In [8]:
!gpustat

huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
[1m[37mTnT                     [m  Wed Sep 20 19:36:18 2023  [1m[30m515.86.01[m
[36m[0][m [34mNVIDIA A100-PCIE-40GB[m |[1m[31m 68°C[m, [1m[32m100 %[m | [36m[1m[33m 5363[m / [33m40960[m MB | [1m[30mhss0729[m([33m1853M[m) [1m[30mlyhe[m([33m2393M[m) [1m[30mghzhao[m([33m551M[m)
[36m[1][m [34mNVIDIA A100-PCIE-40GB[m |[1m[31m 51°C[m, [1m[32m 68 %[m | [36m[1m[33m32258[m / [33m40960[m MB | [1m[30mliliz[m([33m30815M[m)
[36m[2][m [34mNVIDIA A100-PCIE-40GB[m |[31m 35°C[m, [1m[32m 38 %[m | [36m[1m[33m39361[m / [33m40960[m MB | [1m[30mjiaxianyan[m([33m38795M[m)
[36m[3][m [34mNVIDIA A100-PCIE-40GB[m |[31m 49°C[m, [1m[32m 37 %[m | [36m

In [9]:
import sys
import json
import os
os.environ['CUDA_VISIBLE_DEVICES']='4'

import fire
# import gradio as gr
import torch
import transformers
from peft import PeftModel
from transformers import GenerationConfig, LlamaForCausalLM, LlamaTokenizer, AutoTokenizer, AutoModelForCausalLM, AutoModel, TextStreamer

# from utils.prompter import Prompter
import bitsandbytes as bnb

import warnings
warnings.filterwarnings("ignore")

In [10]:
print(torch.cuda.is_available())

True


In [11]:
load_8bit = False
model_name = 'SFT'

if model_name == 'LLaMA2':
    base_model = 'meta-llama/Llama-2-7b-chat-hf'
    model_path = 'meta-llama/Llama-2-7b-chat-hf'
elif model_name == 'ChatGLM2':
    base_model = 'THUDM/chatglm2-6b'
    model_path = 'THUDM/chatglm2-6b'
elif model_name == 'SFT':
    base_model = 'meta-llama/Llama-2-7b-chat-hf'
    model_path = './model_SFT'
    lora_weights = model_path
if torch.cuda.is_available():
    device = "cuda"

In [12]:
if model_name in ['SFT']:
    print('Loading tokenizer...')
    tokenizer = AutoTokenizer.from_pretrained(base_model, trust_remote_code=True)

    print('Loading model...')
    model = AutoModelForCausalLM.from_pretrained(
        base_model,
        load_in_8bit=load_8bit,
        torch_dtype=torch.float16,
        device_map="auto",
        trust_remote_code=True
    )

    print(f"using lora {lora_weights}")
    model = PeftModel.from_pretrained(
        model,
        lora_weights,
        torch_dtype=torch.float16,
    )
    
    if not load_8bit:
        model.half()  # seems to fix bugs for some users.

    model.eval()

if torch.__version__ >= "2" and sys.platform != "win32":
    model = torch.compile(model)


Loading tokenizer...
Loading model...


Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

In [None]:
import json
import os.path as osp
from typing import Union


class Prompter_LLaMA2(object):
    __slots__ = ("template", "_verbose")

    def __init__(self, verbose: bool = False):
        super().__init__()

    def generate_prompt(
        self,
        instruction: str,
        input: Union[None, str] = None,
    ) -> str:        
        if input:
            prompt = instruction + input
        else:
            prompt = instruction
        system_message = "Please give a proper response to the instruction. Do not say 'I don't know."
        prompt_template=f'''[INST] <<SYS>>
{system_message}
<</SYS>>

{prompt} [/INST]'''
        
        return prompt_template

    def get_response(self, output: str) -> str:
        return output.split('[/INST]')[-1].strip(tokenizer.eos_token).strip()
    
class Prompter_ChatGLM2(object):
    __slots__ = ("template", "_verbose")

    def __init__(self, verbose: bool = False):
        super().__init__()

    def generate_prompt(
        self,
        instruction: str,
        input: Union[None, str] = None,
    ) -> str:        
        if input:
            prompt = instruction + input
        else:
            prompt = instruction
        
        prompt_template = prompt 
        return prompt_template

    def get_response(self, output: str) -> str:
        return output.strip(tokenizer.eos_token).strip()

prompter = Prompter_LLaMA2()

def evaluate(
    instruction,
    input=None,
    temperature=0.1,
    top_p=0.75,
    top_k=40,
    num_beams=1,
    max_new_tokens=4096,
    **kwargs,
):
    
    streamer = TextStreamer(tokenizer)
    prompt = prompter.generate_prompt(instruction, input)
    inputs = tokenizer(prompt, return_tensors="pt")
    input_ids = inputs["input_ids"].to(device)
    generation_config = GenerationConfig(
        temperature=temperature,
        top_p=top_p,
        top_k=top_k,
        num_beams=num_beams,
        **kwargs,
    )
    with torch.no_grad():
        generation_output = model.generate(
            input_ids=input_ids,
            generation_config=generation_config,
            return_dict_in_generate=True,
            output_scores=True,
            max_new_tokens=max_new_tokens,
            # streamer=streamer,
        )
    s = generation_output.sequences[0]
    output = tokenizer.decode(s, skip_special_tokens=True)
    return instruction, prompter.get_response(output)

In [15]:
instruction = 'hello'
instruction, response = evaluate(instruction)
print(instruction)
print(response)

hello
Given your interest in learning more about the world, I'd be happy to help you with your question. However, I must point out that the term "hello" is a common greeting used in many cultures and languages. It's not a specific term that can be associated with any particular country or region.
If you have any specific information or context regarding the term "hello," I'd be more than happy to help you with your query.


In [7]:
import pandas as pd
from tqdm import tqdm
for dataset_name in ['mind_small_dev','steam', 'ml-100k']:
    instruction_path = f'../gen_exp_4_model/results/LLaMA2/{dataset_name}/results.csv'
    instruction_df = pd.read_csv(instruction_path)
    instruction_df

    def get_instruction(idx):
        instruction = instruction_df.iloc[idx]['instruction']
        return instruction

    result_save_path = f'./results/SFT/{dataset_name}/'
    if not os.path.exists(result_save_path):
        os.makedirs(result_save_path)

    df_result = pd.DataFrame(columns=['instruction', 'response'])
    # i = 0
    with open(result_save_path + 'results.csv', 'w', encoding='UTF-8') as f:
        for idx in tqdm(range(instruction_df.shape[0])):
            instruction, response = evaluate(get_instruction(idx))
            df_result = df_result.append({'instruction': instruction, 'response': response}, ignore_index=True)
            # print(instruction)
            # print(response)
            # break
        df_result.to_csv(f, index=False)
        f.close()

100%|██████████| 1000/1000 [53:06<00:00,  3.19s/it]
100%|██████████| 1000/1000 [1:07:41<00:00,  4.06s/it]
100%|██████████| 943/943 [1:08:07<00:00,  4.33s/it]


In [9]:
# 将所有实验结果合并

import pandas as pd
import os
origin_path = '../gen_exp_4_model/results/'
SFT_path = './results/SFT/'
for dataset_name in ['mind_small_dev','steam', 'ml-100k']:
    df_LLaMA2 = pd.read_csv(origin_path + f'LLaMA2/{dataset_name}/results.csv')
    df_ChatGLM2 = pd.read_csv(origin_path + f'ChatGLM2/{dataset_name}/results.csv')
    df_GPT35 = pd.read_csv(origin_path + f'GPT3.5/{dataset_name}/results.csv')
    df_GPT4 = pd.read_csv(origin_path + f'GPT4/{dataset_name}/results.csv')
    df_SFT = pd.read_csv(SFT_path + f'{dataset_name}/results.csv')
    df_all = pd.DataFrame(columns=['instruction', 'LLaMA2', 'ChatGLM2', 'GPT3.5', 'GPT4', 'LLaMA2-SFT'])
    df_all['instruction'] = df_LLaMA2['instruction']
    df_all['LLaMA2'] = df_LLaMA2['response']
    df_all['ChatGLM2'] = df_ChatGLM2['response']
    df_all['GPT3.5'] = df_GPT35['response']
    df_all['GPT4'] = df_GPT4['response']
    df_all['LLaMA2-SFT'] = df_SFT['response']

    if not os.path.exists('./results/all/'):
        os.makedirs('./results/all/')
    df_all.to_csv('./results/all/' + f'{dataset_name}.csv', index=False)
    