In [1]:
import os
os.environ["CUDA_VISIBLE_DEVICES"] = "2,3,4,5,6,7"

In [2]:
from transformers import GenerationConfig, LlamaTokenizer, LlamaForCausalLM

  from .autonotebook import tqdm as notebook_tqdm


In [3]:
tokenizer = LlamaTokenizer.from_pretrained("yahma/llama-13b-hf")

In [30]:
text = "这是个测试的\n\n"
full_tokens = tokenizer(
    text,
    truncation=True,
    padding=False,
    max_length=128,
    return_tensors="pt"
) # need eos

In [31]:
full_tokens

{'input_ids': tensor([[    1, 29871, 30810, 30392, 30502, 31851, 31787, 30210,    13,    13]]), 'attention_mask': tensor([[1, 1, 1, 1, 1, 1, 1, 1, 1, 1]])}

In [8]:
tokenizer.eos_token_id

2

In [9]:
tokenizer.pad_token_id = 0  

In [17]:
tokenizer.convert_tokens_to_ids("#")

29937

In [18]:
data = {'input_ids': [1, 13866, 338, 385, 15278, 393, 16612, 263, 3414, 29889, 14350, 263, 2933, 393, 7128, 2486, 1614, 2167, 278, 2009, 29889, 13, 13, 2277, 29937, 2799, 4080, 29901, 13, 17506, 366, 975, 29888, 5367, 29973, 13, 13, 2277, 29937, 13291, 29901, 13, 2776, 3236, 12902, 565, 366, 508, 1074, 916, 1234, 29889, 2], 'attention_mask': [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1], 'labels': [1, 13866, 338, 385, 15278, 393, 16612, 263, 3414, 29889, 14350, 263, 2933, 393, 7128, 2486, 1614, 2167, 278, 2009, 29889, 13, 13, 2277, 29937, 2799, 4080, 29901, 13, 17506, 366, 975, 29888, 5367, 29973, 13, 13, 2277, 29937, 13291, 29901, 13, 2776, 3236, 12902, 565, 366, 508, 1074, 916, 1234, 29889, 2]}

In [19]:
raw_data = {
    "instruction": "Are you overfitting?",
    "input": "",
    "output": "Of course nah if you can see other answer."
    }
for ele in data['input_ids']:
    token = tokenizer.convert_ids_to_tokens(ele)
    print(token)

<s>
▁Below
▁is
▁an
▁instruction
▁that
▁describes
▁a
▁task
.
▁Write
▁a
▁response
▁that
▁appropri
ately
▁comple
tes
▁the
▁request
.
<0x0A>
<0x0A>
##
#
▁Inst
ruction
:
<0x0A>
Are
▁you
▁over
f
itting
?
<0x0A>
<0x0A>
##
#
▁Response
:
<0x0A>
Of
▁course
▁nah
▁if
▁you
▁can
▁see
▁other
▁answer
.
</s>


**加载训练完的PEFT模型**

训练好了一个PEFT模型之后，会在模型输出路径下有一个文件，`adapter_config.json`，这个文件中会记录这个Peft模型的类别等信息，比如lora的配置文件是长这样的：
```json
{
  "base_model_name_or_path": "yahma/llama-7b-hf",
  "bias": "none",
  "fan_in_fan_out": false,
  "inference_mode": true,
  "init_lora_weights": true,
  "lora_alpha": 16,
  "lora_dropout": 0.05,
  "modules_to_save": null,
  "peft_type": "LORA",
  "r": 16,
  "target_modules": [
    "q_proj",
    "k_proj",
    "v_proj",
    "o_proj"
  ],
  "task_type": "CAUSAL_LM"
}
```
所以当我们需要用到这个模型的时候，指定base model以及模型的输出路径，peft库会自动的帮我们去做模型的加载。

In [20]:
from peft import PeftModel

In [21]:
import torch
out_dir = "models/llama-13b-lora-alpaca-round-0/checkpoint-5900"
base_model = "yahma/llama-13b-hf"
model = None

model = LlamaForCausalLM.from_pretrained(
    base_model, 
    load_in_8bit=False, 
    torch_dtype=torch.float16, 
    device_map="auto")
model = PeftModel.from_pretrained(
    model=model, 
    model_id=out_dir, 
    torch_dtype=torch.float16)
model.half()
model.eval()

Loading checkpoint shards: 100%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 3/3 [00:33<00:00, 11.01s/it]


PeftModelForCausalLM(
  (base_model): LoraModel(
    (model): LlamaForCausalLM(
      (model): LlamaModel(
        (embed_tokens): Embedding(32000, 5120, padding_idx=0)
        (layers): ModuleList(
          (0-39): 40 x LlamaDecoderLayer(
            (self_attn): LlamaAttention(
              (q_proj): Linear(
                in_features=5120, out_features=5120, bias=False
                (lora_dropout): Dropout(p=0.05, inplace=False)
                (lora_A): Linear(in_features=5120, out_features=8, bias=False)
                (lora_B): Linear(in_features=8, out_features=5120, bias=False)
              )
              (k_proj): Linear(in_features=5120, out_features=5120, bias=False)
              (v_proj): Linear(
                in_features=5120, out_features=5120, bias=False
                (lora_dropout): Dropout(p=0.05, inplace=False)
                (lora_A): Linear(in_features=5120, out_features=8, bias=False)
                (lora_B): Linear(in_features=8, out_features=5120, 

In [22]:
from datasets import load_dataset, Dataset
data_belle = load_dataset("BelleGroup/train_3.5M_CN")
print(data_belle)

Found cached dataset json (/home/duhu/.cache/huggingface/datasets/BelleGroup___json/BelleGroup--train_3.5M_CN-d0ea45919c9eb506/0.0.0/e347ab1c932092252e717ff3f949105a4dd28b27e842dd53157d2f72e276c2e4)
100%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 1/1 [00:00<00:00, 12.71it/s]

DatasetDict({
    train: Dataset({
        features: ['conversations', 'id'],
        num_rows: 3606402
    })
})





In [23]:
data_belle['train'][1]

{'conversations': [{'from': 'human',
   'value': '给定一段文本和关键词列表，删除文本中包含所有给定关键词的子字符串。\n文本："这是一个测试句子，目的是看看模型是否可以正确地从这个句子中删除关键词。"\\n关键词列表：[‘测试’，‘模型’]'},
  {'from': 'assistant',
   'value': '删除包含所有给定关键词的子字符串后，文本变为："这是一个句子，目的是看看是否可以正确地从这个句子中删除关键词。"'},
  {'from': 'human', 'value': '好的。现在请你将这个文本中的所有的逗号都替换成空格。'},
  {'from': 'assistant',
   'value': '好的，请稍等一下，现在我会将文本中的所有逗号替换为空格。处理后文本为："这是一个句子 目的是看看是否可以正确地从这个句子中删除关键词。"。处理结果如何？'}],
 'id': '16012449'}

In [42]:
from datasets import Dataset

prompt_pre = (
    "The following is a conversation between an AI assistant called Doer and a human user called User. "
    "The assistant is intelligent, knowledgeable and polite to answer questions of user.\n\n"
)
prompt_history = "User: {input}\n\nDoer: {output}\n\n"
prompt_post = "User: {input}\n\nDoer: "

def generate_prompt(data_point, stage='val'):
    user_prompt = prompt_pre # 固定开场白
    # 这里面的字段是conversions，而不是input，因为上面的例子的字段是conversations
    conversations = data_point['conversations']
    # 获取多轮对话的轮数
#     assert len(conversations) % 2 == 0, f"{data_point} not compeleted finised the conversation"
    num_turns = max(len(conversations) // 2, 1)
    print(num_turns)
#     for i in range(num_turns - 1): # 最后一轮对话单独处理，此处不处理
#         assert conversations[2*i]['from'] == "human"
#         assert conversations[2*i+1]['from'] == "assistant"
#         human = conversations[2*i]['value']
#         assistant = conversations[2*i+1]['value']
#         user_prompt += prompt_history.format_map({'input': human, 'output': assistant})
    # 添加最后一轮对话的输入部分
    user_prompt += prompt_post.format_map({'input': conversations[2*num_turns-2]['value']})
    # 根据是训练还是推理，用不同的方式来处理最后一轮对话的回答部分
    if stage == 'train':
        user_prompt += conversations[2*num_turns-1]['value']
    
    return {"prompt": user_prompt}

data = {
  "conversations": [
    {
      "from": "human",
      "value": "你是谁？"
    }
  ],
  "id": "16012449"
}
one_prompter_raw = generate_prompt(data)
one_prompter = one_prompter_raw['prompt']
one_prompter

1


'The following is a conversation between an AI assistant called Doer and a human user called User. The assistant is intelligent, knowledgeable and polite to answer questions of user.\n\nUser: 你是谁？\n\nDoer: '

In [None]:
# some parameters
temperature=.1
top_p=0.75
top_k=40
num_beams=4
max_new_tokens=256

# generate config
generation_config = GenerationConfig(
    temperature=temperature,
    top_p=top_p,
    top_k=top_k,
    num_beams=num_beams,
    bos_token_id=1,
    eos_token_id=2,
    pad_token_id=0,
    repetition_penalty=2.,
    bad_words_ids=tokenizer(['\n\nUser:','\n\Doer:'], add_special_tokens=False).input_ids,
)

# get inputs
inputs = tokenizer(one_prompter, return_tensors='pt')
print(inputs)
input_ids = inputs['input_ids'].to("cuda")
with torch.no_grad():
    generation_output = model.generate(
        input_ids = input_ids,
        generation_config = generation_config,
        return_dict_in_generate = True,
        output_scores=True,
        max_new_tokens=max_new_tokens
    )

s = generation_output.sequences[0]
output = tokenizer.decode(s)
print(output)

{'input_ids': tensor([[    1,   450,  1494,   338,   263, 14983,  1546,   385,   319, 29902,
         20255,  2000,  1938,   261,   322,   263,  5199,  1404,  2000,  4911,
         29889,   450, 20255,   338, 13052,   296, 29892,  7134,   519,   322,
          1248,   568,   304,  1234,  5155,   310,  1404, 29889,    13,    13,
          2659, 29901, 29871, 30919, 30392,   235,   179,   132, 30882,    13,
            13,  6132,   261, 29901, 29871]]), 'attention_mask': tensor([[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
         1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
         1, 1, 1, 1, 1, 1, 1]])}


In [34]:

import transformers
import copy

class prompt:
    def __init__(self, tokenizer, max_len, add_eos=True):
        self.tokenizer = tokenizer
        self.max_len = max_len
        self.add_eos=add_eos


class chat_prompt(prompt):
    prompt_pre = (
        "The following is a conversation between an AI assistant called Assistant and a human user called User. "
        "The assistant is intelligent, knowledgeable and polite to answer questions of user.\n\n"
    )
    prompt_history = "User:{input}\n\nAssistant:{output}\n\n"
    prompt_post = "User:{input}\n\nAssistant:"

    def preprocess_gen(self, data_point):
        user_prompt = self.prompt_pre
        len_avail = self.max_len - len(self.tokenizer(user_prompt, add_special_tokens=False)['input_ids'])
        input_prompt = self.prompt_post.format_map({'input':data_point['input']})
        len_avail -= len(self.tokenizer(input_prompt, add_special_tokens=False)['input_ids'])
        lens = len(data_point['history'])
        tokenized_lens = []
        for i in range(lens):
            tmp_prompt = self.prompt_history.format_map(data_point['history'][i])
            tokenized_lens.append(len(self.tokenizer(tmp_prompt,add_special_tokens=False)["input_ids"]))
        
        # 启发式：/2 优先除前面的
        i = 0
        while sum(tokenized_lens) > len_avail and i < lens:
            history = data_point['history'][i]
            tmp_len1 = len(history['input'])
            tmp_len2 = len(history['output'])
            if tmp_len2 > tmp_len1:
                history['output'] = history['output'][:tmp_len2//2]
            else:
                history['input'] = history['input'][:tmp_len1//2]
            prompt = self.prompt_history.format_map(history)
            single_len =(len(self.tokenizer(prompt,add_special_tokens=False)["input_ids"]))
            tokenized_lens[i] = single_len
            i += 1
        total_len = sum(tokenized_lens)
        # 还不够的话 直接截断
        while total_len > len_avail and i < lens - 1 :
            total_len -= tokenized_lens[i]
            data_point['history'] = data_point['history'][1:]
            i += 1
        # 最终合并
        for i in range(lens):
            user_prompt += self.prompt_history.format_map(data_point['history'][i])
        user_prompt += input_prompt
        print({'real_input:':user_prompt})
        inputs = self.tokenizer(user_prompt)["input_ids"]
        return inputs

    def preprocess_train(self, data_point):
        user_prompt = self.prompt_pre
        lens = len(data_point['input'])
        for i in range(lens-1):
            user_prompt += self.prompt_history.format_map({'input':data_point['input'][i].strip(),'output':data_point['output'][i].strip()})
        user_prompt += self.prompt_post.format_map({'input':data_point['input'][-1].strip()})

        len_user_prompt_tokens = len(self.tokenizer(
            user_prompt,
            truncation=True,
            max_length=self.max_len,
        )["input_ids"]) - 1 # remove extra eos
        if self.add_eos:
            full_tokens = self.tokenizer(
                user_prompt + data_point["output"][-1].strip(),
                truncation=True,
                padding=False,
                max_length=self.max_len,
            )["input_ids"] # need eos
        else:
            full_tokens = self.tokenizer(
                user_prompt + data_point["output"][-1].strip(),
                truncation=True,
                padding=False,
                max_length=self.max_len+1,
            )["input_ids"][:-1] # delete eos
        return {
            "input_ids": full_tokens,
            "labels": [-100] * len_user_prompt_tokens + full_tokens[len_user_prompt_tokens:],
            "attention_mask": [1] * (len(full_tokens)),
        }

    def data_collator(self,):
        return transformers.DataCollatorForSeq2Seq(self.tokenizer)

    def postprocess(self, text, render=False):
        output = text.split("Assistant:")[-1].strip()
        if 'User:' in output:
            output = output.split("User:")[0]
        output = output.replace('�','') 
        if render:
            # fix gradio chatbot markdown code render bug
            lines = output.split("\n")
            for i, line in enumerate(lines):
                if "```" in line:
                    if line != "```":
                        lines[i] = f'<pre><code class="language-{lines[i][3:]}">'
                    else:
                        lines[i] = '</code></pre>'
                else:
                    if i > 0:
                        lines[i] = "<br/>" + line.replace("<", "&lt;").replace(">", "&gt;").replace("__", '\_\_')
            output =  "".join(lines)
            # output = output.replace('<br/><pre>','\n<pre>') work for html; but not for gradio
        return output

    def get_data_collator():
        return transformers.DataCollatorForLanguageModeling


In [43]:
data_point = {
    "input": ["A B C D E F G H", "J K L M N O P Q"],
    "output": [" 1 2 3", "4 5 6"]
}
prompt = chat_prompt(tokenizer, max_len=128, add_eos=True)

In [44]:
import json
print(prompt.preprocess_train(data_point))

{'input_ids': [1, 450, 1494, 338, 263, 14983, 1546, 385, 319, 29902, 20255, 2000, 4007, 22137, 322, 263, 5199, 1404, 2000, 4911, 29889, 450, 20255, 338, 13052, 296, 29892, 7134, 519, 322, 1248, 568, 304, 1234, 5155, 310, 1404, 29889, 13, 13, 2659, 29901, 29909, 350, 315, 360, 382, 383, 402, 379, 13, 13, 7900, 22137, 29901, 29896, 29871, 29906, 29871, 29941, 13, 13, 2659, 29901, 29967, 476, 365, 341, 405, 438, 349, 660, 13, 13, 7900, 22137, 29901, 29946, 29871, 29945, 29871, 29953], 'labels': [-100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, 29901, 29946, 29871, 29945, 29871, 29953], 'at

In [46]:
tokenizer.convert_ids_to_tokens(29953)

'6'