一、导入相关包

In [1]:
from datasets import Dataset
from transformers import (AutoTokenizer, AutoModelForCausalLM, DataCollatorForSeq2Seq, TrainingArguments, Trainer,
                          PreTrainedTokenizerFast)
from peft import LoraConfig, TaskType, get_peft_model
from typing import Dict
import pandas as pd
import torch
from tqdm import tqdm
import random
import warnings

# pd.set_option('future.no_silent_downcasting', True)
warnings.filterwarnings('ignore')

二、加载数据集

In [2]:
# 读取文件
train_path = '../data/train.ft.txt'
# test_path = '../data/test.ft.txt'

def read_review(path: str) -> pd.DataFrame:
    # 定义一个空列表来存储标签和评论
    reviews_list = []
    # 打开并读取文件内容
    with open(path, 'r') as file:
        lines = file.readlines()
        for line in tqdm(lines):
            # 分割标签和评论
            label, review = line.strip().split(' ', 1)
            reviews_list.append((label, review))
    # 将列表转换为DataFrame
    df = pd.DataFrame(reviews_list, columns=['Label', 'Review'])
    # 更换label的类型
    df['Label'] = df['Label'].replace({'__label__1': '1', '__label__2': '2'})
    return df

train_df = read_review(train_path)
train_df

100%|██████████| 350443/350443 [00:00<00:00, 1049595.82it/s]


Unnamed: 0,Label,Review
0,2,Stuning even for the non-gamer: This sound tra...
1,2,The best soundtrack ever to anything.: I'm rea...
2,2,Amazing!: This soundtrack is my favorite music...
3,2,Excellent Soundtrack: I truly like this soundt...
4,2,"Remember, Pull Your Jaw Off The Floor After He..."
...,...,...
350438,1,Oracle SQL needs Help: This book waltzes throu...
350439,1,Unusual writing style: This book chronicles th...
350440,1,"Junk.: They didn't give ""no stars"" as an optio..."
350441,1,WORTHLESS!!!: This book only presents a sequen...


In [3]:
# 文件转换
train_ds = Dataset.from_pandas(train_df)
train_ds

Dataset({
    features: ['Label', 'Review'],
    num_rows: 350443
})

In [4]:
train_ds[:3]

{'Label': ['2', '2', '2'],
 'Review': ['Stuning even for the non-gamer: This sound track was beautiful! It paints the senery in your mind so well I would recomend it even to people who hate vid. game music! I have played the game Chrono Cross but out of all of the games I have ever played it has the best music! It backs away from crude keyboarding and takes a fresher step with grate guitars and soulful orchestras. It would impress anyone who cares to listen! ^_^',
  "The best soundtrack ever to anything.: I'm reading a lot of reviews saying that this is the best 'game soundtrack' and I figured that I'd write a review to disagree a bit. This in my opinino is Yasunori Mitsuda's ultimate masterpiece. The music is timeless and I'm been listening to it for years now and its beauty simply refuses to fade.The price tag on this is pretty staggering I must say, but if you are going to buy any cd for this much money, this is the only one that I feel would be worth every penny.",
  'Amazing!: Thi

三、数据集预处理（转化为input_id、attension_mask等）

In [5]:
model_path = "../model/LLM-Research/Meta-Llama-3-8B"

tokenizer = AutoTokenizer.from_pretrained(model_path, use_fast=True, local_files_only=True)

# 创建pad_token（模型本来没有），也是后面训练的需要
tokenizer.pad_token = tokenizer.eos_token
tokenizer.pad_token_id = tokenizer.eos_token_id
tokenizer.padding_side = 'right'

Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.


In [6]:
# 处理数据集转换为合适的输入，Base模型无需模板
def process_func(origin: Dict[str, str], train_ds: Dataset, tokenizer: PreTrainedTokenizerFast, 
                 max_length: int=1024, shot_num: int=3):
    # 随机筛选上下文示例
    shot_list = random.choices(train_ds, k=shot_num)
    # 根据上下文示例生成输入模板
    input_content = "Please classify the following text into Label 1 or Label 2. " + \
                "And 2 represents positive emotions and 1 represents negative emotions. \n"
    for shot in shot_list:
        input_content += "\n    Text: "
        input_content += shot['Review']
        input_content += "\n    Label: "
        input_content += shot['Label']
        input_content += "\n"
    input_content += "\n    Text: "
    input_content += origin['Review']
    input_content += "\n    Label: "
    # 生成 input_id 、 attension_mask 和 label
    input_ids, attention_mask, labels = [], [], []
    inputs_1 = tokenizer(input_content, add_special_tokens=False)
    inputs_2 = tokenizer(origin['Label'], add_special_tokens=False)
    input_ids = inputs_1["input_ids"] + inputs_2["input_ids"] + [tokenizer.eos_token_id]
    attention_mask = inputs_1["attention_mask"] + inputs_2["attention_mask"] + [1]
    labels = [-100] * len(inputs_1["input_ids"]) + inputs_2["input_ids"] + [tokenizer.eos_token_id]  
    # 是否超过最大长度
    if len(input_ids) > max_length:
        warnings.warn('Got too long input_ids!')
    return {
        "input_ids": input_ids,
        "attention_mask": attention_mask,
        "labels": labels
    }

In [7]:
# 将数据集变化为token形式
nums = 10000
mini_train_ds = Dataset.from_pandas(train_df.iloc[:nums, :])

# 固定随机数种子
random.seed(42)
tokenized_id = mini_train_ds.map(process_func, remove_columns=mini_train_ds.column_names, 
                            fn_kwargs={'train_ds': train_ds, 'tokenizer': tokenizer})

Map:   0%|          | 0/10000 [00:00<?, ? examples/s]

In [8]:
# 查看效果
print(tokenizer.decode(tokenized_id[1]["input_ids"]))

Please classify the following text into Label 1 or Label 2. And 2 represents positive emotions and 1 represents negative emotions. 

    Text: Great shower head: I bought this to replace the old one in my bathroom that I use to give my toddler a bath. The shower head itself is great, but the cord is much shorter than the old one was which is a bit of a problem. It barely reaches the tub's faucet. I am hoping to find a replacement cord.
    Label: 2

    Text: Great Advice, the kids are all right: This book helped us tremendously, from the time she was born our now 14 month old was constantly crying, whining and difficult. Dr. Dobson's advice to follow God's will and enforce discipline made an almost instant impact on our daughter. Who would have thought that a small pat on the rear of our 3 month old would work so well? We continued this (some call it abuse) for most of the last year and she hardly ever makes a sound!We now have a quiet little girl, who finally keeps to herself.Now we 

四、创建模型

In [9]:
# 创建模型
model = AutoModelForCausalLM.from_pretrained(model_path, 
                                             torch_dtype=torch.bfloat16, 
                                             device_map='cuda', 
                                             local_files_only=True)

model.enable_input_require_grads()  # 开启梯度检查点时，要执行该方法和下面的不开启缓存
model.config.use_cache = False

Loading checkpoint shards:   0%|          | 0/4 [00:00<?, ?it/s]

Lora 低秩适配器

In [10]:
# Lora配置
config = LoraConfig(
    task_type = TaskType.CAUSAL_LM, 
    target_modules = ["q_proj", "k_proj", "v_proj", "o_proj", "gate_proj", "up_proj", "down_proj"],
    inference_mode = False, # 训练模式
    r = 8,                  # Lora 秩
    lora_alpha = 32,        # Lora alaph
    lora_dropout = 0.1      # Dropout 比例
)

config

LoraConfig(peft_type=<PeftType.LORA: 'LORA'>, auto_mapping=None, base_model_name_or_path=None, revision=None, task_type=<TaskType.CAUSAL_LM: 'CAUSAL_LM'>, inference_mode=False, r=8, target_modules={'up_proj', 'v_proj', 'down_proj', 'o_proj', 'gate_proj', 'k_proj', 'q_proj'}, lora_alpha=32, lora_dropout=0.1, fan_in_fan_out=False, bias='none', use_rslora=False, modules_to_save=None, init_lora_weights=True, layers_to_transform=None, layers_pattern=None, rank_pattern={}, alpha_pattern={}, megatron_config=None, megatron_core='megatron.core', loftq_config={}, use_dora=False, layer_replication=None)

In [11]:
model = get_peft_model(model, config)
model.print_trainable_parameters()

trainable params: 20,971,520 || all params: 8,051,232,768 || trainable%: 0.26047588741133265


五、配置训练参数

In [12]:
args = TrainingArguments(
    output_dir = "../lora",
    logging_dir="../lora/logging",
    per_device_train_batch_size = 4,
    gradient_accumulation_steps = 4,
    logging_steps = 10,
    num_train_epochs = 5,
    learning_rate = 1e-5,
    save_steps = 50,
    save_total_limit = 100,
    save_on_each_node = True,
    gradient_checkpointing = True,   # 开启梯度检查点减少显存消耗
    gradient_checkpointing_kwargs = {
        'use_reentrant': True,
    },
)

六、创建训练器

In [13]:
trainer = Trainer(
    model = model,
    args = args,
    train_dataset = tokenized_id,
    data_collator = DataCollatorForSeq2Seq(tokenizer=tokenizer, padding=True)
)

七、模型训练

In [14]:
trainer.train()

Step,Training Loss
10,2.7175
20,1.2446
30,0.2971
40,0.0835
50,0.0502
60,0.0561
70,0.0643
80,0.0563
90,0.0531
100,0.0587


TrainOutput(global_step=3125, training_loss=0.04668684987545013, metrics={'train_runtime': 14312.0955, 'train_samples_per_second': 3.494, 'train_steps_per_second': 0.218, 'total_flos': 1.3015855036340306e+18, 'train_loss': 0.04668684987545013, 'epoch': 5.0})

八、保存结果

In [15]:
peft_model_id="../lora/result"

trainer.model.save_pretrained(peft_model_id)
tokenizer.save_pretrained(peft_model_id)

('../lora/result/tokenizer_config.json',
 '../lora/result/special_tokens_map.json',
 '../lora/result/tokenizer.json')