In [3]:
# %pip install -U datasets accelerate peft trl tensorboard bitsandbytes langchain sentencepiece --user
# %pip install transformers==4.37.2 --user

In [1]:
import os
import sys
import numpy as np
import pandas as pd
import torch as th
# from torch.utils.tensorboard import SummaryWriter
from datasets import (load_dataset, load_from_disk, Dataset)
from transformers import (AutoTokenizer, AutoModel, AutoModelForCausalLM, BitsAndBytesConfig,
                          TrainingArguments, DataCollatorWithPadding, DataCollatorForLanguageModeling,
                          DataCollatorForSeq2Seq, DataCollatorForTokenClassification)
from transformers.integrations import TensorBoardCallback
from peft import (LoraConfig, get_peft_model, PeftModel, TaskType)
from trl import SFTTrainer

In [2]:
device = th.device("cuda" if th.cuda.is_available() else "cpu")
print(device)

cuda


In [3]:
path_project = "C:/my_project/MyGit/Machine-Learning-Column/hugging_face"
path_data = os.path.join(os.path.dirname(path_project), "data")
path_model = os.path.join(os.path.dirname(path_project), "model")

## step-1: 载入数据源

In [4]:
# filename = "NousResearch/json-mode-eval/train-00000-of-00001.parquet"
filename = "tatsu-lab/alpaca/train-00000-of-00001-a09b74b3ef9c3b56.parquet"

In [5]:
dataset = load_dataset(
    path="parquet",
    data_files=os.path.join(path_data, filename),
    split="all"
)

In [6]:
dataset = dataset.select(range(200))
dataset = dataset.train_test_split(test_size=0.2, shuffle=True, seed=0) 
dataset_train, dataset_test = dataset["train"], dataset["test"]

## step-2: tokenizer

In [7]:
checkpoint = "chatglm3-6b"

In [8]:
tokenizer = AutoTokenizer.from_pretrained(
    pretrained_model_name_or_path=os.path.join(path_model, checkpoint),
    cache_dir=path_model,
    force_download=False,
    local_files_only=True,
    trust_remote_code=True
)

In [12]:
# tokenizer.pad_token  # '<unk>'
# tokenizer.eos_token  # '</s>'
# tokenizer.pad_token = tokenizer.eos_token  # 半精度训练时需要
# tokenizer.padding_side = "right"  # llama2

## step-3: 配置量化参数

In [9]:
config_bnb = BitsAndBytesConfig(
    load_in_8bit=True,
    # load_in_4bit=True,
    # bnb_4bit_quant_type="nf4",
    # bnb_4bit_compute_dtype=th.bfloat16,
    # bnb_4bit_use_double_quant=True
)

## step-4: 载入基础/任务大模型

In [10]:
model_base = AutoModelForCausalLM.from_pretrained(
    pretrained_model_name_or_path=os.path.join(path_model, checkpoint),
    cache_dir=path_model,
    force_download=False,
    local_files_only=True,
    trust_remote_code=True,
    device_map="auto",
    torch_dtype=th.bfloat16,
    quantization_config=config_bnb
)

Loading checkpoint shards:   0%|          | 0/7 [00:00<?, ?it/s]

  return self.fget.__get__(instance, owner)()


In [11]:
for i, (name, parm) in enumerate(model_base.named_parameters()):
    print(f"{i}  name: {name};  shape: {parm.shape};  dtype: {parm.dtype};  device: {parm.device}")

0  name: transformer.embedding.word_embeddings.weight;  shape: torch.Size([65024, 4096]);  dtype: torch.bfloat16;  device: cuda:0
1  name: transformer.encoder.layers.0.input_layernorm.weight;  shape: torch.Size([4096]);  dtype: torch.bfloat16;  device: cuda:0
2  name: transformer.encoder.layers.0.self_attention.query_key_value.weight;  shape: torch.Size([9437184, 1]);  dtype: torch.uint8;  device: cuda:0
3  name: transformer.encoder.layers.0.self_attention.query_key_value.bias;  shape: torch.Size([4608]);  dtype: torch.bfloat16;  device: cuda:0
4  name: transformer.encoder.layers.0.self_attention.dense.weight;  shape: torch.Size([8388608, 1]);  dtype: torch.uint8;  device: cuda:0
5  name: transformer.encoder.layers.0.post_attention_layernorm.weight;  shape: torch.Size([4096]);  dtype: torch.bfloat16;  device: cuda:0
6  name: transformer.encoder.layers.0.mlp.dense_h_to_4h.weight;  shape: torch.Size([56098816, 1]);  dtype: torch.uint8;  device: cuda:0
7  name: transformer.encoder.layers.

In [12]:
print(model_base.dtype)

torch.bfloat16


In [13]:
# check embedding_size
embedding_size = model_base.get_input_embeddings().weight.shape[0]
if len(tokenizer) > embedding_size:
    model_base.resize_token_embeddings(len(tokenizer))

## step-5: 配置模型参数

In [15]:
config_model = {
    "rank": 8,
    "lora_alpha": 32,
    "lora_dropout": 0.1,
    "use_rslora": True,
    "epochs": 3,
    "batch_size": 2,
    "gradient_steps": 2,
    "learning_rate": 0.001,
    "weight_decay": 0.01,
    "max_seq_lenght": 512
}

## step-6: 配置LoRA模型

In [16]:
# LoRA: Low-Rank Adaptation of Large Language Models
# config_lora = LoraConfig(target_modules=["0"])
# config_lora = LoraConfig(target_modules=["query_key_value", "dense_4h_to_h"])
# config_lora = LoraConfig(target_modules=[".*\.1.*query_key_value"])
# config_lora = LoraConfig(target_modules=["query_key_value"], modules_to_save=["word_embeddings"])
config_lora = LoraConfig(
    r=config_model.get("rank"),
    lora_alpha=config_model.get("lora_alpha"),
    lora_dropout=config_model.get("lora_dropout"),
    use_rslora=config_model.get("use_rslora"),
    bias="none",
    task_type=TaskType.CAUSAL_LM
)

In [17]:
# model_base = prepare_model_for_int8_training(model_base)
model_lora = get_peft_model(model=model_base, peft_config=config_lora)
# model_lora.enable_input_require_grads()  # if TrainingArguments(gradient_checkpointing=True)
model_lora.config.use_cache = False

In [19]:
# print_trainable_parameters - 1
print(model_lora.print_trainable_parameters())

# print_trainable_parameters - 2
# trainable_params = 0
# all_params = 0

# for param in model_lora.parameters():
#     if param.requires_grad:
#         trainable_params += param.numel()
#     all_params += param.numel()

# print(f"trainable params: {trainable_params} || all params: {all_params} || trainable%: {100 * trainable_params / all_params:.4f}")

trainable params: 1949696 || all params: 3390261248 || trainable%: 0.0575


## step-7: 模型训练

In [20]:
args_train = TrainingArguments(
    output_dir=os.path.join(path_model, "model_sft"),
    num_train_epochs=config_model.get("epochs"),
    per_device_train_batch_size=config_model.get("batch_size"),
    per_device_eval_batch_size=config_model.get("batch_size"),
    gradient_accumulation_steps=config_model.get("gradient_steps"),
    gradient_checkpointing=True, 
    optim="adamw_torch",
    learning_rate=config_model.get("learning_rate"),
    weight_decay=config_model.get("weight_decay"),
    save_strategy="epoch",
    evaluation_strategy="epoch",
    save_total_limit=3,
    #metric_for_best_model="f1",
    load_best_model_at_end=True
)

In [21]:
collate_fn = DataCollatorForLanguageModeling(tokenizer, mlm=False) 
# collate_fn = DataCollatorWithPadding(tokenizer)
# collate_fn = DataCollatorForSeq2Seq(tokenizer, padding=True)
# collate_fn = DataCollatorForTokenClassification(tokenizer)

In [22]:
trainer = SFTTrainer(
    model=model_lora,
    tokenizer=tokenizer,
    args=args_train,
    peft_config=config_lora,
    data_collator=collate_fn,
    train_dataset=dataset_train,
    eval_dataset=dataset_test,
    dataset_text_field="text", 
    packing=True,
    max_seq_length=config_model.get("max_seq_length"),
    #compute_metrics=compute_metrics,
)

dataloader_config = DataLoaderConfiguration(dispatch_batches=None, split_batches=False)


In [None]:
res_train = trainer.train()

## step-8: 模型评估

In [None]:
res_eval = trainer.evaluate()
# res_eval = trainer.evaluate(dataset_train)
# res_eval = trainer.evaluate(dataset_test)

## step-9: 模型保存

In [None]:
trainer.save_model(output_dir=os.path.join(path_model, "model_sft"))

## step-10: 模型加载

In [None]:
# reload model_base

In [None]:
# load model_sft
model_sft = PeftModel.from_pretrained(
    model=model_base,
    model_id=os.path.join(path_model, "model_sft"),
    is_trainable=False
)
model_sft = model_sft.merge_and_unload()  # W + BA, speed up, but errors when use 8-bit

## step-11: 模型推理