In [3]:
import os
import sys
import warnings; warnings.filterwarnings("ignore")
import numpy as np
import pandas as pd
import torch as th

from pprint import pp
from datasets import (load_dataset, load_from_disk, Dataset)
from transformers import (AutoTokenizer, 
                          BitsAndBytesConfig,
                          AutoModel, 
                          AutoModelForCausalLM, 
                          AutoModelForSequenceClassification,
                          DataCollatorWithPadding, 
                          DataCollatorForLanguageModeling,
                          DataCollatorForSeq2Seq, 
                          DataCollatorForTokenClassification,
                          TrainingArguments, Trainer)
from peft import (LoraConfig, get_peft_model, PeftModel, TaskType, get_peft_model_state_dict)
from trl import SFTTrainer
from vllm import (LLM, SamplingParams)

In [4]:
device = th.device("cuda" if th.cuda.is_available() else "cpu")
devive_cnt = th.cuda.device_count()
print(f"device = {device}; devive_cnt = {devive_cnt}")
print(th.__version__)
print(th.version.cuda)

device = cuda; devive_cnt = 1
2.5.1+cu121
12.1


In [5]:
path_project = "C:/my_project/MyGit/Machine-Learning-Column/hugging_face"
path_data = os.path.join(os.path.dirname(path_project), "data")
path_model = "F:/LLM"
path_output = os.path.join(os.path.dirname(path_project), "output")

## step-1: 数据源

In [6]:
filename = "ht/pt_data.csv"

In [7]:
df_csv = pd.read_csv(os.path.join(path_data, filename))

In [8]:
df_csv.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 9 entries, 0 to 8
Data columns (total 2 columns):
 #   Column   Non-Null Count  Dtype 
---  ------   --------------  ----- 
 0   title    9 non-null      object
 1   content  9 non-null      object
dtypes: object(2)
memory usage: 276.0+ bytes


## step-2: tokenizer

In [9]:
checkpoint = "Qwen/Qwen2.5-3B-Instruct"

In [10]:
tokenizer = AutoTokenizer.from_pretrained(
    pretrained_model_name_or_path=os.path.join(path_model, checkpoint),
    cache_dir=path_model,
    force_download=False,
    local_files_only=True
)

In [10]:
pp(tokenizer.special_tokens_map)

{'eos_token': '<|im_end|>',
 'pad_token': '<|endoftext|>',
 'additional_special_tokens': ['<|im_start|>',
                               '<|im_end|>',
                               '<|object_ref_start|>',
                               '<|object_ref_end|>',
                               '<|box_start|>',
                               '<|box_end|>',
                               '<|quad_start|>',
                               '<|quad_end|>',
                               '<|vision_start|>',
                               '<|vision_end|>',
                               '<|vision_pad|>',
                               '<|image_pad|>',
                               '<|video_pad|>']}


In [45]:
df_csv["text"] = "<|im_start|>" + \
    df_csv["title"] + "\n" + df_csv["content"] + \
    tokenizer.eos_token + \
    tokenizer.pad_token

In [46]:
dataset = Dataset.from_pandas(df_csv)
# dataset = Dataset.from_pandas(df_csv).train_test_split(test_size=0.15, shuffle=True, seed=0)

In [47]:
pp(dataset)

Dataset({
    features: ['title', 'content', 'text'],
    num_rows: 9
})


## step-3: 量化参数

In [14]:
config_bnb = BitsAndBytesConfig(
    load_in_4bit=True,
    bnb_4bit_quant_type="nf4",
    bnb_4bit_compute_dtype=th.bfloat16,
    bnb_4bit_use_double_quant=True
)  # QLoRA

## step-4: 载入基模

In [15]:
base_model = AutoModelForCausalLM.from_pretrained(
    pretrained_model_name_or_path=os.path.join(path_model, checkpoint),
    cache_dir=path_model,
    force_download=False,
    local_files_only=True,
    device_map="auto",
    torch_dtype=th.bfloat16,
    low_cpu_mem_usage=True,
    # quantization_config=(config_bnb if config_bnb else None),
)

Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

In [16]:
base_model.gradient_checkpointing_enable()
base_model.enable_input_require_grads()
base_model.config.use_cache = False

if th.cuda.device_count() > 1:
    base_model.is_parallelizable = True
    base_model.model_parallel = True

In [17]:
allocated_memory = th.cuda.memory_allocated()
cached_memory = th.cuda.memory_reserved()
print(f"已分配的GPU内存：{allocated_memory / 1024**3:.2f}G, 已缓存的GPU内存：{cached_memory / 1024**3:.2f}G")

已分配的GPU内存：1.91G, 已缓存的GPU内存：2.06G


In [18]:
# check embedding_size
tokenizer_size = len(tokenizer)
embedding_size = base_model.get_input_embeddings().weight.shape[0]
print(f"tokenizer_size = {tokenizer_size}; embedding_size = {embedding_size}")

if tokenizer_size > embedding_size:
    base_model.resize_token_embeddings(tokenizer_size)

tokenizer_size = 151665; embedding_size = 151936


## step-5: 模型参数

In [19]:
config_model = {
    "rank": 8,
    "lora_alpha": 32,
    "lora_dropout": 0.1,
    "use_rslora": True,
    "epochs": 2,
    "batch_size": 1,
    "gradient_steps": 2,
    "learning_rate": 0.00005,
    "weight_decay": 0.01,
    "max_seq_length": 512
}

## step-6: LoRA参数

## step-7: 模型训练

In [None]:
# 整理函数
def tokenize_function(sample):
    inputs = tokenizer(text=sample["text"], max_length=512, truncation=True, padding=True, return_tensors="pt")
    inputs["labels"] = inputs["input_ids"]
    return inputs

In [24]:
dataset_tokenized = dataset.map(tokenize_function, batched=True, remove_columns=dataset.column_names)

Map:   0%|          | 0/9 [00:00<?, ? examples/s]

In [18]:
dataset_tokenized

Dataset({
    features: ['input_ids', 'attention_mask', 'labels'],
    num_rows: 9
})

In [23]:
args_train = TrainingArguments(
    output_dir=os.path.join(path_output, "model_pt"),
    num_train_epochs=config_model.get("epochs"),
    per_device_train_batch_size=config_model.get("batch_size"),
    per_device_eval_batch_size=config_model.get("batch_size"),
    gradient_accumulation_steps=config_model.get("gradient_steps"),
    gradient_checkpointing=True, 
    optim="adamw_torch",
    learning_rate=config_model.get("learning_rate"),
    weight_decay=config_model.get("weight_decay"),
    logging_strategy="epoch",
    save_strategy="epoch",
    evaluation_strategy="epoch",
    save_total_limit=1,
    metric_for_best_model="eval_loss",
    load_best_model_at_end=True
)

In [24]:
collate_fn = DataCollatorForLanguageModeling(tokenizer, mlm=False) 

In [None]:
trainer = Trainer(
    model=base_model,
    tokenizer=tokenizer,
    args=args_train,
    data_collator=collate_fn,
    train_dataset=dataset_tokenized,
    eval_dataset=dataset_tokenized,
)

In [None]:
res_train = trainer.train()