In [None]:
# conda create --name unsloth python=3.11
# conda env list
# activate unsloth
# conda install jupyterlab ipykernel
# python -m ipykernel install --user -name unsloth --display-name "Python unsloth"
# pip install --upgrade --force-reinstall  --no-cache-dir unsloth unsloth_zoo
# pip install torch torchvision torchaudio --index-url https://download.pytorch.org/whl/cu121

In [None]:
import os
import sys
import warnings; warnings.filterwarnings("ignore")
import torch as th
# import torch_npu as th_npu

from pprint import pp
from transformers import DataCollatorForLanguageModeling
from trl import (SFTConfig, SFTTrainer)
from unsloth import FastLanguageModel
from unsloth.chat_templates import standardize_sharegpt

In [3]:
device = th.device("cuda" if th.cuda.is_available() else "cpu")
# device = th.device("npu" if th.npu.is_available() else "cpu")
devive_cnt = th.cuda.device_count()
# devive_cnt = th.npu.device_count()
print(f"device = {device}; devive_cnt = {devive_cnt}")
print(f"torch version = {th.__version__}")
print(f"cuda version = {th.version.cuda}")

device = cuda; devive_cnt = 1
torch version = 2.5.1+cu121
cuda version = 12.1


In [4]:
path_project = "C:/my_project/MyGit/Machine-Learning-Column/hugging_face"
path_data = os.path.join(os.path.dirname(path_project), "data")
path_model = "F:/LLM"
path_output = os.path.join(path_model, "output")

## step-1: 数据源

In [None]:
filename = "alpaca/train-00000-of-00001-a09b74b3ef9c3b56.parquet"

In [None]:
dataset = load_dataset(
    path="parquet",
    data_files=os.path.join(path_data, filename),
    split="all"
)

In [None]:
dataset = dataset.select(range(2000))  # 预研
dataset = dataset.train_test_split(test_size=0.2, shuffle=True, seed=0)
train_dataset, eval_dataset = dataset["train"], dataset["test"]

## step-2~4: tokenizer/量化/载入基模

In [None]:
base_model, tokenizer = FastLanguageModel.from_pretrained(
    model_name=os.path.join(path_model, checkpoint),
    max_seq_length=2048,
    dtype=th.bfloat16,
    load_in_4bit=False
)

In [None]:
tokenizer.add_special_tokens({"bos_token": "<|im_start|>"})

In [None]:
pp(f"bos_token = {tokenizer.bos_token}")
pp(f"eos_token = {tokenizer.eos_token}")
pp(f"pad_token = {tokenizer.pad_token}")
pp(f"padding_side = {tokenizer.padding_side}")

In [None]:
for i, (name, parm) in enumerate(base_model.named_parameters()):
    print(f"{i}  name: {name};  shape: {parm.shape};  dtype: {parm.dtype};  device: {parm.device}")

In [None]:
base_model.gradient_checkpointing_enable()
base_model.enable_input_require_grads()
base_model.config.use_cache = False

In [None]:
gpu_stats = th.cuda.get_device_properties(device)
start_memory = round(th.cuda.max_memory_reserved() / 1024**3, 3)
max_memory = round(gpu_stats.total_memory / 1024**3, 3)
print(f"GPU = {gpu_stats.name}. Max memory = {max_memory} GB")
print(f"{start_memory} GB of memory reserved.")

GPU = NVIDIA GeForce GTX 1080 Ti. Max memory = 11.0 GB
0.0 GB of memory reserved.


## step-5: 模型参数

In [None]:
train_args = SFTConfig(
    output_dir=os.path.join(path_output, "model_unsloth"),
    num_train_epochs=5,
    per_device_train_batch_size=1,
    per_device_eval_batch_size=1,
    gradient_accumulation_steps=2,
    gradient_checkpointing=False,
    optim="adamw_torch",
    learning_rate=0.00005,
    warmup_steps=5,
    weight_decay=0.01,
    lr_scheduler_type="cosine",  # linear, cosine
    logging_strategy="epoch",
    save_strategy="epoch",
    evaluation_strategy="epoch",
    save_total_limit=1,
    metric_for_best_model="eval_loss",
    load_best_model_at_end=True,
    dataset_text_field="text",
    report_to="wandb",
)

## step-6: LoRA参数

In [None]:
lora_model = FastLanguageModel.get_peft_model(
    model=base_model,
    r=16,
    lora_alpha=32
    lora_dropout=0.1,
    bias="none",
    use_rslora=True,
    use_gradient_checkpointing="unsloth",
    target_modules=[
        "q_proj", "k_proj", "v_proj", "o_proj",  # self_attn
        "gate_proj", "up_proj", "down_proj",  # mlp
        "lm_head"  # lm_head
        ]
)

In [None]:
print(lora_model.print_trainable_parameters())

## step-7: 整理函数

In [None]:
# 方法-1
def apply_sft_template(sample):
    user_prompt = sample["instruction"] + "\n" + sample["input"]
    assistant_prompt = sample["output"]
    messages = [
        {"role": "system", "content": system_prompt},
        {"role": "user", "content": user_prompt},
        {"role": "assistant", "content": assistant_prompt}
    ]
    text = tokenizer.apply_chat_template(
        messages,
        tokenize=False,
        add_generation_prompt=False,
        enable_thinking=False  # for Qwen3
    )
    sample["text"] = text
    return sample

dataset = dataset.map(apply_sft_template)
train_dataset = dataset["train"]
test_dataset = dataset["test"]

In [None]:
# 方法-2
dataset = standardize_sharegpt(dataset)
train_dataset = dataset["train"]
test_dataset = dataset["test"]

## step-8: 模型训练

In [None]:
collate_fn = DataCollatorForLanguageModeling(tokenizer, mlm=False) 

In [None]:
trainer = SFTTrainer(
    model=lora_model,
    tokenizer=tokenizer,
    args=train_args,
    data_collator=collate_fn,
    train_dataset=train_dataset,
    eval_dataset=test_dataset,
)

In [None]:
training_result = trainer.train()

## step-9: 模型评估

In [None]:
evaluating_result = trainer.evaluate()
pp(evaluating_result)

## step-10: 模型保存

In [None]:
lora_model.save_pretrained_gguf("lora_model_bf16", tokenizer, quantization_method="bf16")  # q4_k_m

## step-11: 模型推理

In [None]:
FastLanguageModel.for_inference(lora_model)

In [None]:
system_prompt = "You are a helpful assistant."
user_prompt = "你好，好久不见！"

messages = [
        {"role": "system", "content": system_prompt},
        {"role": "user", "content": user_prompt},
    ]

In [None]:
text = tokenizer.apply_chat_template(
        messages,
        tokenize=False,
        add_generation_prompt=True,
        enable_thinking=False
    )
model_inputs = tokenizer([text], return_tensors="pt").to(device)

In [None]:
gen_kwargs = {
    "max_new_tokens": 1024,
    "top_p": 0.5,
    "temperature": 0.5,
    "do_sample": False
}

In [None]:
with th.inference_mode():
    complete_ids = lora_model.generate(
        input_ids=model_inputs.input_ids,
        attention_mask=model_inputs.attention_mask,
        use_cache=True,
        **gen_kwargs
    )
    
input_ids = model_inputs.input_ids
generated_ids = [O[len(I): ] for (I, O) in zip(input_ids, complete_ids)]
response = tokenizer.batch_decode(sequences=generated_ids, skip_special_tokens=True)[0]
print(response)