<a href="https://colab.research.google.com/github/Genimix/gdp-dashboard/blob/main/TestFinetuning.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
!pip install transformers datasets peft accelerate bitsandbytes



In [2]:
import json
import torch
from transformers import AutoTokenizer, AutoModelForCausalLM, TrainingArguments, Trainer, DataCollatorForLanguageModeling, default_data_collator
from peft import LoraConfig, get_peft_model, prepare_model_for_kbit_training
from datasets import Dataset

In [3]:
model_name = "Qwen/Qwen2.5-3B"
tokenizer = AutoTokenizer.from_pretrained(model_name, trust_remote_code=True)
model = AutoModelForCausalLM.from_pretrained(
    model_name,
    device_map="auto",   # 自动放到GPU上
    trust_remote_code=True,
    load_in_4bit=True    # QLoRA 精度优化
)

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.
The `load_in_4bit` and `load_in_8bit` arguments are deprecated and will be removed in the future versions. Please, pass a `BitsAndBytesConfig` object in `quantization_config` argument instead.
Sliding Window Attention is enabled but not implemented for `sdpa`; unexpected results may be encountered.


Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

generation_config.json:   0%|          | 0.00/138 [00:00<?, ?B/s]

In [4]:
# ============ Step 1: 读取 JSON 文件 ============
data = []
with open('./sample_data/sft_dataset.json', 'r', encoding='utf-8') as f:
    raw_data = json.load(f)  # 注意：不是逐行读取，而是整个文件解析

# 预处理：拼接成 Qwen 的 ChatML 格式
processed_data = []
for item in raw_data:
    instruction = item.get("instruction", "").strip()
    input_text = item.get("input", "").strip()
    output = item.get("output", "").strip()

    prompt = f"<|im_start|>user\n{instruction}\n\n{input_text}\n<|im_end|>\n"
    full_text = f"{prompt}<|im_start|>assistant\n{output}\n<|im_end|>"

    processed_data.append({"text": full_text})

In [5]:
# ============ Step 3: 准备 LoRA 配置 ============
model = prepare_model_for_kbit_training(model)

lora_config = LoraConfig(
    r=4,
    lora_alpha=32,
    lora_dropout=0.05,
    bias="none",
    task_type="CAUSAL_LM"
)

model = get_peft_model(model, lora_config)
model.print_trainable_parameters()

trainable params: 921,600 || all params: 3,086,860,288 || trainable%: 0.0299


In [6]:
# 转换为 HuggingFace Dataset
dataset = Dataset.from_list(processed_data)

# Tokenization，并附加 labels（用于 causal LM 微调）
def tokenize(example):
    tokens = tokenizer(
        example["text"],
        padding="max_length",
        truncation=True,
        max_length=1024,
    )
    # 注意这里使用 torch.tensor 强制转换
    return {
        "input_ids": torch.tensor(tokens["input_ids"]),
        "attention_mask": torch.tensor(tokens["attention_mask"]),
        "labels": torch.tensor(tokens["input_ids"])  # 通常 labels = input_ids
    }
tokenized_dataset = dataset.map(tokenize, batched=True)
# tokenized_dataset.set_format(type="torch", columns=["input_ids", "attention_mask", "labels"])

Map:   0%|          | 0/2004 [00:00<?, ? examples/s]

In [None]:
dataset[:2]

{'text': ['<|im_start|>user\n请将下面的英语短语翻译为中文，并转换为常用中文语序，确保术语符合“固定建筑物”领域专业表达，仅输出 JSON 格式结果。\n\n{Portable strong boxes, e.g. which may be fixed to a wall or the like(E05G1/08, E05G1/14 take precedence; devices to prevent theft or loss of bags, trunks, baskets, or the like A45C13/18; coin boxes for coin freed apparatus G07F9/06)}\n<|im_end|>\n<|im_start|>assistant\n{"translation": "{便携式保险箱,例如可被固定在墙上或类似物上(E05G 1/08,E05G 1/14优先;包,箱子,篮子或类似物品的防盗或防丢失的装置入A45C13/18;硬币释放装置的硬币盒入G07F9/06;投币自动售货机的钱币盒入G07D1/00B)}"}\n<|im_end|>',
  '<|im_start|>user\n请将下面的英语短语翻译为中文，并转换为常用中文语序，确保术语符合“固定建筑物”领域专业表达，仅输出 JSON 格式结果。\n\n{Mechanical features of panels}\n<|im_end|>\n<|im_start|>assistant\n{"translation": "镶板的机械特征"}\n<|im_end|>']}

In [7]:
# ============ Step 5: 训练参数 ============
training_args = TrainingArguments(
    output_dir="./qwen2.5-3b-lora",
    per_device_train_batch_size=4,
    gradient_accumulation_steps=4,
    num_train_epochs=5,
    learning_rate=1e-4,
    fp16=True,
    logging_steps=10,
    save_strategy="epoch",
    save_total_limit=2,
    report_to="none",
    remove_unused_columns=False
)

In [8]:
# ============ Step 6: 开始训练 ============
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_dataset,
    tokenizer=tokenizer,
    data_collator=default_data_collator
)

trainer.train()

  trainer = Trainer(
No label_names provided for model class `PeftModelForCausalLM`. Since `PeftModel` hides base models input arguments, if label_names is not given, label_names can't be set automatically within `Trainer`. Note that empty label_names list will be used instead.
`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`.
  return fn(*args, **kwargs)
  return fn(*args, **kwargs)
  return fn(*args, **kwargs)
  return fn(*args, **kwargs)
  return fn(*args, **kwargs)
  return fn(*args, **kwargs)
  return fn(*args, **kwargs)
  return fn(*args, **kwargs)
  return fn(*args, **kwargs)
  return fn(*args, **kwargs)
  return fn(*args, **kwargs)
  return fn(*args, **kwargs)
  return fn(*args, **kwargs)
  return fn(*args, **kwargs)
  return fn(*args, **kwargs)
  return fn(*args, **kwargs)
  return fn(*args, **kwargs)
  return fn(*args, **kwargs)
  return fn(*args, **kwargs)
  return fn(*args, **kwargs)
  return fn(*args, **kwargs)
  return fn(*args, **kw

Step,Training Loss
10,7.0974
20,2.5974
30,0.6919
40,0.3919
50,0.3381
60,0.2626
70,0.2169
80,0.1801
90,0.1518
100,0.1282


  return fn(*args, **kwargs)


KeyboardInterrupt: 

In [28]:
from transformers import AutoTokenizer, AutoModelForCausalLM
from peft import PeftModel

base_model = AutoModelForCausalLM.from_pretrained("Qwen/Qwen2.5-3B", trust_remote_code=True, device_map="auto", load_in_4bit=True)
model = PeftModel.from_pretrained(base_model, "./qwen2.5-3b-lora")
tokenizer = AutoTokenizer.from_pretrained("Qwen/Qwen2.5-3B", trust_remote_code=True)

input_text = """### instruction：
请将下面的英语短语翻译为中文，并转换为常用中文语序，确保术语符合“固定建筑物”领域专业表达，仅输出 JSON 格式结果。
### inputs：
for mounting door leaves, window sashes or the like
### outputs：
"""
instruction = "请将下面的英语短语翻译为中文，并转换为常用中文语序，确保术语符合“固定建筑物”领域专业表达，仅输出 JSON 格式结果。\n\n{Conveyors; Paddle wheels; Endless belts(E02B15/101 takes precedence)}"
prompt = f"<|im_start|>user\n{instruction}\n<|im_end|>\n<|im_start|>assistant\n"
inputs = tokenizer(prompt, return_tensors="pt").to(model.device)
outputs = model.generate(**inputs, max_new_tokens=200)
print('结果为：', tokenizer.decode(outputs[0], skip_special_tokens=True))


The `load_in_4bit` and `load_in_8bit` arguments are deprecated and will be removed in the future versions. Please, pass a `BitsAndBytesConfig` object in `quantization_config` argument instead.


Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

Setting `pad_token_id` to `eos_token_id`:151643 for open-end generation.


结果为： user
请将下面的英语短语翻译为中文，并转换为常用中文语序，确保术语符合“固定建筑物”领域专业表达，仅输出 JSON 格式结果。

{Conveyors; Paddle wheels; Endless belts(E02B15/101 takes precedence)}

assistant
prises
prisesassistant
prisesassistant
prisesassistant
prisesassistant
prisesassistant
prisesassistant
prisesassistant
prisesassistant
prisesassistant
prisesassistant
prisesassistant
prisesassistant
prisesassistant
prisesassistant
prisesassistant
prisesassistant
prisesassistant
prisesassistant
prisesassistant
prisesassistant
prisesassistant
prisesassistant
prisesassistant
prisesassistant
prisesassistant
prisesassistant
prisesassistant
prisesassistant
prisesassistant
prisesassistant
prisesassistant
prisesassistant
prisesassistant
prisesassistant
prisesassistant
prisesassistant
prisesassistant
prisesassistant
prisesassistant
prisesassistant
prisesassistant
prisesassistant
prisesassistant
prisesassistant
prisesassistant
prisesassistant
prisesassistant
prisesassistant
prisesassistant
prisesassistant
prisesassistant
prisesassistant
prisesa

In [9]:
trainer.save_model("./qwen2.5-3b-lora")
tokenizer.save_pretrained("./qwen2.5-3b-lora")

('./qwen2.5-3b-lora/tokenizer_config.json',
 './qwen2.5-3b-lora/special_tokens_map.json',
 './qwen2.5-3b-lora/vocab.json',
 './qwen2.5-3b-lora/merges.txt',
 './qwen2.5-3b-lora/added_tokens.json',
 './qwen2.5-3b-lora/tokenizer.json')