In [1]:
import os
os.environ["CUDA_VISIBLE_DEVICES"] = "6"

In [2]:
import torch
from datasets import Dataset
from transformers import (
    AutoTokenizer, 
    AutoModelForCausalLM,
    TrainingArguments,
    Trainer,
    DataCollatorForLanguageModeling,
    BitsAndBytesConfig
)
from peft import (
    prepare_model_for_kbit_training,
    LoraConfig,
    get_peft_model,
    PeftModel
)

In [3]:
root = "/home/coder/projects/test/story_structure"

model_name = "MediaTek-Research/Breeze-7B-Instruct-v0_1"
model_description = "Breeze"

# model_name = "TinyLlama/TinyLlama-1.1B-Chat-v1.0"  # 僅1.1B參數
# model_description = "TinyLlama"
file_path = f"{root}/data/raw/percy_jackson.txt"

### Fine-tuning model with peft

In [4]:
def load_data(file_path):
    """載入並預處理文本數據"""
    with open(file_path, 'r', encoding='utf-8') as f:
        text = f.read()
    
    # 將文本分割成較小的段落，使用較短的長度
    chunks = [text[i:i+256] for i in range(0, len(text), 256)]
    
    # 創建dataset
    dataset = Dataset.from_dict({
        'text': chunks
    })
    return dataset

In [5]:
def prepare_model_and_tokenizer(model_name):
    """準備小型基礎模型和分詞器，使用新的量化配置"""    
    # 設定 4bit 量化配置
    bnb_config = BitsAndBytesConfig(
        load_in_4bit=True,
        bnb_4bit_quant_type="nf4",
        bnb_4bit_compute_dtype=torch.float16,
        bnb_4bit_use_double_quant=True
    )
    
    # 載入模型和分詞器
    tokenizer = AutoTokenizer.from_pretrained(model_name)
    model = AutoModelForCausalLM.from_pretrained(
        model_name,
        quantization_config=bnb_config,  # 使用新的量化配置
        torch_dtype=torch.float16,
        device_map="auto"
    )
    
    # 配置 LoRA
    lora_config = LoraConfig(
        r=8,
        lora_alpha=16,
        target_modules=["q_proj", "v_proj"],
        lora_dropout=0.05,
        bias="none",
        task_type="CAUSAL_LM"
    )
    
    model = prepare_model_for_kbit_training(model)
    model = get_peft_model(model, lora_config)
    
    return model, tokenizer


In [6]:
def tokenize_function(examples, tokenizer):
    """將文本轉換為token"""
    return tokenizer(
        examples["text"],
        truncation=True,
        max_length=256,  # 減少序列長度
        padding="max_length"
    )

In [7]:
# 載入數據
dataset = load_data(file_path)

# 準備模型和分詞器
model, tokenizer = prepare_model_and_tokenizer(model_name)

Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

In [8]:
# 處理數據集
tokenized_dataset = dataset.map(
    lambda x: tokenize_function(x, tokenizer),
    batched=True,
    remove_columns=dataset.column_names
)

Map:   0%|          | 0/3248 [00:00<?, ? examples/s]

In [9]:
# 訓練參數配置
training_args = TrainingArguments(
    output_dir=f"{root}/reports/{model_description}",
    per_device_train_batch_size=2,
    gradient_accumulation_steps=8,
    num_train_epochs=2,
    learning_rate=1e-4,
    fp16=True,
    save_steps=200,
    logging_steps=20,
    max_steps=100,
    warmup_steps=100,
    optim="paged_adamw_8bit"
)

In [10]:
# 設定數據整理器
data_collator = DataCollatorForLanguageModeling(
    tokenizer=tokenizer,
    mlm=False
)

In [11]:
# 創建訓練器
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_dataset,
    data_collator=data_collator,
)

In [None]:
# 開始訓練
trainer.train()

  return fn(*args, **kwargs)


In [None]:
# 保存模型
model.save_pretrained(f"{root}/models/{model_description}")

### Test model response

In [None]:
def load_original_model(model_name):
    """載入原始模型"""    
    tokenizer = AutoTokenizer.from_pretrained(model_name)
    model = AutoModelForCausalLM.from_pretrained(
        model_name,
        torch_dtype=torch.float16,
        device_map="auto"
    )
    
    return model, tokenizer

In [None]:
def load_finetuned_model(model_name, model_description):
    """載入微調後的模型"""
    adapter_path = f"{root}/models/{model_description}"
    
    # 載入基礎模型
    base_model = AutoModelForCausalLM.from_pretrained(
        model_name,
        torch_dtype=torch.float16,
        device_map="auto"
    )
    
    # 載入 LoRA 權重
    model = PeftModel.from_pretrained(base_model, adapter_path)
    tokenizer = AutoTokenizer.from_pretrained(model_name)
    
    return model, tokenizer

In [None]:
def generate_response(model, tokenizer, prompt, max_length=512):
    """生成回應"""
    inputs = tokenizer(prompt, return_tensors="pt").to(model.device)
    
    outputs = model.generate(
        **inputs,
        max_length=max_length,
        num_return_sequences=1,
        temperature=0.7,
        top_p=0.9,
        do_sample=True,
        pad_token_id=tokenizer.pad_token_id
    )
    
    response = tokenizer.decode(outputs[0], skip_special_tokens=True)
    return response

In [None]:
question = "你看過波西傑克森這本關於希臘神話的小說嗎?看過的話說明一下故事主軸。"

In [None]:
print("=== 原始模型回應 ===")
model, tokenizer = load_original_model(model_name)
original_response = generate_response(model, tokenizer, question)
print(original_response)

=== 原始模型回應 ===
你看過波西傑克森這本關於希臘神話的小說嗎?看過的話說明一下故事主軸。


In [None]:
print("=== 微調後模型回應 ===")
model, tokenizer = load_finetuned_model(model_name, model_description)
finetuned_response = generate_response(model, tokenizer, question)
print(finetuned_response)

=== 微調後模型回應 ===
你看過波西傑克森這本關於希臘神話的小說嗎?看過的話說明一下故事主軸。希臘神話是在希臘遺族繼承的。在尼泊尼爾遺族，我們從希臘遺族歷史中獲得了一些經驗。在希臘遺族中，我們在從希臘遺族的廣大協同世界中拍攝的戲劇中，可以看到很多難事。我們在某些地方做了過的事。很多地方我們做的事，不過很少是我們做的事。在我們做的事中，我們很少是幫助人民的。我們做的事，是盡可能的幫助協力。很多地方我們做的事，是從不賜個幫助。我們很少是幫助人民。我們很少是幫助人民。我們很少是幫助人民。我們很少是幫助人民。我們很少是幫助人民。我們很少是幫助人民。我們很少是幫助人民。我們很少是幫助人民。我們很少是幫助人民。我們很少是幫助人民。我們很少是�
