In [1]:
import os, sys

root = os.path.dirname(os.getcwd())
sys.path.append(root)
print(f"{root = }")
    
os.environ["CUDA_VISIBLE_DEVICES"] = '5'

root = '/home/coder/projects/test/story_structure'


In [2]:
import torch
from datasets import Dataset
from transformers import (
    AutoTokenizer, 
    AutoModelForCausalLM,
    TrainingArguments,
    Trainer,
    DataCollatorForLanguageModeling,
    BitsAndBytesConfig
)
from peft import (
    prepare_model_for_kbit_training,
    LoraConfig,
    get_peft_model,
    PeftModel
)

from src.utils import Log, helper_function

In [3]:
helper_function.set_seed(42)
torch.backends.cudnn.deterministic = True
torch.backends.cudnn.benchmark = False

In [4]:
# get logger
log = Log()
logger = log.set_logger(file_path = f"{root}/logs/log.log", level = 1, freq = "D", interval = 10, backup = 3, name = "log")

paths = {
    'data': f'{root}/data',
    'reports': f'{root}/reports',
    'models': f'{root}/models',
    'src': f'{root}/src',
}

params = {
    'init': {
            'paths': paths,
        },
}

# model_name = "MediaTek-Research/Breeze-7B-Instruct-v0_1"
# model_description = "Breeze"

# # model_name = "TinyLlama/TinyLlama-1.1B-Chat-v1.0"  # 僅1.1B參數
# # model_description = "TinyLlama"
# file_path = f"{paths['data']}/raw/percy_jackson.txt"

# chunk_size = 256

In [5]:
# class object
class Train():
    def __init__(self, paths, logger):
        self.paths = paths
        self.logger = logger
        
self = Train(**params["init"], logger=logger)

In [6]:
self.exp_config = helper_function.load_config(f"{paths['src']}/config/experiment/config.yml")
self.exp_config

{'data': {'file_path': '/raw/percy_jackson.txt'},
 'model': {'name': 'MediaTek-Research/Breeze-7B-Instruct-v0_1',
  'description': 'breeze',
  'chunk_size': 256,
  'early_stopping_patience': 20},
 'training_args': {'save_steps': 200,
  'logging_steps': 20,
  'num_train_epochs': 2,
  'per_device_train_batch_size': 2,
  'gradient_accumulation_steps': 8,
  'warmup_steps': 100,
  'learning_rate': 0.0001,
  'optim': 'paged_adamw_8bit',
  'fp16': True}}

### Fine-tuning model with peft

In [7]:
def load_data(file_path):
    """載入並預處理文本數據"""
    with open(file_path, 'r', encoding='utf-8') as f:
        text = f.read()
    
    # 將文本分割成較小的段落，使用較短的長度
    self.chunk_size = self.exp_config["model"]["chunk_size"]
    chunks = [text[i:i+self.chunk_size] for i in range(0, len(text), self.chunk_size)]
    
    # 創建dataset
    dataset = Dataset.from_dict({
        'text': chunks
    })
    return dataset

In [8]:
def prepare_model_and_tokenizer(model_name):
    """準備小型基礎模型和分詞器，使用新的量化配置"""
    # 設定 4bit 量化配置
    bnb_config = BitsAndBytesConfig(
        load_in_4bit=True,  # 啟用 4bit 載入，以減少模型大小和記憶體使用量
        bnb_4bit_quant_type="nf4",  # 指定量化類型
        bnb_4bit_compute_dtype=torch.float16,  # 設定計算數據類型，以加快運算速度
        bnb_4bit_use_double_quant=True  # 啟用雙重量化，提高準確性但略增加記憶體用量
    )
    
    # 載入模型和分詞器
    tokenizer = AutoTokenizer.from_pretrained(model_name)
    model = AutoModelForCausalLM.from_pretrained(
        model_name,
        quantization_config=bnb_config,  # 使用新的量化配置
        torch_dtype=torch.float16,  # 設定模型的預設數據類型，以優化記憶體使用量
        device_map="auto"  # 自動分配模型到可用的設備上（CPU或GPU）
    )
    
    # 配置 LoRA
    lora_config = LoraConfig(
        r=8,  # LoRA的秩，控制LoRA矩陣的大小，影響參數數量和計算複雜度
        lora_alpha=16,  # LoRA放大因子，調整LoRA參數的學習率
        target_modules=["q_proj", "v_proj"],  # 指定 LoRA 作用的模型部分
        lora_dropout=0.05,  # 設定 LoRA 層的 dropout 率，用於防止過擬合
        bias="none",  # 關閉偏差項
        task_type="CAUSAL_LM"  # 指定任務類型為因果語言模型
    )
    
    model = prepare_model_for_kbit_training(model)  # 準備模型以支持 kbit 訓練
    model = get_peft_model(model, lora_config)  # 使用 LoRA 配置加速模型訓練    

    # move model to available device
    available_device = torch.device(
        'cuda' if torch.cuda.is_available() else 'cpu')
    model.to(available_device)
    
    return model, tokenizer


In [9]:
def tokenize_function(examples, tokenizer):
    """將文本轉換為token"""
    return tokenizer(
        examples["text"],
        truncation=True, # 截斷文本以符合模型的最大長度
        max_length=self.chunk_size,  # 設定最大長度
        padding="max_length" # 填充文本以符合模型的最大長度
    )

In [10]:
# 載入數據
dataset = load_data(f'{self.paths["data"]}/{self.exp_config["data"]["file_path"]}')

# 準備模型和分詞器
model, tokenizer = prepare_model_and_tokenizer(self.exp_config["model"]["name"])

Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

In [11]:
# 處理數據集
tokenized_dataset = dataset.map(
    lambda x: tokenize_function(x, tokenizer), # 將文本轉換為token
    batched=True, # 一次處理多個樣本
    remove_columns=dataset.column_names # 移除原始文本列
)

Map:   0%|          | 0/3248 [00:00<?, ? examples/s]

In [12]:
# create model checkpoint folder
model_checkpoint_path = f"{self.paths['models']}/{self.exp_config['model']['name']}/{self.exp_config['model']['description']}"
helper_function.remove_directory(model_checkpoint_path) # remove old model checkpoint
os.makedirs(model_checkpoint_path, exist_ok=True) # create new model checkpoint

# create logging folder
logging_dir = f'{self.paths["reports"]}/{self.exp_config["model"]["name"]}/{self.exp_config["model"]["description"]}/logs'
helper_function.remove_directory(model_checkpoint_path) # remove old model checkpoint
os.makedirs(logging_dir, exist_ok=True) # create new model checkpoint

# 訓練參數配置
training_args = TrainingArguments(
    output_dir = model_checkpoint_path, 
    logging_dir = logging_dir,
    **self.exp_config['training_args']
    )

In [13]:
# 設定數據整理器
data_collator = DataCollatorForLanguageModeling(
    tokenizer=tokenizer,
    mlm=False # 是否隨機遮蔽token，關閉以進行因果語言模型訓練，即預測下一個token
)

# 訓練參數配置
training_args = TrainingArguments(
    output_dir=f"{root}/reports/{model_description}",
    per_device_train_batch_size=2,
    gradient_accumulation_steps=8,
    num_train_epochs=2,
    learning_rate=1e-4,
    fp16=True,
    save_steps=200,
    logging_steps=20,
    max_steps=100,
    warmup_steps=100,
    optim="paged_adamw_8bit"
)

In [14]:
# 創建訓練器
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_dataset,
    data_collator=data_collator,
    # eval_dataset=dataset_valid,
    # compute_metrics=compute_metrics, # 添加計算指標的函數
    # loss_fn=custom_loss,
    # callbacks=[EarlyStoppingCallback(self.exp_config['model']['early_stopping_patience'])]
)

In [15]:
gpu_dict = helper_function.check_device(model, trainer.args)
self.logger.info(f"-- Model is using {'GPU: ' + str(gpu_dict['device_idx']) if gpu_dict['device_idx'] != -1 else 'CPU'}")
self.logger.info(f"-- training_args is using {gpu_dict['model_gpu_idx']}")

2024-12-26 18:32:16 INFO -- Model is using GPU: 0
2024-12-26 18:32:16 INFO -- training_args is using cuda:0


In [16]:
# 開始訓練
trainer.train()

  return fn(*args, **kwargs)


Step,Training Loss
20,3.9674


KeyboardInterrupt: 

model.save_pretrained(f"{root}/models/{model_description}")

### Test model response

In [None]:
def load_original_model(model_name):
    """載入原始模型"""    
    tokenizer = AutoTokenizer.from_pretrained(model_name)
    model = AutoModelForCausalLM.from_pretrained(
        model_name,
        torch_dtype=torch.float16,
        device_map="auto"
    )
    
    return model, tokenizer

In [None]:
def load_finetuned_model(model_name, model_description):
    """載入微調後的模型"""
    adapter_path = f"{root}/models/{model_description}"
    
    # 載入基礎模型
    base_model = AutoModelForCausalLM.from_pretrained(
        model_name,
        torch_dtype=torch.float16,
        device_map="auto"
    )
    
    # 載入 LoRA 權重
    model = PeftModel.from_pretrained(base_model, adapter_path)
    tokenizer = AutoTokenizer.from_pretrained(model_name)
    
    return model, tokenizer

In [None]:
def generate_response(model, tokenizer, prompt, max_length=512):
    """生成回應"""
    inputs = tokenizer(prompt, return_tensors="pt").to(model.device)
    
    outputs = model.generate(
        **inputs,
        max_length=max_length,
        num_return_sequences=1,
        temperature=0.7,
        top_p=0.9,
        do_sample=True,
        pad_token_id=tokenizer.pad_token_id
    )
    
    response = tokenizer.decode(outputs[0], skip_special_tokens=True)
    return response

In [None]:
question = "你看過波西傑克森這本關於希臘神話的小說嗎?看過的話說明一下故事主軸。"

In [None]:
print("=== 原始模型回應 ===")
model, tokenizer = load_original_model(model_name)
original_response = generate_response(model, tokenizer, question)
print(original_response)

In [None]:
print("=== 微調後模型回應 ===")
model, tokenizer = load_finetuned_model(model_name, model_description)
finetuned_response = generate_response(model, tokenizer, question)
print(finetuned_response)