# Fine-Tuning Llama Model on Arxiv Dataset

## 1. Setup

In [15]:
# !pip install -r requirements.txt

In [None]:
# check torch GPU
import torch
print(torch.cuda.is_available())

#### Download dataset

In [17]:
# import kagglehub

# # Download latest version
# path = kagglehub.dataset_download("Cornell-University/arxiv", path='Cornell-University/arxiv')

# print("Path to dataset files:", path)

## 2. Load necessary modules and configuration

In [18]:
from transformers import AutoModelForCausalLM, AutoTokenizer, TrainingArguments, Trainer, BitsAndBytesConfig, pipeline
from trl import DataCollatorForCompletionOnlyLM, SFTTrainer
from peft import LoraConfig, get_peft_model, prepare_model_for_kbit_training
from datasets import load_dataset
import bitsandbytes as bnb
import torch
import os

In [19]:
# Training arguments
BASE_MODEL = "Llama-3.2-3B-Instruct"
SUBSET_RATIO = 0.01  # 使用 1% 的資料


# Paths
BASE_MODEL_DIR = os.path.join("./models/", BASE_MODEL)
DATASET = "./data/Cornell-University/arxiv"
DATASET_MAP = "./data/Cornell-University/arxiv_tokenized_dataset"
OUTPUT_DIR = os.path.join("./fine_tuned_models/", f"{BASE_MODEL}_{SUBSET_RATIO}")


# Load model and tokenizer
tokenizer = AutoTokenizer.from_pretrained(BASE_MODEL_DIR)

## 3. Data preprocessing

In [None]:
# Set pad token
if tokenizer.pad_token is None:
    tokenizer.add_special_tokens({'pad_token': '<|pad|>'})

print(tokenizer.pad_token)

In [21]:
# Load dataset
dataset = load_dataset(DATASET, split="train")

In [None]:
subset_dataset = dataset.shuffle(seed=42).select(range(int(len(dataset) * SUBSET_RATIO)))
print(f"使用的資料集大小: {len(subset_dataset)} 個樣本")

In [23]:
def formatting_prompts_func(example):
    output_texts = []
    for i in range(len(example['abstract'])):
        text = f"""<|begin_of_text|><|start_header_id|>system<|end_header_id|>
You are an AI assistant to summarize scientific abstracts into concise and accurate title.
<|eot_id|><|start_header_id|>user<|end_header_id|>
{example['abstract'][i]}
<|eot_id|><|start_header_id|>assistant<|end_header_id|>
{example['title'][i]}<|eot_id|>"""
        output_texts.append(text)
    return output_texts


## 4. FineTuning

In [24]:
# Preprocessing
response_template = "<|start_header_id|>assistant<|end_header_id|>\n"
collator = DataCollatorForCompletionOnlyLM(response_template=response_template, tokenizer=tokenizer, mlm=False)

### 4.1 LoRA

In [None]:
# Reload base model for LoRA 
lora_model = AutoModelForCausalLM.from_pretrained(BASE_MODEL_DIR, device_map="auto")
lora_model.gradient_checkpointing_enable()
lora_model.resize_token_embeddings(len(tokenizer))

# Set LoRA config
lora_config = LoraConfig(
    r=8,
    lora_alpha=32,
    target_modules=["q_proj", "k_proj", "v_proj"],
    lora_dropout=0.1,
    bias="none",
    task_type="CAUSAL_LM"
)

# Apply LoRA to the base model
lora_model = get_peft_model(lora_model, lora_config)

# Set LoRA training args
lora_training_args = TrainingArguments(
    output_dir=os.path.join(OUTPUT_DIR, "lora"),
    per_device_train_batch_size=2,
    gradient_accumulation_steps=4,  # 累積梯度
    num_train_epochs=3,
    logging_dir="./logs",
    save_strategy="epoch",
    fp16=True,
    push_to_hub=False
)

# Initialize LoRA Trainer
lora_trainer = SFTTrainer(
    model=lora_model,
    args=lora_training_args,
    train_dataset=subset_dataset,
    tokenizer=tokenizer,
    data_collator=collator,  # 動態填充
    formatting_func=formatting_prompts_func,
    optimizers=(bnb.optim.Adam8bit(lora_model.parameters(), lr=1e-4), None)  # 使用 8-bit Adam 優化器，減少記憶體使用 (預設使用 AdamW)
)

# Train LoRA model
lora_trainer.train()
lora_trainer.save_model(os.path.join(OUTPUT_DIR,"lora/final_model"))


In [None]:
# Empty VRAM
del lora_model
# del pipe
del lora_trainer
import gc
gc.collect()

### 4.2 QLoRA

In [None]:
# Define quantization config for QLoRA
quantization_config = BitsAndBytesConfig(
    load_in_4bit=True,
    bnb_4bit_use_double_quant=False,
    bnb_4bit_quant_type='nf4',
    bnb_4bit_compute_dtype=torch.bfloat16
)

# Load 4-bit QLoRA model
qlora_model = AutoModelForCausalLM.from_pretrained(
    BASE_MODEL_DIR,
    device_map="auto",
    torch_dtype=torch.bfloat16,
    quantization_config=quantization_config
)
qlora_model.gradient_checkpointing_enable()
qlora_model.resize_token_embeddings(len(tokenizer))
qlora_model = prepare_model_for_kbit_training(qlora_model, use_gradient_checkpointing = False)

# Set QLoRA config
qlora_config = LoraConfig(
    r=64,  # Typically higher for QLoRA
    lora_alpha=32,
    target_modules=["q_proj", "k_proj", "v_proj", "o_proj"],
    lora_dropout=0.1,
    bias="none",
    task_type="CAUSAL_LM"
)

# Apply QLoRA to the base model
qlora_model = get_peft_model(qlora_model, qlora_config)

# Ensure only floating point parameters require gradients
for param in qlora_model.parameters():
    if param.dtype in [torch.bfloat16, torch.float32, torch.float64, torch.complex64, torch.complex128]:
        param.requires_grad = True
    else:
        param.requires_grad = False

# Set QLoRA training args
qlora_training_args = TrainingArguments(
    output_dir=os.path.join(OUTPUT_DIR, "qlora"),
    per_device_train_batch_size=2,
    gradient_accumulation_steps=4,  # 累積梯度
    num_train_epochs=3,
    logging_dir="./logs",
    save_strategy="epoch",
    fp16=True,
    push_to_hub=False
)

# Initialize QLoRA Trainer
qlora_trainer = SFTTrainer(
    model=qlora_model,
    args=qlora_training_args,
    train_dataset=subset_dataset,
    tokenizer=tokenizer,
    data_collator=collator,  # 動態填充
    formatting_func=formatting_prompts_func,
    optimizers=(bnb.optim.Adam8bit(qlora_model.parameters(), lr=1e-4), None)  # 使用 8-bit Adam 優化器，減少記憶體使用 (預設使用 AdamW)
)

# Train QLoRA model
qlora_trainer.train()
qlora_trainer.save_model(os.path.join(OUTPUT_DIR,"qlora/final_model"))


In [None]:
# Empty VRAM
del qlora_model
# del pipe
del qlora_trainer
import gc
gc.collect()

## 5. Inference


In [11]:

abstract = """Creating high-quality True-False (TF) multiple-choice questions (MCQs), with accurate distractors, is a challenging and time-consuming task in education. This paper introduces True-False Distractor Generation (TFDG), a pipeline that leverages pre-trained language models and sentence retrieval techniques to automate the generation of TF-type MCQ distractors. Furthermore, the evaluation of generated TF questions presents a challenge. Traditional metrics like BLEU and ROUGE are unsuitable for this task. To address this, we propose a new evaluation metric called Retrieval-based Accuracy Differential (RAD). RAD assesses the discriminative power of TF questions by comparing model accuracy with and without access to reference texts. It quantitatively evaluates how well questions differentiate between students with varying knowledge levels. This research benefits educators and assessment developers, facilitating the efficient automatic generation of high-quality TF-type MCQs and their reliable evaluation."""

In [12]:
prompt = f"""<|begin_of_text|><|start_header_id|>system<|end_header_id|>
You are an AI assistant to summarize scientific abstracts into concise and accurate title.
<|eot_id|><|start_header_id|>user<|end_header_id|>
{abstract}
<|eot_id|><|start_header_id|>assistant<|end_header_id|>
"""

In [None]:
max_length = 512

print("[abstract]:", abstract)

print("="*100)

print("[Base model]")
result = pipeline(task="text-generation", model=BASE_MODEL_DIR, tokenizer=tokenizer, max_length=max_length)(prompt)[0]['generated_text']
print("[Response]:")
print(result[result.find(response_template) + len(response_template):])

print("-"*100)

print("[LoRA model]")
result = pipeline(task="text-generation", model=os.path.join(OUTPUT_DIR, f"lora/final_model"), tokenizer=tokenizer, max_length=max_length)(prompt)[0]['generated_text']
print("[Response]:")
print(result[result.find(response_template) + len(response_template):])

print("-"*100)

print("[QLoRA model]")
result = pipeline(task="text-generation", model=os.path.join(OUTPUT_DIR, "qlora/final_model"), tokenizer=tokenizer, max_length=max_length)(prompt)[0]['generated_text']
print("[Response]:")
print(result[result.find(response_template) + len(response_template):])
