In [1]:
from transformers import AutoTokenizer, AutoModelForCausalLM, BitsAndBytesConfig
from datasets import load_dataset
from transformers import AutoModelForCausalLM, TrainingArguments, Trainer
from peft import LoraConfig, get_peft_model, PeftModel, TaskType
import os
import pandas as pd
import torch
from utils import preprocess_qa, RestrictToValidTokens
pd.options.display.max_colwidth = None
# tokenizer = AutoTokenizer.from_pretrained("meta-llama/Meta-Llama-3-8B-Instruct")
# model = AutoModelForCausalLM.from_pretrained("meta-llama/Meta-Llama-3-8B-Instruct")
hf_auth_token = os.getenv("HF_AUTH_TOKEN")
ds = load_dataset("tau/commonsense_qa")
trained_folder = "./fine-tuned/llama-qa-lora_overfit_100"

In [2]:
df = pd.DataFrame(ds['train'])
# df.head(5)

In [3]:
bnb_config = BitsAndBytesConfig(
    load_in_8bit=True,  # Enable 8-bit quantization
    llm_int8_threshold=6.0  # Adjust threshold for higher precision on sensitive layers
)

tokenizer = AutoTokenizer.from_pretrained(
    "meta-llama/Meta-Llama-3-8B-Instruct",
    use_auth_token=hf_auth_token,
    cache_dir="/fs03/yu60/kojitanaka/model_cache"
)

model = AutoModelForCausalLM.from_pretrained(
    "meta-llama/Meta-Llama-3-8B-Instruct",
    use_auth_token=hf_auth_token,
    cache_dir="/fs03/yu60/kojitanaka/model_cache",
    device_map="auto",  # Automatically maps layers to GPU
    quantization_config=bnb_config,
)

Loading checkpoint shards:   0%|          | 0/4 [00:00<?, ?it/s]

In [4]:
# 1. Add a new pad token
tokenizer.add_special_tokens({"pad_token": "<|pad|>"})

# 2. Resize model embeddings to match the new (larger) vocabulary
model.resize_token_embeddings(len(tokenizer), mean_resizing=False)

# 3. Set pad token + pad_token_id
tokenizer.pad_token = "<|pad|>"
tokenizer.pad_token_id = tokenizer.convert_tokens_to_ids("<|pad|>")
# tokenizer.pad_token = tokenizer.eos_token
max_length = min(tokenizer.model_max_length, 256)

def create_tokenized_ds_for_finetune(example):
    prompt_text = preprocess_qa(example)['text']

    tokenized_prompt = tokenizer(prompt_text, truncation=True, padding="max_length", return_tensors="pt", max_length=max_length)

    answer_token = tokenizer(example['answerKey'].strip(), return_tensors="pt", add_special_tokens=False)

    input_ids = tokenized_prompt["input_ids"].squeeze(0)
    attention_mask = tokenized_prompt["attention_mask"].squeeze(0)
    answer_input_ids = answer_token["input_ids"].squeeze(0)

    labels = torch.full_like(input_ids, -100)
    if answer_input_ids.numel() == 1:
        next_pos = input_ids.ne(tokenizer.pad_token_id).sum()
        labels[next_pos] = answer_input_ids.item()
    else:
        start_pos = input_ids.ne(tokenizer.pad_token_id).sum()
        labels[start_pos : start_pos + answer_input_ids.size(0)] = answer_input_ids

    pad_length = max_length - input_ids.shape[0]
    
    if pad_length > 0:
        input_ids = torch.cat([input_ids, torch.full((pad_length,), tokenizer.pad_token_id)])
        attention_mask = torch.cat([attention_mask, torch.zeros(pad_length, dtype=torch.long)])
        labels = torch.cat([labels, torch.full((pad_length,), -100)])
    else:
        input_ids = input_ids[:max_length]
        attention_mask = attention_mask[:max_length]
        labels = labels[:max_length]

    return {
        "input_ids": input_ids,
        "attention_mask": attention_mask,
        "labels": labels
    }

tokenized_ds_finetune = ds.map(create_tokenized_ds_for_finetune, remove_columns=ds['train'].column_names)

In [5]:
# Print the max length and verify sequence lengths
print(f"Max Length Used: {max_length}")

# Iterate correctly through the dataset
for idx in range(3):  # Use indexing directly to access each item
    example = tokenized_ds_finetune['train'][idx]  # Accessing each dictionary correctly
    input_ids = example['input_ids']
    labels = example['labels']

    # Print lengths and ensure the types are correct
    print(f"\nExample {idx + 1}:")
    print("Input Length:", len(input_ids))  
    print("Labels Length:", len(labels))
    
    # Decode the tokens and print (only unmasked tokens for labels)
    print("Input Tokens Decoded:", tokenizer.decode(input_ids))
    print("Labels Tokens Decoded:", tokenizer.decode([t for t in labels if t != -100]))

Max Length Used: 256

Example 1:
Input Length: 256
Labels Length: 256
Input Tokens Decoded: <|begin_of_text|>Question: The sanctions against the school were a punishing blow, and they seemed to what the efforts the school had made to change?. Options: A: ignore B: enforce C: authoritarian D: yell at E: avoid. Return only the letter corresponding to the correct answer: A, B, C, D, or E. Answer:<|pad|><|pad|><|pad|><|pad|><|pad|><|pad|><|pad|><|pad|><|pad|><|pad|><|pad|><|pad|><|pad|><|pad|><|pad|><|pad|><|pad|><|pad|><|pad|><|pad|><|pad|><|pad|><|pad|><|pad|><|pad|><|pad|><|pad|><|pad|><|pad|><|pad|><|pad|><|pad|><|pad|><|pad|><|pad|><|pad|><|pad|><|pad|><|pad|><|pad|><|pad|><|pad|><|pad|><|pad|><|pad|><|pad|><|pad|><|pad|><|pad|><|pad|><|pad|><|pad|><|pad|><|pad|><|pad|><|pad|><|pad|><|pad|><|pad|><|pad|><|pad|><|pad|><|pad|><|pad|><|pad|><|pad|><|pad|><|pad|><|pad|><|pad|><|pad|><|pad|><|pad|><|pad|><|pad|><|pad|><|pad|><|pad|><|pad|><|pad|><|pad|><|pad|><|pad|><|pad|><|pad|><|pad|><|

In [6]:
lora_config = LoraConfig(
    r=8,
    lora_alpha=32,
    target_modules=["q_proj", "v_proj"],
    bias="none",      
    lora_dropout=0.0,     
    task_type=TaskType.CAUSAL_LM  
)

# Apply LoRA to the model
model = get_peft_model(model, lora_config)
model.print_trainable_parameters()

trainable params: 3,407,872 || all params: 8,033,677,312 || trainable%: 0.0424


In [7]:
train_dataset = tokenized_ds_finetune["train"].select(range(16))
validation_dataset = tokenized_ds_finetune["validation"].select(range(16))

In [8]:
generation_config = {
    "pad_token_id": tokenizer.pad_token_id,
    "eos_token_id": tokenizer.eos_token_id
}

model.resize_token_embeddings(len(tokenizer))  

if tokenizer.pad_token_id is None:
    tokenizer.pad_token = "<|pad|>"
    tokenizer.pad_token_id = tokenizer.convert_tokens_to_ids("<|pad|>")

training_args = TrainingArguments(
    output_dir=trained_folder,     
    evaluation_strategy="epoch",     
    save_strategy="epoch",           
    learning_rate=3e-5,               
    per_device_train_batch_size=4,    
    per_device_eval_batch_size=4,    
    num_train_epochs=10,
    weight_decay=0.01,                
    logging_dir="./logs",            
    logging_steps=1,                
    save_total_limit=3,              
    load_best_model_at_end=True,     
    fp16=False,                       
    gradient_accumulation_steps=1,   
    report_to="none"                 
)

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=validation_dataset,
    tokenizer=tokenizer
)


torch.cuda.empty_cache()

trainer.train()

model.save_pretrained(trained_folder)
tokenizer.save_pretrained(trained_folder)

  trainer = Trainer(


Epoch,Training Loss,Validation Loss
1,14.3782,14.269659
2,11.1131,11.523464
3,10.3881,9.988491
4,7.9076,7.652774
5,5.8115,5.934968
6,4.0822,4.732222
7,3.5614,3.871064
8,2.5241,3.319704
9,2.3602,2.994216
10,2.1975,2.872624




('./fine-tuned/llama-qa-lora_overfit_4/tokenizer_config.json',
 './fine-tuned/llama-qa-lora_overfit_4/special_tokens_map.json',
 './fine-tuned/llama-qa-lora_overfit_4/tokenizer.json')