In [1]:
from transformers import AutoTokenizer, AutoModelForCausalLM, BitsAndBytesConfig
from datasets import load_dataset
from transformers import AutoModelForCausalLM, TrainingArguments, Trainer
from peft import LoraConfig, get_peft_model, PeftModel, TaskType
import os
import pandas as pd
import torch
from utils import preprocess_qa, RestrictToValidTokens
pd.options.display.max_colwidth = None
hf_auth_token = os.getenv("HF_AUTH_TOKEN")
ds = load_dataset("tau/commonsense_qa")

## Parameters ##

In [14]:
trained_folder = "./fine-tuned/llama-qa-lora_16"

# bnb config
llm_int8_threshold = 6.0

# Lora Config
rank = 8
lora_alpha = 32
target_modules = ["q_proj", "v_proj"] # ["q_proj", "v_proj", "k_proj", "o_proj"], ["q_proj", "v_proj", "k_proj", "gate_proj", "up_proj", "down_proj"] 
bias = 'none'
lora_dropout = 0.0

# Training
training_size = 16
validation_size = 16

# Learning
learning_rate = 3e-5
train_batch_size = 1
eval_batch_size = 1
train_epochs = 10
weight_decay= 0.01
is_fp16=False
gradient_accumulation_steps=1


In [2]:
df = pd.DataFrame(ds['train'])
df.head(5)

Unnamed: 0,id,question,question_concept,choices,answerKey
0,075e483d21c29a511267ef62bedc0461,"The sanctions against the school were a punishing blow, and they seemed to what the efforts the school had made to change?",punishing,"{'label': ['A', 'B', 'C', 'D', 'E'], 'text': ['ignore', 'enforce', 'authoritarian', 'yell at', 'avoid']}",A
1,61fe6e879ff18686d7552425a36344c8,Sammy wanted to go to where the people were. Where might he go?,people,"{'label': ['A', 'B', 'C', 'D', 'E'], 'text': ['race track', 'populated areas', 'the desert', 'apartment', 'roadblock']}",B
2,4c1cb0e95b99f72d55c068ba0255c54d,To locate a choker not located in a jewelry box or boutique where would you go?,choker,"{'label': ['A', 'B', 'C', 'D', 'E'], 'text': ['jewelry store', 'neck', 'jewlery box', 'jewelry box', 'boutique']}",A
3,02e821a3e53cb320790950aab4489e85,Google Maps and other highway and street GPS services have replaced what?,highway,"{'label': ['A', 'B', 'C', 'D', 'E'], 'text': ['united states', 'mexico', 'countryside', 'atlas', 'oceans']}",D
4,23505889b94e880c3e89cff4ba119860,"The fox walked from the city into the forest, what was it looking for?",fox,"{'label': ['A', 'B', 'C', 'D', 'E'], 'text': ['pretty flowers.', 'hen house', 'natural habitat', 'storybook', 'dense forest']}",C


In [3]:
bnb_config = BitsAndBytesConfig(
    load_in_8bit=True,  # Enable 8-bit quantization
    llm_int8_threshold=llm_int8_threshold  # Adjust threshold for higher precision on sensitive layers
)

tokenizer = AutoTokenizer.from_pretrained(
    "meta-llama/Meta-Llama-3-8B-Instruct",
    use_auth_token=hf_auth_token,
    cache_dir="/fs03/yu60/kojitanaka/model_cache"
)

model = AutoModelForCausalLM.from_pretrained(
    "meta-llama/Meta-Llama-3-8B-Instruct",
    use_auth_token=hf_auth_token,
    cache_dir="/fs03/yu60/kojitanaka/model_cache",
    device_map="auto",  # Automatically maps layers to GPU
    quantization_config=bnb_config,
)

Loading checkpoint shards:   0%|          | 0/4 [00:00<?, ?it/s]

In [4]:
# 1. Add a new pad token
tokenizer.add_special_tokens({"pad_token": "<|pad|>"})

# 2. Resize model embeddings to match the new (larger) vocabulary
model.resize_token_embeddings(len(tokenizer), mean_resizing=False)

# 3. Set pad token + pad_token_id
tokenizer.pad_token = "<|pad|>"
tokenizer.pad_token_id = tokenizer.convert_tokens_to_ids("<|pad|>")
# tokenizer.pad_token = tokenizer.eos_token
max_length = min(tokenizer.model_max_length, 256)

def create_tokenized_ds_for_finetune(example):
    prompt_text = preprocess_qa(example)['text']

    tokenized_prompt = tokenizer(prompt_text, truncation=True, padding="max_length", return_tensors="pt", max_length=max_length)

    answer_token = tokenizer(example['answerKey'].strip(), return_tensors="pt", add_special_tokens=False)

    input_ids = tokenized_prompt["input_ids"].squeeze(0)
    attention_mask = tokenized_prompt["attention_mask"].squeeze(0)
    answer_input_ids = answer_token["input_ids"].squeeze(0)

    labels = torch.full_like(input_ids, -100)
    if answer_input_ids.numel() == 1:
        next_pos = input_ids.ne(tokenizer.pad_token_id).sum()
        labels[next_pos] = answer_input_ids.item()
    else:
        start_pos = input_ids.ne(tokenizer.pad_token_id).sum()
        labels[start_pos : start_pos + answer_input_ids.size(0)] = answer_input_ids

    pad_length = max_length - input_ids.shape[0]
    
    if pad_length > 0:
        input_ids = torch.cat([input_ids, torch.full((pad_length,), tokenizer.pad_token_id)])
        attention_mask = torch.cat([attention_mask, torch.zeros(pad_length, dtype=torch.long)])
        labels = torch.cat([labels, torch.full((pad_length,), -100)])
    else:
        input_ids = input_ids[:max_length]
        attention_mask = attention_mask[:max_length]
        labels = labels[:max_length]

    return {
        "input_ids": input_ids,
        "attention_mask": attention_mask,
        "labels": labels
    }

tokenized_ds_finetune = ds.map(create_tokenized_ds_for_finetune, remove_columns=ds['train'].column_names)

In [5]:
# Print the max length used in tokenization
print(f"Max Length Used: {max_length}")

# Check the first 3 examples from the training dataset
for idx in range(3):  
    example = tokenized_ds_finetune['train'][idx]  
    input_ids = example['input_ids']
    labels = example['labels']

    # Convert to tensors (optional for clarity)
    input_ids_tensor = torch.tensor(input_ids)
    labels_tensor = torch.tensor(labels)

    # Print token lengths
    print(f"\nExample {idx + 1}:")
    print("Input Length:", len(input_ids))  
    print("Labels Length:", len(labels))

    # Decode input and target tokens to check proper alignment
    print("Input Tokens Decoded:", tokenizer.decode(input_ids))
    print("Labels Tokens Decoded:", tokenizer.decode([t for t in labels if t != -100]))

    # Check how many tokens are actually contributing to the loss (non -100)
    valid_loss_tokens = len([t for t in labels if t != -100])
    print(f"Number of Tokens Contributing to Loss: {valid_loss_tokens}")

    # Ensure padding tokens are masked properly
    num_padding_tokens = labels.count(-100)
    print(f"Number of Ignored Tokens (Padding or Input): {num_padding_tokens}")

Max Length Used: 256

Example 1:
Input Length: 256
Labels Length: 256
Input Tokens Decoded: <|begin_of_text|>Question: The sanctions against the school were a punishing blow, and they seemed to what the efforts the school had made to change?. Options: A: ignore B: enforce C: authoritarian D: yell at E: avoid. Return only the letter corresponding to the correct answer: A, B, C, D, or E. Answer:<|pad|><|pad|><|pad|><|pad|><|pad|><|pad|><|pad|><|pad|><|pad|><|pad|><|pad|><|pad|><|pad|><|pad|><|pad|><|pad|><|pad|><|pad|><|pad|><|pad|><|pad|><|pad|><|pad|><|pad|><|pad|><|pad|><|pad|><|pad|><|pad|><|pad|><|pad|><|pad|><|pad|><|pad|><|pad|><|pad|><|pad|><|pad|><|pad|><|pad|><|pad|><|pad|><|pad|><|pad|><|pad|><|pad|><|pad|><|pad|><|pad|><|pad|><|pad|><|pad|><|pad|><|pad|><|pad|><|pad|><|pad|><|pad|><|pad|><|pad|><|pad|><|pad|><|pad|><|pad|><|pad|><|pad|><|pad|><|pad|><|pad|><|pad|><|pad|><|pad|><|pad|><|pad|><|pad|><|pad|><|pad|><|pad|><|pad|><|pad|><|pad|><|pad|><|pad|><|pad|><|pad|><|pad|><|

In [6]:
lora_config = LoraConfig(
    r=rank,
    lora_alpha=lora_alpha,
    target_modules=target_modules,
    bias=bias,      
    lora_dropout=lora_dropout,     
    task_type=TaskType.CAUSAL_LM  
)

# Apply LoRA to the model
model = get_peft_model(model, lora_config)
model.print_trainable_parameters()

trainable params: 3,407,872 || all params: 8,033,677,312 || trainable%: 0.0424


In [7]:
train_dataset = tokenized_ds_finetune["train"].select(range(training_size))
validation_dataset = tokenized_ds_finetune["validation"].select(range(validation_size))

In [8]:
generation_config = {
    "pad_token_id": tokenizer.pad_token_id,
    "eos_token_id": tokenizer.eos_token_id
}

model.resize_token_embeddings(len(tokenizer))  

if tokenizer.pad_token_id is None:
    tokenizer.pad_token = "<|pad|>"
    tokenizer.pad_token_id = tokenizer.convert_tokens_to_ids("<|pad|>")

training_args = TrainingArguments(
    output_dir=trained_folder,     
    evaluation_strategy="epoch",     
    save_strategy="epoch",           
    learning_rate=learning_rate,               
    per_device_train_batch_size=train_batch_size,    
    per_device_eval_batch_size=eval_batch_size,    
    num_train_epochs=train_epochs,
    weight_decay=weight_decay,                
    logging_dir="./logs",            
    logging_steps=1,                
    save_total_limit=3,              
    load_best_model_at_end=True,     
    fp16=is_fp16,                       
    gradient_accumulation_steps=gradient_accumulation_steps,   
    report_to="none"                 
)

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=validation_dataset,
    tokenizer=tokenizer
)


torch.cuda.empty_cache()

trainer.train()

model.save_pretrained(trained_folder)
tokenizer.save_pretrained(trained_folder)

  trainer = Trainer(


Epoch,Training Loss,Validation Loss
1,8.9487,7.585327
2,2.4146,1.807538
3,0.6809,1.075743
4,2.2675,2.280303
5,0.0001,3.129434
6,0.0,3.184811
7,0.0,3.200341
8,0.0,3.41913
9,0.0,3.506366
10,0.0,3.612757




('./fine-tuned/llama-qa-lora_16/tokenizer_config.json',
 './fine-tuned/llama-qa-lora_16/special_tokens_map.json',
 './fine-tuned/llama-qa-lora_16/tokenizer.json')

In [13]:
# Convert target_modules list into a quoted CSV-friendly string
target_modules_str = '"' + ", ".join(target_modules) + '"'  # Joins elements with commas and wraps in quotes

# Print all values as a CSV string with target_modules converted properly
values_string = ",".join([
    str(llm_int8_threshold),
    str(rank),
    str(lora_alpha),
    target_modules_str,  # Now properly formatted
    str(bias),
    str(lora_dropout),
    str(training_size),
    str(validation_size),
    str(learning_rate),
    str(train_batch_size),
    str(eval_batch_size),
    str(train_epochs),
    str(weight_decay),
    str(is_fp16),
    str(gradient_accumulation_steps)
])

# Print CSV values for Google Sheets
print(values_string)

6.0,8,32,"q_proj, v_proj",none,0.0,16,16,3e-05,1,1,10,0.01,False,1


In [10]:
from torch.utils.data import DataLoader
from transformers import DataCollatorForLanguageModeling

data_collator = DataCollatorForLanguageModeling(
    tokenizer=tokenizer,
    mlm=False,           # We are doing causal LM, not MLM
    pad_to_multiple_of=8 # or None
)

# Make a small subset to test
train_dataset_small = tokenized_ds_finetune['train'].select(range(5))

# Create a simple DataLoader
train_dataloader = DataLoader(
    train_dataset_small,
    batch_size=2,
    shuffle=False,
    collate_fn=data_collator
)

# Iterate through the DataLoader
for batch_idx, batch in enumerate(train_dataloader):
    print(f"=== Batch {batch_idx} ===")
    for k, v in batch.items():
        print(k, v.shape if hasattr(v, 'shape') else type(v))
    print()
    # Optionally, do a quick forward pass if the model is loaded:
    # outputs = model(**batch)
    # print("Loss:", outputs.loss.item())

=== Batch 0 ===
input_ids torch.Size([2, 256])
attention_mask torch.Size([2, 256])
labels torch.Size([2, 256])

=== Batch 1 ===
input_ids torch.Size([2, 256])
attention_mask torch.Size([2, 256])
labels torch.Size([2, 256])

=== Batch 2 ===
input_ids torch.Size([1, 256])
attention_mask torch.Size([1, 256])
labels torch.Size([1, 256])

