In [1]:
from transformers import AutoTokenizer, AutoModelForCausalLM, BitsAndBytesConfig
from datasets import load_dataset
import torch
from transformers import AutoModelForCausalLM, TrainingArguments, Trainer
from peft import LoraConfig, get_peft_model, PeftModel, TaskType
import os
import bitsandbytes as bnb  # For 8-bit quantization
from evaluate import load
from tqdm import tqdm
import torch
import pandas as pd
import re
from rouge_score import rouge_scorer
from nltk.translate.bleu_score import sentence_bleu
from utils import preprocess_qa, RestrictToValidTokens
pd.options.display.max_colwidth = None
# tokenizer = AutoTokenizer.from_pretrained("meta-llama/Meta-Llama-3-8B-Instruct")
# model = AutoModelForCausalLM.from_pretrained("meta-llama/Meta-Llama-3-8B-Instruct")
hf_auth_token = os.getenv("HF_AUTH_TOKEN")
ds = load_dataset("tau/commonsense_qa")

In [2]:
df = pd.DataFrame(ds['train'])
df.head(5)

Unnamed: 0,id,question,question_concept,choices,answerKey
0,075e483d21c29a511267ef62bedc0461,"The sanctions against the school were a punishing blow, and they seemed to what the efforts the school had made to change?",punishing,"{'label': ['A', 'B', 'C', 'D', 'E'], 'text': ['ignore', 'enforce', 'authoritarian', 'yell at', 'avoid']}",A
1,61fe6e879ff18686d7552425a36344c8,Sammy wanted to go to where the people were. Where might he go?,people,"{'label': ['A', 'B', 'C', 'D', 'E'], 'text': ['race track', 'populated areas', 'the desert', 'apartment', 'roadblock']}",B
2,4c1cb0e95b99f72d55c068ba0255c54d,To locate a choker not located in a jewelry box or boutique where would you go?,choker,"{'label': ['A', 'B', 'C', 'D', 'E'], 'text': ['jewelry store', 'neck', 'jewlery box', 'jewelry box', 'boutique']}",A
3,02e821a3e53cb320790950aab4489e85,Google Maps and other highway and street GPS services have replaced what?,highway,"{'label': ['A', 'B', 'C', 'D', 'E'], 'text': ['united states', 'mexico', 'countryside', 'atlas', 'oceans']}",D
4,23505889b94e880c3e89cff4ba119860,"The fox walked from the city into the forest, what was it looking for?",fox,"{'label': ['A', 'B', 'C', 'D', 'E'], 'text': ['pretty flowers.', 'hen house', 'natural habitat', 'storybook', 'dense forest']}",C


In [3]:
bnb_config = BitsAndBytesConfig(
    load_in_8bit=True,  # Enable 8-bit quantization
    llm_int8_threshold=6.0  # Adjust threshold for higher precision on sensitive layers
)

tokenizer = AutoTokenizer.from_pretrained(
    "meta-llama/Meta-Llama-3-8B-Instruct",
    use_auth_token=hf_auth_token,
    cache_dir="/fs03/yu60/kojitanaka/model_cache"
)

model = AutoModelForCausalLM.from_pretrained(
    "meta-llama/Meta-Llama-3-8B-Instruct",
    use_auth_token=hf_auth_token,
    cache_dir="/fs03/yu60/kojitanaka/model_cache",
    device_map="auto",  # Automatically maps layers to GPU
    quantization_config=bnb_config,
)

Loading checkpoint shards:   0%|          | 0/4 [00:00<?, ?it/s]

In [4]:
trained_folder = "./llama-qa-lora_overfit"

In [None]:
# 1. Add a new pad token
tokenizer.add_special_tokens({"pad_token": "<|pad|>"})

# 2. Resize model embeddings to match the new (larger) vocabulary
model.resize_token_embeddings(len(tokenizer), mean_resizing=False)

# 3. Set pad token + pad_token_id
tokenizer.pad_token = "<|pad|>"
tokenizer.pad_token_id = tokenizer.convert_tokens_to_ids("<|pad|>")
# tokenizer.pad_token = tokenizer.eos_token
max_length = min(tokenizer.model_max_length, 256)

def create_tokenized_ds_for_finetune(example):
    import torch

    prompt_text = preprocess_qa(example)['text']

    tokenized_prompt = tokenizer(prompt_text, truncation=True, padding="max_length", return_tensors="pt", max_length=max_length)

    answer_token = tokenizer(example['answerKey'].strip(), return_tensors="pt", add_special_tokens=False)

    input_ids = tokenized_prompt["input_ids"].squeeze(0)
    attention_mask = tokenized_prompt["attention_mask"].squeeze(0)
    answer_input_ids = answer_token["input_ids"].squeeze(0)

    labels = torch.full_like(input_ids, -100)
    if answer_input_ids.numel() == 1:
        next_pos = input_ids.ne(tokenizer.pad_token_id).sum()
        labels[next_pos] = answer_input_ids.item()
    else:
        start_pos = input_ids.ne(tokenizer.pad_token_id).sum()
        labels[start_pos : start_pos + answer_input_ids.size(0)] = answer_input_ids

    pad_length = max_length - input_ids.shape[0]
    
    if pad_length > 0:
        input_ids = torch.cat([input_ids, torch.full((pad_length,), tokenizer.pad_token_id)])
        attention_mask = torch.cat([attention_mask, torch.zeros(pad_length, dtype=torch.long)])
        labels = torch.cat([labels, torch.full((pad_length,), -100)])
    else:
        input_ids = input_ids[:max_length]
        attention_mask = attention_mask[:max_length]
        labels = labels[:max_length]

    return {
        "input_ids": input_ids,
        "attention_mask": attention_mask,
        "labels": labels
    }

tokenized_ds_finetune = ds.map(create_tokenized_ds_for_finetune, remove_columns=ds['train'].column_names)

In [None]:
# Print the max length and verify sequence lengths
print(f"Max Length Used: {max_length}")

# Iterate correctly through the dataset
for idx in range(3):  # Use indexing directly to access each item
    example = tokenized_ds_finetune['train'][idx]  # Accessing each dictionary correctly
    input_ids = example['input_ids']
    labels = example['labels']

    # Print lengths and ensure the types are correct
    print(f"\nExample {idx + 1}:")
    print("Input Length:", len(input_ids))  
    print("Labels Length:", len(labels))
    
    # Decode the tokens and print (only unmasked tokens for labels)
    print("Input Tokens Decoded:", tokenizer.decode(input_ids))
    print("Labels Tokens Decoded:", tokenizer.decode([t for t in labels if t != -100]))

Max Length Used: 256

Example 1:
Input Length: 256
Labels Length: 256
Input Tokens Decoded: <|begin_of_text|>Question: The sanctions against the school were a punishing blow, and they seemed to what the efforts the school had made to change? Options: A: ignore B: enforce C: authoritarian D: yell at E: avoid. Return only the letter corresponding to the correct answer: A, B, C, D, or E. Answer:<|pad|><|pad|><|pad|><|pad|><|pad|><|pad|><|pad|><|pad|><|pad|><|pad|><|pad|><|pad|><|pad|><|pad|><|pad|><|pad|><|pad|><|pad|><|pad|><|pad|><|pad|><|pad|><|pad|><|pad|><|pad|><|pad|><|pad|><|pad|><|pad|><|pad|><|pad|><|pad|><|pad|><|pad|><|pad|><|pad|><|pad|><|pad|><|pad|><|pad|><|pad|><|pad|><|pad|><|pad|><|pad|><|pad|><|pad|><|pad|><|pad|><|pad|><|pad|><|pad|><|pad|><|pad|><|pad|><|pad|><|pad|><|pad|><|pad|><|pad|><|pad|><|pad|><|pad|><|pad|><|pad|><|pad|><|pad|><|pad|><|pad|><|pad|><|pad|><|pad|><|pad|><|pad|><|pad|><|pad|><|pad|><|pad|><|pad|><|pad|><|pad|><|pad|><|pad|><|pad|><|pad|><|pad|><|p

In [9]:
lora_config = LoraConfig(
    r=8,                   # Rank for low-rank matrices
    lora_alpha=32,         # Scaling factor for LoRA updates
    target_modules=["q_proj", "v_proj"],  # Fine-tune only query and value layers
    bias="none",      
    lora_dropout=0.0,     
    task_type=TaskType.CAUSAL_LM  
)

# Apply LoRA to the model
model = get_peft_model(model, lora_config)
model.print_trainable_parameters()

trainable params: 3,407,872 || all params: 8,033,677,312 || trainable%: 0.0424


In [10]:
# Assuming you already processed tokenized_ds with text and target_text columns
train_dataset = tokenized_ds_finetune["train"].select(range(16))
validation_dataset = tokenized_ds_finetune["validation"].select(range(16))
# train_dataset['labels']
# print(tokenized_ds['train'][0])
# Should show input_ids, attention_mask, and labels where only the target tokens are supervised.

In [12]:
generation_config = {
    "pad_token_id": tokenizer.pad_token_id,
    "eos_token_id": tokenizer.eos_token_id
}

model.resize_token_embeddings(len(tokenizer))  

if tokenizer.pad_token_id is None:
    tokenizer.pad_token = "<|pad|>"
    tokenizer.pad_token_id = tokenizer.convert_tokens_to_ids("<|pad|>")

training_args = TrainingArguments(
    output_dir=trained_folder,     
    evaluation_strategy="epoch",     
    save_strategy="epoch",           
    learning_rate=3e-5,               
    per_device_train_batch_size=1,    
    per_device_eval_batch_size=1,    
    num_train_epochs=10,
    weight_decay=0.01,                
    logging_dir="./logs",            
    logging_steps=1,                
    save_total_limit=3,              
    load_best_model_at_end=True,     
    fp16=False,                       
    gradient_accumulation_steps=1,   
    report_to="none"                 
)

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=validation_dataset,
    tokenizer=tokenizer
)

import torch
torch.cuda.empty_cache()

trainer.train()

model.save_pretrained(trained_folder)
tokenizer.save_pretrained(trained_folder)

huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
  trainer = Trainer(


Epoch,Training Loss,Validation Loss
1,0.0,7.684482
2,0.0,6.823765
3,0.0,5.774842
4,0.0,7.686615
5,0.0,7.336946
6,0.0,10.226562
7,0.0,11.721681
8,0.0,11.933595
9,0.0,9.493792
10,0.0,11.276869




('./llama-qa-lora_overfit/tokenizer_config.json',
 './llama-qa-lora_overfit/special_tokens_map.json',
 './llama-qa-lora_overfit/tokenizer.json')

In [7]:
from transformers import LogitsProcessorList, LogitsProcessor
# Custom LogitsProcessor for valid tokens restriction
class RestrictToValidTokens(LogitsProcessor):
    def __init__(self, valid_tokens):
        self.valid_tokens = valid_tokens

    def __call__(self, input_ids, scores):
        # Mask all logits except the valid tokens
        mask = torch.full_like(scores, float('-inf'))
        mask[..., self.valid_tokens] = 0
        return scores + mask

# Convert valid tokens to token IDs
valid_tokens = [tokenizer.convert_tokens_to_ids(token) for token in ['A', 'B', 'C', 'D', 'E']]

# Instantiate the logits processor with valid tokens
logits_processor = LogitsProcessorList([RestrictToValidTokens(valid_tokens)])

correct_predictions = 0
for idx in range(16):
    example = train_dataset[idx]  # Ensure using the same preprocessed dataset
    inputs = {k: torch.tensor(v).unsqueeze(0).to(model.device) for k, v in example.items() if k != "labels"}

    with torch.no_grad():
        outputs = model.generate(**inputs, 
                                 max_new_tokens=1, 
                                 eos_token_id=tokenizer.eos_token_id, 
                                 top_k=5,
                                 logits_processor=logits_processor
                                 )
        generated_tokens = outputs[0][inputs['input_ids'].shape[1]:]
        prediction_text = tokenizer.decode(generated_tokens, skip_special_tokens=True)
        
        # Decode ground truth from tokenized labels
        ground_truth = tokenizer.decode([token for token in example['labels'] if token != -100])
        
        if prediction_text.strip() == ground_truth.strip():
            correct_predictions += 1

        print(f"Prediction: {prediction_text}, Ground Truth: {ground_truth}")

accuracy = correct_predictions / 16 * 100
print(f"\nAccuracy: {accuracy:.2f}%")

Setting `pad_token_id` to `eos_token_id`:128009 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:128009 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:128009 for open-end generation.


Prediction: D, Ground Truth: A
Prediction: E, Ground Truth: B


Setting `pad_token_id` to `eos_token_id`:128009 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:128009 for open-end generation.


Prediction: E, Ground Truth: A
Prediction: A, Ground Truth: D


Setting `pad_token_id` to `eos_token_id`:128009 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:128009 for open-end generation.


Prediction: A, Ground Truth: C
Prediction: A, Ground Truth: D


Setting `pad_token_id` to `eos_token_id`:128009 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:128009 for open-end generation.


Prediction: B, Ground Truth: E
Prediction: E, Ground Truth: B


Setting `pad_token_id` to `eos_token_id`:128009 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:128009 for open-end generation.


Prediction: D, Ground Truth: E
Prediction: A, Ground Truth: D


Setting `pad_token_id` to `eos_token_id`:128009 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:128009 for open-end generation.


Prediction: E, Ground Truth: B
Prediction: A, Ground Truth: C


Setting `pad_token_id` to `eos_token_id`:128009 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:128009 for open-end generation.


Prediction: E, Ground Truth: C
Prediction: A, Ground Truth: A


Setting `pad_token_id` to `eos_token_id`:128009 for open-end generation.


Prediction: E, Ground Truth: C
Prediction: A, Ground Truth: D

Accuracy: 6.25%


In [None]:
example = tokenized_ds["train"][0]
print("Input IDs:", example["input_ids"])
print("Labels:   ", example["labels"])

print("Decoded Input:", tokenizer.decode(example["input_ids"]))
print("Decoded Label:", tokenizer.decode([x for x in example["labels"] if x != -100]))

In [None]:
import torch

example = tokenized_ds["train"][0]
input_ids = torch.tensor(example["input_ids"])
labels = torch.tensor(example["labels"])

label_positions = (labels != -100).nonzero(as_tuple=True)[0]
print("Label positions:", label_positions.tolist())

non_pad_count = int((input_ids != tokenizer.pad_token_id).sum())
print("Non-pad count:", non_pad_count)

# Compare them
print("Decoded input:", tokenizer.decode(input_ids))
print("Decoded label:", tokenizer.decode(labels[labels != -100]))

In [None]:
from torch.utils.data import DataLoader

train_loader = DataLoader(train_dataset, batch_size=1, shuffle=False)

model.train()
batch = next(iter(train_loader))

# Ensure tensors are properly shaped and moved to the correct device
for k in batch:
    batch[k] = torch.tensor(batch[k]).unsqueeze(0).to(model.device)  # Adding batch dimension

# Forward pass in training mode
outputs = model(**batch)
print("Manual training-mode loss:", outputs.loss.item())

# Forward pass in evaluation mode
model.eval()
with torch.no_grad():
    outputs_eval = model(**batch)
print("Manual eval-mode loss:", outputs_eval.loss.item())


In [None]:
# Check exactly how many labels are non-masked
for idx in range(3):
    example = train_dataset[idx]
    input_ids = torch.tensor(example['input_ids'])
    labels = torch.tensor(example['labels'])
    non_pad_tokens = (labels != -100).sum().item()
    print(f"Example {idx}: Non-Pad Tokens: {non_pad_tokens}, Label Positions:", (labels != -100).nonzero(as_tuple=True)[0].tolist())