# Login to huggingface:

In [None]:
# login to huggingface snippet
from huggingface_hub import login

login("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")

# Required installation and imports:

In [None]:
pip install rouge_score pycocoevalcap

In [None]:
pip install evaluate

In [None]:
pip install nltk

In [None]:
import torch
import datasets
from torch import nn

from datasets import load_dataset
from transformers import (AutoModel,
                          AutoModelForCausalLM, 
                          AutoTokenizer,
                          AutoModelForQuestionAnswering,
                          Trainer, TrainingArguments,
                          DataCollatorWithPadding
                          )
import evaluate
from nltk.tokenize import word_tokenize
from nltk.translate.meteor_score import meteor_score

print("imports done")

# Setting up the GPU:

In [None]:
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
print('Using device:', device)

# Loading the model - both the base model and the model with question-answering head

In [None]:
model_name = "meta-llama/Llama-3.2-1B"

# tokenizer
tokenizer = AutoTokenizer.from_pretrained(model_name)
# setting the padding token manually:
if tokenizer.pad_token is None:
    tokenizer.pad_token = tokenizer.eos_token
print("tokenizer loaded")

# model without question-answering layer:
base_model = AutoModel.from_pretrained(model_name)
print("base model loaded")

# model with question-answering layer:
model = AutoModelForQuestionAnswering.from_pretrained(model_name)
print("model with question answering layer loaded")



# Counting the model parameters:

In [None]:
# function to count model parameters:
def print_model_parameters_tabular(model):
    parameters = []
    total_params = 0
    trainable_params = 0

    # Collect model parameter details
    for name, param in model.named_parameters():
        num_params = param.numel()
        total_params += num_params
        if param.requires_grad:
            trainable_params += num_params
        parameters.append([name, list(param.shape), param.requires_grad, f"{num_params:,}"])

    # Print parameter details in a more compact, line-by-line format
    print(f"Model Parameters for {type(model).__name__}:\n")
    for param in parameters:
        name, shape, requires_grad, num_elements = param
        print(f"Parameter Name: {name}")
        print(f"  Shape: {shape}")
        print(f"  Requires Grad: {requires_grad}")
        print(f"  Total Elements: {num_elements}")
        print("-" * 50)  # Separator line for clarity
    
    # Print summary
    print(f"\nTotal Parameters: {total_params:,}")
    print(f"Trainable Parameters: {trainable_params:,}")
    print(f"Non-Trainable Parameters: {total_params - trainable_params:,}")


## Without question-answering layer:

In [None]:
# printing parameters of base model:
print_model_parameters_tabular(base_model)

## With question-answering layer:

In [None]:
print_model_parameters_tabular(model)

### As can be seen, the addition of a question-answering head to make the model suitable for a question-answering task adds 4098 extra parameters to the base model. During fine-tuning, we will only be training these parameters and not change the parameters of the base model (due to memory and computation constraints - and also, the relevant parameters for question-answering are only those added in the final layer).

### We will also check the number of parameters after fine-tuning, to show that fine-tuning by itself doesn't alter the number of parameters, but adding an extra layer does.

# Loading the dataset:

In [None]:
dataset = load_dataset("rajpurkar/squad_v2", split="train")
print(dataset)

# splitting on the default 'train' split as asked, with a train-test ratio of 80:20 and random_state = 1 
split_dataset = dataset.train_test_split(test_size=0.2, seed=1)
print(split_dataset)



In [None]:
# training/testing sample:
print(split_dataset['train'][1])

# Tokenizing the dataset:

In [None]:
# function to tokenize the data (takes the sample printed above as input)
def preprocess_function(examples):
    questions = [q.strip() for q in examples["question"]]
    inputs = tokenizer(
        questions,
        examples["context"],
        max_length=128,
        truncation="only_second",
        stride=64,
        return_overflowing_tokens=True,
        return_offsets_mapping=True,
        padding="max_length",
    )

    offset_mapping = inputs.pop("offset_mapping")
    sample_map = inputs.pop("overflow_to_sample_mapping")

    inputs["example_id"] = [examples["id"][i] for i in sample_map]
    answers = examples["answers"]
    start_positions = []
    end_positions = []

    for i, offset in enumerate(offset_mapping):
        sample_idx = sample_map[i]
        answer = answers[sample_idx]
        start_char = answer["answer_start"][0]
        end_char = answer["answer_start"][0] + len(answer["text"][0])
        sequence_ids = inputs.sequence_ids(i)

        # Find the start and end of the context
        idx = 0
        while idx < len(sequence_ids) and sequence_ids[idx] != 1:
            idx += 1
        context_start = idx
        while idx < len(sequence_ids) and sequence_ids[idx] == 1:
            idx += 1
        context_end = idx - 1

        # If the answer is not fully inside the context, label is (0, 0)
        if offset[context_start][0] > start_char or offset[context_end][1] < end_char:
            start_positions.append(0)
            end_positions.append(0)
        else:
            # Otherwise it's the start and end token positions
            idx = context_start
            while idx <= context_end and offset[idx][0] <= start_char:
                idx += 1
            start_positions.append(idx - 1)

            idx = context_end
            while idx >= context_start and offset[idx][1] >= end_char:
                idx -= 1
            end_positions.append(idx + 1)

    inputs["start_positions"] = start_positions
    inputs["end_positions"] = end_positions
    return inputs

In [None]:
# calling the tokenizing function on training + testing samples:
tokenized_train_data = split_dataset['train'].map(
    preprocess_function,
    batched=True,
    remove_columns=split_dataset["train"].column_names, # to make the format suitable for the model
)
print("train data tokenized")
tokenized_test_data = split_dataset['test'].map(
    preprocess_function,
    batched=True,
    remove_columns=split_dataset["test"].column_names,
)
print("test data tokenized")

### Verifying tokenized data:

In [None]:
print(tokenized_train_data[0])
print(tokenized_test_data[0])

In [None]:
# data collator instance:
data_collator = DataCollatorWithPadding(tokenizer=tokenizer)

# Defining training arguments and creating trainer instance (before fine-tuning for pre-fine-tuning evaluation):

In [None]:
model.to(device)

In [None]:
# defining training arguments:
training_args = TrainingArguments(
    output_dir="./results",     
    per_device_eval_batch_size=8, # due to hardware constraints
    do_eval=False,                  
    logging_dir='./logs',         
    report_to="none", 
)

# trainer instance:
trainer = Trainer(
    model=model,
    args=training_args,      
    tokenizer=tokenizer,
    data_collator=data_collator
) 


# Functions to compute various evaluation metrics and compare pre- and post-fine-tuning performance:

In [None]:
def compare_predictions(test_data, squad_dataset_test, start_idx, end_idx, tokenizer):

    original_answers_map = {
        example["id"]: example["answers"]["text"][0] for example in squad_dataset_test
    }
    comparisons = [
        (
            original_answers_map[example_id],  # Original answer
            tokenizer.decode(
                test_data["input_ids"][i][start_idx[i]:end_idx[i] + 1]
            ).strip() if start_idx[i] != 0 and end_idx[i] != 0 else ""
        )
        for i, example_id in enumerate(test_data["example_id"])
    ]

    return comparisons


In [None]:
def compute_average_meteor_scores(references, predictions):
    total = 0
    count = len(predictions)

    for ref, pred in zip(references, predictions):
        ref = word_tokenize(ref)
        pred = word_tokenize(pred)
        score_details = meteor_score([ref], pred)
        total += score_details

    avg = total / count

    return {
        "score": avg,
    }


In [None]:
def evaluate_function(test_data, tokenizer, squad_dataset_test, trainer):
    predictions, _, _ = trainer.predict(test_data)
    start_logits, end_logits = predictions
    start_idx = np.argmax(start_logits, axis=1)
    end_idx = np.argmax(end_logits, axis=1)

    comparisons = compare_predictions(test_data, squad_dataset_test, start_idx, end_idx, tokenizer)
    original_answers = [original for original, _ in comparisons]
    predicted_answers = [predicted for _, predicted in comparisons]
    print("hey")
    squad_predictions = [
        {
            "id": str(i),
            "prediction_text": pred,
            "no_answer_probability": 1.0 if len(pred.strip()) == 0 else 0.0,
        }
        for i, pred in enumerate(predicted_answers)
    ]

    squad_references = [
        {"id": str(i), "answers": {"text": [orig], "answer_start": []}}
        for i, orig in enumerate(original_answers)
    ]
    squad_metric = evaluate.load("squad_v2")
    bleu_metric = evaluate.load("bleu")
    rouge_metric = evaluate.load("rouge")

    squad_results = squad_metric.compute(
        predictions=squad_predictions,
        references=squad_references
    )
    
    bleu_results = bleu_metric.compute(
        predictions=predicted_answers,
        references=[[orig] for orig in original_answers]
    )
    
    rouge_results = rouge_metric.compute(
        predictions=predicted_answers,
        references=original_answers
    )
    
    meteor_results = compute_average_meteor_scores(original_answers, predicted_answers)

    # final output - returning all metrics:
    results = {
        "squad_v2": squad_results,
        "bleu": bleu_results["bleu"],
        "rouge-2": rouge_results["rouge2"],
        "rouge-L": rouge_results["rougeL"],
        "rouge-1": rouge_results["rouge1"],
        "meteor": meteor_results,
    }

    return results

In [None]:
import numpy as np
results_before = evaluate_function(tokenized_test_data, tokenizer, split_dataset["test"], trainer)
print(results_before)

# Freezing all model parameters except the last question-answering layer added to the base model:

In [None]:
for param in model.transformer.parameters():
    param.requires_grad = False
# verify if the top layer is still trainable
for name, param in model.named_parameters():
    if param.requires_grad:
        print(name)# Define the training arguments

# training arguments:
training_args = TrainingArguments(
    output_dir="./post_finetuning_results",            
    evaluation_strategy="no",                    
    learning_rate=2e-5,                          
    per_device_train_batch_size=8,  # set to a smaller number due to memory and compute constraints
    per_device_eval_batch_size=8,                
    num_train_epochs=3,                          
    weight_decay=0.01,                           
    logging_dir="./logs",                        
    report_to="none",                      
    save_strategy="epoch",                      
    fp16=True,                                  
    logging_steps=10,                           
    lr_scheduler_type="linear",                 
    load_best_model_at_end=False,                
)

# instantiating trainer for fine-tuning
trainer = Trainer(
    model=model,       
    args=training_args,               
    train_dataset=tokenized_train_data, 
    data_collator=data_collator,
    tokenizer=tokenizer
)



In [None]:
# Fine-tune the model
trainer.train()

In [None]:
results_after = evaluate_function(tokenized_test_data, tokenizer, split_dataset["test"], trainer)
print(results_after)

In [None]:

import matplotlib.pyplot as plt
import numpy as np

def create_single_table(results_before, results_after):
    def format_value(value):
        """Helper function to format values consistently."""
        if isinstance(value, float):
            return f"{value:.4f}"
        return value if value is not None else "N/A"

    # Collect all keys
    all_keys = set(results_before.keys()).union(results_after.keys())

    # Prepare data for the table
    table_data = []
    for key in sorted(all_keys):
        if isinstance(results_before.get(key), dict) or isinstance(results_after.get(key), dict):
            # Handle nested dictionaries
            sub_keys = set(results_before.get(key, {}).keys()).union(results_after.get(key, {}).keys())
            for sub_key in sorted(sub_keys):
                before_value = format_value(results_before.get(key, {}).get(sub_key))
                after_value = format_value(results_after.get(key, {}).get(sub_key))
                table_data.append([key, sub_key, before_value, after_value])
        else:
            # Handle scalar values
            before_value = format_value(results_before.get(key))
            after_value = format_value(results_after.get(key))
            table_data.append([key, "-", before_value, after_value])

    # Create the figure and axis
    fig, ax = plt.subplots(figsize=(10, len(table_data) * 0.5))
    ax.axis("tight")
    ax.axis("off")

    # Add a title
    title = "Comparison of Results Before and After Fine-Tuning"
    fig.suptitle(title, fontsize=14, fontweight="bold", y=0.98)

    # Create the table
    column_labels = ["Metric", "Sub-Metric", "Before Fine-Tuning", "After Fine-Tuning"]
    table = ax.table(
        cellText=table_data,
        colLabels=column_labels,
        cellLoc="center",
        loc="center",
    )

    # Style the table
    table.auto_set_font_size(False)
    table.set_fontsize(10)
    table.auto_set_column_width(col=list(range(len(column_labels))))

    # Display the table
    plt.show()


In [None]:

repo_name = "jiya14desai/Llama-3.2-1B_fine-tuned_on_questionanswering"
model.push_to_hub(repo_name)
print(f"Model uploaded to Hugging Face Hub under repository: {repo_name}")

In [None]:
def count_parameters(model):
    return sum(p.numel() for p in model.parameters())


pretrained_params = count_parameters(model)
finetuned_params = count_parameters(trainer.model)

print(f"Number of parameters in the pre-trained model: {pretrained_params}")
print(f"Number of parameters in the fine-tuned model: {finetuned_params}")
if pretrained_params == finetuned_params:
    print("The number of parameters remains the same after fine-tuning.")
else:
    print("The number of parameters differs after fine-tuning.")