In [1]:
# !pip install transformers DataLoader huggingface_hub datasets -q

In [1]:
import os
from huggingface_hub import login

import numpy as np

from transformers import AutoTokenizer, AutoModelForQuestionAnswering, DataCollatorWithPadding
import torch
from torch.utils.data import DataLoader
from torch.optim import AdamW

from tqdm import tqdm

from datasets import load_dataset

from sklearn.model_selection import train_test_split

In [2]:
current_dir = os.getcwd()

### Model

In [4]:
# login(token=os.environ['HUGGINGFACE_API_KEY'])

In [5]:
# model_name = "meta-llama/Llama-3.2-1B"
# tokenizer = AutoTokenizer.from_pretrained(model_name)
# model = AutoModelForQuestionAnswering.from_pretrained(
#     model_name,
#     torch_dtype=torch.float16,
# )

# model.resize_token_embeddings(len(tokenizer))

# if tokenizer.pad_token is None:
#     tokenizer.pad_token = tokenizer.eos_token

# model.config.pad_token_id = tokenizer.pad_token_id

# model.save_pretrained("./Model_QA")
# tokenizer.save_pretrained("./Tokenizer_QA")

In [6]:
model = AutoModelForQuestionAnswering.from_pretrained(current_dir+"/Model_QA")
tokenizer = AutoTokenizer.from_pretrained(current_dir+"/Tokenizer_QA")

### Data

In [3]:
dataset = load_dataset("rajpurkar/squad_v2")

In [4]:
dataset = load_dataset("rajpurkar/squad_v2")

model_name = "bert-base-uncased"
tokenizer = AutoTokenizer.from_pretrained(model_name)

def preprocess_function(examples):


    questions = examples["question"]
    contexts = examples["context"]
    answers = examples["answers"]

    tokenized_inputs = tokenizer(
        questions,
        contexts,
        truncation=True,
        padding=False,
        return_offsets_mapping=True,
    )

    start_positions = []
    end_positions = []

    for i, offsets in enumerate(tokenized_inputs["offset_mapping"]):
        answer = answers[i]
        start_char = answer["answer_start"][0] if len(answer["answer_start"]) > 0 else None
        end_char = (
            start_char + len(answer["text"][0]) if start_char is not None else None
        )

        if start_char is None:
            start_positions.append(0)
            end_positions.append(0)
        else:
            start_pos, end_pos = None, None
            for j, (start, end) in enumerate(offsets):
                if start <= start_char < end:
                    start_pos = j
                if start < end_char <= end:
                    end_pos = j
            start_positions.append(start_pos)
            end_positions.append(end_pos)

    tokenized_inputs.pop("offset_mapping")
    tokenized_inputs["start_positions"] = start_positions
    tokenized_inputs["end_positions"] = end_positions

    return tokenized_inputs

tokenized_train_dataset = dataset["train"].select(range(20000)).map(preprocess_function, batched=True, remove_columns=dataset["train"].column_names)
tokenized_validation_dataset = dataset["validation"].select(range(1000)).map(preprocess_function, batched=True, remove_columns=dataset["validation"].column_names)

print(tokenized_train_dataset.shape)
print(tokenized_validation_dataset.shape)

(20000, 5)
(1000, 5)


In [5]:
data_collator = DataCollatorWithPadding(tokenizer=tokenizer, return_tensors="pt")

tokenized_train_dataset.set_format("torch")
tokenized_validation_dataset.set_format("torch")

In [6]:
train_dataloader = DataLoader(
    tokenized_train_dataset, batch_size=128, shuffle=True, collate_fn=data_collator
)

validation_dataloader = DataLoader(
    tokenized_validation_dataset, batch_size=32, shuffle=False, collate_fn=data_collator
)

### Training

In [11]:
for name, param in model.named_parameters():
    
    if name not in ["qa_outputs.weight", "qa_outputs.bias"]:
        param.requires_grad = False

In [9]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
device

device(type='cuda')

In [13]:
optimizer = AdamW([model.qa_outputs.weight, model.qa_outputs.bias],
              lr=0.001)

epochs = 3
epoch_loss = []

for epoch in range(epochs):

    model.train()
    model.to(device)
    
    total_loss = 0

    for batch in tqdm(train_dataloader):

        outputs = model(
            input_ids=batch["input_ids"].to(device),
            attention_mask=batch["attention_mask"].to(device),
            start_positions=batch["start_positions"].to(device).long(),
            end_positions=batch["end_positions"].to(device).long(),
        )

        loss = outputs.loss
        loss.backward()

        optimizer.step()
        optimizer.zero_grad()

        total_loss += loss.item()

    avg_loss = total_loss / len(train_dataloader)
    epoch_loss.append(avg_loss)
    print(f"Average Loss for Epoch {epoch + 1}: {avg_loss}")

    checkpoint_dir = f"Model_checkpoints/checkpoint_epoch_{epoch + 1}"
    model.save_pretrained(checkpoint_dir)
    tokenizer.save_pretrained(checkpoint_dir)
    print(f"Checkpoint saved at {checkpoint_dir}")


model.save_pretrained("fine_tuned_bert_qa")
tokenizer.save_pretrained("fine_tuned_bert_qa")

100%|██████████| 157/157 [34:15<00:00, 13.09s/it]


Average Loss for Epoch 1: 4.556686753679992
Checkpoint saved at Model_checkpoints/checkpoint_epoch_1


100%|██████████| 157/157 [34:31<00:00, 13.19s/it]


Average Loss for Epoch 2: 4.323548414145305
Checkpoint saved at Model_checkpoints/checkpoint_epoch_2


100%|██████████| 157/157 [34:20<00:00, 13.12s/it]


Average Loss for Epoch 3: 4.297109225753006
Checkpoint saved at Model_checkpoints/checkpoint_epoch_3


('fine_tuned_bert_qa/tokenizer_config.json',
 'fine_tuned_bert_qa/special_tokens_map.json',
 'fine_tuned_bert_qa/vocab.txt',
 'fine_tuned_bert_qa/added_tokens.json',
 'fine_tuned_bert_qa/tokenizer.json')

### Testing

In [18]:
# !pip install evaluate nltk rouge_score -q

In [7]:
model = AutoModelForQuestionAnswering.from_pretrained("/teamspace/studios/this_studio/Model_checkpoints/checkpoint_epoch_3")

In [12]:
import torch
from transformers import AutoTokenizer, AutoModelForCausalLM
from datasets import load_dataset
from evaluate import load as load_metric
import nltk
from nltk.translate import meteor_score
nltk.download('punkt')

model.to(device)

# Metrics
f1_metric = load_metric("f1")
bleu_metric = load_metric("bleu")
rouge_metric = load_metric("rouge")
exact_match_metric = load_metric("exact_match")

# Helper functions
def predict_answer(model, question, context):

    inputs = tokenizer(question, context, return_tensors="pt", truncation=True, max_length=512)
    inputs = {key: val.to(device) for key, val in inputs.items()}
    outputs = model(**inputs)

    start_scores = outputs.start_logits
    end_scores = outputs.end_logits

    start_idx = torch.argmax(start_scores[0])
    end_idx = torch.argmax(end_scores[0])

    if start_idx <= end_idx:

        predicted_answer = tokenizer.decode(inputs['input_ids'][0][start_idx:end_idx + 1])
    else:
        predicted_answer = ""

    return predicted_answer

def compute_metrics(predictions, references):

    # Exact Match
    exact_matches = [1 if pred.strip() == ref.strip() else 0 for pred, ref in zip(predictions, references)]
    exact_match_score = sum(exact_matches) / len(exact_matches) * 100

    # BLEU Score
    bleu = bleu_metric.compute(predictions=predictions,
                                references=references)['bleu'] * 100

    # ROUGE Score
    rouge = rouge_metric.compute(predictions=predictions, references=references, use_aggregator=True)
    rouge_l = rouge['rougeL'] * 100

    tokenized_predictions = [nltk.word_tokenize(pred) for pred in predictions]
    tokenized_references = [nltk.word_tokenize(ref) for ref in references]

    # Compute METEOR score
    meteor = sum([meteor_score.single_meteor_score(ref, pred) for pred, ref in zip(tokenized_predictions, tokenized_references)]) / len(predictions) * 100

    return {
        "Exact Match": exact_match_score,
        "BLEU Score": bleu,
        "ROUGE-L Score": rouge_l,
        "METEOR Score": meteor
    }

#Generate predictions and evaluate
predictions = []
references = []

for sample in dataset["validation"].select(range(1000)):
    question = sample['question']
    context = sample['context']
    reference = sample['answers']['text'][0] if sample['answers']['text'] else ""  # Use empty string for unanswerable

    prediction = predict_answer(model, question, context)
    predictions.append(prediction)
    references.append(reference)

# Compute metrics
metrics = compute_metrics(predictions, references)
print(metrics)


[nltk_data] Downloading package punkt to
[nltk_data]     /teamspace/studios/this_studio/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


{'Exact Match': 0.0, 'BLEU Score': 0.0, 'ROUGE-L Score': 0.0, 'METEOR Score': 0.0}


### Zeroshot

In [14]:
zero_shot_model = AutoModelForQuestionAnswering.from_pretrained("/teamspace/studios/this_studio/Model_QA").to(device)

In [15]:
#Generate predictions and evaluate
predictions = []
references = []

for sample in dataset["validation"].select(range(1000)):
    question = sample['question']
    context = sample['context']
    reference = sample['answers']['text'][0] if sample['answers']['text'] else ""  # Use empty string for unanswerable

    prediction = predict_answer(zero_shot_model, question, context)
    predictions.append(prediction)
    references.append(reference)

# Compute metrics
metrics = compute_metrics(predictions, references)
print(metrics)

{'Exact Match': 30.3, 'BLEU Score': 0.14726663803867085, 'ROUGE-L Score': 1.0665124093610086, 'METEOR Score': 2.0532087725175967}


Q5

In [16]:
def count_parameters(model):
    return sum(p.numel() for p in model.parameters())

# Model parameters

# Load fine-tuned model (if available)
fine_tuned_params = count_parameters(model)

print(f"Fine-Tuned Model Parameters: {fine_tuned_params}")


Fine-Tuned Model Parameters: 1235818498


Total parameters in pre-trained meta-llama/Llama-3.2-1B: 1235.82M (1235824640 parameters)
Fine-Tuned Model Parameters: 1235814400
So the number of parameters does not match in fine-tuned and pre-trained model

Q7
Lower or Higher Scores in the Metrics [10 pts]
Higher Scores: Higher metric scores (like F1, BLEU, ROUGE, and Exact Match) indicate better alignment of the model's predictions with the ground truth. This reflects the model's ability to accurately understand and generate contextually relevant answers. Lower Scores: Lower scores signify either a lack of understanding or issues in the model’s ability to generalize. This could be due to: Insufficient fine-tuning on the task-specific data. Lack of diverse data during fine-tuning or pretraining. Poor handling of ambiguous or adversarial questions. Rationale:

F1 Score: Indicates precision and recall balance. A high score suggests that the predicted answers capture both relevance and completeness. BLEU/ROUGE: These measure the overlap between predicted and reference answers. Low scores here might highlight issues with phrasing or inability to capture subtle nuances. Exact Match: Useful for factual datasets like SQuAD, where the answer must exactly match. Lower EM might indicate paraphrasing or misunderstanding. 2. Understanding the Number of Parameters Between Pretraining and Fine-Tuning [05 pts]

Pretraining Parameters: Large models (e.g., LLaMA) have billions of parameters during pretraining, enabling them to learn general representations from diverse data. This is crucial for capturing broad language understanding. Fine-Tuning Parameters: Fine-tuning often involves updating only a subset of parameters (e.g., LoRA or adapters) to adapt the general knowledge for specific tasks. In some cases, all parameters may be fine-tuned, but this is computationally expensive. Rationale:

The number of trainable parameters during fine-tuning directly affects adaptability. Fine-tuning a small subset is computationally efficient but might limit task-specific improvements. Pretrained models with larger parameter counts usually generalize better but may need extensive fine-tuning to specialize. 3. Performance Difference for Zero-Shot and Fine-Tuned Models [05 pts]

Zero-Shot Performance: A pre-trained model is evaluated directly on a downstream task without any task-specific training. It relies on general language understanding and may perform well on tasks with close alignment to pretraining data. Fine-Tuned Performance: Fine-tuned models are trained on specific datasets, improving their performance on the task. They learn task-specific nuances, which is critical for structured datasets like SQuAD.