In [1]:
!pip install peft




In [2]:
from huggingface_hub import notebook_login, login
from datasets import load_dataset
from transformers import PreTrainedTokenizerBase
from typing import Dict, List
from transformers import AutoTokenizer
from datasets import load_dataset
from functools import partial
from transformers import DataCollatorForMultipleChoice
from transformers import AutoModelForMultipleChoice
from peft import get_peft_model, LoraConfig, TaskType


## P1

In [3]:
notebook_login()
login(token="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")

VBox(children=(HTML(value='<center> <img\nsrc=https://huggingface.co/front/assets/huggingface_logo-noborder.sv…

## P2

In [None]:

swag = load_dataset("swag", "regular")

print(swag)


The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


README.md: 0.00B [00:00, ?B/s]

regular/train-00000-of-00001.parquet:   0%|          | 0.00/14.8M [00:00<?, ?B/s]

regular/validation-00000-of-00001.parque(…):   0%|          | 0.00/4.81M [00:00<?, ?B/s]

regular/test-00000-of-00001.parquet:   0%|          | 0.00/4.78M [00:00<?, ?B/s]

Generating train split:   0%|          | 0/73546 [00:00<?, ? examples/s]

Generating validation split:   0%|          | 0/20006 [00:00<?, ? examples/s]

Generating test split:   0%|          | 0/20005 [00:00<?, ? examples/s]

DatasetDict({
    train: Dataset({
        features: ['video-id', 'fold-ind', 'startphrase', 'sent1', 'sent2', 'gold-source', 'ending0', 'ending1', 'ending2', 'ending3', 'label'],
        num_rows: 73546
    })
    validation: Dataset({
        features: ['video-id', 'fold-ind', 'startphrase', 'sent1', 'sent2', 'gold-source', 'ending0', 'ending1', 'ending2', 'ending3', 'label'],
        num_rows: 20006
    })
    test: Dataset({
        features: ['video-id', 'fold-ind', 'startphrase', 'sent1', 'sent2', 'gold-source', 'ending0', 'ending1', 'ending2', 'ending3', 'label'],
        num_rows: 20005
    })
})


In [5]:
swag["train"].features

{'video-id': Value('string'),
 'fold-ind': Value('string'),
 'startphrase': Value('string'),
 'sent1': Value('string'),
 'sent2': Value('string'),
 'gold-source': Value('string'),
 'ending0': Value('string'),
 'ending1': Value('string'),
 'ending2': Value('string'),
 'ending3': Value('string'),
 'label': ClassLabel(names=['0', '1', '2', '3'])}

In [6]:
swag["train"][0]

{'video-id': 'anetv_jkn6uvmqwh4',
 'fold-ind': '3416',
 'startphrase': 'Members of the procession walk down the street holding small horn brass instruments. A drum line',
 'sent1': 'Members of the procession walk down the street holding small horn brass instruments.',
 'sent2': 'A drum line',
 'gold-source': 'gold',
 'ending0': 'passes by walking down the street playing their instruments.',
 'ending1': 'has heard approaching them.',
 'ending2': "arrives and they're outside dancing and asleep.",
 'ending3': 'turns the lead singer watches the performance.',
 'label': 0}

In [None]:
examples = {
    "sent1": swag["train"]["sent1"],
    "sent2": swag["train"]["sent2"],
    "ending0": swag["train"]["ending0"],
    "ending1": swag["train"]["ending1"],
    "ending2": swag["train"]["ending2"],
    "ending3": swag["train"]["ending3"],
    "label": swag["train"]["label"],  
}

# P3:SWAG Dataset Columns: Meaning and Purpose

## Columns Description

| Column Name   | Meaning & Purpose                                                                                       |
|---------------|-------------------------------------------------------------------------------------------------------|
| `video-id`    | Identifier for the original video clip from which the example was derived. Useful for traceability.    |
| `fold-ind`    | Fold index used internally for data splitting or cross-validation purposes.                            |
| `startphrase` | The starting phrase or combined context sentence(s) used as the premise for the question.              |
| `sent1`       | The first sentence providing the initial context of the scene or situation.                           |
| `sent2`       | The second sentence continuing the context, leading into the question or ambiguity to be resolved.    |
| `ending0`     | The first candidate ending or possible continuation of the context (multiple-choice option 0).         |
| `ending1`     | The second candidate ending (option 1).                                                                |
| `ending2`     | The third candidate ending (option 2).                                                                 |
| `ending3`     | The fourth candidate ending (option 3).                                                                |
| `label`       | The correct answer index (integer 0-3), indicating which `ending` is the most plausible continuation. |
| `gold-source` | The origin of the label. `"gold"` indicates the label is human-annotated ground truth.                |

---

## Purpose of Each Column

- **Context Columns (`sent1`, `sent2`, `startphrase`)**:  
  These provide the situational setup or premise that the multiple-choice endings continue. They help the model understand the scene or scenario.

- **Ending Columns (`ending0` to `ending3`)**:  
  These are the possible continuations of the context. Only one is correct (most commonsense plausible), and the others serve as distractors.

- **Label (`label`)**:  
  This is the key supervised signal representing the human-verified correct answer. It is used during training and evaluation.

- **Metadata (`video-id`, `fold-ind`)**:  
  Additional info to support dataset management, reproducibility, and referencing original video sources.

- **Label Source (`gold-source`)**:  
  Indicates the provenance of the label, typically `"gold"` for human-verified answers ensuring dataset reliability.

---

## Notes

- The dataset focuses on **commonsense reasoning** by forcing the model to choose the best ending grounded in realistic and logical continuation.
- Labels are **human-annotated**, ensuring high quality and meaningful evaluation.



## P4

In [None]:
def unflatten(feature,batch_size):
        return [feature[i * 4:(i + 1) * 4] for i in range(batch_size)]

def preprocess_swag_examples(examples: Dict[str, List[str]], tokenizer: PreTrainedTokenizerBase, max_length: int = 128):

   
    contexts = [f"{c} {s}" for c, s in zip(examples["sent1"], examples["sent2"])]

    
    batch_size = len(contexts)

  
    sequences = []
    for i in range(batch_size):
        choices = [
            contexts[i] + " " + examples[f"ending{j}"][i]
            for j in range(4)
        ]
        sequences.append(choices)

    
    flat_sequences = [choice for choices in sequences for choice in choices]

   
    tokenized = tokenizer(
        flat_sequences,
        truncation=True,
        padding="max_length",
        max_length=max_length,
        return_tensors=None 
    )

    
    tokenized_output = {
        key: unflatten(val,batch_size) for key, val in tokenized.items()
    }

    return tokenized_output


## P5

In [9]:
tokenizer = AutoTokenizer.from_pretrained("google-bert/bert-base-uncased")
tokenized_inputs = preprocess_swag_examples(examples, tokenizer, max_length=128)

tokenizer_config.json:   0%|          | 0.00/48.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/570 [00:00<?, ?B/s]

vocab.txt: 0.00B [00:00, ?B/s]

tokenizer.json: 0.00B [00:00, ?B/s]

In [10]:
print(f"Number of examples: {len(tokenized_inputs['input_ids'])}")
print(f"Number of choices per example: {len(tokenized_inputs['input_ids'][0])}")
print(f"Sequence length: {len(tokenized_inputs['input_ids'][0][0])}")


Number of examples: 73546
Number of choices per example: 4
Sequence length: 128


## P6

In [None]:

preprocess_fn = partial(preprocess_swag_examples, tokenizer=tokenizer, max_length=128)

tokenized_swag = swag.map(preprocess_fn, batched=True)
train_dataset=tokenized_swag["train"]


Map:   0%|          | 0/73546 [00:00<?, ? examples/s]

Map:   0%|          | 0/20006 [00:00<?, ? examples/s]

Map:   0%|          | 0/20005 [00:00<?, ? examples/s]

## P7

In [None]:

data_collator = DataCollatorForMultipleChoice(tokenizer=tokenizer)


## P8

In [13]:

model = AutoModelForMultipleChoice.from_pretrained("google-bert/bert-base-uncased")


model.safetensors:   0%|          | 0.00/440M [00:00<?, ?B/s]

Some weights of BertForMultipleChoice were not initialized from the model checkpoint at google-bert/bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


## P9

In [None]:
!pip install evaluate


In [None]:
import torch
from transformers import AutoTokenizer

example = swag["validation"][0]  

context = f"{example['sent1']} {example['sent2']}"
choices = [context + " " + example[f"ending{i}"] for i in range(4)]

tokenized = tokenizer(
    choices,
    return_tensors="pt",
    padding=True,
    truncation=True
)

input_batch = {k: v.unsqueeze(0) for k, v in tokenized.items()}

with torch.no_grad():
    outputs = model(**input_batch)
    logits = outputs.logits
    predicted_label = torch.argmax(logits, dim=1).item()

true_label = example["label"]

print("Context:", context)
print("\nCandidate Endings:")
for i in range(4):
    print(f"Option {i}: {example[f'ending{i}']}")
print("\nPredicted Label:", predicted_label)
print("True Label:", true_label)
print("\n✅ Correct Prediction" if predicted_label == true_label else "❌ Incorrect Prediction")


In [None]:
from transformers import TrainingArguments, Trainer
from evaluate import load as load_evaluate
import numpy as np

accuracy_metric = load_evaluate("accuracy")
warmup_steps=100,
lr_scheduler_type="linear"

def compute_metrics(eval_pred):
    predictions, labels = eval_pred
    preds = np.argmax(predictions, axis=1)

    
    acc = accuracy_metric.compute(predictions=preds, references=labels)["accuracy"]

    import torch.nn.functional as F
    import math
    log_probs = F.log_softmax(torch.tensor(predictions), dim=1)
    label_log_probs = log_probs[range(len(labels)), labels]
    avg_neg_log_likelihood = -label_log_probs.mean().item()
    perplexity = math.exp(avg_neg_log_likelihood)

    return {"accuracy": acc, "perplexity": perplexity}

import torch.nn.functional as F
import math

def compute_perplexity(eval_pred):
    predictions, labels = eval_pred
    log_probs = F.log_softmax(torch.tensor(predictions), dim=1)
    label_log_probs = log_probs[range(len(labels)), labels]
    avg_neg_log_likelihood = -label_log_probs.mean().item()
    perplexity = math.exp(avg_neg_log_likelihood)
    return {"perplexity": perplexity}

training_args = TrainingArguments(
    output_dir="./results",
    per_device_train_batch_size=8,
    gradient_accumulation_steps=4, 
    per_device_eval_batch_size=16,
    num_train_epochs=3,
    evaluation_strategy="epoch",
    logging_dir="./logs",
    learning_rate=2e-5,  
    report_to="none"
)

trainer = Trainer(
    model=model,
    args=training_args,
    tokenizer=tokenizer,
    data_collator=data_collator,
    train_dataset=tokenized_swag["train"],
    eval_dataset=tokenized_swag["validation"],
    compute_metrics=compute_metrics,
    callbacks=[EarlyStoppingCallback(early_stopping_patience=2)]

)
trainer.train() 

eval_results = trainer.evaluate()
print("\n📊 Evaluation Results:")
print(eval_results)


In [None]:
from sklearn.metrics import confusion_matrix, ConfusionMatrixDisplay
import matplotlib.pyplot as plt
import numpy as np

predictions = trainer.predict(tokenized_swag["validation"])
y_true = predictions.label_ids
y_pred = np.argmax(predictions.predictions, axis=1)

cm = confusion_matrix(y_true, y_pred)
disp = ConfusionMatrixDisplay(confusion_matrix=cm, display_labels=[0, 1, 2, 3])

plt.figure(figsize=(6,6))
disp.plot(cmap="Blues", values_format="d")
plt.title("Confusion Matrix - SWAG Validation")
plt.show()
