In [1]:
# Cell 1 — Install required packages
!pip install -q transformers datasets sentencepiece accelerate evaluate rouge_score bert-score

  Preparing metadata (setup.py) ... [?25l[?25hdone
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m84.1/84.1 kB[0m [31m6.0 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m61.1/61.1 kB[0m [31m5.1 MB/s[0m eta [36m0:00:00[0m
[?25h  Building wheel for rouge_score (setup.py) ... [?25l[?25hdone


In [2]:
# Cell 2 — Imports and device check
import os, gc, re
import pandas as pd
import numpy as np
import torch

from datasets import Dataset
import evaluate

from transformers import (
    T5Tokenizer,
    T5ForConditionalGeneration,
    DataCollatorForSeq2Seq,
    Seq2SeqTrainer,
    Seq2SeqTrainingArguments
)

print("Torch:", torch.__version__)
print("CUDA available:", torch.cuda.is_available())
if torch.cuda.is_available():
    print("GPU:", torch.cuda.get_device_name(0))


Torch: 2.9.0+cu126
CUDA available: True
GPU: NVIDIA A100-SXM4-80GB


In [3]:
# cell - import dataset
from google.colab import drive
drive.mount('/content/drive')


Mounted at /content/drive


In [4]:
# Cell 3 — Import libraries
import json
import pandas as pd


In [5]:
!ls /content/drive/MyDrive/nlp_project_dataset


dev-v1.1.json  train-v1.1.json


In [6]:
# Cell 4 — Load the JSON files
train_path = "/content/drive/MyDrive/nlp_project_dataset/train-v1.1.json"
dev_path   = "/content/drive/MyDrive/nlp_project_dataset/dev-v1.1.json"

with open(train_path, 'r') as f:
    train_json = json.load(f)

with open(dev_path, 'r') as f:
    dev_json = json.load(f)


In [7]:
# Cell 5 — Flatten JSON into Pandas DataFrame
def squad_json_to_df(squad_dict):
    contexts = []
    questions = []
    answers = []

    for group in squad_dict['data']:
        for paragraph in group['paragraphs']:
            context = paragraph['context']
            for qa in paragraph['qas']:
                question = qa['question']
                if len(qa['answers']) > 0:
                    answer = qa['answers'][0]['text']  # take first answer
                else:
                    answer = ""
                contexts.append(context)
                questions.append(question)
                answers.append(answer)

    df = pd.DataFrame({
        "context": contexts,
        "question": questions,
        "answer": answers
    })
    return df

train_df = squad_json_to_df(train_json)
dev_df   = squad_json_to_df(dev_json)

print("Train size:", train_df.shape)
print("Dev size:", dev_df.shape)
train_df.head(3)


Train size: (87599, 3)
Dev size: (10570, 3)


Unnamed: 0,context,question,answer
0,"Architecturally, the school has a Catholic cha...",To whom did the Virgin Mary allegedly appear i...,Saint Bernadette Soubirous
1,"Architecturally, the school has a Catholic cha...",What is in front of the Notre Dame Main Building?,a copper statue of Christ
2,"Architecturally, the school has a Catholic cha...",The Basilica of the Sacred heart at Notre Dame...,the Main Building


In [8]:
# Cell 7 — Custom PyTorch Dataset using Pandas
from torch.utils.data import Dataset, DataLoader

class QGDataset(Dataset):
    def __init__(self, dataframe, tokenizer, max_input_len=512, max_target_len=64):
        self.data = dataframe
        self.tokenizer = tokenizer
        self.max_input_len = max_input_len
        self.max_target_len = max_target_len

    def __len__(self):
        return len(self.data)

    def __getitem__(self, idx):
        context = self.data.loc[idx, "context"]
        answer  = self.data.loc[idx, "answer"]
        question= self.data.loc[idx, "question"]

        # Input format: "generate question: context: ... answer: ..."
        input_text  = f"generate question: context: {context} answer: {answer}"
        target_text = question

        # Tokenize inputs
        inputs = self.tokenizer(
            input_text,
            max_length=self.max_input_len,
            truncation=True,
            padding="max_length",
            return_tensors="pt"
        )

        # Tokenize targets
        targets = self.tokenizer(
            target_text,
            max_length=self.max_target_len,
            truncation=True,
            padding="max_length",
            return_tensors="pt"
        )

        labels = targets.input_ids.squeeze()
        labels[labels == self.tokenizer.pad_token_id] = -100  # ignore pad in loss

        return {
            "input_ids": inputs.input_ids.squeeze(),
            "attention_mask": inputs.attention_mask.squeeze(),
            "labels": labels
        }


In [9]:
# Load tokenizer and model
from transformers import T5Tokenizer, T5ForConditionalGeneration

model_name = "t5-small"
tokenizer = T5Tokenizer.from_pretrained(model_name)
model     = T5ForConditionalGeneration.from_pretrained(model_name)
model.config.pad_token_id = tokenizer.pad_token_id


The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


tokenizer_config.json:   0%|          | 0.00/2.32k [00:00<?, ?B/s]

spiece.model:   0%|          | 0.00/792k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.39M [00:00<?, ?B/s]

You are using the default legacy behaviour of the <class 'transformers.models.t5.tokenization_t5.T5Tokenizer'>. This is expected, and simply means that the `legacy` (previous) behavior will be used so nothing changes for you. If you want to use the new behaviour, set `legacy=False`. This should only be set if you understand what it means, and thoroughly read the reason why this was added as explained in https://github.com/huggingface/transformers/pull/24565


config.json:   0%|          | 0.00/1.21k [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/242M [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/147 [00:00<?, ?B/s]

In [10]:
# Cell 8 — Create Datasets and DataLoaders
from torch.utils.data import random_split, DataLoader

# Reset index so pandas rows go 0..N-1 (avoids KeyError)
train_df = train_df.reset_index(drop=True)

# Train / Validation split sizes
train_size = int(0.9 * len(train_df))
val_size   = len(train_df) - train_size

# Random split returns PyTorch Subsets
train_subset, val_subset = random_split(train_df, [train_size, val_size])

# Convert subsets back into DataFrames
train_split_df = train_df.iloc[train_subset.indices].reset_index(drop=True)
val_split_df   = train_df.iloc[val_subset.indices].reset_index(drop=True)

# Build datasets
train_dataset = QGDataset(train_split_df, tokenizer)
val_dataset   = QGDataset(val_split_df, tokenizer)
test_dataset  = QGDataset(dev_df.reset_index(drop=True), tokenizer)

# DataLoaders
train_loader = DataLoader(train_dataset, batch_size=4, shuffle=True)
val_loader   = DataLoader(val_dataset, batch_size=4)
test_loader  = DataLoader(test_dataset, batch_size=4)

print("Train batches:", len(train_loader), "Val batches:", len(val_loader), "Test batches:", len(test_loader))


Train batches: 19710 Val batches: 2190 Test batches: 2643


In [11]:
# Cell 9 — Define Training Loop
from torch.optim import AdamW
from torch.nn.utils import clip_grad_norm_

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model = model.to(device)

optimizer = AdamW(model.parameters(), lr=5e-5)

def train_one_epoch(loader, model, optimizer):
    model.train()
    total_loss = 0
    for batch in loader:
        optimizer.zero_grad()
        input_ids = batch["input_ids"].to(device)
        attention_mask = batch["attention_mask"].to(device)
        labels = batch["labels"].to(device)

        outputs = model(input_ids=input_ids, attention_mask=attention_mask, labels=labels)
        loss = outputs.loss
        loss.backward()

        clip_grad_norm_(model.parameters(), 1.0)  # avoid exploding gradients
        optimizer.step()

        total_loss += loss.item()
    return total_loss / len(loader)

def evaluate(loader, model):
    model.eval()
    total_loss = 0
    with torch.no_grad():
        for batch in loader:
            input_ids = batch["input_ids"].to(device)
            attention_mask = batch["attention_mask"].to(device)
            labels = batch["labels"].to(device)

            outputs = model(input_ids=input_ids, attention_mask=attention_mask, labels=labels)
            total_loss += outputs.loss.item()
    return total_loss / len(loader)


In [12]:
# Cell 10 — Train model
from tqdm.notebook import tqdm

epochs = 5

for epoch in range(epochs):
    model.train()
    total_loss = 0

    # wrap train_loader with tqdm for progress bar
    loop = tqdm(train_loader, leave=True)
    loop.set_description(f"Epoch {epoch+1}/{epochs}")

    for batch in loop:
        optimizer.zero_grad()
        input_ids = batch["input_ids"].to(device)
        attention_mask = batch["attention_mask"].to(device)
        labels = batch["labels"].to(device)

        outputs = model(input_ids=input_ids, attention_mask=attention_mask, labels=labels)
        loss = outputs.loss
        loss.backward()

        torch.nn.utils.clip_grad_norm_(model.parameters(), 1.0)
        optimizer.step()

        total_loss += loss.item()

        # update progress bar with batch loss
        loop.set_postfix(batch_loss=loss.item())

    avg_train_loss = total_loss / len(train_loader)
    val_loss = evaluate(val_loader, model)
    print(f"Epoch {epoch+1}/{epochs} — Train Loss: {avg_train_loss:.4f} | Val Loss: {val_loss:.4f}")



  0%|          | 0/19710 [00:00<?, ?it/s]

Epoch 1/5 — Train Loss: 1.9200 | Val Loss: 1.6399


  0%|          | 0/19710 [00:00<?, ?it/s]

Epoch 2/5 — Train Loss: 1.7215 | Val Loss: 1.5756


  0%|          | 0/19710 [00:00<?, ?it/s]

Epoch 3/5 — Train Loss: 1.6309 | Val Loss: 1.5518


  0%|          | 0/19710 [00:00<?, ?it/s]

Epoch 4/5 — Train Loss: 1.5609 | Val Loss: 1.5356


  0%|          | 0/19710 [00:00<?, ?it/s]

Epoch 5/5 — Train Loss: 1.5047 | Val Loss: 1.5181


In [13]:
# Cell 11 — Generate Question function
def generate_question(context, answer, num_beams=4, max_length=64):
    model.eval()
    input_text = f"generate question: context: {context} answer: {answer}"
    inputs = tokenizer(input_text, return_tensors="pt", truncation=True, max_length=512).to(device)
    outputs = model.generate(
        **inputs,
        max_length=max_length,
        num_beams=num_beams,
        early_stopping=True
    )
    return tokenizer.decode(outputs[0], skip_special_tokens=True)


In [14]:
# Cell 12 — few examples
for i in range(3):
    print("=== SAMPLE", i, "===")
    print("Context (truncated):", dev_df.loc[i, "context"][:200], "...")
    print("Answer:", dev_df.loc[i, "answer"])
    print("True Question:", dev_df.loc[i, "question"])
    print("Generated:", generate_question(dev_df.loc[i, "context"], dev_df.loc[i, "answer"]))
    print()


=== SAMPLE 0 ===
Context (truncated): Super Bowl 50 was an American football game to determine the champion of the National Football League (NFL) for the 2015 season. The American Football Conference (AFC) champion Denver Broncos defeated ...
Answer: Denver Broncos
True Question: Which NFL team represented the AFC at Super Bowl 50?
Generated: Who won the Super Bowl?

=== SAMPLE 1 ===
Context (truncated): Super Bowl 50 was an American football game to determine the champion of the National Football League (NFL) for the 2015 season. The American Football Conference (AFC) champion Denver Broncos defeated ...
Answer: Carolina Panthers
True Question: Which NFL team represented the NFC at Super Bowl 50?
Generated: Who did the Denver Broncos defeat?

=== SAMPLE 2 ===
Context (truncated): Super Bowl 50 was an American football game to determine the champion of the National Football League (NFL) for the 2015 season. The American Football Conference (AFC) champion Denver Broncos defeated ...
Ans

In [15]:
# Cell 13 — Save model
save_dir = "/content/qg_t5_small_squad_manual"
os.makedirs(save_dir, exist_ok=True)
model.save_pretrained(save_dir)
tokenizer.save_pretrained(save_dir)
print("Model saved to", save_dir)


Model saved to /content/qg_t5_small_squad_manual
