<a href="https://colab.research.google.com/github/MSadatAnik/QuestionGeneration/blob/Sadat_Branch/QG_V2.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
#cell 1
# Install required packages
!pip install transformers==4.53.0 datasets nltk rouge_score

# Import and download NLTK punkt tokenizer
import nltk
nltk.download('punkt')



[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


True

In [29]:
#cell 2
import os
import json
import logging
import numpy as np
import torch
from itertools import chain
from dataclasses import dataclass
from typing import Optional
from torch.utils.data import Dataset
from transformers import (
    AutoTokenizer, AutoModelForSeq2SeqLM,
    Seq2SeqTrainer, Seq2SeqTrainingArguments,
    DataCollatorForSeq2Seq, HfArgumentParser,
    set_seed, AutoConfig, cache_utils,
)
from datasets import Dataset as HFDataset
from datasets import load_metric
from nltk.tokenize import word_tokenize

# Set up logging
logging.basicConfig(level=logging.INFO)
logger = logging.getLogger(__name__)

In [3]:
#cell 3
from google.colab import files

# Upload train and validation JSON files
print("Please upload qg_train_v0.json")
uploaded = files.upload()
print("Please upload qg_valid_v0.json")
uploaded = files.upload()

# Load JSON data
with open("qg_train_v0.json", "r") as f:
    train_data = json.load(f)

with open("qg_valid_v0.json", "r") as f:
    valid_data = json.load(f)

# Print dataset sizes
print("Train examples:", len(train_data))
print("Valid examples:", len(valid_data))

Please upload qg_train_v0.json


Saving qg_train_v0.json to qg_train_v0 (1).json
Please upload qg_valid_v0.json


Saving qg_valid_v0.json to qg_valid_v0 (1).json
Train examples: 192
Valid examples: 49


In [34]:
#cell 4
def tokenized_data(source_text, target_text, data_args, tokenizer):
    padding = "max_length" if data_args.pad_to_max_length else False
    model_inputs = tokenizer(source_text, max_length=data_args.max_source_length, padding=padding, truncation=True)
    labels = tokenizer(text_target=target_text, max_length=data_args.max_target_length, padding=padding, truncation=True)

    if data_args.ignore_pad_token_for_loss:
        labels["input_ids"] = [
            [(l if l != tokenizer.pad_token_id else -100) for l in label]
            for label in labels["input_ids"]
        ]

    model_inputs["labels"] = labels["input_ids"]

    # extra: check for corrupted values
    for example in model_inputs["labels"]:
        if any(l < -100 or l > tokenizer.vocab_size * 2 for l in example):
            raise ValueError(f"Invalid label token found: {example}")

    return model_inputs

In [24]:
#cell 5
class CustomDS(Dataset):
    def __init__(self, data):
        self.data = data

    def __len__(self):
        return len(self.data['input_ids'])

    def __getitem__(self, idx):
        return {k: v[idx] for k, v in self.data.items()}


In [35]:
#cell 6
def process_qg_agno_openstax(data, data_args, tokenizer):
    all_questions = list(chain(*[d['questions'] for d in data]))
    source_text = []
    target_text = []

    for q in all_questions:
        inp_txt = f"context {q['hl_context'].replace('<hl>', '')}: "
        out_txt = q['question']['normal_format']
        source_text.append(inp_txt)
        target_text.append(out_txt)

    # Optional: debug mode slice
    if data_args.is_debug_mode > 0:
        source_text = source_text[:50]
        target_text = target_text[:50]

    model_inputs = tokenized_data(source_text, target_text, data_args, tokenizer)
    return HFDataset.from_dict(model_inputs)


In [36]:
#cell 7
@dataclass
class DataTrainingArguments:
    max_source_length: int = 512
    max_target_length: int = 48
    pad_to_max_length: bool = False
    ignore_pad_token_for_loss: bool = True
    is_debug_mode: int = -1  # set 1 to slice small data

data_args = DataTrainingArguments()

In [37]:
#cell 8
set_seed(42)
model_name = "t5-small"  # or "google/mt5-small" for multilingual
tokenizer = AutoTokenizer.from_pretrained(model_name)

model = AutoModelForSeq2SeqLM.from_pretrained(model_name)

In [38]:
#cell 9
train_dataset = process_qg_agno_openstax(train_data, data_args, tokenizer)
valid_dataset = process_qg_agno_openstax(valid_data, data_args, tokenizer)

In [39]:
#cell 10
training_args = Seq2SeqTrainingArguments(
    output_dir="./results",
    eval_strategy="epoch",
    save_strategy="epoch",
    learning_rate=5e-5,
    per_device_train_batch_size=8,
    per_device_eval_batch_size=8,
    weight_decay=0.1,
    adam_epsilon=1e-8,
    max_grad_norm=1.0,
    gradient_accumulation_steps=4,
    num_train_epochs=10,
    save_total_limit=2,
    logging_dir="./logs",
    logging_steps=10,
    warmup_steps=500,
    lr_scheduler_type="linear",
    predict_with_generate=True,
    push_to_hub=False,
    report_to=[],
    load_best_model_at_end=True,
    metric_for_best_model="eval_loss",
    greater_is_better=False,
)


In [42]:
#cell 11
def compute_metrics(eval_preds):
    preds, labels = eval_preds
    if isinstance(preds, tuple):
        preds = preds[0]

    # Clip predictions to valid token ID range
    preds = np.clip(preds, 0, tokenizer.vocab_size - 1)

    # Check for invalid token IDs
    if np.any((preds < 0) | (preds >= tokenizer.vocab_size)):
        logger.warning("Invalid token IDs found in predictions, clipping to valid range")
        preds = np.clip(preds, 0, tokenizer.vocab_size - 1)

    decoded_preds = tokenizer.batch_decode(preds, skip_special_tokens=True)
    labels = np.where(labels != -100, labels, tokenizer.pad_token_id)
    decoded_labels = tokenizer.batch_decode(labels, skip_special_tokens=True)

    result = metric.compute(predictions=decoded_preds, references=decoded_labels, use_stemmer=True)
    result = {k: round(v.mid.fmeasure * 100, 4) for k, v in result.items()}
    return result


In [43]:
#cell12
data_collator = DataCollatorForSeq2Seq(
    tokenizer=tokenizer,
    model=model,
    label_pad_token_id=-100,
)

trainer = Seq2SeqTrainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=valid_dataset,
    processing_class=tokenizer,
    data_collator=data_collator,
    compute_metrics=compute_metrics,
)

trainer.train()

Epoch,Training Loss,Validation Loss,Rouge1,Rouge2,Rougel,Rougelsum
1,2.5688,2.423033,26.6376,8.1311,22.736,22.7598
2,2.5031,2.383021,28.7985,10.2146,25.1281,25.1118
3,2.4694,2.345735,30.4627,11.7091,26.695,26.7544
4,2.3953,2.311426,31.2237,11.8773,27.0646,27.0687
5,2.191,2.280252,32.0166,12.2152,27.5253,27.5059
6,2.3294,2.25784,31.7883,11.8847,27.239,27.2156
7,2.4013,2.243422,32.0716,12.3378,27.5826,27.5676
8,2.3193,2.234182,32.5989,12.5478,27.9231,27.8834
9,2.1855,2.230451,32.5764,12.5357,27.9418,27.9057
10,2.0743,2.229208,32.6416,12.6106,28.0599,28.0151


There were missing keys in the checkpoint model loaded: ['encoder.embed_tokens.weight', 'decoder.embed_tokens.weight', 'lm_head.weight'].


TrainOutput(global_step=860, training_loss=2.3685054069341613, metrics={'train_runtime': 1260.4684, 'train_samples_per_second': 21.627, 'train_steps_per_second': 0.682, 'total_flos': 3277109283520512.0, 'train_loss': 2.3685054069341613, 'epoch': 10.0})

In [44]:
#cell13
results = trainer.evaluate()
print("Evaluation Results:", results)

# Predictions
predictions = trainer.predict(valid_dataset)
decoded_preds = tokenizer.batch_decode(predictions.predictions, skip_special_tokens=True)
print("\nSample Predictions:")
for i, p in enumerate(decoded_preds[:5]):
    print(f"{i+1}: {p}")


Evaluation Results: {'eval_loss': 2.229207754135132, 'eval_rouge1': 32.6416, 'eval_rouge2': 12.6106, 'eval_rougeL': 28.0599, 'eval_rougeLsum': 28.0151, 'eval_runtime': 31.2399, 'eval_samples_per_second': 21.479, 'eval_steps_per_second': 2.689, 'epoch': 10.0}


OverflowError: out of range integral type conversion attempted