In [1]:
!pip install evaluate sentencepiece transformers



In [2]:
import os
import pandas as pd
import torch
from datasets import Dataset
from transformers import (
    MT5ForConditionalGeneration,
    MT5Tokenizer,
    AutoTokenizer,
    TrainingArguments,
    Trainer,
    DataCollatorForSeq2Seq,
    Seq2SeqTrainingArguments
)
import numpy as np
import evaluate

In [3]:
# import os
# os.environ["PYTORCH_CUDA_ALLOC_CONF"] = "expandable_segments:True"


In [None]:
train_file = "train_subset_0.10_percent.csv"
test_file  = "test_subset_0.10_percent.csv"
train_df = pd.read_csv(train_file).dropna(subset=["question_my","context_my","answer_my"])
test_df  = pd.read_csv(test_file).dropna(subset=["question_my","context_my","answer_my"])

print(f"Train rows: {len(train_df)} | Eval rows: {len(test_df)}")

# Convert to HuggingFace datasets
train_ds = Dataset.from_pandas(train_df[["question_my","context_my","answer_my"]])
train_ds = train_ds.shuffle(seed=42)
train_ds = train_ds.select(range(min(1000, len(train_ds))))

eval_ds  = Dataset.from_pandas(test_df[["question_my","context_my","answer_my"]])
eval_ds = eval_ds.shuffle(seed=42)
eval_ds = eval_ds.select(range(min(100, len(eval_ds))))

#only using a sample set for the test run. 

Train rows: 7007 | Eval rows: 1752


In [None]:

model_name = "google/mt5-base"

tokenizer = AutoTokenizer.from_pretrained(model_name, use_fast=False)
model = MT5ForConditionalGeneration.from_pretrained(model_name)

if tokenizer.pad_token is None:
    tokenizer.pad_token = tokenizer.eos_token
    model.config.pad_token_id = tokenizer.pad_token_id

max_input_len = 512-128
max_target_len = 128


You are using the default legacy behaviour of the <class 'transformers.models.t5.tokenization_t5.T5Tokenizer'>. This is expected, and simply means that the `legacy` (previous) behavior will be used so nothing changes for you. If you want to use the new behaviour, set `legacy=False`. This should only be set if you understand what it means, and thoroughly read the reason why this was added as explained in https://github.com/huggingface/transformers/pull/24565


In [6]:
import torch
print(torch.cuda.is_available())
print(torch.cuda.device_count())
print(torch.cuda.get_device_name(0) if torch.cuda.is_available() else "No GPU")


True
1
NVIDIA A100-SXM4-40GB


In [None]:

def preprocess_fn(batch):
    # build input like: "question: ... context: ..."
    inputs = [
        f"question: {q} context: {c}"
        for q,c in zip(batch["question_my"], batch["context_my"])
    ]
    model_inputs = tokenizer(
        inputs,
        max_length=max_input_len,
        truncation="only_second",   # truncate context first
        stride=64,                  # allow overlap if context too long
        return_overflowing_tokens=True,
        padding="max_length",
    )

    # tokenize answers
    with tokenizer.as_target_tokenizer():
        labels = tokenizer(
            batch["answer_my"],
            max_length=max_target_len,
            truncation=True,
            padding="max_length",
        )

    # replace pad token id with -100 so loss ignores itu
    labels_ids = [
        [(l if l != tokenizer.pad_token_id else -100) for l in label]
        for label in labels["input_ids"]
    ]
    model_inputs["labels"] = labels_ids

    return model_inputs

# Map preprocessing
train_ds = train_ds.map(preprocess_fn, batched=True, remove_columns=train_ds.column_names)
eval_ds  = eval_ds.map(preprocess_fn, batched=True, remove_columns=eval_ds.column_names)


Map:   0%|          | 0/1000 [00:00<?, ? examples/s]



Map:   0%|          | 0/100 [00:00<?, ? examples/s]

In [8]:
import numpy as np
from transformers import EvalPrediction

def normalize_text(text):
    return text.strip().lower()

def compute_f1(pred, truth):
    pred_tokens = pred.split()
    truth_tokens = truth.split()
    common = set(pred_tokens) & set(truth_tokens)
    if len(common) == 0:
        return 0.0
    prec = len(common) / len(pred_tokens)
    rec = len(common) / len(truth_tokens)
    return 2 * prec * rec / (prec + rec)

def compute_metrics(eval_pred: EvalPrediction):
    preds, labels = eval_pred

    # Some trainers return tuple (logits, ...) → take argmax
    if isinstance(preds, tuple):
        preds = preds[0]

    # Convert to numpy
    preds = np.argmax(preds, axis=-1) if preds.ndim == 3 else preds

    # Replace -100 in labels with pad_token_id
    labels = np.where(labels != -100, labels, tokenizer.pad_token_id)

    # Decode
    decoded_preds = tokenizer.batch_decode(preds, skip_special_tokens=True)
    decoded_labels = tokenizer.batch_decode(labels, skip_special_tokens=True)

    # Normalize
    decoded_preds = [normalize_text(p) for p in decoded_preds]
    decoded_labels = [normalize_text(l) for l in decoded_labels]

    # Metrics
    exact_matches = [int(p == l) for p, l in zip(decoded_preds, decoded_labels)]
    f1s = [compute_f1(p, l) for p, l in zip(decoded_preds, decoded_labels)]

    return {
        "exact_match": float(np.mean(exact_matches)),
        "f1": float(np.mean(f1s))
    }


In [11]:
# bleu = evaluate.load("bleu")
# import numpy as np


# def compute_metrics(eval_pred):
#     preds, labels = eval_pred

#     if isinstance(preds, tuple):
#         preds = preds[0]

#     preds = np.array(preds)

#     if preds.ndim == 3:
#         preds = preds[:, 0, :]

#     preds = np.where(
#         (preds < 0) | (preds >= tokenizer.vocab_size),
#         tokenizer.pad_token_id,
#         preds
#     )

#     decoded_preds = tokenizer.batch_decode(preds, skip_special_tokens=True)
#     labels = np.where(labels != -100, labels, tokenizer.pad_token_id)
#     decoded_labels = tokenizer.batch_decode(labels, skip_special_tokens=True)

#     # strip & filter out empties
#     decoded_preds = [p.strip() for p in decoded_preds]
#     decoded_labels = [l.strip() for l in decoded_labels]

#     filtered = [(p, l) for p, l in zip(decoded_preds, decoded_labels) if l != ""]
#     if len(filtered) == 0:
#         return {"exact_match": 0.0, "bleu": 0.0}

#     decoded_preds, decoded_labels = zip(*filtered)

#     exact = sum(p == l for p, l in zip(decoded_preds, decoded_labels)) / len(decoded_preds)

#     try:
#         bleu_score = bleu.compute(
#             predictions=list(decoded_preds),
#             references=[[l] for l in decoded_labels]
#         )["bleu"]
#     except ZeroDivisionError:
#         bleu_score = 0.0

#     return {"exact_match": exact, "bleu": bleu_score}



# ----------------------------
# 5. Training setup
# ----------------------------
from transformers import Seq2SeqTrainingArguments

args = Seq2SeqTrainingArguments(
    output_dir="./mt5_qa_output",
    eval_strategy="epoch",
    save_strategy="epoch",
    logging_dir="./logs",
    logging_steps=50,
    num_train_epochs=5,
    per_device_train_batch_size=8,               # ✅ safe default for mT5-base on GPU
    per_device_eval_batch_size=4,                #   Evaluation is more memory-intensive, e.g., due to predict_with_generate=True which runs autoregressive decoding
    gradient_accumulation_steps=2,               # ✅ effective batch size = 16
    eval_accumulation_steps=2,
    learning_rate=3e-5,                          # ✅ slightly lower for stability
    weight_decay=0.01,
    warmup_ratio=0.1,
    predict_with_generate=True,
    generation_max_length=32,
    load_best_model_at_end=True,
    metric_for_best_model="exact_match",
    greater_is_better=True,
    fp16=False   ,      
    bf16= True,
    
    report_to="none",
)


collator = DataCollatorForSeq2Seq(tokenizer, model=model, label_pad_token_id=-100)


In [None]:
trainer = Trainer(
    model=model,
    args=args,
    train_dataset=train_ds,
    eval_dataset=eval_ds,
    tokenizer=tokenizer,
    data_collator=collator,
    compute_metrics=compute_metrics,
)



#THIS IS ONLY A TEST RUN. 
trainer.train()

  trainer = Trainer(


Epoch,Training Loss,Validation Loss,Exact Match,F1
1,9.6699,4.890985,0.0,0.00182
2,6.9734,3.07356,0.0,0.005089
3,4.7988,2.5667,0.0,0.016494
4,3.8408,2.449344,0.0,0.050093
5,3.6928,2.42001,0.0,0.058981


There were missing keys in the checkpoint model loaded: ['encoder.embed_tokens.weight', 'decoder.embed_tokens.weight'].


TrainOutput(global_step=315, training_loss=5.412277778746589, metrics={'train_runtime': 291.4424, 'train_samples_per_second': 17.156, 'train_steps_per_second': 1.081, 'total_flos': 5537401122299904.0, 'train_loss': 5.412277778746589, 'epoch': 5.0})

In [None]:
trainer.save_model("./mt5_qa_output")
tokenizer.save_pretrained("./mt5_qa_output")

In [None]:
eval_results = trainer.evaluate()
print("Final evaluation:", eval_results)