In [1]:
!pip install transformers
!pip install evaluate
!pip install rouge


import torch
import json
from tqdm import tqdm
import torch.nn as nn
from torch.optim import Adam
import nltk
import spacy
import string
import evaluate  # Bleu
from torch.utils.data import Dataset, DataLoader, RandomSampler
import pandas as pd
import numpy as np
import transformers
from sklearn.model_selection import train_test_split
import matplotlib.pyplot as plt
from transformers import T5Tokenizer, T5Model, T5ForConditionalGeneration, T5TokenizerFast

import warnings
warnings.filterwarnings("ignore")

Collecting evaluate
  Downloading evaluate-0.4.1-py3-none-any.whl.metadata (9.4 kB)
Downloading evaluate-0.4.1-py3-none-any.whl (84 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m84.1/84.1 kB[0m [31m3.7 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: evaluate
Successfully installed evaluate-0.4.1
Collecting rouge
  Downloading rouge-1.0.1-py3-none-any.whl (13 kB)
Installing collected packages: rouge
Successfully installed rouge-1.0.1


2024-02-05 15:27:37.873526: E external/local_xla/xla/stream_executor/cuda/cuda_dnn.cc:9261] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
2024-02-05 15:27:37.873622: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:607] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
2024-02-05 15:27:38.041059: E external/local_xla/xla/stream_executor/cuda/cuda_blas.cc:1515] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered


In [2]:
TOKENIZER = T5TokenizerFast.from_pretrained("t5-base")
MODEL = T5ForConditionalGeneration.from_pretrained("t5-base", return_dict=True)
OPTIMIZER = Adam(MODEL.parameters(), lr=0.00001)
Q_LEN = 256   # Question Length
T_LEN = 32    # Target Length
BATCH_SIZE = 4
DEVICE = "cuda:0"

spiece.model:   0%|          | 0.00/792k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.39M [00:00<?, ?B/s]

config.json:   0%|          | 0.00/1.21k [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/892M [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/147 [00:00<?, ?B/s]

In [3]:
df = pd.read_csv("/kaggle/input/squad-v11/SQuAD-v1.1.csv")
df


Unnamed: 0,title,context,question,answer,answer_start,answer_end
0,University_of_Notre_Dame,"Architecturally, the school has a Catholic cha...",To whom did the Virgin Mary allegedly appear i...,Saint Bernadette Soubirous,515,541
1,University_of_Notre_Dame,"Architecturally, the school has a Catholic cha...",What is in front of the Notre Dame Main Building?,a copper statue of Christ,188,213
2,University_of_Notre_Dame,"Architecturally, the school has a Catholic cha...",The Basilica of the Sacred heart at Notre Dame...,the Main Building,279,296
3,University_of_Notre_Dame,"Architecturally, the school has a Catholic cha...",What is the Grotto at Notre Dame?,a Marian place of prayer and reflection,381,420
4,University_of_Notre_Dame,"Architecturally, the school has a Catholic cha...",What sits on top of the Main Building at Notre...,a golden statue of the Virgin Mary,92,126
...,...,...,...,...,...,...
87594,Kathmandu,"Kathmandu Metropolitan City (KMC), in order to...",In what US state did Kathmandu first establish...,Oregon,229,235
87595,Kathmandu,"Kathmandu Metropolitan City (KMC), in order to...",What was Yangon previously known as?,Rangoon,414,421
87596,Kathmandu,"Kathmandu Metropolitan City (KMC), in order to...",With what Belorussian city does Kathmandu have...,Minsk,476,481
87597,Kathmandu,"Kathmandu Metropolitan City (KMC), in order to...",In what year did Kathmandu create its initial ...,1975,199,203


In [4]:

selected_columns = ["context", "question", "answer"]
df = df[selected_columns]

In [5]:
df=df[:8000]

In [8]:
class QA_Dataset(Dataset):
    def __init__(self, tokenizer, dataframe, q_len, t_len):
        self.tokenizer = tokenizer
        self.q_len = q_len
        self.t_len = t_len
        self.data = dataframe
        self.questions = self.data["question"]
        self.context = self.data["context"]
        self.answer = self.data["answer"]
        
    def __len__(self):
        return len(self.questions)
    
    def __getitem__(self, idx):
        if idx < 0 or idx >= len(self.questions):
            raise IndexError(f"Index {idx} is out of bounds for the dataset.")
        question = self.questions[idx]
        context = self.context[idx]
        answer = self.answer[idx]
        
        question_tokenized = self.tokenizer(question, context, max_length=self.q_len, padding="max_length",
                                                    truncation=True, pad_to_max_length=True, add_special_tokens=True)
        answer_tokenized = self.tokenizer(answer, max_length=self.t_len, padding="max_length", 
                                          truncation=True, pad_to_max_length=True, add_special_tokens=True)
        
        labels = torch.tensor(answer_tokenized["input_ids"], dtype=torch.long)
        labels[labels == 0] = -100
        
        return {
            "input_ids": torch.tensor(question_tokenized["input_ids"], dtype=torch.long),
            "attention_mask": torch.tensor(question_tokenized["attention_mask"], dtype=torch.long),
            "labels": labels,
            "decoder_attention_mask": torch.tensor(answer_tokenized["attention_mask"], dtype=torch.long)
        }

In [9]:
# Dataloader

train_data, val_data = train_test_split(df, test_size=0.2, random_state=42)

# Get the indices of the DataFrames
train_indices = train_data.index.tolist()
val_indices = val_data.index.tolist()

# Create RandomSamplers using the indices
train_sampler = RandomSampler(train_indices)
val_sampler = RandomSampler(val_indices)

qa_dataset = QA_Dataset(TOKENIZER, df, Q_LEN, T_LEN)

train_loader = DataLoader(qa_dataset, batch_size=20, sampler=train_sampler)
val_loader = DataLoader(qa_dataset, batch_size=20, sampler=val_sampler)

In [10]:
# Move the model to the same device if it's not already
MODEL = MODEL.to(DEVICE)

In [11]:
train_loss = 0
val_loss = 0
train_batch_count = 0
val_batch_count = 0

for epoch in range(20):
    MODEL.train()
    for batch in tqdm(train_loader, desc="Training batches"):
        input_ids = batch["input_ids"].to(DEVICE)
        attention_mask = batch["attention_mask"].to(DEVICE)
        labels = batch["labels"].to(DEVICE)
        decoder_attention_mask = batch["decoder_attention_mask"].to(DEVICE)

        outputs = MODEL(
                          input_ids=input_ids,
                          attention_mask=attention_mask,
                          labels=labels,
                          decoder_attention_mask=decoder_attention_mask
                        )

        OPTIMIZER.zero_grad()
        outputs.loss.backward()
        OPTIMIZER.step()
        train_loss += outputs.loss.item()
        train_batch_count += 1
    
    #Evaluation
    MODEL.eval()
    for batch in tqdm(val_loader, desc="Validation batches"):
        input_ids = batch["input_ids"].to(DEVICE)
        attention_mask = batch["attention_mask"].to(DEVICE)
        labels = batch["labels"].to(DEVICE)
        decoder_attention_mask = batch["decoder_attention_mask"].to(DEVICE)

        outputs = MODEL(
                          input_ids=input_ids,
                          attention_mask=attention_mask,
                          labels=labels,
                          decoder_attention_mask=decoder_attention_mask
                        )

        OPTIMIZER.zero_grad()
        outputs.loss.backward()
        OPTIMIZER.step()
        val_loss += outputs.loss.item()
        val_batch_count += 1
        
    print(f"{epoch+1}/{20} -> Train loss: {train_loss / train_batch_count}\tValidation loss: {val_loss/val_batch_count}")

Training batches: 100%|██████████| 320/320 [03:45<00:00,  1.42it/s]
Validation batches: 100%|██████████| 80/80 [00:54<00:00,  1.46it/s]


1/20 -> Train loss: 1.9833295404911042	Validation loss: 0.7666220560669899


Training batches: 100%|██████████| 320/320 [03:44<00:00,  1.42it/s]
Validation batches: 100%|██████████| 80/80 [00:54<00:00,  1.46it/s]


2/20 -> Train loss: 1.2898366898298264	Validation loss: 0.5625027752947063


Training batches: 100%|██████████| 320/320 [03:44<00:00,  1.42it/s]
Validation batches: 100%|██████████| 80/80 [00:54<00:00,  1.46it/s]


3/20 -> Train loss: 0.9963923059248676	Validation loss: 0.4627503800516327


Training batches: 100%|██████████| 320/320 [03:44<00:00,  1.42it/s]
Validation batches: 100%|██████████| 80/80 [00:54<00:00,  1.46it/s]


4/20 -> Train loss: 0.8349140839127358	Validation loss: 0.4001928806188516


Training batches: 100%|██████████| 320/320 [03:44<00:00,  1.42it/s]
Validation batches: 100%|██████████| 80/80 [00:54<00:00,  1.46it/s]


5/20 -> Train loss: 0.7318453164026141	Validation loss: 0.3553768846066669


Training batches: 100%|██████████| 320/320 [03:44<00:00,  1.42it/s]
Validation batches: 100%|██████████| 80/80 [00:54<00:00,  1.46it/s]


6/20 -> Train loss: 0.6579105808710058	Validation loss: 0.320658737514168


Training batches: 100%|██████████| 320/320 [03:44<00:00,  1.42it/s]
Validation batches: 100%|██████████| 80/80 [00:54<00:00,  1.46it/s]


7/20 -> Train loss: 0.60052766638941	Validation loss: 0.2932661844084838


Training batches: 100%|██████████| 320/320 [03:44<00:00,  1.42it/s]
Validation batches: 100%|██████████| 80/80 [00:54<00:00,  1.46it/s]


8/20 -> Train loss: 0.5559711159323342	Validation loss: 0.27044888896343766


Training batches: 100%|██████████| 320/320 [03:44<00:00,  1.42it/s]
Validation batches: 100%|██████████| 80/80 [00:54<00:00,  1.46it/s]


9/20 -> Train loss: 0.5194112801174116	Validation loss: 0.25106152777637664


Training batches: 100%|██████████| 320/320 [03:44<00:00,  1.42it/s]
Validation batches: 100%|██████████| 80/80 [00:54<00:00,  1.46it/s]


10/20 -> Train loss: 0.4884010719449725	Validation loss: 0.2344073460472282


Training batches: 100%|██████████| 320/320 [03:44<00:00,  1.42it/s]
Validation batches: 100%|██████████| 80/80 [00:54<00:00,  1.46it/s]


11/20 -> Train loss: 0.46183824891711334	Validation loss: 0.22000330240698532


Training batches: 100%|██████████| 320/320 [03:44<00:00,  1.42it/s]
Validation batches: 100%|██████████| 80/80 [00:54<00:00,  1.46it/s]


12/20 -> Train loss: 0.4386711457404696	Validation loss: 0.20705602573192058


Training batches: 100%|██████████| 320/320 [03:44<00:00,  1.42it/s]
Validation batches: 100%|██████████| 80/80 [00:54<00:00,  1.47it/s]


13/20 -> Train loss: 0.4178405742778873	Validation loss: 0.19553708884363563


Training batches: 100%|██████████| 320/320 [03:44<00:00,  1.43it/s]
Validation batches: 100%|██████████| 80/80 [00:54<00:00,  1.47it/s]


14/20 -> Train loss: 0.3994108837126987	Validation loss: 0.1851660078148208


Training batches: 100%|██████████| 320/320 [03:44<00:00,  1.43it/s]
Validation batches: 100%|██████████| 80/80 [00:54<00:00,  1.47it/s]


15/20 -> Train loss: 0.3829889475934518	Validation loss: 0.1758473947510356


Training batches: 100%|██████████| 320/320 [03:44<00:00,  1.43it/s]
Validation batches: 100%|██████████| 80/80 [00:54<00:00,  1.46it/s]


16/20 -> Train loss: 0.36800854487210016	Validation loss: 0.16727515068614593


Training batches: 100%|██████████| 320/320 [03:44<00:00,  1.43it/s]
Validation batches: 100%|██████████| 80/80 [00:54<00:00,  1.47it/s]


17/20 -> Train loss: 0.35433572971053173	Validation loss: 0.15947384606843115


Training batches: 100%|██████████| 320/320 [03:44<00:00,  1.43it/s]
Validation batches: 100%|██████████| 80/80 [00:54<00:00,  1.46it/s]


18/20 -> Train loss: 0.3419055717851734	Validation loss: 0.15233039504358506


Training batches: 100%|██████████| 320/320 [03:44<00:00,  1.42it/s]
Validation batches: 100%|██████████| 80/80 [00:54<00:00,  1.46it/s]


19/20 -> Train loss: 0.33044844371018517	Validation loss: 0.14574570566496972


Training batches: 100%|██████████| 320/320 [03:44<00:00,  1.42it/s]
Validation batches: 100%|██████████| 80/80 [00:54<00:00,  1.46it/s]

20/20 -> Train loss: 0.31962701897355145	Validation loss: 0.13967859704272997





In [None]:
def predict_answer(context, question, ref_answer=None):
    inputs = TOKENIZER(question, context, max_length=Q_LEN, padding="max_length", truncation=True, add_special_tokens=True)
    
    input_ids = torch.tensor(inputs["input_ids"], dtype=torch.long).to(DEVICE).unsqueeze(0)
    attention_mask = torch.tensor(inputs["attention_mask"], dtype=torch.long).to(DEVICE).unsqueeze(0)

    outputs = MODEL.generate(input_ids=input_ids, attention_mask=attention_mask)
  
    predicted_answer = TOKENIZER.decode(outputs.flatten(), skip_special_tokens=True)
    
    if ref_answer:
        # Load the Bleu metric
        bleu = evaluate.load("google_bleu")
        score = bleu.compute(predictions=[predicted_answer], 
                            references=[ref_answer])
    
        print("Context: \n", context)
        print("\n")
        print("Question: \n", question)
        return {
            "Reference Answer: ": ref_answer, 
            "Predicted Answer: ": predicted_answer, 
            "BLEU Score: ": score
        }
    else:
        return predicted_answer