In [1]:
from sklearn.model_selection import train_test_split

from transformers import T5Tokenizer, T5ForConditionalGeneration  

from transformers import AdamW
import pandas as pd
import torch
import pytorch_lightning as pl
from pytorch_lightning.callbacks import ModelCheckpoint
from torch.nn.utils.rnn import pad_sequence
# from torch.utils.data import Dataset, DataLoader, random_split, RandomSampler, SequentialSampler

pl.seed_everything(100)
import warnings
warnings.filterwarnings("ignore")

In [2]:
data = pd.read_excel('/kaggle/input/mine12/Conversation_Chatbot.xlsx')

In [3]:
print("No of rows:" ,data.shape[0])

No of rows: 130


Here **PyTorch-lightning** is used: PyTorch Lightning is a lightweight interface for PyTorch that simplifies the process of training deep learning models. It provides pre-built components and features for common tasks, making the code more modular and reusable. PyTorch Lightning also provides various features such as automatic checkpointing, distributed training, and multi-GPU training. It follows a strict design pattern and provides hooks and callbacks for customization. It is compatible with various hardware platforms such as CPUs and GPUs.

In [4]:

print("No of rows:" ,data.shape[0])

No of rows: 130


The task is to create a conversational model that can generate natural and engaging responses to a given input text. The model should be able to understand the context of the conversation and generate appropriate responses that are relevant to the topic and flow of the conversation. Additionally, the model should be able to handle open-ended conversations, where the topic can change dynamically, and maintain coherence throughout the conversation.

In [5]:
DEVICE = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
INPUT_MAX_LEN = 128 #input length
OUTPUT_MAX_LEN = 128 # output length
TRAIN_BATCH_SIZE = 8 # batch size of training
VAL_BATCH_SIZE = 2 # batch size for validation
EPOCHS = 5 # number of epoch

In [6]:
MODEL_NAME = "t5-base"
tokenizer = T5Tokenizer.from_pretrained(MODEL_NAME, model_max_length=512)

Downloading (…)ve/main/spiece.model:   0%|          | 0.00/792k [00:00<?, ?B/s]

Downloading (…)lve/main/config.json:   0%|          | 0.00/1.21k [00:00<?, ?B/s]

Example of how T5 Tokenizer actually work.

In [7]:
text = "Hello, how are you today?"    # assume the text that is to be tokenized 

input_tokenize = tokenizer( 
             text,
            add_special_tokens=True,        #Add Special tokens like [CLS] and [SEP]
            max_length=128,
            padding = 'max_length',         #for padding to max_length for equal sequence length
            truncation = True,              #truncate the text if it is greater than max_length
            return_attention_mask=True,     #will return attention mask
            return_tensors="pt"             #return tensor formate
        )

In [8]:
print("input_ids: ", input_tokenize['input_ids'].flatten())
print("-----------------------------------------------------------------------------")
print("Attention Mask: ", input_tokenize['attention_mask'].flatten())

input_ids:  tensor([8774,    6,  149,   33,   25,  469,   58,    1,    0,    0,    0,    0,
           0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
           0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
           0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
           0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
           0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
           0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
           0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
           0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
           0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
           0,    0,    0,    0,    0,    0,    0,    0])
-----------------------------------------------------------------------------
Attention Mask:  tensor([1, 1, 1, 1, 1, 1, 1, 1, 0, 0

In [9]:
class T5Dataset:
    
  def __init__(self,question,answer):   
    
    self.question = question
    self.answer = answer
    self.tokenizer = tokenizer
    self.input_max_len = INPUT_MAX_LEN
    self.output_max_len = OUTPUT_MAX_LEN
  
  def __len__(self):                      # This method retrives the number of item from the dataset
    return len(self.question)

  def __getitem__(self,item):             # This method retrieves the item at the specified index item. 

    question = str(self.question[item])
    question = ''.join(question.split())

    answer = str(self.answer[item])
    answer = ''.join(answer.split())

    input_tokenize = self.tokenizer(      
            question,
            add_special_tokens=True,
            max_length=self.input_max_len,
            padding = 'max_length',
            truncation = True,
            return_attention_mask=True,
            return_tensors="pt"
        )
    output_tokenize = self.tokenizer(
            answer,
            add_special_tokens=True,
            max_length=self.output_max_len,
            padding = 'max_length',
            truncation = True,
            return_attention_mask=True,
            return_tensors="pt"
            
        )
    

    input_ids = input_tokenize["input_ids"].flatten()
    attention_mask = input_tokenize["attention_mask"].flatten()
    labels = output_tokenize['input_ids'].flatten()

    out = {
            'question':question,      
            'answer':answer,
            'input_ids': input_ids,
            'attention_mask':attention_mask,
            'target':labels
        }
        
    return out

In [10]:
class T5DataLoad(pl.LightningDataModule):
    
    def __init__(self,df_train,df_test):
        super().__init__()
        self.df_train = df_train
        self.df_test = df_test
        self.tokenizer = tokenizer
        self.input_max_len = INPUT_MAX_LEN
        self.out_max_len = OUTPUT_MAX_LEN
    
    def setup(self, stage=None):
        
        self.train_data = T5Dataset(
            question = self.df_train.question.values,
            answer = self.df_train.answer.values
        )
        
        self.valid_data = T5Dataset(
            question = self.df_test.question.values,
            answer = self.df_test.answer.values
        )
    def train_dataloader(self):
        return torch.utils.data.DataLoader(
         self.train_data,
         batch_size= TRAIN_BATCH_SIZE,
         shuffle=True, 
         num_workers=2
        )
    def val_dataloader(self):
        return torch.utils.data.DataLoader(
        self.valid_data,
        batch_size= VAL_BATCH_SIZE,
        num_workers = 2
        )

In [11]:
class T5Model(pl.LightningModule):
    
    def __init__(self):
        super().__init__()
        self.model = T5ForConditionalGeneration.from_pretrained(MODEL_NAME, return_dict = True)

        
    def forward(self, input_ids, attention_mask, labels=None):
        
        output = self.model(
        input_ids=input_ids, 
        attention_mask=attention_mask, 
        labels=labels
        )
        return output.loss, output.logits
    
    def training_step(self, batch, batch_idx):

        input_ids = batch["input_ids"]
        attention_mask = batch["attention_mask"]
        labels= batch["target"]
        loss, logits = self(input_ids , attention_mask, labels)

        
        self.log("train_loss", loss, prog_bar=True, logger=True)

        return {'loss': loss}
    
    def validation_step(self, batch, batch_idx):
        input_ids = batch["input_ids"]
        attention_mask = batch["attention_mask"]
        labels= batch["target"]
        loss, logits = self(input_ids, attention_mask, labels)
        print(loss)
        print(logits)
        self.log("val_loss", loss, prog_bar=True, logger=True)
        
        return {'val_loss': loss}

    def configure_optimizers(self):
        return AdamW(self.parameters(), lr=0.0001)

# Final Training Step

In [12]:
def run():
    df_train, df_test = train_test_split(data,test_size = 0.2, random_state=100)
    dataload = T5DataLoad(df_train,df_test)
    dataload.setup()
    device = DEVICE
    model = T5Model()
    model.to(device)
    
    checkpoint = ModelCheckpoint(
        dirpath="/kaggle/working",
        filename='best-model',
        save_top_k=2,
        verbose=True,
        monitor="val_loss",
        mode="min"
    )
    trainer = pl.Trainer(
        callbacks = checkpoint,
        max_epochs= 1,
        gpus=1,
        accelerator="gpu"
    )
    trainer.fit(model, dataload)
run()

Downloading (…)"pytorch_model.bin";:   0%|          | 0.00/892M [00:00<?, ?B/s]

Downloading (…)neration_config.json:   0%|          | 0.00/147 [00:00<?, ?B/s]

Sanity Checking: 0it [00:00, ?it/s]

tensor(13.5540, device='cuda:0')
tensor([[[-28.8281, -22.5195, -24.5901,  ..., -60.4692, -60.3163, -60.4748],
         [-30.5674, -10.5226,  -2.1703,  ..., -37.1412, -36.9600, -36.9380],
         [-22.1129,  -9.0042,  -7.2849,  ..., -37.7123, -37.6336, -37.7268],
         ...,
         [-15.1206, -13.9283, -14.7396,  ..., -43.6271, -43.4374, -43.5475],
         [-15.1341, -14.0693, -14.7095,  ..., -43.7636, -43.5674, -43.6765],
         [-15.1459, -14.1073, -14.7553,  ..., -43.8822, -43.6805, -43.7898]],

        [[-27.0475, -21.3786, -21.9397,  ..., -58.6131, -58.6086, -58.8206],
         [-24.5903,  -7.7264,  -5.4532,  ..., -35.2017, -35.1940, -35.2603],
         [-26.0001, -10.5760,  -8.8694,  ..., -38.8787, -38.8642, -38.9490],
         ...,
         [-15.5826, -13.6136, -13.8259,  ..., -44.0200, -43.9822, -44.1078],
         [-16.0070, -13.9721, -14.1506,  ..., -44.6510, -44.6120, -44.7436],
         [-16.4195, -14.3096, -14.4565,  ..., -45.2765, -45.2372, -45.3736]]],
       devi

Training: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

tensor(0.7100, device='cuda:0')
tensor([[[ -6.5540, -12.5442, -13.8147,  ..., -39.1567, -39.0477, -39.0855],
         [-21.1136,  -8.8276,  -1.0379,  ..., -35.6668, -35.5168, -35.4629],
         [-13.2914,  -7.6071,  -5.7386,  ..., -35.2460, -35.1324, -35.1784],
         ...,
         [  2.5282,  -8.7571,  -8.3377,  ..., -31.9145, -31.7654, -31.8451],
         [  2.2837,  -8.9166,  -8.2933,  ..., -32.0150, -31.8637, -31.9422],
         [  2.1888,  -8.8612,  -8.2882,  ..., -31.9897, -31.8367, -31.9145]],

        [[ -1.5027,  -9.7580, -10.2607,  ..., -34.1582, -34.0939, -34.1474],
         [-15.7506,  -8.0395,  -3.8009,  ..., -35.1546, -35.0969, -35.0910],
         [-15.5355,  -9.6283,  -6.6959,  ..., -36.3320, -36.3034, -36.2730],
         ...,
         [  2.6859,  -7.5692,  -6.8998,  ..., -30.0998, -30.0425, -30.0638],
         [  2.6583,  -7.5704,  -6.9327,  ..., -30.0379, -29.9802, -30.0033],
         [  2.6283,  -7.5703,  -6.9653,  ..., -29.9929, -29.9347, -29.9595]]],
       devic

In [13]:
train_model = T5Model.load_from_checkpoint('/kaggle/working/best-model.ckpt')
train_model.freeze()

def generate_question(question):

    inputs_encoding =  tokenizer(
        question,
        add_special_tokens=True,
        max_length= INPUT_MAX_LEN,
        padding = 'max_length',
        truncation='only_first',
        return_attention_mask=True,
        return_tensors="pt"
        )

    
    generate_ids = train_model.model.generate(
        input_ids = inputs_encoding["input_ids"],
        attention_mask = inputs_encoding["attention_mask"],
        max_length = INPUT_MAX_LEN,
        num_beams = 4,
        num_return_sequences = 1,
        no_repeat_ngram_size=2,
        early_stopping=True,
        )

    preds = [
        tokenizer.decode(gen_id,
        skip_special_tokens=True, 
        clean_up_tokenization_spaces=True)
        for gen_id in generate_ids
    ]

    return "".join(preds)

We have taken the test dataset manually, While evaluating the model we have randomly choosen three questions from our test dataset and evaulated our model.

# Model Evaluation

In [27]:
ques = "what are the popular programs in that fields?"
print("Ques: ",ques)
print("BOT: ",generate_question(ques))

Ques:  what are the popular programs in that fields?
BOT:  are the most popular programs in that field?


In [28]:
ques = "Im looking for a graduate program in the LAW. Does Yale offer any programs in that area? "
print("Ques: ",ques)
print("BOT: ",generate_question(ques))

Ques:  Im looking for a graduate program in the LAW. Does Yale offer any programs in that area? 
BOT:  i am looking for a law degree.


In [29]:
ques = "What is the application process like for Yale graduate programs?"
print("Ques: ",ques)
print("BOT: ",generate_question(ques))

Ques:  What is the application process like for Yale graduate programs?
BOT:  What is the application process like for Yale graduate programs?
