In [1]:
import numpy as np 
import pandas as pd
import os
#from datasets import load_dataset, Dataset


from transformers import AutoModelForSeq2SeqLM, AutoTokenizer, GenerationConfig, TrainingArguments, Trainer
from transformers import T5Tokenizer, T5ForConditionalGeneration
from transformers import AdamW

import torch
import pytorch_lightning as pl
from pytorch_lightning.callbacks import ModelCheckpoint
from torch.nn.utils.rnn import pad_sequence
#from torch.utils.data import Dataset, DataLoader, random_split, RandomSampler, SequentialSampler

pl.seed_everything(100)
import warnings
warnings.filterwarnings("ignore")

In [None]:
df = pd.read_csv('/kaggle/input/bitext-gen-ai-chatbot-customer-support-dataset/Bitext_Sample_Customer_Support_Training_Dataset_27K_responses-v11.csv')

In [None]:
#df.head(5)
df.describe()

In [None]:
df.columns

In [None]:
tmp  = df[['instruction','response']]

In [None]:
tmp.describe()

In [None]:
tokenizer = T5Tokenizer.from_pretrained("google/flan-t5-small")
model_pre_trained = T5ForConditionalGeneration.from_pretrained("google/flan-t5-small")



In [None]:
DEVICE = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
INPUT_MAX_LEN = 256 
OUTPUT_MAX_LEN = 256
TRAIN_BATCH_SIZE = 16 
VAL_BATCH_SIZE = 16
#EPOCHS = 5 

In [None]:
class CustomDataset:
    
    def __init__(self,df,tokenizer,INPUT_MAX_LEN,OUTPUT_MAX_LEN):
        #print(type(df))
        if not isinstance(df, pd.DataFrame):
            raise ValueError("df should be a pandas DataFrame")
        self.df = df
        self.tokenizer = tokenizer
        self.input_max_len = INPUT_MAX_LEN
        self.output_max_len = OUTPUT_MAX_LEN
    
    def get_item(self, col):
        instruction = col['instruction']
        response = col['response']
        
    def __getitem__(self, index):
        row = self.df.iloc[index]
        
        instruction = str(row['instruction'])
        #instruction = ''.join(instruction.split())

        response = str(row['response'])
        #response = ''.join(response.split())
        
        input_tokenize = self.tokenizer(instruction,add_special_tokens=True,
                                        max_length=self.input_max_len,padding = 'max_length',
                                        truncation = True,return_attention_mask=True,
                                        return_tensors="pt")
        
        output_tokenize = self.tokenizer(response,add_special_tokens=True,
                                         max_length=self.output_max_len,padding = 'max_length',
                                         truncation = True,return_attention_mask=True,
                                         return_tensors="pt")
        
        input_ids = input_tokenize["input_ids"].flatten()
        attention_mask = input_tokenize["attention_mask"].flatten()
        labels = output_tokenize['input_ids'].flatten()
        
        out = {'instruction':instruction,'response':response,
               'input_ids': input_ids,'attention_mask':attention_mask,'target':labels}
        
        return out
    
    def __len__(self):
        return len(self.df)
        
        
        
        
        
    

In [None]:
class CustomDataLoad(pl.LightningDataModule):
    
    def __init__(self,train_data,test_data,tokenizer,INPUT_MAX_LEN,OUTPUT_MAX_LEN,TRAIN_BATCH_SIZE,VAL_BATCH_SIZE):
        super().__init__()
        
        self.train_data = train_data
        self.test_data = test_data
        self.tokenizer = tokenizer
        self.input_max_len = INPUT_MAX_LEN
        self.out_max_len = OUTPUT_MAX_LEN
        self.train_bs = TRAIN_BATCH_SIZE
        self.val_bs = VAL_BATCH_SIZE
        
    def setup(self,stage = None):
        
        self.train_data_ = CustomDataset(df = self.train_data,tokenizer = self.tokenizer,
                                         INPUT_MAX_LEN = self.input_max_len,
                                         OUTPUT_MAX_LEN = self.out_max_len)
        
        self.valid_data_ = CustomDataset(df = self.test_data,tokenizer = self.tokenizer,
                                         INPUT_MAX_LEN = self.input_max_len,
                                         OUTPUT_MAX_LEN = self.out_max_len)
        
    def train_dataloader(self):
        return torch.utils.data.DataLoader(self.train_data_,
                                           batch_size= self.train_bs,
                                           shuffle=True,num_workers=2)
    def val_dataloader(self):
        return torch.utils.data.DataLoader(self.valid_data_,
                                           batch_size= self.val_bs,
                                           num_workers = 2)


In [None]:
class T5Model(pl.LightningModule):
    def __init__(self):
        super().__init__()
        self.model = model_pre_trained 
        
    def forward(self, input_ids, attention_mask, labels=None):
        output = self.model(input_ids=input_ids,
                            attention_mask=attention_mask, 
                            labels=labels)
        
        
        
        return output.loss, output.logits
    
    
    def training_step(self, batch, batch_idx):
        input_ids = batch["input_ids"]
        attention_mask = batch["attention_mask"]
        labels= batch["target"]
        loss, logits = self(input_ids , attention_mask, labels)
        self.log("train_loss", loss, prog_bar=True, logger=True)

        return {'loss': loss}
    
    def validation_step(self, batch, batch_idx):
        input_ids = batch["input_ids"]
        attention_mask = batch["attention_mask"]
        labels= batch["target"]
        loss, logits = self(input_ids, attention_mask, labels)

        self.log("val_loss", loss, prog_bar=True, logger=True)
        
        return {'val_loss': loss}

    def configure_optimizers(self):
        return AdamW(self.parameters(), lr=0.0005)
    
    def train_model(self,train_data,test_data,
              tokenizer,INPUT_MAX_LEN,OUTPUT_MAX_LEN,
              TRAIN_BATCH_SIZE,VAL_BATCH_SIZE):
        
        dataload = CustomDataLoad(train_data,test_data,tokenizer,
                                   INPUT_MAX_LEN,OUTPUT_MAX_LEN,
                                   TRAIN_BATCH_SIZE,VAL_BATCH_SIZE)
        
        dataload.setup()
    
        
        checkpoint = ModelCheckpoint(dirpath="/kaggle/working",
                                     filename='best-model',
                                     save_top_k=3,
                                     verbose=True,
                                     monitor="val_loss",
                                     mode="min")
        trainer = pl.Trainer(callbacks = checkpoint,max_epochs= 10,accelerator="gpu",accumulate_grad_batches=4)
        
        trainer.fit(self.to(DEVICE), dataload)
        torch.cuda.empty_cache()
        
        
        
        
        

In [None]:
model_fine_tuned = T5Model()

In [None]:
from sklearn.model_selection import train_test_split
train_data, test_data = train_test_split(tmp,test_size = 0.2, random_state=1)

In [None]:
model_fine_tuned.train_model(train_data,test_data,
                       tokenizer,INPUT_MAX_LEN,OUTPUT_MAX_LEN,
                       TRAIN_BATCH_SIZE,VAL_BATCH_SIZE)

In [None]:
train_model = T5Model.load_from_checkpoint('/kaggle/working/best-model.ckpt')
train_model.freeze()
train_model = train_model.model.to(DEVICE)
def generate_question(instruction):
    

    inputs_encoding =  tokenizer(instruction,add_special_tokens=True,
                                 max_length= INPUT_MAX_LEN,padding = 'max_length',
                                 truncation='only_first',
                                 return_attention_mask=True,return_tensors="pt").to(DEVICE) 

    
    generate_ids = train_model.generate(input_ids = inputs_encoding["input_ids"],
                                              attention_mask = inputs_encoding["attention_mask"],
                                              max_length = INPUT_MAX_LEN,num_beams = 4,
                                              num_return_sequences = 1,
                                              no_repeat_ngram_size=2,
                                              early_stopping=True,)

    preds = [tokenizer.decode(gen_id,skip_special_tokens=True,clean_up_tokenization_spaces=True) for gen_id in generate_ids]

    return "".join(preds)

In [None]:
instruction = "How could I track the compensation?"
print("instruction: ",instruction)
print("Bot: ",generate_question(instruction))

In [None]:
#all_layers = list(train_model.children())


#last_layer = all_layers[-1]

#print("Last layer:", last_layer)

In [None]:
#print(train_model)

In [None]:
#print(all_layers)