In [None]:
!pip install evaluate

# Import Tools

In [1]:
import pandas as pd
from transformers import T5Tokenizer, T5ForConditionalGeneration, Trainer, TrainingArguments

  from .autonotebook import tqdm as notebook_tqdm


# Load Datasets

In [2]:
# Load dataset 
df = pd.read_csv("domain_specific_chatbot_data.csv")

# Display a sample of the dataset
df.head()

Unnamed: 0,query,response,intent,domain
0,What are the side effects of the COVID-19 vacc...,Common side effects of the COVID-19 vaccine in...,side effects inquiry,healthcare
1,How can I schedule an appointment with my doctor?,You can schedule an appointment by calling our...,appointment booking,healthcare
2,What should I do if I miss a dose of my medica...,"If you miss a dose, take it as soon as you rem...",medication inquiry,healthcare
3,How can I check my account balance?,You can check your balance by logging into you...,balance inquiry,finance
4,What is the interest rate for a personal loan?,The current interest rate for personal loans i...,loan inquiry,finance


# Data Preprocessing

In [3]:
from sklearn.model_selection import train_test_split

# Split the data into training and validation sets
train_df, val_df = train_test_split(df, test_size=0.2, random_state=42)

train_df.shape, val_df.shape

((2408, 4), (602, 4))

In [4]:
train_data = train_df.reset_index(drop=True)
validation_data = val_df.reset_index(drop=True)

validation_data

Unnamed: 0,query,response,intent,domain
0,How do I apply for a student loan?,You can apply for a student loan by visiting o...,student loan application,finance
1,What is the interest rate for a personal loan?,The current interest rate for personal loans i...,loan inquiry,finance
2,What are the side effects of the COVID-19 vacc...,Common side effects of the COVID-19 vaccine in...,side effects inquiry,healthcare
3,What should I do if I miss a dose of my medica...,"If you miss a dose, take it as soon as you rem...",medication inquiry,healthcare
4,How can I schedule an appointment with my doctor?,You can schedule an appointment by calling our...,appointment booking,healthcare
...,...,...,...,...
597,What should I do if I miss a dose of my medica...,"If you miss a dose, take it as soon as you rem...",medication inquiry,healthcare
598,What are the symptoms of flu?,"Flu symptoms include fever, cough, sore throat...",flu symptoms inquiry,healthcare
599,How do I apply for a student loan?,You can apply for a student loan by visiting o...,student loan application,finance
600,How do I update my contact details on my account?,"To update your contact details, log into your ...",contact update,finance


In [5]:
# Clean the text by removing unwanted characters
import re

def clean_text(text):
    text = re.sub(r'\r\n', ' ', text)  # Remove carriage returns and line breaks
    text = re.sub(r'\s+', ' ', text)  # Remove extra spaces
    text = re.sub(r'<.*?>', '', text)  # Remove any XML tags
    text = text.strip().lower()  # Strip and convert to lower case
    return text

# Apply cleaning to dialogue and summary columns
train_data['query'] = train_data['query'].apply(clean_text)
train_data['response'] = train_data['response'].apply(clean_text)

validation_data['query'] = validation_data['query'].apply(clean_text)
validation_data['response'] = validation_data['response'].apply(clean_text)


# Display a sample after cleaning
train_data

Unnamed: 0,query,response,intent,domain
0,how do i update my contact details on my account?,"to update your contact details, log into your ...",contact update,finance
1,what should i do if i miss a dose of my medica...,"if you miss a dose, take it as soon as you rem...",medication inquiry,healthcare
2,"i lost my credit card, what should i do?",please contact our customer service immediatel...,lost card reporting,finance
3,what should i do if i miss a dose of my medica...,"if you miss a dose, take it as soon as you rem...",medication inquiry,healthcare
4,"i lost my credit card, what should i do?",please contact our customer service immediatel...,lost card reporting,finance
...,...,...,...,...
2403,can i make changes to my loan repayment schedule?,changes to your loan repayment schedule can be...,loan repayment adjustment,finance
2404,"i lost my credit card, what should i do?",please contact our customer service immediatel...,lost card reporting,finance
2405,what are the side effects of the covid-19 vacc...,common side effects of the covid-19 vaccine in...,side effects inquiry,healthcare
2406,what is the interest rate for a personal loan?,the current interest rate for personal loans i...,loan inquiry,finance


# Tokenization

In [6]:
tokenizer= T5Tokenizer.from_pretrained("t5-base")

You are using the default legacy behaviour of the <class 'transformers.models.t5.tokenization_t5.T5Tokenizer'>. This is expected, and simply means that the `legacy` (previous) behavior will be used so nothing changes for you. If you want to use the new behaviour, set `legacy=False`. This should only be set if you understand what it means, and thoroughly read the reason why this was added as explained in https://github.com/huggingface/transformers/pull/24565


In [7]:
from datasets import Dataset
train_dataset_hf = Dataset.from_pandas(train_data)
val_dataset_hf = Dataset.from_pandas(validation_data)
def preprocess_function(examples):
    # Tokenize inputs and targets
    model_inputs = tokenizer(examples["query"], padding="max_length", truncation=True, max_length=250)
    labels = tokenizer(examples["response"], padding="max_length", truncation=True, max_length=250)["input_ids"]

    # Replace padding token IDs with -100 in labels
    labels = [
        [(label if label != tokenizer.pad_token_id else -100) for label in label_seq]
        for label_seq in labels
    ]

    model_inputs["labels"] = labels
    return model_inputs

# Apply preprocessing
train_dataset = train_dataset_hf.map(preprocess_function, batched=True)
val_dataset = val_dataset_hf.map(preprocess_function, batched=True)

Map: 100%|██████████| 2408/2408 [00:04<00:00, 522.31 examples/s]
Map: 100%|██████████| 602/602 [00:01<00:00, 545.74 examples/s]


In [8]:
sample = train_dataset[0]
print("Labels:", sample['labels'])
train_data['response'][0]

Labels: [12, 2270, 39, 574, 1030, 6, 4303, 139, 39, 905, 11, 281, 12, 8, 3, 31, 18816, 31, 1375, 5, 1, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -10

"to update your contact details, log into your account and go to the 'profile' section."

In [9]:
train_dataset[0]

{'query': 'how do i update my contact details on my account?',
 'response': "to update your contact details, log into your account and go to the 'profile' section.",
 'intent': 'contact update',
 'domain': 'finance',
 'input_ids': [149,
  103,
  3,
  23,
  2270,
  82,
  574,
  1030,
  30,
  82,
  905,
  58,
  1,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
 

# Fine Tuning Model

In [12]:
import transformers
print(transformers.__version__)


4.51.0


In [13]:
# Model
model = T5ForConditionalGeneration.from_pretrained("t5-small")

# Define training arguments
training_args = TrainingArguments(
    output_dir="./results",          # output directory for checkpoints
    num_train_epochs=6,              # number of training epochs
    per_device_train_batch_size=8,   # batch size per device during training
    per_device_eval_batch_size=8,    # batch size for evaluation
    warmup_steps=500,                # number of warmup steps for learning rate scheduler
    weight_decay=0.01,               # strength of weight decay
    logging_dir="./logs",            # directory for storing logs
    logging_steps=50,                # how often to log training info
    save_steps=500,                  # how often to save a model checkpoint
    eval_steps=50,                   # how often to run evaluation
    # evaluation_strategy="epoch",     # Ensure evaluation happens every `epoch`
)

# Trainer setup
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=val_dataset
)

# Train the model
trainer.train()

Passing a tuple of `past_key_values` is deprecated and will be removed in Transformers v4.48.0. You should pass an instance of `EncoderDecoderCache` instead, e.g. `past_key_values=EncoderDecoderCache.from_legacy_cache(past_key_values)`.


Step,Training Loss
50,3.8872
100,3.3823
150,2.7347
200,2.0825
250,1.4909
300,0.9874
350,0.5874
400,0.3247
450,0.1592
500,0.1128


TrainOutput(global_step=1806, training_loss=0.45362569522091983, metrics={'train_runtime': 2083.9164, 'train_samples_per_second': 6.933, 'train_steps_per_second': 0.867, 'total_flos': 954794115072000.0, 'train_loss': 0.45362569522091983, 'epoch': 6.0})

# Save and Load Model


In [None]:
model.save_pretrained("./chatbot_model")
tokenizer.save_pretrained("./chatbot_model")


model = T5ForConditionalGeneration.from_pretrained("./chatbot_model")
tokenizer = T5Tokenizer.from_pretrained("./chatbot_model")

# Chatbot System

In [None]:
device = model.device


def chatbot(query):
    query = clean_text(query)
    input_ids = tokenizer(query,return_tensors="pt",max_length=250,truncation=True)

    inputs = {key: value.to(device) for key, value in input_ids.items()}

    outputs = model.generate(
        input_ids["input_ids"],
        max_length=250,
        num_beams=5,
        early_stopping=True
    )

    return tokenizer.decode(outputs[0], skip_special_tokens=True)

while True:
    user_input = input("You: ")
    if user_input.lower() == "exit":
        break
    response = chatbot(user_input)
    print("Chatbot:", response)

# Mandatory to-be-run after fresh restart of kernel

In [2]:
from transformers import T5ForConditionalGeneration, T5Tokenizer
import torch

In [5]:
#Load model and tokenizer
model = T5ForConditionalGeneration.from_pretrained("./chatbot_model")
tokenizer = T5Tokenizer.from_pretrained("./chatbot_model")
model.eval()

# Clean the text by removing unwanted characters
import re

def clean_text(text):
    text = re.sub(r'\r\n', ' ', text)  # Remove carriage returns and line breaks
    text = re.sub(r'\s+', ' ', text)  # Remove extra spaces
    text = re.sub(r'<.*?>', '', text)  # Remove any XML tags
    text = text.strip().lower()  # Strip and convert to lower case
    return text