<a href="https://colab.research.google.com/github/M-Pascal/Chatbot_Summative/blob/main/Notebook/Healthcare_Chatbot_%5BSummative%5D.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# **Healthcare Assistance Chatbot**
Within this task we will be building assistance chatbot which is fine-tune from pre-trained model using T5-small. with the purpose of building a healthcare information chatbot that can answer patients’ questions about hospital services.

In [1]:
# Installing transformers dependences
!pip install -U datasets
!pip install -U transformers
!!pip install -U evaluate
!pip install rouge_score

Collecting datasets
  Downloading datasets-3.6.0-py3-none-any.whl.metadata (19 kB)
Collecting fsspec<=2025.3.0,>=2023.1.0 (from fsspec[http]<=2025.3.0,>=2023.1.0->datasets)
  Downloading fsspec-2025.3.0-py3-none-any.whl.metadata (11 kB)
Downloading datasets-3.6.0-py3-none-any.whl (491 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m491.5/491.5 kB[0m [31m11.1 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading fsspec-2025.3.0-py3-none-any.whl (193 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m193.6/193.6 kB[0m [31m9.7 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: fsspec, datasets
  Attempting uninstall: fsspec
    Found existing installation: fsspec 2025.3.2
    Uninstalling fsspec-2025.3.2:
      Successfully uninstalled fsspec-2025.3.2
  Attempting uninstall: datasets
    Found existing installation: datasets 2.14.4
    Uninstalling datasets-2.14.4:
      Successfully uninstalled datasets-2.14.4
[31mERROR: pip's dependency re

# 1. Importing libraries

In [2]:
# Importing necessary library
from datasets import load_dataset # for huggingface dataset
import re
import torch
import pandas as pd
from datasets import Dataset, DatasetDict
from transformers import (
    T5Tokenizer,
    T5ForConditionalGeneration,
    TrainingArguments,
    Trainer
)
from sklearn.model_selection import train_test_split
import evaluate


# 2. Loading data

In [3]:
# Loading the datset from Huggingface
data = load_dataset("Ram20307/HealthCareChatbot")

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


README.md:   0%|          | 0.00/147 [00:00<?, ?B/s]

medical_dataset.csv:   0%|          | 0.00/22.3M [00:00<?, ?B/s]

Generating train split:   0%|          | 0/16413 [00:00<?, ? examples/s]

In [4]:
# Exploring data we downloaded
print(data)

DatasetDict({
    train: Dataset({
        features: ['input', 'output'],
        num_rows: 16413
    })
})


In [5]:
data['train'][:1]

{'input': ['What services does your healthcare facility offer?'],
 'output': ['We offer a wide range of services, including primary care, specialty care, diagnostic imaging, laboratory services, mental health support, physical therapy, and preventive care. You can find a detailed list on our website under the "Services" section.']}

In [6]:
# # Checking for the duplicates
# data_1 = data['train'].to_pandas()
# duplicates = data_1.duplicated(subset=['input', 'output'])
# print("Rows marked as duplicates: \n")
# # Apply the boolean mask to the 'train' split of the dataset
# print(data['train'][duplicates])

# # total number of duplicates
# num_duplicates = duplicates.sum()
# print(f"\nNumber of duplicate rows: {num_duplicates}")
# Convert dataset to pandas DataFrame

# 3. Data Preprocessing
## Convert to Pandas and Clean the Dataset
We are going to handle missing data, and removing duplicate to reduce the noise in the dataset

In [7]:
df = data['train'].to_pandas()

# Showing initial row count
print("Initial rows:", len(df))

# Check for missing values
missing_input = df['input'].isna().sum()
missing_output = df['output'].isna().sum()
print(f"Missing values on; input: {missing_input}, output: {missing_output}")

# Drop rows with missing input or output
df.dropna(subset=['input', 'output'], inplace=True)

# Checking for duplicates before dropping
duplicates = df.duplicated(subset=['input', 'output'], keep=False)
print(f"Number of duplicate rows (before drop): {duplicates.sum()}")

# Drop exact duplicate input-output pairs
df.drop_duplicates(subset=['input', 'output'], inplace=True)

# Final report
print("Cleaned data rows:", len(df))
print("Total rows removed during cleaning:", 16413 - len(df))

Initial rows: 16413
Missing values on; input: 0, output: 5
Number of duplicate rows (before drop): 80
Cleaned data rows: 16360
Total rows removed during cleaning: 53


## 3.1. Text Normalization

In [8]:
# Normalize text
def clean_text(text):
    text = text.lower()  # Convert to lowercase
    text = re.sub(r'\s+', ' ', text)  # Normalize whitespace
    text = re.sub(r'[^a-z0-9\s.,?!]', '', text)  # Keep only basic symbols
    return text.strip()

df['input'] = df['input'].apply(clean_text)
df['output'] = df['output'].apply(clean_text)

## 3.2. Splitting the dataset (Train and Validation)

In [9]:
# Splitting 90% for training, 10% for validation
train_df, val_df = train_test_split(df, test_size=0.1, random_state=42)

# Convert to Hugging Face Dataset
train_dataset = Dataset.from_pandas(train_df)
val_dataset = Dataset.from_pandas(val_df)

# Wrap in Dataset Dictionary
dataset = DatasetDict({
    'train': train_dataset,
    'validation': val_dataset
})

# 3.3. Tokenization
**Tokenization Method:** SentencePiece, used by T5

In [10]:
# Load tokenizer
tokenizer = T5Tokenizer.from_pretrained("t5-small")

# Preprocessing function
def preprocess(example):
    input_text = "question: " + example["input"]
    target_text = example["output"]

    model_inputs = tokenizer(input_text, max_length=128, padding="max_length", truncation=True)
    with tokenizer.as_target_tokenizer():
        labels = tokenizer(target_text, max_length=128, padding="max_length", truncation=True)

    model_inputs["labels"] = labels["input_ids"]
    return model_inputs

# Apply preprocessing
tokenized_dataset = dataset.map(preprocess, remove_columns=["input", "output"])

tokenizer_config.json:   0%|          | 0.00/2.32k [00:00<?, ?B/s]

spiece.model:   0%|          | 0.00/792k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.39M [00:00<?, ?B/s]

You are using the default legacy behaviour of the <class 'transformers.models.t5.tokenization_t5.T5Tokenizer'>. This is expected, and simply means that the `legacy` (previous) behavior will be used so nothing changes for you. If you want to use the new behaviour, set `legacy=False`. This should only be set if you understand what it means, and thoroughly read the reason why this was added as explained in https://github.com/huggingface/transformers/pull/24565


Map:   0%|          | 0/14724 [00:00<?, ? examples/s]



Map:   0%|          | 0/1636 [00:00<?, ? examples/s]

# 4. Loading pretrained T5 model

In [11]:
# pretrained model
model = T5ForConditionalGeneration.from_pretrained("t5-small")


config.json:   0%|          | 0.00/1.21k [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/242M [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/147 [00:00<?, ?B/s]

In [12]:
# Defining training Arguments

# Creating function for training and evaluation
def train_and_evaluate(model, tokenized_data, learning_rate,
                       output_dir, num_train_epochs,
                       per_device_train_batch_size,
                       per_device_eval_batch_size):
    training_args = TrainingArguments(
        output_dir=output_dir,
        num_train_epochs=num_train_epochs,
        per_device_train_batch_size=per_device_train_batch_size,
        per_device_eval_batch_size=per_device_eval_batch_size,
        warmup_steps=100,
        weight_decay=0.01,
        learning_rate=learning_rate,
        logging_dir=f"{output_dir}/logs",
        logging_steps=10,
        # Changed evaluation_strategy to eval_strategy
        eval_strategy="epoch",
        save_strategy="epoch",
        save_steps=200,
        save_total_limit=1,
        load_best_model_at_end=True,
        fp16=torch.cuda.is_available(),
        gradient_accumulation_steps=2,
        report_to="none"
    )

    # Initialiazing trainer and training model
    trainer = Trainer(
        model=model,
        args=training_args,
        train_dataset=tokenized_data["train"],
        eval_dataset=tokenized_data["validation"]
    )

    trainer.train()
    eval_loss = trainer.evaluate()["eval_loss"]

    return trainer, eval_loss


### Training baseline model

In [13]:
# Train baseline model
print("Training baseline model...")
baseline_trainer, baseline_loss = train_and_evaluate(
    model=model,
    tokenized_data=tokenized_dataset,
    learning_rate=2e-5,
    output_dir="./results_baseline",
    num_train_epochs=3,
    per_device_train_batch_size=8,
    per_device_eval_batch_size=16
)

Training baseline model...


Passing a tuple of `past_key_values` is deprecated and will be removed in Transformers v4.48.0. You should pass an instance of `EncoderDecoderCache` instead, e.g. `past_key_values=EncoderDecoderCache.from_legacy_cache(past_key_values)`.


Epoch,Training Loss,Validation Loss
1,2.6982,2.398354
2,2.5156,2.265203
3,2.2906,2.236768


There were missing keys in the checkpoint model loaded: ['encoder.embed_tokens.weight', 'decoder.embed_tokens.weight', 'lm_head.weight'].


### Training fine-tuned model

In [14]:
# Training tuned model
print("\nTraining fine-tuned model with higher learning rate...")
tuned_trainer, tuned_loss = train_and_evaluate(
    model=model,
    tokenized_data=tokenized_dataset,
    learning_rate=5e-5,
    output_dir="./results_tuned",
    num_train_epochs=3,
    per_device_train_batch_size=8,
    per_device_eval_batch_size=16
)


Training fine-tuned model with higher learning rate...


Epoch,Training Loss,Validation Loss
1,2.2909,2.077021
2,2.2244,2.024785
3,2.0025,2.010356


There were missing keys in the checkpoint model loaded: ['encoder.embed_tokens.weight', 'decoder.embed_tokens.weight', 'lm_head.weight'].


### Result

In [15]:
# Printing results
print(f"\nLoss Comparison: ")
print(f"Baseline Loss (With lr=2e-5): {baseline_loss:.4f}")
print(f"Fine-Tuned Loss (With lr=5e-5): {tuned_loss:.4f}")


Loss Comparison: 
Baseline Loss (With lr=2e-5): 2.2368
Fine-Tuned Loss (With lr=5e-5): 2.0104


# 5. Model Evaluation using ROUGE
We will be exploring two mixed ways:
- Quantitative evaluation
- Qualitative evaluation

In [16]:
# Use evaluate.load instead of load_metric
rouge = evaluate.load("rouge")

# Run evaluation on a few sample
preds, refs = [], []
sample = val_df.sample(100).itertuples()

for row in sample:
    input_text = "question: " + row.input
    inputs = tokenizer(input_text, return_tensors="pt", padding=True, truncation=True)
    # Ensure model is on the correct device (CPU or GPU)
    if torch.cuda.is_available():
        inputs = {k: v.to("cuda") for k, v in inputs.items()}
        model.to("cuda")

    output = model.generate(**inputs)
    decoded_output = tokenizer.decode(output[0], skip_special_tokens=True)

    preds.append(decoded_output)
    refs.append(row.output)

# Compute ROUGE
results = rouge.compute(predictions=preds, references=refs)
print(results)

Downloading builder script:   0%|          | 0.00/6.27k [00:00<?, ?B/s]

{'rouge1': np.float64(0.13135546512883267), 'rouge2': np.float64(0.07772215702036342), 'rougeL': np.float64(0.11925402580913422), 'rougeLsum': np.float64(0.11941889598451311)}


## Manual Testing (Qualitative)

In [17]:
def ask_bot(question):
    input_text = "question: " + question
    inputs = tokenizer(input_text, return_tensors="tf")
    outputs = model.generate(**inputs)
    return tokenizer.decode(outputs[0], skip_special_tokens=True)

print(ask_bot("What kind of therapy do you provide?"))

RuntimeError: Invalid device string: '/job:localhost/replica:0/task:0/device:GPU:0'