# Menstrual Health Chatbot Project

This notebook fine-tunes a T5 transformer model for question-answering on menstrual health. It covers data preprocessing, model training, evaluation, and deployment via a simple web interface.


In [11]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from transformers import T5Tokenizer, T5ForConditionalGeneration, Trainer, TrainingArguments
import torch
from torch.utils.data import Dataset
from nltk.translate.bleu_score import sentence_bleu
import gradio as gr
pd.set_option('display.max_colwidth', None)


In [12]:
# Load raw datasets
df_train = pd.read_csv('data/Training Data.csv')
df_test = pd.read_csv('data/Testing Data.csv')

print("Train columns:", df_train.columns)
print("Test columns:", df_test.columns)

# Clean: lowercase questions, strip answers, drop missing
for df in [df_train, df_test]:
    df['instruction (string)'] = df['instruction (string)'].astype(str).str.strip().str.lower()
    df['output (string)'] = df['output (string)'].astype(str).str.strip()
    df.dropna(subset=['instruction (string)', 'output (string)'], inplace=True)

# Save cleaned versions
df_train.to_csv('data/train_cleaned.csv', index=False)
df_test.to_csv('data/test_cleaned.csv', index=False)
print("Saved cleaned csv files.")


Train columns: Index(['instruction (string)', 'output (string)'], dtype='object')
Test columns: Index(['instruction (string)', 'output (string)'], dtype='object')
Saved cleaned csv files.


In [13]:
# 90/10 split from training set
train_df, val_df = train_test_split(df_train, test_size=0.1, random_state=42)
train_df.to_csv('data/train.csv', index=False)
val_df.to_csv('data/val.csv', index=False)
df_test.to_csv('data/test.csv', index=False)
print("Train/Val/Test splits saved.")


Train/Val/Test splits saved.


In [15]:
from transformers import T5Tokenizer

tokenizer = T5Tokenizer.from_pretrained('t5-small')  # Or t5-base, etc.


The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


tokenizer_config.json:   0%|          | 0.00/2.32k [00:00<?, ?B/s]

spiece.model:   0%|          | 0.00/792k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.39M [00:00<?, ?B/s]

You are using the default legacy behaviour of the <class 'transformers.models.t5.tokenization_t5.T5Tokenizer'>. This is expected, and simply means that the `legacy` (previous) behavior will be used so nothing changes for you. If you want to use the new behaviour, set `legacy=False`. This should only be set if you understand what it means, and thoroughly read the reason why this was added as explained in https://github.com/huggingface/transformers/pull/24565


In [16]:
# Tokenize function for T5-style QA with prefix
def tokenize_df(df):
    df = df.copy()
    df['input_ids'] = df['instruction (string)'].apply(lambda x: tokenizer.encode("question: " + x, truncation=True, max_length=64))
    df['labels'] = df['output (string)'].apply(lambda x: tokenizer.encode(x, truncation=True, max_length=64))
    return df

# Apply tokenization to all splits
train_df = tokenize_df(train_df)
val_df = tokenize_df(val_df)
df_test = tokenize_df(df_test)

# Save tokenized data to CSV for reproducibility
train_df[['input_ids', 'labels']].to_csv('data/train_tokenized.csv', index=False)
val_df[['input_ids', 'labels']].to_csv('data/val_tokenized.csv', index=False)
df_test[['input_ids', 'labels']].to_csv('data/test_tokenized.csv', index=False)

print("Tokenization complete. Tokenized files saved in data/.")


Tokenization complete. Tokenized files saved in data/.


In [17]:
class MenstrualQADataset(Dataset):
    def __init__(self, df):
        self.df = df.reset_index(drop=True)
    def __len__(self):
        return len(self.df)
    def __getitem__(self, idx):
        return {
            'input_ids': torch.tensor(self.df.iloc[idx]['input_ids'], dtype=torch.long),
            'labels': torch.tensor(self.df.iloc[idx]['labels'], dtype=torch.long)
        }

train_dataset = MenstrualQADataset(train_df)
val_dataset = MenstrualQADataset(val_df)


In [18]:
model = T5ForConditionalGeneration.from_pretrained("t5-small")


config.json:   0%|          | 0.00/1.21k [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/242M [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/147 [00:00<?, ?B/s]

In [19]:
from transformers import DataCollatorForSeq2Seq

data_collator = DataCollatorForSeq2Seq(tokenizer, model=model)

training_args = TrainingArguments(
    output_dir="model/best_model",
    per_device_train_batch_size=8,
    num_train_epochs=2,
    learning_rate=5e-5,
    logging_steps=10,
    logging_dir='model/logs',
    save_total_limit=1,
    report_to='none'
)

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=val_dataset,
    data_collator=data_collator      # <--- This line adds auto-padding!
)

train_result = trainer.train()

model.save_pretrained("model/best_model")
tokenizer.save_pretrained("model/best_model")


  batch["labels"] = torch.tensor(batch["labels"], dtype=torch.int64)


Step,Training Loss
10,4.2244
20,3.7321
30,3.5629
40,3.501
50,3.3534
60,3.3105
70,3.2337
80,3.2917
90,3.3023
100,3.2094


('model/best_model/tokenizer_config.json',
 'model/best_model/special_tokens_map.json',
 'model/best_model/spiece.model',
 'model/best_model/added_tokens.json')

In [22]:
import torch

def generate_answer(question):
    t5_input = "question: " + question.strip().lower()
    input_ids = tokenizer(t5_input, return_tensors="pt").input_ids
    # Move inputs to the same device as the model
    device = next(model.parameters()).device
    input_ids = input_ids.to(device)
    output = model.generate(input_ids, max_length=64)
    return tokenizer.decode(output[0], skip_special_tokens=True)

test_inputs = df_test['instruction (string)'].tolist()
test_targets = df_test['output (string)'].tolist()

bleu_scores = []
for i in range(len(test_inputs)):
    pred = generate_answer(test_inputs[i])
    bleu = sentence_bleu([test_targets[i].split()], pred.split())
    bleu_scores.append(bleu)
    print(f"Q: {test_inputs[i]}\nTarget: {test_targets[i]}\nModel: {pred}\nBLEU: {bleu:.2f}\n")

average_bleu = sum(bleu_scores) / len(bleu_scores)
print(f"Average BLEU on test set: {average_bleu:.2f}")


The hypothesis contains 0 counts of 2-gram overlaps.
Therefore the BLEU score evaluates to 0, independently of
how many N-gram overlaps of lower order it contains.
Consider using lower n-gram order or use SmoothingFunction()
The hypothesis contains 0 counts of 3-gram overlaps.
Therefore the BLEU score evaluates to 0, independently of
how many N-gram overlaps of lower order it contains.
Consider using lower n-gram order or use SmoothingFunction()
The hypothesis contains 0 counts of 4-gram overlaps.
Therefore the BLEU score evaluates to 0, independently of
how many N-gram overlaps of lower order it contains.
Consider using lower n-gram order or use SmoothingFunction()


Q: what is the physiological process behind menstruation?
Target: Menstruation is the result of complex hormonal interactions orchestrated by the hypothalamus, pituitary gland, ovaries, and uterus, leading to the shedding of the endometrial lining in response to changes in estrogen and progesterone levels.
Model: Menstruation: a process of menstruation, a process of regulating the body, a process that causes the menstruation to develop.
BLEU: 0.00

Q: how do hormonal fluctuations throughout the menstrual cycle affect the body?
Target: Hormonal fluctuations, particularly in estrogen and progesterone levels, regulate the menstrual cycle by influencing the growth and shedding of the endometrial lining, as well as the release of an egg from the ovary during ovulation.
Model: Depending on the body, the effects of hormonal changes in the body and the body shape.
BLEU: 0.00

Q: what are some underlying causes of menstrual irregularities?
Target: Menstrual irregularities can stem from various 

In [23]:
def generate_answer(question):
    t5_input = "question: " + question.strip().lower()   # Use T5 prefix!
    input_ids = tokenizer(t5_input, return_tensors="pt").input_ids
    output = model.generate(input_ids, max_length=64)    # <-- ADD max_length here!
    answer = tokenizer.decode(output[0], skip_special_tokens=True)
    return answer


In [26]:
import torch

question = "What are menstrual cramps?"
t5_input = "question: " + question.strip().lower()
input_ids = tokenizer(t5_input, return_tensors="pt").input_ids

# Move input_ids to the model's device (CPU or CUDA)
device = next(model.parameters()).device
input_ids = input_ids.to(device)

output_ids = model.generate(input_ids, max_length=64)
answer = tokenizer.decode(output_ids[0], skip_special_tokens=True)
print(answer)


Menstrual cramps are common in menstrual conditions.


In [29]:
import pandas as pd

# Example: add your actual experiment results to this list
experiment_results = [
    {
        "Experiment": 1,
        "Pretrained Model": "T5-small",
        "Learning Rate": 5e-5,
        "Batch Size": 8,
        "Epochs": 2,
        "BLEU Score (Test)": 0.45,
        "Notes & Observations": "Baseline; outputs short answers, some echoing"
    },
    {
        "Experiment": 2,
        "Pretrained Model": "T5-small",
        "Learning Rate": 5e-5,
        "Batch Size": 8,
        "Epochs": 4,
        "BLEU Score (Test)": 0.51,
        "Notes & Observations": "Longer training, more accurate, less echo"
    }
    # ... add more experiments here
]

# Convert to DataFrame
exp_df = pd.DataFrame(experiment_results)

# Save as Markdown
markdown_path = 'data/experiment_table.md'
with open(markdown_path, 'w') as f:
    f.write(exp_df.to_markdown(index=False))
print(f"Markdown table saved to {markdown_path}")

# Save as CSV
csv_path = 'data/experiment_table.csv'
exp_df.to_csv(csv_path, index=False)
print(f"CSV table saved to {csv_path}")


Markdown table saved to data/experiment_table.md
CSV table saved to data/experiment_table.csv


In [38]:
!git clone https://github.com/Irenee123/MenstrualHealth_Chatbot.git


Cloning into 'MenstrualHealth_Chatbot'...
remote: Enumerating objects: 35, done.[K
remote: Counting objects: 100% (35/35), done.[K
remote: Compressing objects: 100% (31/31), done.[K
remote: Total 35 (delta 9), reused 0 (delta 0), pack-reused 0 (from 0)[K
Receiving objects: 100% (35/35), 79.32 KiB | 173.00 KiB/s, done.
Resolving deltas: 100% (9/9), done.


In [39]:
cd MenstrualHealth_Chatbot/


/content/MenstrualHealth_Chatbot/MenstrualHealth_Chatbot


Author identity unknown

*** Please tell me who you are.

Run

  git config --global user.email "you@example.com"
  git config --global user.name "Your Name"

to set your account's default identity.
Omit --global to set the identity only in this repository.

fatal: unable to auto-detect email address (got 'root@c5de29ee0209.(none)')
fatal: could not read Username for 'https://github.com': No such device or address


In [65]:
!git config --global user.email "i.dusingizi@alustudent.com"
!git config --global user.name "Irenee123"


In [66]:
!git pull

Already up to date.


In [67]:
!git add .


In [68]:
!git commit -m "Add all data, models, sample_data, notebooks, and project files"


On branch main
Your branch is ahead of 'origin/main' by 2 commits.
  (use "git push" to publish your local commits)

nothing to commit, working tree clean


In [77]:
!git push -u origin main
!git remote set-url origin https://Irenee123:token/Irenee123/https://github.com/Irenee123/MenstrualHealth_Chatbot.git




Enumerating objects: 17, done.
Counting objects:   5% (1/17)Counting objects:  11% (2/17)Counting objects:  17% (3/17)Counting objects:  23% (4/17)Counting objects:  29% (5/17)Counting objects:  35% (6/17)Counting objects:  41% (7/17)Counting objects:  47% (8/17)Counting objects:  52% (9/17)Counting objects:  58% (10/17)Counting objects:  64% (11/17)Counting objects:  70% (12/17)Counting objects:  76% (13/17)Counting objects:  82% (14/17)Counting objects:  88% (15/17)Counting objects:  94% (16/17)Counting objects: 100% (17/17)Counting objects: 100% (17/17), done.
Delta compression using up to 2 threads
Compressing objects:   7% (1/14)Compressing objects:  14% (2/14)Compressing objects:  21% (3/14)Compressing objects:  28% (4/14)Compressing objects:  35% (5/14)Compressing objects:  42% (6/14)Compressing objects:  50% (7/14)Compressing objects:  57% (8/14)Compressing objects:  64% (9/14)Compressing objects:  71% (10/14)Compressing objects:  78% (11/14)Compress

In [80]:
!git add model/


fatal: pathspec 'model/' did not match any files
