In [4]:
import kagglehub
from kagglehub import KaggleDatasetAdapter

file_path = "Mental_Health_FAQ.csv"

df = kagglehub.load_dataset(
  KaggleDatasetAdapter.PANDAS,
  "narendrageek/mental-health-faq-for-chatbot",
  file_path,
)

print("First 5 records:", df.head())

  df = kagglehub.load_dataset(


Downloading from https://www.kaggle.com/api/v1/datasets/download/narendrageek/mental-health-faq-for-chatbot?dataset_version_number=1&file_name=Mental_Health_FAQ.csv...


100%|██████████| 160k/160k [00:00<00:00, 3.44MB/s]

First 5 records:    Question_ID                                          Questions  \
0      1590140        What does it mean to have a mental illness?   
1      2110618                    Who does mental illness affect?   
2      6361820                        What causes mental illness?   
4      7657263            Can people with mental illness recover?   

                                             Answers  
0  Mental illnesses are health conditions that di...  
1  It is estimated that mental illness affects 1 ...  
2  It is estimated that mental illness affects 1 ...  
3  Symptoms of mental health disorders vary depen...  
4  When healing from mental illness, early identi...  





In [5]:
df.dropna(inplace=True)  # Remove incomplete entries

In [6]:
import re
df['Questions'] = df['Questions'].apply(lambda x: re.sub(r'[^a-zA-Z0-9\s]', '', x.lower()))

In [10]:
from huggingface_hub import login
login(token="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")

In [11]:
from transformers import AutoTokenizer, AutoModel
tokenizer = AutoTokenizer.from_pretrained("emilyalsentzer/Bio_ClinicalBERT")
model = AutoModel.from_pretrained("emilyalsentzer/Bio_ClinicalBERT")

config.json:   0%|          | 0.00/385 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/213k [00:00<?, ?B/s]

pytorch_model.bin:   0%|          | 0.00/436M [00:00<?, ?B/s]

In [13]:
print("Dataset columns:", df.columns)
print("\nSample entry:")
print(df.iloc[0])  # Show first Q&A pair

Dataset columns: Index(['Question_ID', 'Questions', 'Answers'], dtype='object')

Sample entry:
Question_ID                                              1590140
Questions             what does it mean to have a mental illness
Answers        Mental illnesses are health conditions that di...
Name: 0, dtype: object


In [14]:
import re

def clean_medical_text(text):
    # Remove non-alphanumeric characters except medical symbols
    text = re.sub(r'[^a-zA-Z0-9\s\-/()]', '', text)
    # Convert to lowercase except medical abbreviations
    text = text.lower()
    return text.strip()

df['Cleaned_Questions'] = df['Questions'].apply(clean_medical_text)
df['Cleaned_Answers'] = df['Answers'].apply(clean_medical_text)

In [15]:
# Create question-answer pairs
pairs = list(zip(df['Cleaned_Questions'], df['Cleaned_Answers']))

# Split into training (80%) and validation (20%)
from sklearn.model_selection import train_test_split
train_pairs, val_pairs = train_test_split(pairs, test_size=0.2, random_state=42)

In [16]:
# First 3 training examples for demonstration
sample_questions = [pair[0] for pair in train_pairs[:3]]
sample_answers = [pair[1] for pair in train_pairs[:3]]

# Tokenize questions
question_encodings = tokenizer(
    sample_questions,
    padding=True,
    truncation=True,
    max_length=128,
    return_tensors="pt"
)

# Tokenize answers
answer_encodings = tokenizer(
    sample_answers,
    padding=True,
    truncation=True,
    max_length=256,
    return_tensors="pt"
)

print("\nTokenized Question Example:")
print(question_encodings['input_ids'][0])


Tokenized Question Example:
tensor([  101,  1187,  1169,   178,  1525,  2191, 18809,  1643,  3881,  1111,
         7560,   102,     0,     0,     0,     0,     0,     0,     0])


In [17]:
# Encode a test question
test_question = "What are symptoms of depression?"
inputs = tokenizer(test_question, return_tensors="pt")

# Get model output
with torch.no_grad():
    outputs = model(**inputs)

print("\nModel output shape:", outputs.last_hidden_state.shape)


Model output shape: torch.Size([1, 8, 768])


In [18]:
from torch.utils.data import Dataset

class MedicalQADataset(Dataset):
    def __init__(self, pairs, tokenizer, max_length=128):
        self.tokenizer = tokenizer
        self.pairs = pairs
        self.max_length = max_length

    def __len__(self):
        return len(self.pairs)

    def __getitem__(self, idx):
        question, answer = self.pairs[idx]

        # Tokenize both question and answer
        inputs = self.tokenizer(
            question,
            max_length=self.max_length,
            padding="max_length",
            truncation=True,
            return_tensors="pt"
        )

        targets = self.tokenizer(
            answer,
            max_length=self.max_length,
            padding="max_length",
            truncation=True,
            return_tensors="pt"
        )

        return {
            "input_ids": inputs["input_ids"].flatten(),
            "attention_mask": inputs["attention_mask"].flatten(),
            "labels": targets["input_ids"].flatten()
        }

In [19]:
from torch.utils.data import DataLoader

# Create datasets
train_dataset = MedicalQADataset(train_pairs, tokenizer)
val_dataset = MedicalQADataset(val_pairs, tokenizer)

# Create data loaders
BATCH_SIZE = 8  # Reduced for Colab's GPU memory
train_loader = DataLoader(
    train_dataset,
    batch_size=BATCH_SIZE,
    shuffle=True,
    pin_memory=True  # Accelerates data transfer to GPU
)

val_loader = DataLoader(
    val_dataset,
    batch_size=BATCH_SIZE,
    pin_memory=True
)

In [20]:
import torch.nn as nn

class MedicalChatModel(nn.Module):
    def __init__(self, base_model):
        super().__init__()
        self.bert = base_model
        self.classifier = nn.Linear(768, tokenizer.vocab_size)  # 768 = BioBERT hidden size

        # Initialize classifier weights
        nn.init.normal_(self.classifier.weight, std=0.02)
        nn.init.zeros_(self.classifier.bias)

    def forward(self, input_ids, attention_mask, labels=None):
        outputs = self.bert(
            input_ids=input_ids,
            attention_mask=attention_mask
        )

        sequence_output = outputs.last_hidden_state
        logits = self.classifier(sequence_output)

        loss = None
        if labels is not None:
            loss_fct = nn.CrossEntropyLoss(ignore_index=tokenizer.pad_token_id)
            loss = loss_fct(logits.view(-1, tokenizer.vocab_size), labels.view(-1))

        return {"loss": loss, "logits": logits}

model = MedicalChatModel(model)  # Wrap our base model

In [21]:
from transformers import AdamW
from torch.optim.lr_scheduler import CosineAnnealingLR

# Optimizer
optimizer = AdamW(model.parameters(), lr=2e-5, weight_decay=0.01)

# Scheduler
scheduler = CosineAnnealingLR(optimizer, T_max=100)

# Enable GPU if available
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model = model.to(device)



In [24]:
from torch.cuda.amp import autocast, GradScaler
scaler = GradScaler()

  scaler = GradScaler()


In [25]:
model.bert.config.gradient_checkpointing = True

In [26]:
best_val_loss = float("inf")
patience = 2

In [27]:
from tqdm import tqdm

EPOCHS = 3  # Start small for testing
GRAD_CLIP = 1.0  # Prevent exploding gradients

for epoch in range(EPOCHS):
    # Training Phase
    model.train()
    train_loss = 0
    progress_bar = tqdm(train_loader, desc=f"Epoch {epoch+1} Training")

    for batch in progress_bar:
      optimizer.zero_grad()

      with autocast():
          inputs = {
            "input_ids": batch["input_ids"].to(device),
            "attention_mask": batch["attention_mask"].to(device),
            "labels": batch["labels"].to(device)
            }

          outputs = model(**inputs)
          loss = outputs["loss"]

      scaler.scale(loss).backward()

      torch.nn.utils.clip_grad_norm_(model.parameters(), GRAD_CLIP)
      scaler.step(optimizer)
      scaler.update()
      scheduler.step()

    # Validation Phase
    model.eval()
    val_loss = 0
    with torch.no_grad():
        for batch in tqdm(val_loader, desc="Validating"):
            inputs = {
                "input_ids": batch["input_ids"].to(device),
                "attention_mask": batch["attention_mask"].to(device),
                "labels": batch["labels"].to(device)
            }

            outputs = model(**inputs)
            val_loss += outputs["loss"].item()

    print(f"\nEpoch {epoch+1} | "
          f"Train Loss: {train_loss/len(train_loader):.4f} | "
          f"Val Loss: {val_loss/len(val_loader):.4f}\n")

  with autocast():
Epoch 1 Training: 100%|██████████| 10/10 [02:18<00:00, 13.88s/it]
Validating: 100%|██████████| 3/3 [00:10<00:00,  3.61s/it]



Epoch 1 | Train Loss: 0.0000 | Val Loss: 9.4404



Epoch 2 Training: 100%|██████████| 10/10 [02:16<00:00, 13.64s/it]
Validating: 100%|██████████| 3/3 [00:10<00:00,  3.63s/it]



Epoch 2 | Train Loss: 0.0000 | Val Loss: 9.3229



Epoch 3 Training: 100%|██████████| 10/10 [02:21<00:00, 14.14s/it]
Validating: 100%|██████████| 3/3 [00:10<00:00,  3.63s/it]


Epoch 3 | Train Loss: 0.0000 | Val Loss: 9.2455






In [28]:
import os

SAVE_PATH = "/content/medical_chatbot"
os.makedirs(SAVE_PATH, exist_ok=True)

# Save full model
torch.save(model.state_dict(), os.path.join(SAVE_PATH, "pytorch_model.bin"))
# Save tokenizer
tokenizer.save_pretrained(SAVE_PATH)

print(f"Model saved to {SAVE_PATH}")

Model saved to /content/medical_chatbot


In [31]:
import os

SAVE_PATH = "/content/medical_chatbot"
os.makedirs(SAVE_PATH, exist_ok=True)

# Save full model
torch.save(model.state_dict(), os.path.join(SAVE_PATH, "pytorch_model.bin"))

# Save tokenizer
tokenizer.save_pretrained(SAVE_PATH)

# Save the model config
model.bert.config.save_pretrained(SAVE_PATH) # Assuming 'model.bert' accesses the base model's config

print(f"Model saved to {SAVE_PATH}")

Model saved to /content/medical_chatbot


In [33]:
#For colab users
!pip install pyngrok --quiet

In [40]:
#For colab users
from pyngrok import ngrok

# Authenticate (only needed once per session)
!ngrok config add-authtoken 2uU6McoAccmroZOncSzMuTfYeiH_5YBg47TufHEzJAiUP82q4
# Using 'config add-authtoken' instead of just 'authtoken'

Authtoken saved to configuration file: /root/.config/ngrok/ngrok.yml


In [41]:
# Expose your Flask app's port
public_url = ngrok.connect(5000).public_url  # Flask's default port is 5000
print(f" * Public URL: {public_url}")

 * Public URL: https://01f7-34-42-55-0.ngrok-free.app


In [None]:
from flask import Flask, request, jsonify
from transformers import AutoTokenizer, AutoModel, AutoConfig

app = Flask(__name__)

# Load saved model and tokenizer
SAVE_PATH = "/content/medical_chatbot"  # Define the save path
tokenizer = AutoTokenizer.from_pretrained(SAVE_PATH)
config = AutoConfig.from_pretrained(SAVE_PATH)  # Load the config
base_model = AutoModel.from_config(config) # Create base model from config
model = MedicalChatModel(base_model) # Wrap with your custom class

# Load the state dictionary
model.load_state_dict(torch.load(os.path.join(SAVE_PATH, "pytorch_model.bin"), map_location=device))
model.eval()  # Set model to evaluation mode

# Move model to CPU or GPU
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model.to(device)


@app.route('/chat', methods=['POST'])
def chat():
    user_input = request.json['query']

    # Tokenize user input
    inputs = tokenizer(
        user_input,
        return_tensors="pt",
        max_length=128,
        padding="max_length",
        truncation=True
    ).to(device)

    # Generate response
    with torch.no_grad():
        outputs = model(**inputs)
        logits = outputs["logits"]

    # Convert logits to text
    response_ids = torch.argmax(logits, dim=-1)
    response = tokenizer.decode(response_ids[0], skip_special_tokens=True)

    return jsonify({"response": response})

if __name__ == '__main__':
    app.run(host="0.0.0.0", port=5000, debug=True)

 * Serving Flask app '__main__'
 * Debug mode: on


 * Running on all addresses (0.0.0.0)
 * Running on http://127.0.0.1:5000
 * Running on http://172.28.0.12:5000
INFO:werkzeug:[33mPress CTRL+C to quit[0m
INFO:werkzeug: * Restarting with stat
