<a href="https://colab.research.google.com/github/NayabNoor01/Symptom-to-Disease-Diagnosis-and-Medicine-Recommendation-System/blob/main/Stage_2.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
# 1. Mount Google Drive
from google.colab import drive
drive.mount('/content/drive')

# 2. Load the saved model and tokenizer
from transformers import AutoModelForSequenceClassification, AutoTokenizer
import torch

model_path = "/content/drive/MyDrive/saved_model"
model = AutoModelForSequenceClassification.from_pretrained(model_path)
tokenizer = AutoTokenizer.from_pretrained(model_path)

# 3. Upload dataset from laptop
from google.colab import files
uploaded = files.upload()  # Choose your Excel file

import pandas as pd
dataset = pd.read_excel(list(uploaded.keys())[0])

# 4. Prepare multi-label encoding
from sklearn.preprocessing import MultiLabelBinarizer
import ast

# Convert string labels back to Python lists if needed
dataset['Cause_Disease'] = dataset.apply(lambda row: [row['Cause'], row['Disease']], axis=1)

# Use the same label order/classes as training
# You should ideally save `multilabel.classes_` during training and reload here
mlb = MultiLabelBinarizer()
labels = mlb.fit_transform(dataset['Cause_Disease']).astype('float32')

# 5. Tokenize symptoms
inputs = tokenizer(dataset['Symptom'].tolist(), padding=True, truncation=True, return_tensors="pt")

# 6. Predictions
model.eval()
with torch.no_grad():
    outputs = model(**inputs)
    probs = torch.sigmoid(outputs.logits)
    preds = (probs > 0.5).int()

# 7. Evaluate
from sklearn.metrics import accuracy_score, f1_score
acc = accuracy_score(labels, preds)
f1 = f1_score(labels, preds, average="samples")

print(f"Accuracy: {acc*100:.2f}%")
print(f"F1 Score: {f1*100:.2f}%")


Mounted at /content/drive


Saving dataset.xlsx to dataset.xlsx


Asking to truncate to max_length but no maximum length is provided and the model has no predefined maximum length. Default to no truncation.


Accuracy: 99.60%
F1 Score: 99.60%


In [None]:
# Automatically detect disease classes from your dataset
disease_classes = dataset['Disease'].unique().tolist()

# Convert predictions back to label names
predicted_labels = mlb.inverse_transform(preds.numpy())

# Extract only disease labels
disease_labels_only = []
for labels in predicted_labels:
    disease_list = [l for l in labels if l in disease_classes]
    disease_labels_only.append(disease_list)

# Add predictions to dataframe
dataset["Predicted_Disease"] = disease_labels_only

# Save to Excel if needed
dataset.to_excel("predicted_diseases.xlsx", index=False)
from google.colab import files
files.download("predicted_diseases.xlsx")

print("Saved with predicted diseases only.")


<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

Saved with predicted diseases only.


In [None]:
dataset=pd.read_excel("predicted_diseases.xlsx")

##**Label mapping**

In [None]:
label_mapping={label:idx for idx,label in enumerate(dataset["Medicine"].unique())}
dataset["label"]=dataset["Medicine"].map(label_mapping)
print('Label mapping >> ',label_mapping)
print(dataset[["Medicine",'label']].head())

Label mapping >>  {"['Antihistamines (e.g., Loratadine)', 'Decongestants (e.g., Pseudoephedrine)', 'Epinephrine auto-injectors', 'Corticosteroids', 'Immunotherapy (allergy shots)']": 0, "['Acetaminophen','Corticosteroids (prescription anti-inflammatory medicine, including cortisone shots)','Disease-modifying antirheumatic drugs (DMARDs)']": 1, "['Inhaled corticosteroids (e.g., Fluticasone)', 'Beta-agonists (e.g., Albuterol)', 'Leukotriene modifiers (e.g., Montelukast)', 'Anticholinergics (e.g., Ipratropium)', 'Omalizumab']": 2, "['Acetaminophen','Avoid(e.g.,Aspirin,Ibuprofen)']": 3, "['Aspirin', 'Nitroglycerin', 'Beta-blockers (e.g., Metoprolol)', 'ACE inhibitors', 'Thrombolytics or PCI (percutaneous coronary intervention)']": 4, "['Triptans (5-hydroxytryptamine)','Ditans (lasmiditan)','Gepants (rimegepant and ubrogepant)','Dihydroergotamine (prochlorperazine)','Antiemetic medications (metoclopramide)']": 5, "['Amoxicillin','Tinidazole','Rabeprazole','Sucralfate']": 6, "['Antibiotics (

In [None]:
from datasets import Dataset

##**Converting Hugging Face**

In [None]:
dataset=Dataset.from_pandas(dataset[["Predicted_Disease",'label']])
split_dataset=dataset.train_test_split(test_size=0.2,seed=42)
train_dataset=split_dataset['train']
eval_dataset=split_dataset['test']


##**Tokenization**

In [None]:
from transformers import AutoTokenizer
tokenizer=AutoTokenizer.from_pretrained('dmis-lab/biobert-base-cased-v1.1')

def tokenize_function(example):
  return tokenizer(example["Predicted_Disease"],padding='max_length',truncation=True)
train_dataset=train_dataset.map(tokenize_function,batched=True)
eval_dataset=eval_dataset.map(tokenize_function,batched=True)

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


config.json:   0%|          | 0.00/313 [00:00<?, ?B/s]

vocab.txt: 0.00B [00:00, ?B/s]

Map:   0%|          | 0/400 [00:00<?, ? examples/s]

Asking to pad to max_length but no maximum length is provided and the model has no predefined maximum length. Default to no padding.
Asking to truncate to max_length but no maximum length is provided and the model has no predefined maximum length. Default to no truncation.


Map:   0%|          | 0/100 [00:00<?, ? examples/s]

##**Metric**

In [None]:
!pip install -q evaluate
import evaluate
import numpy as np
accuracy_metric=evaluate.load('accuracy')
f1_metric=evaluate.load('f1')
def compute_metric(eval_pred):
  logits,labels=eval_pred
  predictions=np.argmax(logits,axis=-1)
  acc=accuracy_metric.compute(predictions=predictions,references=labels)['accuracy']
  f1=f1_metric.compute(predictions=predictions,references=labels,average='weighted')['f1']
  return {'accuracy':acc,'f1':f1}

[?25l   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/84.1 kB[0m [31m?[0m eta [36m-:--:--[0m[2K   [91m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m[91m╸[0m[90m━[0m [32m81.9/84.1 kB[0m [31m2.8 MB/s[0m eta [36m0:00:01[0m[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m84.1/84.1 kB[0m [31m2.1 MB/s[0m eta [36m0:00:00[0m
[?25h

Downloading builder script: 0.00B [00:00, ?B/s]

Downloading builder script: 0.00B [00:00, ?B/s]

##**Load Model**

In [None]:
from transformers import AutoModelForSequenceClassification
num_labels=len(label_mapping)
model=AutoModelForSequenceClassification.from_pretrained(
    'dmis-lab/biobert-base-cased-v1.1',num_labels=num_labels
)

pytorch_model.bin:   0%|          | 0.00/436M [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/436M [00:00<?, ?B/s]

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at dmis-lab/biobert-base-cased-v1.1 and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


##**Training Setup**

In [None]:
from transformers import TrainingArguments,Trainer
training_args=TrainingArguments(
    output_dir='./results',
    num_train_epochs=2,
    per_device_train_batch_size=16,
    per_device_eval_batch_size=64,
    warmup_steps=50,
    weight_decay=0.01,
    eval_strategy='epoch',
    save_strategy='epoch',
    logging_dir='./logs',
    logging_steps=10,
    report_to='none',
)
trainer=Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=eval_dataset,
    tokenizer=tokenizer,
    compute_metrics=compute_metric,
)


  trainer=Trainer(


##**Train Model**

In [None]:
trainer.train()

Epoch,Training Loss,Validation Loss,Accuracy,F1
1,2.2357,2.0307,0.33,0.187657
2,1.0114,0.635522,0.98,0.980045


TrainOutput(global_step=50, training_loss=1.8505613136291503, metrics={'train_runtime': 273.9359, 'train_samples_per_second': 2.92, 'train_steps_per_second': 0.183, 'total_flos': 5583288717888.0, 'train_loss': 1.8505613136291503, 'epoch': 2.0})

In [None]:
# Mount Google Drive
from google.colab import drive
drive.mount('/content/drive')

# Save in Drive under a folder "stage_2"
save_path = "/content/drive/MyDrive/stage_2"
model.save_pretrained(save_path)
tokenizer.save_pretrained(save_path)


Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


('/content/drive/MyDrive/stage_2/tokenizer_config.json',
 '/content/drive/MyDrive/stage_2/special_tokens_map.json',
 '/content/drive/MyDrive/stage_2/vocab.txt',
 '/content/drive/MyDrive/stage_2/added_tokens.json',
 '/content/drive/MyDrive/stage_2/tokenizer.json')

##**Evaluate Model**

In [None]:
eval_results=trainer.evaluate()
print('\n***Evaluation Result***')
for key,value in eval_results.items():
  print(f'{key} : {value:.2f}')


***Evaluation Result***
eval_loss : 0.64
eval_accuracy : 0.98
eval_f1 : 0.98
eval_runtime : 5.80
eval_samples_per_second : 17.23
eval_steps_per_second : 0.34
epoch : 2.00
