In [1]:
# Install required libraries (Colab)
!pip install pandas scikit-learn matplotlib seaborn numpy transformers tensorflow




Upload & Load Datasets

In [3]:
import pandas as pd
from google.colab import files



# Load datasets
df_patient = pd.read_csv("/content/patient_input_2000.csv")
df_treatment_en = pd.read_csv("/content/skin_diagnosis_en_10000.csv")
df_treatment_si = pd.read_csv("/content/skin_diagnosis_si_10000.csv")
df_chatbot = pd.read_csv("/content/chatbot_dataset_bilingual.csv")
df_monitor = pd.read_csv("/content/patient_monitoring_2000.csv")

# Quick check
print(df_patient.head())
print(df_treatment_en.head())
print(df_treatment_si.head())
print(df_chatbot.head())
print(df_monitor.head())

  patient_id      name  age gender           symptoms_text symptom_duration  \
0      P0001    Ruwani   29      F           White patches          28 days   
1      P0002   Shanika   48      F  Circular itchy patches           6 days   
2      P0003  Sewwandi   31      F  Circular itchy patches          29 days   
3      P0004     Sahan   39      F  Circular itchy patches          28 days   
4      P0005   Kavindi   60      M              Dark spots          14 days   

  image_provided location previous_conditions   label_disease  
0             No    Kandy            Diabetes            Acne  
1             No   Jaffna                 NaN    Pigmentation  
2            Yes   Jaffna            Diabetes  Herpes Simplex  
3             No  Colombo                 NaN       Psoriasis  
4            Yes   Matara             Obesity       Psoriasis  
   id                                        description  \
0   1  Red itchy thick plaques on my knees for a long...   
1   2  Several small 

Preprocess Symptoms & Encode Labels

In [4]:
from sklearn.preprocessing import LabelEncoder

# Lowercase symptoms
df_patient['symptoms_text'] = df_patient['symptoms_text'].str.lower()

# Encode disease labels
le = LabelEncoder()
df_patient['label_encoded'] = le.fit_transform(df_patient['label_disease'])

print(df_patient[['symptoms_text','label_disease','label_encoded']].head())


            symptoms_text   label_disease  label_encoded
0           white patches            Acne              0
1  circular itchy patches    Pigmentation              9
2  circular itchy patches  Herpes Simplex              6
3  circular itchy patches       Psoriasis             10
4              dark spots       Psoriasis             10


Split Data

In [5]:
from sklearn.model_selection import train_test_split

X = df_patient['symptoms_text']   # Features: symptoms text
y = df_patient['label_encoded']    # Target: disease labels

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)


In [6]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.svm import LinearSVC
from sklearn.pipeline import Pipeline
from sklearn.metrics import classification_report, accuracy_score

# Pipeline
model = Pipeline([
    ("tfidf", TfidfVectorizer()),
    ("clf", LinearSVC())
])

# Train model
model.fit(X_train, y_train)

# Evaluate
y_pred = model.predict(X_test)
print("Accuracy:", accuracy_score(y_test, y_pred))
print(classification_report(y_test, y_pred, target_names=le.classes_))



Accuracy: 0.0875
                    precision    recall  f1-score   support

              Acne       0.10      0.09      0.09        34
Contact Dermatitis       0.09      0.09      0.09        33
          Dandruff       0.06      0.09      0.07        22
          Dry Skin       0.00      0.00      0.00        42
            Eczema       0.00      0.00      0.00        29
  Fungal Infection       0.15      0.08      0.11        37
    Herpes Simplex       0.11      0.27      0.16        26
             Hives       0.06      0.06      0.06        32
 Keratosis Pilaris       0.08      0.16      0.10        31
      Pigmentation       0.04      0.03      0.03        32
         Psoriasis       0.10      0.26      0.14        23
           Sunburn       0.08      0.09      0.08        35
          Vitiligo       0.00      0.00      0.00        24

          accuracy                           0.09       400
         macro avg       0.07      0.09      0.07       400
      weighted avg   

  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


Multilingual BERT Model

In [7]:
from transformers import AutoTokenizer, AutoModelForSequenceClassification
import torch

model_name = "bert-base-multilingual-cased"

tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModelForSequenceClassification.from_pretrained(
    model_name,
    num_labels=len(le.classes_) )



The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


tokenizer_config.json:   0%|          | 0.00/49.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/625 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/996k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.96M [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/714M [00:00<?, ?B/s]

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-multilingual-cased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [8]:
class BertDataset(torch.utils.data.Dataset):
    def __init__(self, texts, labels):
        self.texts = texts
        self.labels = labels

    def __len__(self):
        return len(self.texts)

    def __getitem__(self, idx):
        text = self.texts[idx]
        label = self.labels[idx]

        encoding = tokenizer(
            text,
            truncation=True,
            padding='max_length',
            max_length=128,
            return_tensors="pt"
        )

        return {
            "input_ids": encoding["input_ids"].squeeze(),
            "attention_mask": encoding["attention_mask"].squeeze(),
            "labels": torch.tensor(label)
        }

# Create datasets with reset indices
train_dataset = BertDataset(X_train.reset_index(drop=True), y_train.reset_index(drop=True))
test_dataset = BertDataset(X_test.reset_index(drop=True), y_test.reset_index(drop=True))

# ===============================
# STEP 5 — DataLoader
# ===============================
train_loader = torch.utils.data.DataLoader(train_dataset, batch_size=8, shuffle=True)
test_loader = torch.utils.data.DataLoader(test_dataset, batch_size=8)

# ===============================
# STEP 6 — Training Loop
# ===============================
from torch.optim import AdamW

optimizer = AdamW(model.parameters(), lr=2e-5)
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

model.to(device)

for epoch in range(3):
    model.train()
    total_loss = 0

    for batch in train_loader:
        optimizer.zero_grad()
        batch = {k: v.to(device) for k, v in batch.items()}

        outputs = model(**batch)
        loss = outputs.loss

        total_loss += loss.item()
        loss.backward()
        optimizer.step()

    print(f"Epoch {epoch+1} Loss: {total_loss:.4f}")

# ===============================
# STEP 7 — Prediction Function
# ===============================
def predict_disease_bert(text):
    model.eval()
    encoding = tokenizer(
        text,
        truncation=True,
        padding='max_length',
        max_length=128,
        return_tensors="pt"
    )

    encoding = {k: v.to(device) for k, v in encoding.items()}

    with torch.no_grad():
        output = model(**encoding)

    pred_label = torch.argmax(output.logits).item()
    disease = le.inverse_transform([pred_label])[0]
    return disease

print("Done — BERT model ready!")

Epoch 1 Loss: 515.2860
Epoch 2 Loss: 514.2223
Epoch 3 Loss: 514.1580
Done — BERT model ready!


Treatment Suggestion

In [9]:
def get_treatment_dataset(language='en'):
    if language.lower() == 'en':
        return df_treatment_en
    else:
        return df_treatment_si

In [15]:
def suggest_treatment(user_input, language='en'):
    treatment_df = get_treatment_dataset(language)

    # Predict disease using the BERT prediction function
    pred_label = predict_disease_bert(user_input)

    # Fetch treatment info using 'label' column
    filtered_df = treatment_df[treatment_df['label'] == pred_label]

    if filtered_df.empty:
        # Handle cases where the predicted label doesn't have a direct treatment entry
        return {
            "diagnosis": pred_label,
            "treatment": "No specific treatment found for this diagnosis.",
            "home_remedies": "Consult a doctor for further advice.",
            "cause": "Unknown",
            "doctor_advice": "Seek professional medical advice."
        }
    else:
        treat_info = filtered_df.iloc[0] # Take the first matching entry

        return {
            "diagnosis": pred_label,
            "treatment": treat_info["treatment"], # Use 'treatment' column
            "home_remedies": "N/A (No specific home remedies listed in dataset)", # Placeholder
            "cause": "N/A (Cause explanation not available in dataset)", # Placeholder
            "doctor_advice": treat_info["doctor_advice"] # Use 'doctor_advice' column
        }

Monitoring / Follow-Up

In [11]:
def check_followup(patient_id):
    records = df_monitor[df_monitor['patient_id']==patient_id]
    if records.empty:
        print("No monitoring data available for this patient.")
    else:
        for _, row in records.iterrows():
            print(f"Disease: {row['disease']}, Follow-up: {row['follow_up_date']}, Improvement: {row['improvement_level']}, Doctor visit: {row['doctor_visit_recommended']}")


Dataset-Driven Chatbot


In [12]:
def get_bot_response(intent, language='en'):
    """
    Fetch bot response from chatbot dataset based on intent and language
    """
    subset = df_chatbot[(df_chatbot['intent']==intent) & (df_chatbot['language']==language)]
    if not subset.empty:
        return subset['bot_response_example'].values[0]
    else:
        return "No response available."

Run Chatbot Session

In [16]:
# Choose language
language = input("Choose language (en/si): ").lower()

# Ask Name
name_question = get_bot_response("ask_name", language)
name = input(name_question + " ")

# Ask Age
age_question = get_bot_response("ask_age", language)
age = input(age_question + " ")

# Ask Symptoms
symptoms_question = get_bot_response("ask_symptoms", language)
symptoms = input(symptoms_question + " ")

# Predict Disease & Suggest Treatment
result = suggest_treatment(symptoms, language)

# Show Treatment
treatment_response = get_bot_response("treatment_suggestion", language)
print(f"\n{treatment_response}")
print(f"Diagnosis: {result['diagnosis']}")
print(f"OTC Treatment: {result['treatment']}")
print(f"Home Remedies: {result['home_remedies']}")
print(f"Cause: {result['cause']}")
print(f"Doctor Advice: {result['doctor_advice']}")

# Ask if condition is severe
severe = input("Is your condition severe? (yes/no) " if language=='en' else "ඔබේ තත්ත්වය දැඩියිද? (ඔව්/නැත) ").lower()
if severe in ['yes','ඔව්']:
    doctor_response = get_bot_response("doctor_referral", language)
    print(doctor_response)

# Thank You
thank_response = get_bot_response("thank_you", language)
print(thank_response)

# Optional: Patient Monitoring
patient_id = input("\nEnter your patient ID to see monitoring info (or skip): ")
if patient_id.strip() != "":
    check_followup(patient_id)

Choose language (en/si): en
Hello! What is your name? ruwani
Please enter your age. 29
Thanks, can you describe the severity? i have itchy with fever

Based on your symptoms, use the suggested OTC treatment and home remedies.
Diagnosis: Herpes Simplex
OTC Treatment: No specific treatment found for this diagnosis.
Home Remedies: Consult a doctor for further advice.
Cause: Unknown
Doctor Advice: Seek professional medical advice.
Is your condition severe? (yes/no) no
You’re welcome! Take care.

Enter your patient ID to see monitoring info (or skip): p0001
No monitoring data available for this patient.
