In [13]:
import pandas as pd
import numpy as np
from gensim.models import KeyedVectors
from imblearn.over_sampling import SMOTE
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split, cross_val_score, GridSearchCV
from sklearn.metrics import f1_score, classification_report
from sklearn.feature_extraction.text import TfidfVectorizer
import re
from xgboost import XGBClassifier
from sklearn.preprocessing import LabelEncoder

print("Libraries imported successfully")

Libraries imported successfully


In [2]:
data = pd.read_csv("preprocessed_dataset.csv")

# Verification
print(f"Dataset shape: {data.shape}")
print("Class distribution:\n", data["label"].value_counts().head(5), "...", data["label"].value_counts().tail(5))
print("Sample rows:\n", data.head())

Dataset shape: (477, 2)
Class distribution:
 label
Dengue         48
Typhoid        43
Chicken pox    36
allergy        28
Psoriasis      25
Name: count, dtype: int64 ... label
Hypertension             10
Dimorphic Hemorrhoids     9
Migraine                  7
Acne                      5
Arthritis                 4
Name: count, dtype: int64
Sample rows:
        label                      extracted_symptoms
0  Psoriasis  dry scaly patches itchy rash skin rash
1  Psoriasis              peeling stinging sensation
2  Psoriasis                         joint pain pain
3  Psoriasis                     silver like dusting
4  Psoriasis                                     NaN


In [3]:
bio_word_vec = KeyedVectors.load_word2vec_format(r"C:\Users\ACER\Downloads\BioWordVec_PubMed_MIMICIII_d200.vec.bin", binary=True)
print(f"Loaded embeddings with {len(bio_word_vec)} terms")
print("Sample check: 'fever' in vocab?", "fever" in bio_word_vec)

Loaded embeddings with 16545452 terms
Sample check: 'fever' in vocab? True


In [4]:
def symptoms_to_embedding(symptoms, model):
    if pd.isna(symptoms) or not isinstance(symptoms, str):
        return np.zeros(200)
    symptom_list = symptoms.split()
    vectors = []
    weights = []  # Optional: weight by symptom frequency or importance
    for symptom in symptom_list:
        words = symptom.split()
        word_vecs = [model[word] for word in words if word in model]
        if word_vecs:
            avg_vec = np.mean(word_vecs, axis=0)
            vectors.append(avg_vec)
            weights.append(1.0)  # Placeholder; could weight by TF-IDF or medical relevance
    if vectors:
        return np.average(vectors, axis=0, weights=weights)
    return np.zeros(200)

data["symptom_embedding"] = data["extracted_symptoms"].apply(lambda x: symptoms_to_embedding(x, bio_word_vec))
X = np.vstack(data["symptom_embedding"].values)
y = data["label"].values

print(f"X shape: {X.shape}")
print("Sample embedding (first 5 dims):", X[0][:5])
print("Rows with empty symptoms:", data["extracted_symptoms"].isna().sum())

X shape: (477, 200)
Sample embedding (first 5 dims): [ 0.07757777 -0.00226628 -0.133824    0.61743713 -0.35856715]
Rows with empty symptoms: 20


In [5]:
smote = SMOTE(random_state=42, k_neighbors=3)
X_resampled, y_resampled = smote.fit_resample(X, y)
print(f"Augmented X shape: {X_resampled.shape}")
print("Augmented class distribution:\n", pd.Series(y_resampled).value_counts())

[WinError 2] The system cannot find the file specified
  File "c:\Users\ACER\gitClones\DoctorSathi\env\Lib\site-packages\joblib\externals\loky\backend\context.py", line 257, in _count_physical_cores
    cpu_info = subprocess.run(
               ^^^^^^^^^^^^^^^
  File "C:\Users\ACER\AppData\Local\Programs\Python\Python312\Lib\subprocess.py", line 548, in run
    with Popen(*popenargs, **kwargs) as process:
         ^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "C:\Users\ACER\AppData\Local\Programs\Python\Python312\Lib\subprocess.py", line 1026, in __init__
    self._execute_child(args, executable, preexec_fn, close_fds,
  File "C:\Users\ACER\AppData\Local\Programs\Python\Python312\Lib\subprocess.py", line 1538, in _execute_child
    hp, ht, pid, tid = _winapi.CreateProcess(executable, args,
                       ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^


Augmented X shape: (1152, 200)
Augmented class distribution:
 Psoriasis                          48
Varicose Veins                     48
peptic ulcer disease               48
drug reaction                      48
gastroesophageal reflux disease    48
allergy                            48
urinary tract infection            48
Malaria                            48
Jaundice                           48
Cervical spondylosis               48
Migraine                           48
Hypertension                       48
Bronchial Asthma                   48
Acne                               48
Arthritis                          48
Dimorphic Hemorrhoids              48
Pneumonia                          48
Common Cold                        48
Fungal infection                   48
Dengue                             48
Impetigo                           48
Chicken pox                        48
Typhoid                            48
diabetes                           48
Name: count, dtype: int64


In [6]:
# Use your full 141-term symptom list (simplified here)
known_symptoms = ["skin rash", "joint pain", "fever", "dry scaly patches"]  # Replace with full list
single_word_symptoms = ["fever", "nausea", "pain", "rash"]

# Disease-specific symptom priors (example subset)
disease_symptom_priors = {
    "Psoriasis": ["skin rash", "dry scaly patches", "joint pain"],
    "Dengue": ["fever", "rash", "headache"],
    # Add for all 24 diseases
}

def embedding_to_symptoms(embedding, label, model, known_symptoms, top_n=4):
    all_symptoms = known_symptoms + single_word_symptoms
    symptom_vectors = {s: symptoms_to_embedding(s, model) for s in all_symptoms if symptoms_to_embedding(s, model).any()}
    similarities = {s: model.cosine_similarities(embedding, [v])[0] for s, v in symptom_vectors.items()}
    
    # Boost disease-specific symptoms
    if label in disease_symptom_priors:
        for s in disease_symptom_priors[label]:
            if s in similarities:
                similarities[s] *= 1.2  # Weight prior symptoms higher
    
    selected = sorted(similarities.items(), key=lambda x: x[1], reverse=True)[:top_n]
    return " ".join([s[0] for s in selected if s[1] > 0.3])  # Lower threshold

augmented_data = pd.DataFrame({"label": y_resampled})
augmented_data["symptom_embedding"] = list(X_resampled)
augmented_data["extracted_symptoms"] = [embedding_to_symptoms(emb, lab, bio_word_vec, known_symptoms) 
                                        for emb, lab in zip(X_resampled, y_resampled)]

print(f"Augmented dataset shape: {augmented_data.shape}")
print("Sample augmented rows:\n", augmented_data.head())
print("Unique symptoms in augmented data:", len(set(" ".join(augmented_data["extracted_symptoms"]).split())))

  similarities = dot_products / (norm * all_norms)


Augmented dataset shape: (1152, 3)
Sample augmented rows:
        label                                  symptom_embedding  \
0  Psoriasis  [0.07757776949022498, -0.0022662838122674395, ...   
1  Psoriasis  [-0.13429934158921242, 0.1195533275604248, -0....   
2  Psoriasis  [-0.1081300030152003, 0.4614596540729205, -0.2...   
3  Psoriasis  [-0.0423600027958552, 0.004344666997591655, -0...   
4  Psoriasis  [0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...   

                            extracted_symptoms  
0  skin rash dry scaly patches rash joint pain  
1  dry scaly patches skin rash joint pain rash  
2             joint pain pain nausea skin rash  
3  dry scaly patches skin rash joint pain rash  
4                                               
Unique symptoms in augmented data: 9


In [7]:
# tfidf = TfidfVectorizer(ngram_range=(1, 2))
# X = tfidf.fit_transform(augmented_data["extracted_symptoms"])
# y = augmented_data["label"]
# X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
# model = RandomForestClassifier(random_state=42, n_estimators=200, max_depth=20, min_samples_split=5)
# model.fit(X_train, y_train)
# y_pred = model.predict(X_test)
# f1 = f1_score(y_test, y_pred, average="weighted")

# print(f"X_train shape: {X_train.shape}, X_test shape: {X_test.shape}")
# print(f"Vocabulary size: {X.shape[1]}")
# print(f"Weighted F1-Score: {f1}")

In [8]:
X = np.vstack(augmented_data["symptom_embedding"].values)
y = augmented_data["label"]
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
model = RandomForestClassifier(random_state=42, n_estimators=200, max_depth=20, min_samples_split=5)
model.fit(X_train, y_train)
y_pred = model.predict(X_test)
f1 = f1_score(y_test, y_pred, average="weighted")

print(f"X_train shape: {X_train.shape}, X_test shape: {X_test.shape}")
print(f"Weighted F1-Score (embeddings): {f1}")

X_train shape: (921, 200), X_test shape: (231, 200)
Weighted F1-Score (embeddings): 0.7574164176327338


In [9]:
# Encode labels for XGBoost compatibility
label_encoder = LabelEncoder()
y_encoded = label_encoder.fit_transform(y_resampled)
X_final = X_resampled  # Already embeddings

print(f"X_final shape: {X_final.shape}")
print("Number of unique classes:", len(np.unique(y_encoded)))
print("Sample encoded label:", y_encoded[0], "->", y_resampled[0])

X_final shape: (1152, 200)
Number of unique classes: 24
Sample encoded label: 15 -> Psoriasis


In [14]:
X_train, X_test, y_train, y_test = train_test_split(X_final, y_encoded, test_size=0.2, random_state=42)

# Define parameter grid
param_grid = {
    "n_estimators": [100, 200, 300],
    "max_depth": [10, 20, 30],
    "min_samples_split": [2, 5, 10]
}
rf = RandomForestClassifier(random_state=42)
grid_search = GridSearchCV(rf, param_grid, cv=5, scoring="f1_weighted", n_jobs=-1)
grid_search.fit(X_train, y_train)

# Best model
best_rf = grid_search.best_estimator_
y_pred = best_rf.predict(X_test)
f1 = f1_score(y_test, y_pred, average="weighted")

print(f"Best parameters: {grid_search.best_params_}")
print(f"X_train shape: {X_train.shape}, X_test shape: {X_test.shape}")
print(f"Weighted F1-Score (optimized RF): {f1}")
print("Classification report:\n", classification_report(y_test, y_pred, target_names=label_encoder.classes_))

Best parameters: {'max_depth': 20, 'min_samples_split': 2, 'n_estimators': 100}
X_train shape: (921, 200), X_test shape: (231, 200)
Weighted F1-Score (optimized RF): 0.7302483033678611
Classification report:
                                  precision    recall  f1-score   support

                           Acne       0.50      0.80      0.62         5
                      Arthritis       0.81      0.93      0.87        14
               Bronchial Asthma       1.00      0.64      0.78        11
           Cervical spondylosis       0.89      0.73      0.80        11
                    Chicken pox       0.70      0.64      0.67        11
                    Common Cold       0.71      0.71      0.71         7
                         Dengue       0.14      0.14      0.14         7
          Dimorphic Hemorrhoids       0.73      0.85      0.79        13
               Fungal infection       0.67      0.50      0.57         8
                   Hypertension       1.00      1.00      1.

In [18]:
xgb = XGBClassifier(random_state=42, eval_metric="mlogloss")
xgb.fit(X_train, y_train)
y_pred = xgb.predict(X_test)
f1 = f1_score(y_test, y_pred, average="weighted")
cv_scores = cross_val_score(xgb, X_final, y_encoded, cv=5, scoring="f1_weighted")

print(f"X_train shape: {X_train.shape}, X_test shape: {X_test.shape}")
print(f"Weighted F1-Score (XGBoost): {f1}")
print(f"Cross-validated F1-Score: {cv_scores.mean()} (± {cv_scores.std()})")


X_train shape: (921, 200), X_test shape: (231, 200)
Weighted F1-Score (XGBoost): 0.7270171075509957
Cross-validated F1-Score: 0.7504655447896187 (± 0.1025140321845226)


In [17]:
# Identify misclassifications
misclassified = X_test[y_pred != y_test]
true_labels = label_encoder.inverse_transform(y_test[y_pred != y_test])
pred_labels = label_encoder.inverse_transform(y_pred[y_pred != y_test])

misclass_df = pd.DataFrame({"True": true_labels, "Predicted": pred_labels})
print("Misclassification summary:\n", misclass_df.groupby(["True", "Predicted"]).size())

Misclassification summary:
 True                             Predicted                      
Arthritis                        Migraine                           1
Bronchial Asthma                 Arthritis                          1
                                 Common Cold                        1
Cervical spondylosis             Arthritis                          1
                                 Bronchial Asthma                   1
                                 Migraine                           1
Chicken pox                      Dengue                             1
                                 Fungal infection                   2
                                 Impetigo                           1
                                 Pneumonia                          1
Common Cold                      Varicose Veins                     1
                                 diabetes                           1
Dengue                           Chicken pox                       