In [6]:
import os
os.environ["LOKY_MAX_CPU_COUNT"] = "4"  # Add this at the top

import pandas as pd
import numpy as np
from gensim.models import KeyedVectors
from imblearn.over_sampling import SMOTE
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split, cross_val_score, GridSearchCV
from sklearn.metrics import f1_score, classification_report
from xgboost import XGBClassifier
from sklearn.preprocessing import LabelEncoder

print("Libraries imported successfully")

Libraries imported successfully


In [2]:
data = pd.read_csv("preprocessed_dataset.csv")
data["extracted_symptoms"] = data["extracted_symptoms"].fillna("")  # Replace NaN with empty string
data = data[data["extracted_symptoms"] != ""]  # Filter out empty strings

print(f"Cleaned dataset shape: {data.shape}")
print("Class distribution:\n", data["label"].value_counts().head(5), "...", data["label"].value_counts().tail(5))
print("Rows with empty symptoms after cleaning:", data["extracted_symptoms"].isna().sum())

Cleaned dataset shape: (457, 2)
Class distribution:
 label
Dengue         47
Typhoid        43
Chicken pox    35
allergy        27
Jaundice       25
Name: count, dtype: int64 ... label
Hypertension             9
Dimorphic Hemorrhoids    8
Migraine                 6
Acne                     4
Arthritis                3
Name: count, dtype: int64
Rows with empty symptoms after cleaning: 0


In [3]:
bio_word_vec = KeyedVectors.load_word2vec_format(r"C:\Users\ACER\Downloads\BioWordVec_PubMed_MIMICIII_d200.vec.bin", binary=True)
print(f"Loaded embeddings with {len(bio_word_vec)} terms")

Loaded embeddings with 16545452 terms


In [4]:
def symptoms_to_embedding(symptoms, model):
    if pd.isna(symptoms) or not isinstance(symptoms, str):
        return np.zeros(200)
    symptom_list = symptoms.split()
    vectors = []
    for symptom in symptom_list:
        words = symptom.split()
        word_vecs = [model[word] for word in words if word in model]
        if word_vecs:
            vectors.append(np.mean(word_vecs, axis=0))
    return np.mean(vectors, axis=0) if vectors else np.zeros(200)

data["symptom_embedding"] = data["extracted_symptoms"].apply(lambda x: symptoms_to_embedding(x, bio_word_vec))
X = np.vstack(data["symptom_embedding"].values)
y = data["label"].values

print(f"X shape: {X.shape}")

X shape: (457, 200)


In [7]:
smote = SMOTE(random_state=42, k_neighbors=2)  # Reduced from 3
X_resampled, y_resampled = smote.fit_resample(X, y)
print(f"Augmented X shape: {X_resampled.shape}")
print("Augmented class distribution:\n", pd.Series(y_resampled).value_counts())

Augmented X shape: (1128, 200)
Augmented class distribution:
 Psoriasis                          47
Varicose Veins                     47
peptic ulcer disease               47
drug reaction                      47
gastroesophageal reflux disease    47
allergy                            47
urinary tract infection            47
Malaria                            47
Jaundice                           47
Cervical spondylosis               47
Migraine                           47
Hypertension                       47
Bronchial Asthma                   47
Acne                               47
Arthritis                          47
Dimorphic Hemorrhoids              47
Pneumonia                          47
Common Cold                        47
Fungal infection                   47
Dengue                             47
Impetigo                           47
Chicken pox                        47
Typhoid                            47
diabetes                           47
Name: count, dtype: int64


In [8]:
import pandas as pd
import numpy as np
from collections import Counter

# Load dataset
data = pd.read_csv("preprocessed_dataset.csv")
data["extracted_symptoms"] = data["extracted_symptoms"].fillna("")  # Handle NaN

# Step 1: Extract known_symptoms
all_symptoms = " ".join(data["extracted_symptoms"]).split()
known_symptoms = sorted(set(all_symptoms) - {""})  # Remove empty strings and deduplicate
print(f"Number of unique symptoms: {len(known_symptoms)}")
print("Sample known_symptoms:", known_symptoms[:10])

# Step 2: Extract disease_symptom_priors
disease_symptom_priors = {}
for disease in data["label"].unique():
    # Get symptoms for this disease
    disease_rows = data[data["label"] == disease]["extracted_symptoms"]
    symptom_list = " ".join(disease_rows).split()
    if not symptom_list:  # Skip if no symptoms
        continue
    
    # Count symptom frequencies
    symptom_counts = Counter(symptom_list)
    # Select top 5 symptoms (adjust as needed)
    top_symptoms = [symptom for symptom, count in symptom_counts.most_common(5)]
    disease_symptom_priors[disease] = top_symptoms

# Verification
print(f"Number of diseases with priors: {len(disease_symptom_priors)}")
for disease, symptoms in list(disease_symptom_priors.items())[:3]:  # Sample 3
    print(f"{disease}: {symptoms}")

Number of unique symptoms: 141
Sample known_symptoms: ['abdominal', 'aches', 'anemia', 'appetite', 'back', 'bad', 'belching', 'belly', 'blistering', 'bloating']
Number of diseases with priors: 24
Psoriasis: ['rash', 'pain', 'peeling', 'skin', 'joint']
Varicose Veins: ['cramps', 'rash', 'fatigue', 'pain', 'itchy']
Typhoid: ['pain', 'fever', 'constipation', 'headache', 'vomiting']


In [9]:
# Use extracted known_symptoms and disease_symptom_priors
single_word_symptoms = [s for s in known_symptoms if " " not in s]  # Filter single words

def embedding_to_symptoms(embedding, label, model, known_symptoms, top_n=5):
    all_symptoms = list(set(known_symptoms + single_word_symptoms))
    symptom_vectors = {s: symptoms_to_embedding(s, model) for s in all_symptoms if symptoms_to_embedding(s, model).any()}
    similarities = {s: model.cosine_similarities(embedding, [v])[0] for s, v in symptom_vectors.items()}
    if label in disease_symptom_priors:
        for s in disease_symptom_priors[label]:
            if s in similarities:
                similarities[s] *= 1.5  # Stronger boost for priors
    selected = sorted(similarities.items(), key=lambda x: x[1], reverse=True)[:top_n]
    return " ".join([s[0] for s in selected if s[1] > 0.25])

augmented_data = pd.DataFrame({"label": y_resampled})
augmented_data["symptom_embedding"] = list(X_resampled)
augmented_data["extracted_symptoms"] = [embedding_to_symptoms(emb, lab, bio_word_vec, known_symptoms) 
                                        for emb, lab in zip(X_resampled, y_resampled)]

print(f"Augmented dataset shape: {augmented_data.shape}")
print("Unique symptoms:", len(set(" ".join(augmented_data["extracted_symptoms"]).split())))
print("Sample augmented rows:\n", augmented_data.head())

Augmented dataset shape: (1128, 3)
Unique symptoms: 115
Sample augmented rows:
        label                                  symptom_embedding  \
0  Psoriasis  [0.07757777, -0.0022662845, -0.133824, 0.61743...   
1  Psoriasis  [-0.13429934, 0.11955333, -0.226954, 0.5925967...   
2  Psoriasis  [-0.10813, 0.46145964, -0.23083667, -0.1078816...   
3  Psoriasis  [-0.042359997, 0.004344667, -0.07560567, 0.005...   
4  Psoriasis  [-0.154185, 0.32754, -0.036485016, 0.097795, -...   

                  extracted_symptoms  
0      rash skin peeling scaly itchy  
1    peeling rash pain skin stinging  
2  pain joint rash numbness headache  
3   peeling dusting silver skin like  
4   peeling deep cracks skin flaking  


In [10]:
label_encoder = LabelEncoder()
y_encoded = label_encoder.fit_transform(y_resampled)
X_final = X_resampled

X_train, X_test, y_train, y_test = train_test_split(X_final, y_encoded, test_size=0.2, random_state=42)

param_grid = {
    "n_estimators": [100, 200],
    "max_depth": [3, 6, 10],
    "learning_rate": [0.01, 0.1, 0.3],
    "subsample": [0.8, 1.0]
}
xgb = XGBClassifier(random_state=42, eval_metric="mlogloss")
grid_search = GridSearchCV(xgb, param_grid, cv=5, scoring="f1_weighted", n_jobs=-1)
grid_search.fit(X_train, y_train)

best_xgb = grid_search.best_estimator_
y_pred = best_xgb.predict(X_test)
f1 = f1_score(y_test, y_pred, average="weighted")
cv_scores = cross_val_score(best_xgb, X_final, y_encoded, cv=5, scoring="f1_weighted")

print(f"Best parameters: {grid_search.best_params_}")
print(f"Weighted F1-Score (XGBoost): {f1}")
print(f"Cross-validated F1-Score: {cv_scores.mean()} (± {cv_scores.std()})")
print("Classification report:\n", classification_report(y_test, y_pred, target_names=label_encoder.classes_))

Best parameters: {'learning_rate': 0.3, 'max_depth': 6, 'n_estimators': 200, 'subsample': 0.8}
Weighted F1-Score (XGBoost): 0.775575773708233
Cross-validated F1-Score: 0.7733049207204985 (± 0.08251768980453678)
Classification report:
                                  precision    recall  f1-score   support

                           Acne       0.67      0.83      0.74        12
                      Arthritis       1.00      1.00      1.00        12
               Bronchial Asthma       0.83      1.00      0.91         5
           Cervical spondylosis       1.00      0.85      0.92        13
                    Chicken pox       0.64      0.78      0.70         9
                    Common Cold       0.88      0.64      0.74        11
                         Dengue       0.33      0.67      0.44         9
          Dimorphic Hemorrhoids       0.75      0.75      0.75         4
               Fungal infection       0.50      0.50      0.50         8
                   Hypertension   