In [23]:
import pandas as pd
import numpy as np
from gensim.models import KeyedVectors
from imblearn.over_sampling import SMOTE
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import f1_score
from sklearn.feature_extraction.text import TfidfVectorizer

# Verification: Check if imports succeeded (no output expected if successful)
print("Libraries imported successfully")

Libraries imported successfully


In [24]:
import re
# Load dataset
data = pd.read_csv("Symptom2Disease.csv")

# Reuse your cleaning and symptom extraction functions (assumed defined earlier)
def clean_text(text):
    text = text.lower()
    text = re.sub(r'[^\w\s]', ' ', text)
    text = text.replace("vomitting", "vomiting").replace("apetite", "appetite").replace("experince", "experience")
    text = text.replace("dischromic", "discolored").replace("accompained", "accompanied")
    text = re.sub(r'\s+', ' ', text).strip()
    return text

# Define symptom lists (shortened for brevity; use your full lists)
known_symptoms = ["skin rash", "joint pain", "fever", "dry scaly patches"]
single_word_symptoms = ["fever", "nausea", "pain", "rash"]

def extract_symptoms_from_text(text, known_symptoms, single_word_symptoms):
    symptoms_found = set()
    words = text.split()
    for symptom in known_symptoms:
        if symptom in text:
            symptoms_found.add(symptom)
    for word in words:
        if word in single_word_symptoms:
            symptoms_found.add(word)
    return " ".join(sorted(symptoms_found)) if symptoms_found else ""

# Apply preprocessing
data["cleaned_text"] = data["text"].apply(clean_text)
data["extracted_symptoms"] = data["cleaned_text"].apply(
    lambda x: extract_symptoms_from_text(x, known_symptoms, single_word_symptoms)
)
data = data[["label", "extracted_symptoms"]]

# Verification: Check dataset shape and sample
print(f"Dataset shape: {data.shape}")
print("Sample rows:\n", data.head())

Dataset shape: (1200, 2)
Sample rows:
        label                extracted_symptoms
0  Psoriasis  dry scaly patches rash skin rash
1  Psoriasis                                  
2  Psoriasis                   joint pain pain
3  Psoriasis                                  
4  Psoriasis                                  


In [26]:
# Load BioWordVec embeddings
bio_word_vec = KeyedVectors.load_word2vec_format(r"C:\Users\ACER\Downloads\BioWordVec_PubMed_MIMICIII_d200.vec.bin", binary=True)

# Verification: Check vocabulary size and sample terms
print(f"Loaded embeddings with {len(bio_word_vec)} terms")
print("Is 'fever' in vocabulary?", "fever" in bio_word_vec)
print("Similar to 'fever':", bio_word_vec.most_similar("fever", topn=3))

Loaded embeddings with 16545452 terms
Is 'fever' in vocabulary? True
Similar to 'fever': [('fevery', 0.8861067891120911), ('feverr', 0.8792648315429688), ("fever'", 0.8753470182418823)]


In [27]:
# Function to convert symptoms to embeddings
def symptoms_to_embedding(symptoms, model):
    symptom_list = symptoms.split()
    vectors = []
    for symptom in symptom_list:
        words = symptom.split()
        word_vecs = [model[word] for word in words if word in model]
        if word_vecs:
            vectors.append(np.mean(word_vecs, axis=0))
    return np.mean(vectors, axis=0) if vectors else np.zeros(200)

# Apply to dataset
data["symptom_embedding"] = data["extracted_symptoms"].apply(lambda x: symptoms_to_embedding(x, bio_word_vec))
X = np.vstack(data["symptom_embedding"].values)
y = data["label"].values

# Verification: Check embedding shape and sample
print(f"X shape: {X.shape}")
print("Sample embedding (first 5 dimensions):", X[0][:5])
print("Sample label:", y[0])

X shape: (1200, 200)
Sample embedding (first 5 dimensions): [ 0.0637474  -0.00534367 -0.19498634  0.57476169 -0.39963672]
Sample label: Psoriasis


In [28]:
# Apply SMOTE
smote = SMOTE(random_state=42)
X_resampled, y_resampled = smote.fit_resample(X, y)

# Verification: Check new shape and class distribution
print(f"Augmented X shape: {X_resampled.shape}")
print("Augmented class distribution:\n", pd.Series(y_resampled).value_counts())

Augmented X shape: (1200, 200)
Augmented class distribution:
 Psoriasis                          50
Varicose Veins                     50
peptic ulcer disease               50
drug reaction                      50
gastroesophageal reflux disease    50
allergy                            50
urinary tract infection            50
Malaria                            50
Jaundice                           50
Cervical spondylosis               50
Migraine                           50
Hypertension                       50
Bronchial Asthma                   50
Acne                               50
Arthritis                          50
Dimorphic Hemorrhoids              50
Pneumonia                          50
Common Cold                        50
Fungal infection                   50
Dengue                             50
Impetigo                           50
Chicken pox                        50
Typhoid                            50
diabetes                           50
Name: count, dtype: int64


In [29]:
# Function to map embeddings back to symptoms
def embedding_to_symptoms(embedding, model, known_symptoms, top_n=3):
    all_symptoms = known_symptoms + single_word_symptoms
    symptom_vectors = {s: symptoms_to_embedding(s, model) for s in all_symptoms if symptoms_to_embedding(s, model).any()}
    similarities = {s: model.cosine_similarities(embedding, [v])[0] for s, v in symptom_vectors.items()}
    selected = sorted(similarities.items(), key=lambda x: x[1], reverse=True)[:top_n]
    return " ".join([s[0] for s in selected if s[1] > 0.5])

# Create augmented DataFrame
augmented_data = pd.DataFrame({"label": y_resampled})
augmented_data["symptom_embedding"] = list(X_resampled)
augmented_data["extracted_symptoms"] = augmented_data["symptom_embedding"].apply(
    lambda x: embedding_to_symptoms(x, bio_word_vec, known_symptoms)
)

# Verification: Check shape and sample augmented data
print(f"Augmented dataset shape: {augmented_data.shape}")
print("Sample augmented rows:\n", augmented_data.head())

  similarities = dot_products / (norm * all_norms)


Augmented dataset shape: (1200, 3)
Sample augmented rows:
        label                                  symptom_embedding  \
0  Psoriasis  [0.06374739855527878, -0.0053436667658388615, ...   
1  Psoriasis  [0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...   
2  Psoriasis  [-0.10813000053167343, 0.4614596366882324, -0....   
3  Psoriasis  [0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...   
4  Psoriasis  [0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...   

                 extracted_symptoms  
0  skin rash dry scaly patches rash  
1                                    
2            joint pain pain nausea  
3                                    
4                                    


In [30]:
# Vectorize augmented symptoms
tfidf = TfidfVectorizer(ngram_range=(1, 2))
X = tfidf.fit_transform(augmented_data["extracted_symptoms"])
y = augmented_data["label"]

# Split and train
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
model = RandomForestClassifier(random_state=42, n_estimators=200)
model.fit(X_train, y_train)

# Evaluate
y_pred = model.predict(X_test)
f1 = f1_score(y_test, y_pred, average="weighted")

# Verification: Check shapes and performance
print(f"X_train shape: {X_train.shape}, X_test shape: {X_test.shape}")
print(f"Weighted F1-Score: {f1}")

X_train shape: (960, 31), X_test shape: (240, 31)
Weighted F1-Score: 0.10515435983332848


In [31]:
# Train directly on embeddings
X = np.vstack(augmented_data["symptom_embedding"].values)
y = augmented_data["label"]
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
model = RandomForestClassifier(random_state=42, n_estimators=200)
model.fit(X_train, y_train)
y_pred = model.predict(X_test)
f1 = f1_score(y_test, y_pred, average="weighted")

# Verification: Check shapes and performance
print(f"X_train shape: {X_train.shape}, X_test shape: {X_test.shape}")
print(f"Weighted F1-Score (embeddings): {f1}")

X_train shape: (960, 200), X_test shape: (240, 200)
Weighted F1-Score (embeddings): 0.09848769316666181
