In [1]:
import pandas as pd
import numpy as np
from collections import Counter

# Load dataset
data = pd.read_csv("preprocessed_dataset.csv")
data["extracted_symptoms"] = data["extracted_symptoms"].fillna("")  # Handle NaN

# Step 1: Extract known_symptoms
all_symptoms = " ".join(data["extracted_symptoms"]).split()
known_symptoms = sorted(set(all_symptoms) - {""})  # Remove empty strings and deduplicate
print(f"Number of unique symptoms: {len(known_symptoms)}")
print("Sample known_symptoms:", known_symptoms[:10])

# Step 2: Extract disease_symptom_priors
disease_symptom_priors = {}
for disease in data["label"].unique():
    # Get symptoms for this disease
    disease_rows = data[data["label"] == disease]["extracted_symptoms"]
    symptom_list = " ".join(disease_rows).split()
    if not symptom_list:  # Skip if no symptoms
        continue
    
    # Count symptom frequencies
    symptom_counts = Counter(symptom_list)
    # Select top 5 symptoms (adjust as needed)
    top_symptoms = [symptom for symptom, count in symptom_counts.most_common(5)]
    disease_symptom_priors[disease] = top_symptoms

# Verification
print(f"Number of diseases with priors: {len(disease_symptom_priors)}")
for disease, symptoms in list(disease_symptom_priors.items())[:3]:  # Sample 3
    print(f"{disease}: {symptoms}")

Number of unique symptoms: 141
Sample known_symptoms: ['abdominal', 'aches', 'anemia', 'appetite', 'back', 'bad', 'belching', 'belly', 'blistering', 'bloating']
Number of diseases with priors: 24
Psoriasis: ['rash', 'pain', 'peeling', 'skin', 'joint']
Varicose Veins: ['cramps', 'rash', 'fatigue', 'pain', 'itchy']
Typhoid: ['pain', 'fever', 'constipation', 'headache', 'vomiting']
