In [1]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import LabelEncoder

In [3]:
df = pd.read_csv("../data/dataset.csv")
print(f"Original shape: {df.shape}")
df.head()

Original shape: (4920, 18)


Unnamed: 0,Disease,Symptom_1,Symptom_2,Symptom_3,Symptom_4,Symptom_5,Symptom_6,Symptom_7,Symptom_8,Symptom_9,Symptom_10,Symptom_11,Symptom_12,Symptom_13,Symptom_14,Symptom_15,Symptom_16,Symptom_17
0,Fungal infection,itching,skin_rash,nodal_skin_eruptions,dischromic _patches,,,,,,,,,,,,,
1,Fungal infection,skin_rash,nodal_skin_eruptions,dischromic _patches,,,,,,,,,,,,,,
2,Fungal infection,itching,nodal_skin_eruptions,dischromic _patches,,,,,,,,,,,,,,
3,Fungal infection,itching,skin_rash,dischromic _patches,,,,,,,,,,,,,,
4,Fungal infection,itching,skin_rash,nodal_skin_eruptions,,,,,,,,,,,,,,


In [4]:
df.isnull().sum()

Disease          0
Symptom_1        0
Symptom_2        0
Symptom_3        0
Symptom_4      348
Symptom_5     1206
Symptom_6     1986
Symptom_7     2652
Symptom_8     2976
Symptom_9     3228
Symptom_10    3408
Symptom_11    3726
Symptom_12    4176
Symptom_13    4416
Symptom_14    4614
Symptom_15    4680
Symptom_16    4728
Symptom_17    4848
dtype: int64

In [5]:
# Fill missing symptoms
df.fillna("None", inplace=True)
df.head()

Unnamed: 0,Disease,Symptom_1,Symptom_2,Symptom_3,Symptom_4,Symptom_5,Symptom_6,Symptom_7,Symptom_8,Symptom_9,Symptom_10,Symptom_11,Symptom_12,Symptom_13,Symptom_14,Symptom_15,Symptom_16,Symptom_17
0,Fungal infection,itching,skin_rash,nodal_skin_eruptions,dischromic _patches,,,,,,,,,,,,,
1,Fungal infection,skin_rash,nodal_skin_eruptions,dischromic _patches,,,,,,,,,,,,,,
2,Fungal infection,itching,nodal_skin_eruptions,dischromic _patches,,,,,,,,,,,,,,
3,Fungal infection,itching,skin_rash,dischromic _patches,,,,,,,,,,,,,,
4,Fungal infection,itching,skin_rash,nodal_skin_eruptions,,,,,,,,,,,,,,


In [6]:
symptom_columns = df.columns[:-1]  # Exclude 'Disease' and 'ID' columns
all_symptoms = set()

for col in symptom_columns:
    all_symptoms.update(df[col].unique())

all_symptoms.discard("None")
all_symptoms = sorted(list(all_symptoms))
print(f"Total unique symptoms: {len(all_symptoms)}")

Total unique symptoms: 172


In [7]:
# Initialize dataframe with binary symptom columns
binary_df = pd.DataFrame(0, index=np.arange(len(df)), columns=all_symptoms)

# Fill binary columns
for index, row in df.iterrows():
    symptoms = row[:-1].values
    for symptom in symptoms:
        if symptom != "None":
            binary_df.at[index, symptom] = 1

# Add disease column
binary_df['Disease'] = df['Disease']
binary_df.head()


Unnamed: 0,abdominal_pain,abnormal_menstruation,acidity,acute_liver_failure,altered_sensorium,anxiety,back_pain,belly_pain,blackheads,bladder_discomfort,...,Peptic ulcer diseae,Pneumonia,Psoriasis,Tuberculosis,Typhoid,Urinary tract infection,Varicose veins,hepatitis A,itching,Disease
0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,1,Fungal infection
1,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,Fungal infection
2,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,1,Fungal infection
3,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,1,Fungal infection
4,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,1,Fungal infection


In [8]:
# Label encode disease
le = LabelEncoder()
binary_df['Disease_Label'] = le.fit_transform(binary_df['Disease'])

# Save mapping for prediction later
disease_mapping = dict(zip(le.classes_, le.transform(le.classes_)))
print(disease_mapping)


{'(vertigo) Paroymsal  Positional Vertigo': np.int64(0), 'AIDS': np.int64(1), 'Acne': np.int64(2), 'Alcoholic hepatitis': np.int64(3), 'Allergy': np.int64(4), 'Arthritis': np.int64(5), 'Bronchial Asthma': np.int64(6), 'Cervical spondylosis': np.int64(7), 'Chicken pox': np.int64(8), 'Chronic cholestasis': np.int64(9), 'Common Cold': np.int64(10), 'Dengue': np.int64(11), 'Diabetes ': np.int64(12), 'Dimorphic hemmorhoids(piles)': np.int64(13), 'Drug Reaction': np.int64(14), 'Fungal infection': np.int64(15), 'GERD': np.int64(16), 'Gastroenteritis': np.int64(17), 'Heart attack': np.int64(18), 'Hepatitis B': np.int64(19), 'Hepatitis C': np.int64(20), 'Hepatitis D': np.int64(21), 'Hepatitis E': np.int64(22), 'Hypertension ': np.int64(23), 'Hyperthyroidism': np.int64(24), 'Hypoglycemia': np.int64(25), 'Hypothyroidism': np.int64(26), 'Impetigo': np.int64(27), 'Jaundice': np.int64(28), 'Malaria': np.int64(29), 'Migraine': np.int64(30), 'Osteoarthristis': np.int64(31), 'Paralysis (brain hemorrhag

In [9]:
# Save features and labels
X = binary_df.drop(['Disease', 'Disease_Label'], axis=1)
y = binary_df['Disease_Label']

X.to_csv("../data/X_processed.csv", index=False)
y.to_csv("../data/y_processed.csv", index=False)

print("✅ Preprocessing complete. Ready for ML training.")


✅ Preprocessing complete. Ready for ML training.
