In [3]:
import pandas as pd

# Load main dataset mapping symptoms → disease
df = pd.read_csv(r"../data/dataset.csv")
print(df.head())

            Disease   Symptom_1              Symptom_2              Symptom_3  \
0  Fungal infection     itching              skin_rash   nodal_skin_eruptions   
1  Fungal infection   skin_rash   nodal_skin_eruptions    dischromic _patches   
2  Fungal infection     itching   nodal_skin_eruptions    dischromic _patches   
3  Fungal infection     itching              skin_rash    dischromic _patches   
4  Fungal infection     itching              skin_rash   nodal_skin_eruptions   

              Symptom_4 Symptom_5 Symptom_6 Symptom_7 Symptom_8 Symptom_9  \
0   dischromic _patches       NaN       NaN       NaN       NaN       NaN   
1                   NaN       NaN       NaN       NaN       NaN       NaN   
2                   NaN       NaN       NaN       NaN       NaN       NaN   
3                   NaN       NaN       NaN       NaN       NaN       NaN   
4                   NaN       NaN       NaN       NaN       NaN       NaN   

  Symptom_10 Symptom_11 Symptom_12 Symptom_13 Symp

In [4]:

# Load symptom descriptions
desc_df = pd.read_csv(r"../data/symptom_Description.csv")
print(desc_df.head())

          Disease                                        Description
0   Drug Reaction  An adverse drug reaction (ADR) is an injury ca...
1         Malaria  An infectious disease caused by protozoan para...
2         Allergy  An allergy is an immune system response to a f...
3  Hypothyroidism  Hypothyroidism, also called underactive thyroi...
4       Psoriasis  Psoriasis is a common skin disorder that forms...


In [5]:

# Load precautions
precaution_df = pd.read_csv(r"../data/symptom_precaution.csv")
print(precaution_df.head())


          Disease                      Precaution_1  \
0   Drug Reaction                   stop irritation   
1         Malaria          Consult nearest hospital   
2         Allergy                    apply calamine   
3  Hypothyroidism                     reduce stress   
4       Psoriasis  wash hands with warm soapy water   

                   Precaution_2        Precaution_3  \
0      consult nearest hospital    stop taking drug   
1               avoid oily food  avoid non veg food   
2       cover area with bandage                 NaN   
3                      exercise         eat healthy   
4  stop bleeding using pressure      consult doctor   

                  Precaution_4  
0                    follow up  
1           keep mosquitos out  
2  use ice to compress itching  
3             get proper sleep  
4                   salt baths  


In [6]:
# Load symptom severity (optional)
severity_df = pd.read_csv(r"../data/Symptom-severity.csv")
print(severity_df.head())

                Symptom  weight
0               itching       1
1             skin_rash       3
2  nodal_skin_eruptions       4
3   continuous_sneezing       4
4             shivering       5


In [7]:
# Fill missing symptoms
symptom_cols = [col for col in df.columns if col.startswith('Symptom')]
df[symptom_cols] = df[symptom_cols].fillna('None')

In [8]:
# Combine symptoms into lists
df['Symptom_List'] = df[symptom_cols].apply(lambda row: [s.strip() for s in row if s != 'None'], axis=1)


In [9]:
# Build symptom vocabulary
all_symptoms = sorted(set(s for symptoms in df['Symptom_List'] for s in symptoms))
print(f"Unique symptoms found: {len(all_symptoms)}")

Unique symptoms found: 131


In [10]:
# Encode each sample as binary vector
import numpy as np
X = []
for symptoms in df['Symptom_List']:
    binary = [1 if s in symptoms else 0 for s in all_symptoms]
    X.append(binary)
X = np.array(X)
y = df['Disease']

In [11]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=84)
model = RandomForestClassifier()
model.fit(X_train, y_train)

y_pred = model.predict(X_test)
print("Accuracy:", accuracy_score(y_test, y_pred))

Accuracy: 1.0


In [12]:
import joblib

joblib.dump(model, "../model.pkl")
joblib.dump(all_symptoms, "../symptoms_list.pkl")


['../symptoms_list.pkl']

: 