In [209]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split,KFold,cross_val_score,GridSearchCV
from sklearn.svm import SVC
from sklearn.metrics import f1_score, accuracy_score, confusion_matrix,classification_report,confusion_matrix,precision_score,roc_curve
import seaborn as sns
from sklearn.utils import shuffle
# from pandas_profiling import ProfileReport
from sklearn.linear_model import LogisticRegression, Perceptron, RidgeClassifier, SGDClassifier
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier, ExtraTreesClassifier 
from sklearn.ensemble import BaggingClassifier, AdaBoostClassifier, VotingClassifier 
from sklearn.neighbors import KNeighborsClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.tree import DecisionTreeClassifier
from sklearn import metrics

**Read and shuffle the dataset**

In [210]:
df = pd.read_csv('dataset.csv')
df = shuffle(df,random_state=42)
df.head()

Unnamed: 0,Disease,Symptom_1,Symptom_2,Symptom_3,Symptom_4,Symptom_5,Symptom_6,Symptom_7,Symptom_8,Symptom_9,Symptom_10,Symptom_11,Symptom_12,Symptom_13,Symptom_14,Symptom_15,Symptom_16,Symptom_17
373,Acne,skin_rash,blackheads,scurring,,,,,,,,,,,,,,
4916,Acne,skin_rash,pus_filled_pimples,blackheads,scurring,,,,,,,,,,,,,
1550,Hyperthyroidism,fatigue,mood_swings,weight_loss,restlessness,sweating,diarrhoea,fast_heart_rate,excessive_hunger,muscle_weakness,irritability,abnormal_menstruation,,,,,,
3081,AIDS,muscle_wasting,patches_in_throat,high_fever,extra_marital_contacts,,,,,,,,,,,,,
3857,Chronic cholestasis,itching,vomiting,yellowish_skin,nausea,loss_of_appetite,abdominal_pain,yellowing_of_eyes,,,,,,,,,,


**Removing Hyphen from strings**

In [211]:
# for col in df.columns:
    
#     df[col] = df[col].str.replace('_',' ')
# df.head()

**Dataset characteristics**

In [212]:
df.describe()

Unnamed: 0,Disease,Symptom_1,Symptom_2,Symptom_3,Symptom_4,Symptom_5,Symptom_6,Symptom_7,Symptom_8,Symptom_9,Symptom_10,Symptom_11,Symptom_12,Symptom_13,Symptom_14,Symptom_15,Symptom_16,Symptom_17
count,4920,4920,4920,4920,4572,3714,2934,2268,1944,1692,1512,1194,744,504,306,240,192,72
unique,41,34,48,54,50,38,32,26,21,22,21,18,11,8,4,3,3,1
top,Acne,vomiting,vomiting,fatigue,high_fever,headache,nausea,abdominal_pain,abdominal_pain,yellowing_of_eyes,yellowing_of_eyes,irritability,malaise,stomach_bleeding,chest_pain,chest_pain,loss_of_smell,muscle_pain
freq,120,822,870,726,378,348,390,264,276,228,198,120,126,72,96,144,72,72


**Check for null and NaN values**

In [213]:
null_checker = df.apply(lambda x: sum(x.isnull())).to_frame(name='count')
print(null_checker)

            count
Disease         0
Symptom_1       0
Symptom_2       0
Symptom_3       0
Symptom_4     348
Symptom_5    1206
Symptom_6    1986
Symptom_7    2652
Symptom_8    2976
Symptom_9    3228
Symptom_10   3408
Symptom_11   3726
Symptom_12   4176
Symptom_13   4416
Symptom_14   4614
Symptom_15   4680
Symptom_16   4728
Symptom_17   4848


**Remove the trailing space from the symptom columns**

In [214]:
cols = df.columns
data = df[cols].values.flatten()

s = pd.Series(data)
s = s.str.strip()
s = s.values.reshape(df.shape)

df = pd.DataFrame(s, columns=df.columns)
df.head()

Unnamed: 0,Disease,Symptom_1,Symptom_2,Symptom_3,Symptom_4,Symptom_5,Symptom_6,Symptom_7,Symptom_8,Symptom_9,Symptom_10,Symptom_11,Symptom_12,Symptom_13,Symptom_14,Symptom_15,Symptom_16,Symptom_17
0,Acne,skin_rash,blackheads,scurring,,,,,,,,,,,,,,
1,Acne,skin_rash,pus_filled_pimples,blackheads,scurring,,,,,,,,,,,,,
2,Hyperthyroidism,fatigue,mood_swings,weight_loss,restlessness,sweating,diarrhoea,fast_heart_rate,excessive_hunger,muscle_weakness,irritability,abnormal_menstruation,,,,,,
3,AIDS,muscle_wasting,patches_in_throat,high_fever,extra_marital_contacts,,,,,,,,,,,,,
4,Chronic cholestasis,itching,vomiting,yellowish_skin,nausea,loss_of_appetite,abdominal_pain,yellowing_of_eyes,,,,,,,,,,


**Fill the NaN values with zero**

In [215]:
df = df.fillna(0)
df.head()

Unnamed: 0,Disease,Symptom_1,Symptom_2,Symptom_3,Symptom_4,Symptom_5,Symptom_6,Symptom_7,Symptom_8,Symptom_9,Symptom_10,Symptom_11,Symptom_12,Symptom_13,Symptom_14,Symptom_15,Symptom_16,Symptom_17
0,Acne,skin_rash,blackheads,scurring,0,0,0,0,0,0,0,0,0,0,0,0,0,0
1,Acne,skin_rash,pus_filled_pimples,blackheads,scurring,0,0,0,0,0,0,0,0,0,0,0,0,0
2,Hyperthyroidism,fatigue,mood_swings,weight_loss,restlessness,sweating,diarrhoea,fast_heart_rate,excessive_hunger,muscle_weakness,irritability,abnormal_menstruation,0,0,0,0,0,0
3,AIDS,muscle_wasting,patches_in_throat,high_fever,extra_marital_contacts,0,0,0,0,0,0,0,0,0,0,0,0,0
4,Chronic cholestasis,itching,vomiting,yellowish_skin,nausea,loss_of_appetite,abdominal_pain,yellowing_of_eyes,0,0,0,0,0,0,0,0,0,0


In [216]:
df1 = pd.read_csv('Symptom-severity.csv')
x=df1['Symptom']
x

0                   itching
1                 skin_rash
2      nodal_skin_eruptions
3       continuous_sneezing
4                 shivering
               ...         
128      inflammatory_nails
129                 blister
130    red_sore_around_nose
131       yellow_crust_ooze
132               prognosis
Name: Symptom, Length: 133, dtype: object

**Symptom severity rank**

In [217]:
dfx=pd.DataFrame()
dfx["Disease"]=df["Disease"]
y=0
dfx[x]=0
for index, row in df.iterrows():
    for symptom in df.columns[1:]:
        if row[symptom] != 0:
            dfx.loc[index, row[symptom]] = 1
dfx = dfx.fillna(0)
dfx[dfx.columns[1:]]=dfx[dfx.columns[1:]].astype('int')

  dfx[x]=0
  dfx[x]=0
  dfx[x]=0
  dfx[x]=0
  dfx[x]=0
  dfx[x]=0
  dfx[x]=0
  dfx[x]=0
  dfx[x]=0
  dfx[x]=0
  dfx[x]=0
  dfx[x]=0
  dfx[x]=0
  dfx[x]=0
  dfx[x]=0
  dfx[x]=0
  dfx[x]=0
  dfx[x]=0
  dfx[x]=0
  dfx[x]=0
  dfx[x]=0
  dfx[x]=0
  dfx[x]=0
  dfx[x]=0
  dfx[x]=0
  dfx[x]=0
  dfx[x]=0
  dfx[x]=0
  dfx[x]=0
  dfx[x]=0
  dfx[x]=0
  dfx[x]=0
  dfx[x]=0
  dfx.loc[index, row[symptom]] = 1
  dfx.loc[index, row[symptom]] = 1
  dfx.loc[index, row[symptom]] = 1


In [218]:
dfx.columns = dfx.columns.str.strip()

In [220]:
dfx

Unnamed: 0,Disease,itching,skin_rash,nodal_skin_eruptions,continuous_sneezing,shivering,chills,joint_pain,stomach_pain,acidity,...,silver_like_dusting,small_dents_in_nails,inflammatory_nails,blister,red_sore_around_nose,yellow_crust_ooze,prognosis,foul_smell_of urine,spotting_ urination,dischromic _patches
0,Acne,0,1,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,Acne,0,1,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,Hyperthyroidism,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,AIDS,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,Chronic cholestasis,1,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
4915,Psoriasis,0,1,0,0,0,0,1,0,0,...,1,1,1,0,0,0,0,0,0,0
4916,Peptic ulcer diseae,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4917,Dengue,0,1,0,0,0,1,1,0,0,...,0,0,0,0,0,0,0,0,0,0
4918,Fungal infection,1,1,1,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,1


In [221]:

dfx.drop(dfx.columns[-4:], axis=1, inplace=True)
# dfx.drop('prognosis', axis=1, inplace=True)
# dfx.drop('spotting urination', axis=1, inplace=True)
# dfx.drop('dischromic patches', axis=1, inplace=True)
dfx

Unnamed: 0,Disease,itching,skin_rash,nodal_skin_eruptions,continuous_sneezing,shivering,chills,joint_pain,stomach_pain,acidity,...,pus_filled_pimples,blackheads,scurring,skin_peeling,silver_like_dusting,small_dents_in_nails,inflammatory_nails,blister,red_sore_around_nose,yellow_crust_ooze
0,Acne,0,1,0,0,0,0,0,0,0,...,0,1,1,0,0,0,0,0,0,0
1,Acne,0,1,0,0,0,0,0,0,0,...,1,1,1,0,0,0,0,0,0,0
2,Hyperthyroidism,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,AIDS,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,Chronic cholestasis,1,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
4915,Psoriasis,0,1,0,0,0,0,1,0,0,...,0,0,0,1,1,1,1,0,0,0
4916,Peptic ulcer diseae,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4917,Dengue,0,1,0,0,0,1,1,0,0,...,0,0,0,0,0,0,0,0,0,0
4918,Fungal infection,1,1,1,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [224]:
columns_to_drop = ['foul_smell_ofurine', 'dischromic_patches', 'spotting_urination']
dfx = dfx.drop(columns=columns_to_drop)


In [225]:

dfx[dfx.columns[1:]].sum(axis=0).sort_values()

blackheads             108
pus_filled_pimples     108
watering_from_eyes     108
patches_in_throat      108
dehydration            108
                      ... 
nausea                1146
loss_of_appetite      1152
high_fever            1362
vomiting              1914
fatigue               1932
Length: 128, dtype: int64

In [226]:
y=df['Disease'].unique()
y

array(['Acne', 'Hyperthyroidism', 'AIDS', 'Chronic cholestasis',
       'Hypertension', 'Hypoglycemia', 'Arthritis', 'Hepatitis B',
       'Migraine', 'Urinary tract infection', 'Diabetes', 'Hepatitis D',
       'Psoriasis', 'Alcoholic hepatitis', 'Dimorphic hemmorhoids(piles)',
       'Hepatitis E', 'Cervical spondylosis', 'Bronchial Asthma',
       'hepatitis A', 'Allergy', 'Hepatitis C', 'Pneumonia',
       'Hypothyroidism', 'Gastroenteritis', 'Varicose veins', 'Jaundice',
       'Drug Reaction', '(vertigo) Paroymsal  Positional Vertigo',
       'Heart attack', 'Tuberculosis', 'Typhoid', 'Common Cold',
       'Peptic ulcer diseae', 'Paralysis (brain hemorrhage)',
       'Fungal infection', 'Impetigo', 'GERD', 'Dengue', 'Malaria',
       'Chicken pox', 'Osteoarthristis'], dtype=object)

In [228]:
data = dfx.iloc[:,1:].values
labels = dfx['Disease'].values

In [229]:
x_train, x_test, y_train, y_test = train_test_split(data, labels, train_size = 0.7,random_state=42)
x_train, x_val, y_train,y_val=train_test_split(data,labels,test_size=0.3,random_state=42)
print(x_train.shape, x_test.shape, y_train.shape, y_test.shape, x_val.shape,y_val.shape)

(3444, 128) (1476, 128) (3444,) (1476,) (1476, 128) (1476,)


In [230]:
from sklearn.preprocessing import LabelEncoder

le = LabelEncoder()
y_train = le.fit_transform(y_train)
y_test = le.transform(y_test)
y_val=le.transform(y_val)


In [232]:
y=le.classes_
y

array(['(vertigo) Paroymsal  Positional Vertigo', 'AIDS', 'Acne',
       'Alcoholic hepatitis', 'Allergy', 'Arthritis', 'Bronchial Asthma',
       'Cervical spondylosis', 'Chicken pox', 'Chronic cholestasis',
       'Common Cold', 'Dengue', 'Diabetes',
       'Dimorphic hemmorhoids(piles)', 'Drug Reaction',
       'Fungal infection', 'GERD', 'Gastroenteritis', 'Heart attack',
       'Hepatitis B', 'Hepatitis C', 'Hepatitis D', 'Hepatitis E',
       'Hypertension', 'Hyperthyroidism', 'Hypoglycemia',
       'Hypothyroidism', 'Impetigo', 'Jaundice', 'Malaria', 'Migraine',
       'Osteoarthristis', 'Paralysis (brain hemorrhage)',
       'Peptic ulcer diseae', 'Pneumonia', 'Psoriasis', 'Tuberculosis',
       'Typhoid', 'Urinary tract infection', 'Varicose veins',
       'hepatitis A'], dtype=object)

In [233]:
# import pandas as pd
# from sklearn.model_selection import train_test_split, cross_val_score, KFold
# from sklearn.metrics import f1_score
# from sklearn.linear_model import LogisticRegression
# from sklearn.ensemble import GradientBoostingClassifier, RandomForestClassifier, ExtraTreesClassifier
# from sklearn.svm import SVC
# from xgboost import XGBClassifier
# from lightgbm import LGBMClassifier
# from catboost import CatBoostClassifier
# import pickle
# classifiers = {
#     # 'Logistic Regression': LogisticRegression(),
#     'Random Forest': RandomForestClassifier(),
#     # 'SVM': SVC(),
#     'XGBoost': XGBClassifier(),
#     'LightGBM': LGBMClassifier(verbose=-1),
#     'CatBoost': CatBoostClassifier(silent=True),
#     'GradientBoost': GradientBoostingClassifier(),
#     'ExtraTrees': ExtraTreesClassifier()
# }

# # Define the K-fold Cross Validator
# kfold = KFold(n_splits=10, shuffle=True, random_state=1)

# # K-fold Cross Validation model evaluation
# for name, clf in classifiers.items():
#     cv_scores = cross_val_score(clf, x_train, y_train, cv=kfold, scoring='f1_weighted')
#     print(f'{name} cross-validation mean F1 score: %.3f' % cv_scores.mean())
    
#     # Train and test each classifier
#     clf.fit(x_train, y_train)
    
#     test_predictions = clf.predict(x_train)
#     test_f1 = f1_score(y_train, test_predictions, average='weighted')
#     print(f'{name} train F1 Score: {test_f1}')
    
#     test_predictions = clf.predict(x_test)
#     test_f1 = f1_score(y_test, test_predictions, average='weighted')
#     print(f'{name} test F1 Score: {test_f1}')
    
#     val_predictions = clf.predict(x_val)
#     val_f1 = f1_score(y_val, val_predictions, average='weighted')
#     print(f'{name} validation F1 Score: {val_f1}')
    
#     # Format the F1 score to remove the period
#     formatted_f1 = "{:.3f}".format(val_f1).replace('.', '_')
    
#     # Save the classifier to a pickle file
#     with open(f"{name}_{formatted_f1}.pkl", 'wb') as f:
#         pickle.dump(clf, f)


In [234]:
desc = pd.read_csv("symptom_Description.csv")

In [235]:
desc.head()

Unnamed: 0,Disease,Description
0,Drug Reaction,An adverse drug reaction (ADR) is an injury ca...
1,Malaria,An infectious disease caused by protozoan para...
2,Allergy,An allergy is an immune system response to a f...
3,Hypothyroidism,"Hypothyroidism, also called underactive thyroi..."
4,Psoriasis,Psoriasis is a common skin disorder that forms...


In [236]:
prec = pd.read_csv("symptom_precaution.csv")

In [237]:
prec.head()

Unnamed: 0,Disease,Precaution_1,Precaution_2,Precaution_3,Precaution_4
0,Drug Reaction,stop irritation,consult nearest hospital,stop taking drug,follow up
1,Malaria,Consult nearest hospital,avoid oily food,avoid non veg food,keep mosquitos out
2,Allergy,apply calamine,cover area with bandage,,use ice to compress itching
3,Hypothyroidism,reduce stress,exercise,eat healthy,get proper sleep
4,Psoriasis,wash hands with warm soapy water,stop bleeding using pressure,consult doctor,salt baths


In [238]:
def predd(m, X):
    # Get probabilities for each class
    proba = m.predict_proba(X)

    # Get the indices and probabilities of the top 5 classes
    top5_idx = np.argsort(proba[0])[-5:][::-1]
    top5_proba = np.sort(proba[0])[-5:][::-1]

    # Get the names of the top 5 diseases
    top5_diseases = y[top5_idx]

    for i in range(5):
        
        disease = top5_diseases[i]
        probability = top5_proba[i]
        # print(f"{disease}={probability}" )
        
        print("Disease Name: ", disease)
        print("Probability: ", probability)
        if(disease in desc["Disease"].unique()):
            disp = desc[desc['Disease'] == disease]
            disp = disp.values[0][1]
            print("Disease Description: ", disp)
        
        if(disease in prec["Disease"].unique()):
            c = np.where(prec['Disease'] == disease)[0][0]
            precuation_list = []
            for j in range(1, len(prec.iloc[c])):
                precuation_list.append(prec.iloc[c, j])
            print("Recommended Things to do at home: ")
            for precaution in precuation_list:
                print(precaution)
        
        print("\n")


In [239]:
prec

Unnamed: 0,Disease,Precaution_1,Precaution_2,Precaution_3,Precaution_4
0,Drug Reaction,stop irritation,consult nearest hospital,stop taking drug,follow up
1,Malaria,Consult nearest hospital,avoid oily food,avoid non veg food,keep mosquitos out
2,Allergy,apply calamine,cover area with bandage,,use ice to compress itching
3,Hypothyroidism,reduce stress,exercise,eat healthy,get proper sleep
4,Psoriasis,wash hands with warm soapy water,stop bleeding using pressure,consult doctor,salt baths
5,GERD,avoid fatty spicy food,avoid lying down after eating,maintain healthy weight,exercise
6,Chronic cholestasis,cold baths,anti itch medicine,consult doctor,eat healthy
7,hepatitis A,Consult nearest hospital,wash hands through,avoid fatty spicy food,medication
8,Osteoarthristis,acetaminophen,consult nearest hospital,follow up,salt baths
9,(vertigo) Paroymsal Positional Vertigo,lie down,avoid sudden change in body,avoid abrupt head movment,relax


In [240]:
x=dfx.columns[1:]


In [241]:
x


Index(['itching', 'skin_rash', 'nodal_skin_eruptions', 'continuous_sneezing',
       'shivering', 'chills', 'joint_pain', 'stomach_pain', 'acidity',
       'ulcers_on_tongue',
       ...
       'pus_filled_pimples', 'blackheads', 'scurring', 'skin_peeling',
       'silver_like_dusting', 'small_dents_in_nails', 'inflammatory_nails',
       'blister', 'red_sore_around_nose', 'yellow_crust_ooze'],
      dtype='object', length=128)

In [242]:
y

array(['(vertigo) Paroymsal  Positional Vertigo', 'AIDS', 'Acne',
       'Alcoholic hepatitis', 'Allergy', 'Arthritis', 'Bronchial Asthma',
       'Cervical spondylosis', 'Chicken pox', 'Chronic cholestasis',
       'Common Cold', 'Dengue', 'Diabetes',
       'Dimorphic hemmorhoids(piles)', 'Drug Reaction',
       'Fungal infection', 'GERD', 'Gastroenteritis', 'Heart attack',
       'Hepatitis B', 'Hepatitis C', 'Hepatitis D', 'Hepatitis E',
       'Hypertension', 'Hyperthyroidism', 'Hypoglycemia',
       'Hypothyroidism', 'Impetigo', 'Jaundice', 'Malaria', 'Migraine',
       'Osteoarthristis', 'Paralysis (brain hemorrhage)',
       'Peptic ulcer diseae', 'Pneumonia', 'Psoriasis', 'Tuberculosis',
       'Typhoid', 'Urinary tract infection', 'Varicose veins',
       'hepatitis A'], dtype=object)

In [243]:
import pickle

In [244]:
t=pd.Series([0]*128, index=x)
# t.columns=t.columns.astype("str")
# print(t)
m=ExtraTreesClassifier()
with open("LightGBM_1_000.pkl", 'rb') as f:
    m =  pickle.load(f)
t.loc["chest_pain"]=1
t.loc["phlegm"]=1
t.loc["runny_nose"]=1
t.loc["high_fever"]=1
t.loc["throat_irritation"]=1
# t.loc["red spots over body"]=1
t.loc["congestion"]=1
t.loc["redness_of_eyes"]=1
# print(t)
t=t.to_numpy()
print(t.shape)
t=t.reshape(1,-1)
predd(m,t)
# preds=m.predict_proba(t)
# preds


(128,)
Disease Name:  Common Cold
Probability:  0.9999931595644647
Disease Description:  The common cold is a viral infection of your nose and throat (upper respiratory tract). It's usually harmless, although it might not feel that way. Many types of viruses can cause a common cold.
Recommended Things to do at home: 
drink vitamin c rich drinks
take vapour
avoid cold food
keep fever in check


Disease Name:  AIDS
Probability:  1.0136285004795123e-06
Disease Description:  Acquired immunodeficiency syndrome (AIDS) is a chronic, potentially life-threatening condition caused by the human immunodeficiency virus (HIV). By damaging your immune system, HIV interferes with your body's ability to fight infection and disease.
Recommended Things to do at home: 
avoid open cuts
wear ppe if possible
consult doctor
follow up


Disease Name:  Heart attack
Probability:  5.494442747756038e-07
Disease Description:  The death of heart muscle due to the loss of blood supply. The loss of blood supply is usu