In [1]:
import pandas as pd 
import numpy as np
import matplotlib.pyplot as plt
import warnings
warnings.filterwarnings('ignore')
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report,confusion_matrix,accuracy_score
from sklearn.metrics import precision_score, recall_score, f1_score

In [2]:
df = pd.read_csv('/kaggle/input/disease-symptom-description-dataset/dataset.csv')
data_severity = pd.read_csv('/kaggle/input/disease-symptom-description-dataset/Symptom-severity.csv')

In [3]:
df.head(2)

Unnamed: 0,Disease,Symptom_1,Symptom_2,Symptom_3,Symptom_4,Symptom_5,Symptom_6,Symptom_7,Symptom_8,Symptom_9,Symptom_10,Symptom_11,Symptom_12,Symptom_13,Symptom_14,Symptom_15,Symptom_16,Symptom_17
0,Fungal infection,itching,skin_rash,nodal_skin_eruptions,dischromic _patches,,,,,,,,,,,,,
1,Fungal infection,skin_rash,nodal_skin_eruptions,dischromic _patches,,,,,,,,,,,,,,


In [4]:
data_severity.head()

Unnamed: 0,Symptom,weight
0,itching,1
1,skin_rash,3
2,nodal_skin_eruptions,4
3,continuous_sneezing,4
4,shivering,5


In [5]:
for i in data_severity.index:
    print(data_severity['Symptom'][i], data_severity['weight'][i])

itching 1
skin_rash 3
nodal_skin_eruptions 4
continuous_sneezing 4
shivering 5
chills 3
joint_pain 3
stomach_pain 5
acidity 3
ulcers_on_tongue 4
muscle_wasting 3
vomiting 5
burning_micturition 6
spotting_urination 6
fatigue 4
weight_gain 3
anxiety 4
cold_hands_and_feets 5
mood_swings 3
weight_loss 3
restlessness 5
lethargy 2
patches_in_throat 6
irregular_sugar_level 5
cough 4
high_fever 7
sunken_eyes 3
breathlessness 4
sweating 3
dehydration 4
indigestion 5
headache 3
yellowish_skin 3
dark_urine 4
nausea 5
loss_of_appetite 4
pain_behind_the_eyes 4
back_pain 3
constipation 4
abdominal_pain 4
diarrhoea 6
mild_fever 5
yellow_urine 4
yellowing_of_eyes 4
acute_liver_failure 6
fluid_overload 6
swelling_of_stomach 7
swelled_lymph_nodes 6
malaise 6
blurred_and_distorted_vision 5
phlegm 5
throat_irritation 4
redness_of_eyes 5
sinus_pressure 4
runny_nose 5
congestion 5
chest_pain 7
weakness_in_limbs 7
fast_heart_rate 5
pain_during_bowel_movements 5
pain_in_anal_region 6
bloody_stool 5
irritation

In [6]:
#convert data_severity to dictionnary
data_dict = data_severity.set_index('Symptom').T.to_dict()

In [7]:
data_dict

{'itching': {'weight': 1},
 'skin_rash': {'weight': 3},
 'nodal_skin_eruptions': {'weight': 4},
 'continuous_sneezing': {'weight': 4},
 'shivering': {'weight': 5},
 'chills': {'weight': 3},
 'joint_pain': {'weight': 3},
 'stomach_pain': {'weight': 5},
 'acidity': {'weight': 3},
 'ulcers_on_tongue': {'weight': 4},
 'muscle_wasting': {'weight': 3},
 'vomiting': {'weight': 5},
 'burning_micturition': {'weight': 6},
 'spotting_urination': {'weight': 6},
 'fatigue': {'weight': 4},
 'weight_gain': {'weight': 3},
 'anxiety': {'weight': 4},
 'cold_hands_and_feets': {'weight': 5},
 'mood_swings': {'weight': 3},
 'weight_loss': {'weight': 3},
 'restlessness': {'weight': 5},
 'lethargy': {'weight': 2},
 'patches_in_throat': {'weight': 6},
 'irregular_sugar_level': {'weight': 5},
 'cough': {'weight': 4},
 'high_fever': {'weight': 7},
 'sunken_eyes': {'weight': 3},
 'breathlessness': {'weight': 4},
 'sweating': {'weight': 3},
 'dehydration': {'weight': 4},
 'indigestion': {'weight': 5},
 'headache'

In [8]:
df.shape

(4920, 18)

In [9]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 4920 entries, 0 to 4919
Data columns (total 18 columns):
 #   Column      Non-Null Count  Dtype 
---  ------      --------------  ----- 
 0   Disease     4920 non-null   object
 1   Symptom_1   4920 non-null   object
 2   Symptom_2   4920 non-null   object
 3   Symptom_3   4920 non-null   object
 4   Symptom_4   4572 non-null   object
 5   Symptom_5   3714 non-null   object
 6   Symptom_6   2934 non-null   object
 7   Symptom_7   2268 non-null   object
 8   Symptom_8   1944 non-null   object
 9   Symptom_9   1692 non-null   object
 10  Symptom_10  1512 non-null   object
 11  Symptom_11  1194 non-null   object
 12  Symptom_12  744 non-null    object
 13  Symptom_13  504 non-null    object
 14  Symptom_14  306 non-null    object
 15  Symptom_15  240 non-null    object
 16  Symptom_16  192 non-null    object
 17  Symptom_17  72 non-null     object
dtypes: object(18)
memory usage: 692.0+ KB


In [10]:
def remove_space_between_words(df):
    for col in df.columns:
        if df[col].dtype == 'object':
            df[col] = df[col].str.strip().str.replace(" ", "_")
    return df

In [11]:
df = remove_space_between_words(df)
df.head()

Unnamed: 0,Disease,Symptom_1,Symptom_2,Symptom_3,Symptom_4,Symptom_5,Symptom_6,Symptom_7,Symptom_8,Symptom_9,Symptom_10,Symptom_11,Symptom_12,Symptom_13,Symptom_14,Symptom_15,Symptom_16,Symptom_17
0,Fungal_infection,itching,skin_rash,nodal_skin_eruptions,dischromic__patches,,,,,,,,,,,,,
1,Fungal_infection,skin_rash,nodal_skin_eruptions,dischromic__patches,,,,,,,,,,,,,,
2,Fungal_infection,itching,nodal_skin_eruptions,dischromic__patches,,,,,,,,,,,,,,
3,Fungal_infection,itching,skin_rash,dischromic__patches,,,,,,,,,,,,,,
4,Fungal_infection,itching,skin_rash,nodal_skin_eruptions,,,,,,,,,,,,,,


In [12]:
df[df['Disease']=='Acne'].values

array([['Acne', 'skin_rash', 'pus_filled_pimples', ..., nan, nan, nan],
       ['Acne', 'skin_rash', 'pus_filled_pimples', ..., nan, nan, nan],
       ['Acne', 'pus_filled_pimples', 'blackheads', ..., nan, nan, nan],
       ...,
       ['Acne', 'skin_rash', 'pus_filled_pimples', ..., nan, nan, nan],
       ['Acne', 'skin_rash', 'pus_filled_pimples', ..., nan, nan, nan],
       ['Acne', 'skin_rash', 'pus_filled_pimples', ..., nan, nan, nan]],
      dtype=object)

In [13]:
def encode_symptoms(df, data_severity):
    for i in data_severity.index:
        symptom = data_severity["Symptom"][i]
        weight = data_severity["weight"][i]
        df = df.replace(symptom, weight)

    # Replace missing values with 0
    df = df.fillna(0)

    # Additional hardcoded replacements
    df = df.replace("foul_smell_of_urine", 5)
    df = df.replace("dischromic__patches", 6)
    df = df.replace("spotting__urination", 6)
    
    return df

In [14]:
new_df = encode_symptoms(df, data_severity)

In [15]:
new_df.head()

Unnamed: 0,Disease,Symptom_1,Symptom_2,Symptom_3,Symptom_4,Symptom_5,Symptom_6,Symptom_7,Symptom_8,Symptom_9,Symptom_10,Symptom_11,Symptom_12,Symptom_13,Symptom_14,Symptom_15,Symptom_16,Symptom_17
0,Fungal_infection,1,3,4,6,0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,Fungal_infection,3,4,6,0,0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,Fungal_infection,1,4,6,0,0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,Fungal_infection,1,3,6,0,0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,Fungal_infection,1,3,4,0,0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [16]:
names = []

# Iterate through columns except for "Disease"
for col in new_df.columns:
    if col != "Disease":
        # Iterate through rows in the column
        for symptom in new_df[col]:
            # Check if the value is a string and not in the 'names' list
            if isinstance(symptom, str) and symptom not in names:
                names.append(symptom)

# Check if all symptoms have been replaced
all_replaced = all(symptom not in names for symptom in data_severity["Symptom"])

if all_replaced:
    print("All symptoms have been replaced.")
else:
    print("The following symptoms were not replaced:", names)


All symptoms have been replaced.


In [17]:
# separating the data and labels
X = new_df.drop(columns='Disease', axis=1)
Y = new_df['Disease']

In [18]:
print(X)

      Symptom_1  Symptom_2  Symptom_3  Symptom_4  Symptom_5  Symptom_6  \
0             1          3          4          6          0        0.0   
1             3          4          6          0          0        0.0   
2             1          4          6          0          0        0.0   
3             1          3          6          0          0        0.0   
4             1          3          4          0          0        0.0   
...         ...        ...        ...        ...        ...        ...   
4915          5          3          5          6          4        4.0   
4916          3          2          2          2          0        0.0   
4917          6          4          5          6          0        0.0   
4918          3          3          3          2          2        2.0   
4919          3          7          4          2          3        0.0   

      Symptom_7  Symptom_8  Symptom_9  Symptom_10  Symptom_11  Symptom_12  \
0           0.0        0.0        

In [19]:
print(Y)

0                              Fungal_infection
1                              Fungal_infection
2                              Fungal_infection
3                              Fungal_infection
4                              Fungal_infection
                         ...                   
4915    (vertigo)_Paroymsal__Positional_Vertigo
4916                                       Acne
4917                    Urinary_tract_infection
4918                                  Psoriasis
4919                                   Impetigo
Name: Disease, Length: 4920, dtype: object


# Data Standardization

In [20]:
scaler = StandardScaler()

In [21]:
scaler.fit(X)

In [22]:
standardized_data = scaler.transform(X)

In [23]:
print(standardized_data)

[[-1.83180372 -0.96557578 -0.13863618 ... -0.21790657 -0.18978799
  -0.12186667]
 [-0.3122498  -0.13736225  1.46251409 ... -0.21790657 -0.18978799
  -0.12186667]
 [-1.83180372 -0.13736225  1.46251409 ... -0.21790657 -0.18978799
  -0.12186667]
 ...
 [ 1.96708109 -0.13736225  0.66193895 ... -0.21790657 -0.18978799
  -0.12186667]
 [-0.3122498  -0.96557578 -0.93921132 ... -0.21790657 -0.18978799
  -0.12186667]
 [-0.3122498   2.34727837 -0.13863618 ... -0.21790657 -0.18978799
  -0.12186667]]


In [24]:
X = standardized_data
Y = new_df['Disease']

In [25]:
print(X)

[[-1.83180372 -0.96557578 -0.13863618 ... -0.21790657 -0.18978799
  -0.12186667]
 [-0.3122498  -0.13736225  1.46251409 ... -0.21790657 -0.18978799
  -0.12186667]
 [-1.83180372 -0.13736225  1.46251409 ... -0.21790657 -0.18978799
  -0.12186667]
 ...
 [ 1.96708109 -0.13736225  0.66193895 ... -0.21790657 -0.18978799
  -0.12186667]
 [-0.3122498  -0.96557578 -0.93921132 ... -0.21790657 -0.18978799
  -0.12186667]
 [-0.3122498   2.34727837 -0.13863618 ... -0.21790657 -0.18978799
  -0.12186667]]


In [26]:
print(Y)

0                              Fungal_infection
1                              Fungal_infection
2                              Fungal_infection
3                              Fungal_infection
4                              Fungal_infection
                         ...                   
4915    (vertigo)_Paroymsal__Positional_Vertigo
4916                                       Acne
4917                    Urinary_tract_infection
4918                                  Psoriasis
4919                                   Impetigo
Name: Disease, Length: 4920, dtype: object


#  Random Forest Classifier

In [27]:
from sklearn.ensemble import RandomForestClassifier
# Split the data into training and testing sets
X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size=0.2, random_state=42)

# Create a Random Forest Classifier
rfc_classifier = RandomForestClassifier(n_estimators=100, random_state=42)

# Fit the model on the training data
rfc_classifier.fit(X_train, Y_train)

# Predict disease labels on the testing data
Y_pred = rfc_classifier.predict(X_test)

# Calculate accuracy
accuracy = accuracy_score(Y_test, Y_pred)

precision = precision_score(Y_test, Y_pred, average='macro')
recall = recall_score(Y_test, Y_pred,average='macro')
f1 = f1_score(Y_test, Y_pred, average='macro')

print("Random Forest")
print(f'Accuracy: {accuracy}')
print(f'Precision: {precision}')
print(f'Recall: {recall}')
print(f'F1-Score: {f1}')
print(accuracy_score(Y_test, Y_pred,normalize=False))
print("Confusion matrix")
conf_matrix=confusion_matrix(Y_test,Y_pred)
print(conf_matrix)

Random Forest
Accuracy: 0.991869918699187
Precision: 0.9928005598737307
Recall: 0.9909757027776227
F1-Score: 0.9914564591749411
976
Confusion matrix
[[18  0  0 ...  0  0  0]
 [ 0 30  0 ...  0  0  0]
 [ 0  0 24 ...  0  0  0]
 ...
 [ 0  0  0 ... 26  0  0]
 [ 0  0  0 ...  0 22  0]
 [ 0  0  0 ...  0  0 34]]


# Decision Tree Classifier

In [28]:
from sklearn.tree import DecisionTreeClassifier

# Split the data into training and testing sets
X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size=0.2, random_state=42)

# Create a Decision Tree classifier
dt_classifier = DecisionTreeClassifier(random_state=42)

# Fit the model on the training data
dt_classifier.fit(X_train, Y_train)

# Predict disease labels on the testing data
Y_pred = dt_classifier.predict(X_test)

# Calculate accuracy
accuracy = accuracy_score(Y_test, Y_pred)

precision = precision_score(Y_test, Y_pred, average='macro')
recall = recall_score(Y_test, Y_pred,average='macro')
f1 = f1_score(Y_test, Y_pred, average='macro')

print("Decision Tree")
print(f'Accuracy: {accuracy}')
print(f'Precision: {precision}')
print(f'Recall: {recall}')
print(f'F1-Score: {f1}')
print(accuracy_score(Y_test, Y_pred,normalize=False))
print("Confusion matrix")
conf_matrix=confusion_matrix(Y_test,Y_pred)
print(conf_matrix)

Decision Tree
Accuracy: 0.991869918699187
Precision: 0.9928005598737307
Recall: 0.9909757027776227
F1-Score: 0.9914564591749411
976
Confusion matrix
[[18  0  0 ...  0  0  0]
 [ 0 30  0 ...  0  0  0]
 [ 0  0 24 ...  0  0  0]
 ...
 [ 0  0  0 ... 26  0  0]
 [ 0  0  0 ...  0 22  0]
 [ 0  0  0 ...  0  0 34]]


# Gaussian Naive Bayes classifier

In [29]:
from sklearn.naive_bayes import GaussianNB

# Split the data into training and testing sets
X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size=0.2, random_state=42)

# Create a Decision Tree classifier
NB_classifier = GaussianNB()

# Fit the model on the training data
NB_classifier.fit(X_train, Y_train)

# Predict disease labels on the testing data
Y_pred = NB_classifier.predict(X_test)

precision = precision_score(Y_test, Y_pred, average='macro')
recall = recall_score(Y_test, Y_pred,average='macro')
f1 = f1_score(Y_test, Y_pred, average='macro')

print("Gaussian NB")
print(f'Accuracy: {accuracy}')
print(f'Precision: {precision}')
print(f'Recall: {recall}')
print(f'F1-Score: {f1}')
print(accuracy_score(Y_test, Y_pred,normalize=False))
print("Confusion matrix")
conf_matrix=confusion_matrix(Y_test,Y_pred)
print(conf_matrix)

Gaussian NB
Accuracy: 0.991869918699187
Precision: 0.8794765795368534
Recall: 0.8733242259602846
F1-Score: 0.8628509936790025
854
Confusion matrix
[[15  0  0 ...  0  0  0]
 [ 0 30  0 ...  0  0  0]
 [ 0  0 24 ...  0  0  0]
 ...
 [ 0  5  0 ... 21  0  0]
 [ 0  0  0 ...  0 22  0]
 [ 0  0  0 ...  0  0 33]]


# K-Nearest Neighbors Classifier

In [30]:
from sklearn.neighbors import KNeighborsClassifier

# Split the data into training and testing sets with a random state
X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size=0.2, random_state=42)
   
# Create a k-NN classifier and specify the number of neighbors e.g. 3  
k=3
knn_classifier = KNeighborsClassifier(n_neighbors=k)

# Fit the model on the training data
knn_classifier.fit(X_train, Y_train)

# Predict the labels for the test data
Y_pred = knn_classifier.predict(X_test)

precision = precision_score(Y_test, Y_pred, average='macro')
recall = recall_score(Y_test, Y_pred,average='macro')
f1 = f1_score(Y_test, Y_pred, average='macro')

print("KNN")
print(f'Accuracy: {accuracy}')
print(f'Precision: {precision}')
print(f'Recall: {recall}')
print(f'F1-Score: {f1}')
print(accuracy_score(Y_test, Y_pred,normalize=False))
print("Confusion matrix")
conf_matrix=confusion_matrix(Y_test,Y_pred)
print(conf_matrix)

KNN
Accuracy: 0.991869918699187
Precision: 0.9933217189314751
Recall: 0.9909350523711186
F1-Score: 0.9915720052790706
976
Confusion matrix
[[18  0  0 ...  0  0  0]
 [ 0 30  0 ...  0  0  0]
 [ 0  0 24 ...  0  0  0]
 ...
 [ 0  0  0 ... 26  0  0]
 [ 0  0  0 ...  0 22  0]
 [ 0  0  0 ...  0  0 34]]


# K-Fold Cross Validation

In [31]:
from sklearn.model_selection import cross_val_score, KFold
from sklearn.svm import SVC  

svm_classifier = SVC()

# Specify the number of folds for cross-validation
num_folds = 10  # You can choose any number of folds you prefer

# Create a K-Fold cross-validation object
kf = KFold(n_splits=num_folds, shuffle=True, random_state=42)

# Perform cross-validation and compute scores using accuracy, precision, recall
scores = cross_val_score(svm_classifier, X, Y, cv=kf, scoring='accuracy')

# Print the cross-validation scores
print(f'Cross-Validation Scores: {scores}')

# Calculate and print the mean and standard deviation of the scores  
# High mean accuracy and a relatively low standard deviation in cross-validation scores are good indicators of consistent and reliable model performance across different subsets of the data.
mean_score = scores.mean()
std_deviation = scores.std()
precision = precision_score(Y_test, Y_pred, average='macro')
recall = recall_score(Y_test, Y_pred,average='macro')
f1 = f1_score(Y_test, Y_pred, average='macro')
print(f'Mean Accuracy: {mean_score}')
print(f'Standard Deviation: {std_deviation}')
print(f'Accuracy: {accuracy}')
print(f'Precision: {precision}')
print(f'Recall: {recall}')
print(f'F1-Score: {f1}')


Cross-Validation Scores: [0.96544715 0.95934959 0.98373984 0.95121951 0.97764228 0.95528455
 0.96138211 0.97764228 0.95325203 0.96544715]
Mean Accuracy: 0.9650406504065041
Standard Deviation: 0.010670247762932275
Accuracy: 0.991869918699187
Precision: 0.9933217189314751
Recall: 0.9909350523711186
F1-Score: 0.9915720052790706


# SVM

In [32]:
from sklearn.svm import SVC

# Split the data into training and testing sets
X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size=0.2, random_state=100)

# Create an instance of the SVM classifier
svm_classifier = SVC(kernel='linear')

# Train the model using the training data
svm_classifier.fit(X_train, Y_train)

# Predict disease labels on the testing data
Y_pred = svm_classifier.predict(X_test)

# Calculate accuracy
accuracy = accuracy_score(Y_test, Y_pred)

precision = precision_score(Y_test, Y_pred, average='macro')
recall = recall_score(Y_test, Y_pred,average='macro')
f1 = f1_score(Y_test, Y_pred, average='macro')

print("SVM")
print(f'Accuracy: {accuracy}')
print(f'Precision: {precision}')
print(f'Recall: {recall}')
print(f'F1-Score: {f1}')
print(accuracy_score(Y_test, Y_pred,normalize=False))
print("Confusion matrix")
conf_matrix=confusion_matrix(Y_test,Y_pred)
print(conf_matrix)

SVM
Accuracy: 0.9705284552845529
Precision: 0.9735474552547722
Recall: 0.9712520287698774
F1-Score: 0.971132617679643
955
Confusion matrix
[[30  0  0 ...  0  0  0]
 [ 0 23  0 ...  0  0  0]
 [ 0  0 34 ...  0  0  0]
 ...
 [ 0  0  0 ... 27  0  0]
 [ 0  0  0 ...  0 25  0]
 [ 0  0  0 ...  0  0 26]]
