# IMPORTING THE FILES

In [3]:
import pandas as pd
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, classification_report
from sklearn.tree import DecisionTreeClassifier
from sklearn.svm import SVC
from sklearn.neighbors import KNeighborsClassifier
from sklearn.linear_model import LogisticRegression
from joblib import dump
from joblib import load
import joblib

In [2]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


#LOADING THE "SYMPTOM LVL" FILE

In [4]:
symptoms=pd.read_csv('/content/drive/MyDrive/Colab Notebooks/2. SYMPTOMS DATA.csv')
symptoms

Unnamed: 0,itching,skin_rash,nodal_skin_eruptions,continuous_sneezing,shivering,chills,joint_pain,stomach_pain,acidity,ulcers_on_tongue,...,blackheads,scurring,skin_peeling,silver_like_dusting,small_dents_in_nails,inflammatory_nails,blister,red_sore_around_nose,yellow_crust_ooze,DISEASES
0,1,1,1,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,Fungal infection
1,0,1,1,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,Fungal infection
2,1,0,1,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,Fungal infection
3,1,1,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,Fungal infection
4,1,1,1,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,Fungal infection
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
4915,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,(vertigo) Paroymsal Positional Vertigo
4916,0,1,0,0,0,0,0,0,0,0,...,1,1,0,0,0,0,0,0,0,Acne
4917,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,Urinary tract infection
4918,0,1,0,0,0,0,1,0,0,0,...,0,0,1,1,1,1,0,0,0,Psoriasis


# DATA PREPROCESSING
# EXPLORATORY DATA ANALYSIS


**# CHECKING THE TYPE OF DATA PRESENT IN THE DATASET**

In [5]:
symptoms.dtypes

Unnamed: 0,0
itching,int64
skin_rash,int64
nodal_skin_eruptions,int64
continuous_sneezing,int64
shivering,int64
...,...
inflammatory_nails,int64
blister,int64
red_sore_around_nose,int64
yellow_crust_ooze,int64


**# CHECKING THE NUMBER OF ROWS AND COLUMNS**

In [6]:
symptoms.shape

(4920, 133)

**# CHECKING IF THE DATA SET "SYMPTOM LVL" CONTAIN ANY DUPLICATES**

In [7]:
# duplicates=symptoms.duplicated(keep=False)
# print(duplicates)

# duplicate_count = duplicates.sum()
# print(f'Total duplicate rows: {duplicate_count}')


**# SINCE THE TOTAL DUPLICATES ROWS : 0 THERE IS NO NEED TO DROP DUPLICATES**

In [8]:
#symptoms.drop_duplicates(inplace=True)
#symptoms

**# CHECKING IF IS THERE ARE ANY MISSING VALUE IN THE DATASET**

In [9]:
missing_values=symptoms.isnull().sum()
print(missing_values)

itching                 0
skin_rash               0
nodal_skin_eruptions    0
continuous_sneezing     0
shivering               0
                       ..
inflammatory_nails      0
blister                 0
red_sore_around_nose    0
yellow_crust_ooze       0
DISEASES                0
Length: 133, dtype: int64


# ENCODING THE DATA


**LABEL ENCODING FOR DISEASES COLUMN**

In [10]:
label_encoder=LabelEncoder()
symptoms['DISEASES']=label_encoder.fit_transform(symptoms['DISEASES'])
print(symptoms.head())

   itching  skin_rash  nodal_skin_eruptions  continuous_sneezing  shivering  \
0        1          1                     1                    0          0   
1        0          1                     1                    0          0   
2        1          0                     1                    0          0   
3        1          1                     0                    0          0   
4        1          1                     1                    0          0   

   chills  joint_pain  stomach_pain  acidity  ulcers_on_tongue  ...  \
0       0           0             0        0                 0  ...   
1       0           0             0        0                 0  ...   
2       0           0             0        0                 0  ...   
3       0           0             0        0                 0  ...   
4       0           0             0        0                 0  ...   

   blackheads  scurring  skin_peeling  silver_like_dusting  \
0           0         0             

In [11]:
# symptoms_encoded.to_csv('symptomsencoded.csv', index=False)

In [12]:
# from google.colab import files
# files.download('symptomsencoded.csv')

# SPLITING THE DATA INTO TRAINING AND TESTING SETS

In [13]:
X=symptoms.drop(columns=['DISEASES'])
Y=symptoms['DISEASES']
X_train,X_test,Y_train,Y_test=train_test_split(X,Y,test_size=0.2,stratify=Y,random_state=42)
X_train.shape,X_test.shape,Y_train.shape,Y_test.shape

((3936, 132), (984, 132), (3936,), (984,))

# TRAINING THE MODEL

# 1. SUPPORT VECTOR CLASSIFIER

In [14]:
SVM_model=SVC(random_state=42)
SVM_model.fit(X_train,Y_train)

**ACCURACY OF SUPPORT VECTOR CLASSIFIER**

In [15]:
Y_pred_SVM=SVM_model.predict(X_test)
accuracy_SVM=accuracy_score(Y_test,Y_pred_SVM)
print(f'ACCURACY OF SUPPORT VECTOR MACHINE {accuracy_SVM:.2f}')

ACCURACY OF SUPPORT VECTOR MACHINE 1.00


# 2. DECISION TREE CLASSIFIER

In [16]:
DT_model=DecisionTreeClassifier(random_state=42)
DT_model.fit(X_train,Y_train)

**ACCURACY OF DECISION TREE CLASSIFIER**

In [17]:
Y_pred_DT=DT_model.predict(X_test)
accuracy_DT=accuracy_score(Y_test,Y_pred_DT)
print(f'ACCURACY OF DECISION TREE CLASSIFIER: {accuracy_DT:.2f}')

ACCURACY OF DECISION TREE CLASSIFIER: 1.00


# 3. RANDOM FOREST CLASSIFIER


In [18]:
#Initializing the model
RF_model=RandomForestClassifier(random_state=42)
#Training the model
RF_model.fit(X_train,Y_train)

**ACCURACY OF RANDOM FOREST**

In [19]:
#Predicting and evaluating the accuracy
Y_pred_RF=RF_model.predict(X_test)
accuracy_RF=accuracy_score(Y_test,Y_pred_RF)
print(f'ACCURACY OF RANDOM FOREST CLASSIFIER: {accuracy_RF:.2f}')

ACCURACY OF RANDOM FOREST CLASSIFIER: 1.00


# 4. K-NEAREST NEIGHBOR CLASSIFIER

In [20]:
KNN_model=KNeighborsClassifier()
KNN_model.fit(X_train,Y_train)

**ACCURACY OF K-NEAREST NEIGHBOR CLASSIFIER**

In [21]:
Y_pred_KNN=KNN_model.predict(X_test)
accuracy_KNN=accuracy_score(Y_test,Y_pred_KNN)
print(f'ACCURACY OF K-NEAREST NEIGHBOR CLASSIFIER: {accuracy_KNN:.2f}')

ACCURACY OF K-NEAREST NEIGHBOR CLASSIFIER: 1.00


# 5. LOGISTIC REGRESSION

In [22]:
LR_model=LogisticRegression(max_iter=100)
LR_model.fit(X_train,Y_train)

**ACCURACY OF LOGISTIC REGRESSION**

In [23]:
Y_pred_LR=LR_model.predict(X_train)
accuracy_LR=accuracy_score(Y_train,Y_pred_LR)
print(f'ACCURACY OF LOGISTIC REGRESSION : {accuracy_LR:.2f}')

ACCURACY OF LOGISTIC REGRESSION : 1.00


# SAVING THE TRAINED MODEL { RANDOM FOREST MODEL }

In [24]:
dump(RF_model, '/content/drive/MyDrive/Colab Notebooks/trained_random_forest_model.joblib')
print("Model saved to /content/drive/MyDrive/Colab Notebooks/trained_random_forest_model.joblib")

Model saved to /content/drive/MyDrive/Colab Notebooks/trained_random_forest_model.joblib


**CHECKING IF THE RANDOM FOREST MODEL IS LOADED OR NOT**

In [25]:
loaded_model = load('/content/drive/MyDrive/Colab Notebooks/trained_random_forest_model.joblib')
print(type(loaded_model))

<class 'sklearn.ensemble._forest.RandomForestClassifier'>


**SAVING THE LOGISTIC REGRESSION MODEL**

In [26]:
# dump(LR_model, '/content/drive/MyDrive/Colab Notebooks/trained_logistic_regression_model.joblib')
# print("Model saved to /content/drive/MyDrive/Colab Notebooks/trained_logistic_regression_model.joblib")

In [27]:
# loaded_model = load('/content/drive/MyDrive/Colab Notebooks/trained_logistic_regression_model.joblib')
# print(type(loaded_model))

# LOADING THE METADATA FILES

**LOADING THE DESCRIPTION FILE**

In [28]:
description=pd.read_csv('/content/drive/MyDrive/Colab Notebooks/1. DESCRIPTION.csv')
description.head()

Unnamed: 0,DISEASES,DESCRIPTION
0,(vertigo) Paroymsal Positional Vertigo,(Vertigo) Paroxysmal Positional Vertigo is a t...
1,Acne,Acne is a skin condition that occurs when hair...
2,AIDS,AIDS (Acquired Immunodeficiency Syndrome) is a...
3,Alcoholic hepatitis,Alcoholic hepatitis is inflammation of the liv...
4,Allergy,Allergy is an immune system reaction to a subs...


**LOADING THE MEDICATION FILE**

In [29]:
med=pd.read_csv('/content/drive/MyDrive/Colab Notebooks/3. MEDICATION.csv')
med.head()

Unnamed: 0,DISEASES,Medication
0,(vertigo) Paroymsal Positional Vertigo,"['Topical treatments', 'Antibiotics', 'Oral me..."
1,Acne,"['Antibiotics', 'Pain relievers', 'Antihistami..."
2,AIDS,"['Antiretroviral drugs', 'Protease inhibitors'..."
3,Alcoholic hepatitis,"['Antibiotics', 'Isoniazid', 'Rifampin', 'Etha..."
4,Allergy,"['Antihistamines', 'Decongestants', 'Epinephri..."


**LOADING THE PRECAUTIONS FILE**

In [30]:
precaution=pd.read_csv('/content/drive/MyDrive/Colab Notebooks/4. Numbered_Precautions.csv')
precaution.head()

Unnamed: 0,DISEASES,PRECAUTION
0,(vertigo) Paroymsal Positional Vertigo,1. lie down\n2. avoid sudden change in body\n3...
1,Acne,1. bath twice\n2. avoid fatty spicy food\n3. d...
2,AIDS,1. avoid open cuts\n2. wear ppe if possible\n3...
3,Alcoholic hepatitis,1. stop alcohol consumption\n2. consult doctor...
4,Allergy,1. apply calamine\n2. cover area with bandage\...


**LOADING THE DIET PLAN FILE**

In [31]:
diet=pd.read_csv('/content/drive/MyDrive/Colab Notebooks/5. DIET.csv')
diet.head()

Unnamed: 0,DISEASES,Diet
0,(vertigo) Paroymsal Positional Vertigo,"['Vertigo Diet', 'Low-Salt Diet', 'Hydration',..."
1,Acne,"['Acne Diet', 'Low-Glycemic Diet', 'Hydration'..."
2,AIDS,"['Balanced Diet', 'Protein-rich foods', 'Fruit..."
3,Alcoholic hepatitis,"['Liver-Healthy Diet', 'Low-fat Diet', 'Fruits..."
4,Allergy,"['Elimination Diet', 'Omega-3-rich foods', 'Vi..."


#LOADING AND TESTING THE MODEL

In [32]:
model = joblib.load('/content/drive/MyDrive/Colab Notebooks/trained_random_forest_model.joblib')

In [33]:
input_line = input("Enter symptoms, separated by commas (e.g., itching, skin_rash): ")
entered_symptoms = [sym.strip().lower() for sym in input_line.split(',')]
user_input = {symptom: 1 if symptom in entered_symptoms else 0 for symptom in symptoms}
input_df = pd.DataFrame([user_input]).reindex(columns=model.feature_names_in_, fill_value=0)

if len(entered_symptoms) == 1:
    symptom_name = entered_symptoms[0]
    possible_diseases = symptoms[symptoms[symptom_name] == 1]['DISEASES'].unique()
    possible_diseases = label_encoder.inverse_transform(possible_diseases)

    print(f"These are the possible diseases related to '{symptom_name}':")
    for disease in possible_diseases:
        print(f"- {disease}")

    related_symptoms = set()
    for disease in possible_diseases:
        disease_symptoms = symptoms[symptoms['DISEASES'] == label_encoder.transform([disease])[0]].columns[
            symptoms[symptoms['DISEASES'] == label_encoder.transform([disease])[0]].iloc[0] == 1
        ]
        related_symptoms.update(disease_symptoms)

    suggested_symptoms = [symptom for symptom in related_symptoms if symptom != symptom_name]
    print(f"\nWould you like to refine your prediction with any of these additional symptoms? {', '.join(suggested_symptoms)}")

    add_symptom = input("Do you have any of the above symptoms? (yes/no): ").strip().lower()

    if add_symptom == 'no':
        print("It's recommended to consult a doctor for a more accurate diagnosis.")
    else:
        additional_symptom = input("Enter additional symptoms from the above list (comma-separated): ")
        new_symptoms = [sym.strip().lower() for sym in additional_symptom.split(',')]
        entered_symptoms.extend(new_symptoms)

        user_input = {symptom: 1 if symptom in entered_symptoms else 0 for symptom in symptoms}
        input_df = pd.DataFrame([user_input]).reindex(columns=model.feature_names_in_, fill_value=0)

        predicted_disease_encoded = model.predict(input_df)
        disease = label_encoder.inverse_transform(predicted_disease_encoded)[0]
        print(f'Predicted Disease: {disease}')

        description_info = description[description['DISEASES'] == disease]['DESCRIPTION'].values
        if description_info.size > 0:
            print(f"\nDescription of {disease}: {description_info[0]}")

        medications_info = med[med['DISEASES'] == disease]['Medication '].values
        if medications_info.size > 0:
            print(f"\nRecommended Medications for {disease}: {medications_info[0]}")

        precautions_info = precaution[precaution['DISEASES'] == disease]['PRECAUTION'].values
        if precautions_info.size > 0:
            print(f"\nPrecautions for {disease}: {precautions_info[0]}")

        diet_info = diet[diet['DISEASES'] == disease]['Diet'].values
        if diet_info.size > 0:
            print(f"\nDiet Recommendations for {disease}: {diet_info[0]}")

else:
    predicted_disease_encoded = model.predict(input_df)
    disease = label_encoder.inverse_transform(predicted_disease_encoded)[0]
    print(f'Predicted Disease: {disease}')

    description_info = description[description['DISEASES'] == disease]['DESCRIPTION'].values
    if description_info.size > 0:
        print(f"\nDescription of {disease}: {description_info[0]}")

    medications_info = med[med['DISEASES'] == disease]['Medication '].values
    if medications_info.size > 0:
        print(f"\nRecommended Medications for {disease}: {medications_info[0]}")

    precautions_info = precaution[precaution['DISEASES'] == disease]['PRECAUTION'].values
    if precautions_info.size > 0:
        print(f"\nPrecautions for {disease}: {precautions_info[0]}")

    diet_info = diet[diet['DISEASES'] == disease]['Diet'].values
    if diet_info.size > 0:
        print(f"\nDiet Recommendations for {disease}: {diet_info[0]}")


Enter symptoms, separated by commas (e.g., itching, skin_rash): itching, vomiting, fatigue, weight_loss
Predicted Disease: Jaundice

Description of Jaundice: Jaundice is a yellow discoloration of the skin and eyes, often indicating liver problems.

Recommended Medications for Jaundice: ['IV fluids', 'Blood transfusions', 'Liver transplant', 'Medications for itching', 'Antiviral medications']

Precautions for Jaundice: 1. drink plenty of water
2. consume milk thistle
3. eat fruits and high fiberous food
4. medication

Diet Recommendations for Jaundice: ['Liver-Healthy Diet', 'Low-fat Diet', 'Fruits and vegetables', 'Whole grains', 'Lean proteins']
