In [8]:
import pandas as pd
import numpy as np
import nltk
from nltk.tokenize import word_tokenize
from nltk.stem import PorterStemmer
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsClassifier
from sklearn.svm import SVC
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import accuracy_score
from sklearn.preprocessing import LabelEncoder
import pickle
import matplotlib.pyplot as plt
import seaborn as sns

In [9]:
nltk.download('punkt')

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


True

**# Load datasets**

In [10]:

description = pd.read_csv("/content/description.csv")
diets = pd.read_csv("/content/diets.csv")
medications = pd.read_csv("/content/medications.csv")
precautions = pd.read_csv("/content/precautions_df.csv")
symptom_severity = pd.read_csv("/content/Symptom-severity.csv")
symptoms = pd.read_csv("/content/symptoms_df.csv")
training = pd.read_csv("/content/Training.csv")
workout = pd.read_csv("/content/workout_df.csv")

**# Initialize NLP tools**

In [11]:

ps = PorterStemmer()

In [13]:
!pip install nltk
import nltk
nltk.download('punkt')
nltk.download('punkt_tab')

def preprocess_text(text):
    if pd.isna(text):
        return ""
    words = word_tokenize(text.lower())
    stemmed_words = [ps.stem(word) for word in words if word.isalnum()]
    return stemmed_words

description_nlp = description.select_dtypes(include=['object']).applymap(preprocess_text)



[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package punkt_tab to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt_tab.zip.
  description_nlp = description.select_dtypes(include=['object']).applymap(preprocess_text)


In [14]:
# Apply NLP preprocessing
!pip install nltk
import nltk
nltk.download('punkt_tab')
description_nlp = description.select_dtypes(include=['object']).applymap(preprocess_text)
diets_nlp = diets.select_dtypes(include=['object']).applymap(preprocess_text)
medications_nlp = medications.select_dtypes(include=['object']).applymap(preprocess_text)
precautions_nlp = precautions.select_dtypes(include=['object']).applymap(preprocess_text)
symptom_severity_nlp = symptom_severity.select_dtypes(include=['object']).applymap(preprocess_text)
symptoms_nlp = symptoms.select_dtypes(include=['object']).applymap(preprocess_text)
training_nlp = training.select_dtypes(include=['object']).applymap(preprocess_text)
workout_nlp = workout.select_dtypes(include=['object']).applymap(preprocess_text)



[nltk_data] Downloading package punkt_tab to /root/nltk_data...
[nltk_data]   Package punkt_tab is already up-to-date!
  description_nlp = description.select_dtypes(include=['object']).applymap(preprocess_text)
  diets_nlp = diets.select_dtypes(include=['object']).applymap(preprocess_text)
  medications_nlp = medications.select_dtypes(include=['object']).applymap(preprocess_text)
  precautions_nlp = precautions.select_dtypes(include=['object']).applymap(preprocess_text)
  symptom_severity_nlp = symptom_severity.select_dtypes(include=['object']).applymap(preprocess_text)
  symptoms_nlp = symptoms.select_dtypes(include=['object']).applymap(preprocess_text)
  training_nlp = training.select_dtypes(include=['object']).applymap(preprocess_text)
  workout_nlp = workout.select_dtypes(include=['object']).applymap(preprocess_text)


In [15]:
print("Dataset: description")
print("Head:\n", description.head(), "\n")


Dataset: description
Head:
                Disease                                        Description
0     Fungal infection  Fungal infection is a common skin condition ca...
1              Allergy  Allergy is an immune system reaction to a subs...
2                 GERD  GERD (Gastroesophageal Reflux Disease) is a di...
3  Chronic cholestasis  Chronic cholestasis is a condition where bile ...
4        Drug Reaction  Drug Reaction occurs when the body reacts adve... 



In [16]:
print("Info:")
print(description.info(), "\n")

Info:
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 41 entries, 0 to 40
Data columns (total 2 columns):
 #   Column       Non-Null Count  Dtype 
---  ------       --------------  ----- 
 0   Disease      41 non-null     object
 1   Description  41 non-null     object
dtypes: object(2)
memory usage: 788.0+ bytes
None 



In [None]:
print("Describe:\n", description.describe(include='all'), "\n")

Describe:
                  Disease                                        Description
count                 41                                                 41
unique                41                                                 41
top     Fungal infection  Fungal infection is a common skin condition ca...
freq                   1                                                  1 



In [None]:
print("Dataset: diets")
print("Head:\n", diets.head(), "\n")

Dataset: diets
Head:
                Disease                                               Diet
0     Fungal infection  ['Antifungal Diet', 'Probiotics', 'Garlic', 'C...
1              Allergy  ['Elimination Diet', 'Omega-3-rich foods', 'Vi...
2                 GERD  ['Low-Acid Diet', 'Fiber-rich foods', 'Ginger'...
3  Chronic cholestasis  ['Low-Fat Diet', 'High-Fiber Diet', 'Lean prot...
4        Drug Reaction  ['Antihistamine Diet', 'Omega-3-rich foods', '... 



In [None]:
print("Info:")
print(diets.info(), "\n")

Info:
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 41 entries, 0 to 40
Data columns (total 2 columns):
 #   Column   Non-Null Count  Dtype 
---  ------   --------------  ----- 
 0   Disease  41 non-null     object
 1   Diet     41 non-null     object
dtypes: object(2)
memory usage: 788.0+ bytes
None 



In [None]:
print("Describe:\n", diets.describe(include='all'), "\n")


Describe:
                  Disease                                               Diet
count                 41                                                 41
unique                41                                                 36
top     Fungal infection  ['Arthritis Diet', 'Anti-Inflammatory Diet', '...
freq                   1                                                  3 



In [None]:
print("Dataset: medications")
print("Head:\n", medications.head(), "\n")

Dataset: medications
Head:
                Disease                                         Medication
0     Fungal infection  ['Antifungal Cream', 'Fluconazole', 'Terbinafi...
1              Allergy  ['Antihistamines', 'Decongestants', 'Epinephri...
2                 GERD  ['Proton Pump Inhibitors (PPIs)', 'H2 Blockers...
3  Chronic cholestasis  ['Ursodeoxycholic acid', 'Cholestyramine', 'Me...
4        Drug Reaction  ['Antihistamines', 'Epinephrine', 'Corticoster... 



In [None]:
print("Info:")
print(medications.info(), "\n")


Info:
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 41 entries, 0 to 40
Data columns (total 2 columns):
 #   Column      Non-Null Count  Dtype 
---  ------      --------------  ----- 
 0   Disease     41 non-null     object
 1   Medication  41 non-null     object
dtypes: object(2)
memory usage: 788.0+ bytes
None 



In [None]:
print("Describe:\n", medications.describe(include='all'), "\n")

Describe:
                  Disease                                         Medication
count                 41                                                 41
unique                41                                                 38
top     Fungal infection  ['Antiviral drugs', 'IV fluids', 'Blood transf...
freq                   1                                                  3 



In [None]:
print("Dataset: precautions")
print("Head:\n", precautions.head(), "\n")

Dataset: precautions
Head:
    Unnamed: 0         Disease                      Precaution_1  \
0           0   Drug Reaction                   stop irritation   
1           1         Malaria          Consult nearest hospital   
2           2         Allergy                    apply calamine   
3           3  Hypothyroidism                     reduce stress   
4           4       Psoriasis  wash hands with warm soapy water   

                   Precaution_2        Precaution_3  \
0      consult nearest hospital    stop taking drug   
1               avoid oily food  avoid non veg food   
2       cover area with bandage                 NaN   
3                      exercise         eat healthy   
4  stop bleeding using pressure      consult doctor   

                  Precaution_4  
0                    follow up  
1           keep mosquitos out  
2  use ice to compress itching  
3             get proper sleep  
4                   salt baths   



In [None]:
print("Info:")
print(precautions.info(), "\n")

Info:
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 41 entries, 0 to 40
Data columns (total 6 columns):
 #   Column        Non-Null Count  Dtype 
---  ------        --------------  ----- 
 0   Unnamed: 0    41 non-null     int64 
 1   Disease       41 non-null     object
 2   Precaution_1  41 non-null     object
 3   Precaution_2  41 non-null     object
 4   Precaution_3  40 non-null     object
 5   Precaution_4  40 non-null     object
dtypes: int64(1), object(5)
memory usage: 2.1+ KB
None 



In [None]:
print("Describe:\n", precautions.describe(include='all'), "\n")

Describe:
         Unnamed: 0        Disease              Precaution_1 Precaution_2  \
count    41.000000             41                        41           41   
unique         NaN             41                        32           34   
top            NaN  Drug Reaction  Consult nearest hospital     exercise   
freq           NaN              1                         3            3   
mean     20.000000            NaN                       NaN          NaN   
std      11.979149            NaN                       NaN          NaN   
min       0.000000            NaN                       NaN          NaN   
25%      10.000000            NaN                       NaN          NaN   
50%      20.000000            NaN                       NaN          NaN   
75%      30.000000            NaN                       NaN          NaN   
max      40.000000            NaN                       NaN          NaN   

          Precaution_3 Precaution_4  
count               40           40  


In [None]:
print("Dataset: workout")
print("Head:\n", workout.head(), "\n")



Dataset: workout
Head:
    Unnamed: 0.1  Unnamed: 0           disease                    workout
0             0           0  Fungal infection         Avoid sugary foods
1             1           1  Fungal infection         Consume probiotics
2             2           2  Fungal infection  Increase intake of garlic
3             3           3  Fungal infection     Include yogurt in diet
4             4           4  Fungal infection      Limit processed foods 



In [None]:
print("Info:")
print(workout.info(), "\n")

Info:
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 410 entries, 0 to 409
Data columns (total 4 columns):
 #   Column        Non-Null Count  Dtype 
---  ------        --------------  ----- 
 0   Unnamed: 0.1  410 non-null    int64 
 1   Unnamed: 0    410 non-null    int64 
 2   disease       410 non-null    object
 3   workout       410 non-null    object
dtypes: int64(2), object(2)
memory usage: 12.9+ KB
None 



In [None]:
print("Describe:\n", workout.describe(include='all'), "\n")

Describe:
         Unnamed: 0.1  Unnamed: 0           disease        workout
count     410.000000  410.000000               410            410
unique           NaN         NaN                41            140
top              NaN         NaN  Fungal infection  Stay hydrated
freq             NaN         NaN                10             42
mean      204.500000  204.500000               NaN            NaN
std       118.501055  118.501055               NaN            NaN
min         0.000000    0.000000               NaN            NaN
25%       102.250000  102.250000               NaN            NaN
50%       204.500000  204.500000               NaN            NaN
75%       306.750000  306.750000               NaN            NaN
max       409.000000  409.000000               NaN            NaN 



In [None]:
print("Dataset: training")
print("Head:\n", training.head(), "\n")

Dataset: training
Head:
    itching  skin_rash  nodal_skin_eruptions  continuous_sneezing  shivering  \
0        1          1                     1                    0          0   
1        0          1                     1                    0          0   
2        1          0                     1                    0          0   
3        1          1                     0                    0          0   
4        1          1                     1                    0          0   

   chills  joint_pain  stomach_pain  acidity  ulcers_on_tongue  ...  \
0       0           0             0        0                 0  ...   
1       0           0             0        0                 0  ...   
2       0           0             0        0                 0  ...   
3       0           0             0        0                 0  ...   
4       0           0             0        0                 0  ...   

   blackheads  scurring  skin_peeling  silver_like_dusting  \
0          

In [None]:
print("Info:")
print(training.info(), "\n")


Info:
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 4920 entries, 0 to 4919
Columns: 133 entries, itching to prognosis
dtypes: int64(132), object(1)
memory usage: 5.0+ MB
None 



In [None]:
print("Describe:\n", training.describe(include='all'), "\n")

Describe:
             itching    skin_rash  nodal_skin_eruptions  continuous_sneezing  \
count   4920.000000  4920.000000           4920.000000          4920.000000   
unique          NaN          NaN                   NaN                  NaN   
top             NaN          NaN                   NaN                  NaN   
freq            NaN          NaN                   NaN                  NaN   
mean       0.137805     0.159756              0.021951             0.045122   
std        0.344730     0.366417              0.146539             0.207593   
min        0.000000     0.000000              0.000000             0.000000   
25%        0.000000     0.000000              0.000000             0.000000   
50%        0.000000     0.000000              0.000000             0.000000   
75%        0.000000     0.000000              0.000000             0.000000   
max        1.000000     1.000000              1.000000             1.000000   

          shivering       chills   joint

In [17]:

X = training.drop('prognosis', axis=1)
y = training['prognosis']


In [18]:

le = LabelEncoder()
Y = le.fit_transform(y)

X_train, X_test, y_train, y_test = train_test_split(X, Y, test_size=0.3, random_state=20)

In [19]:
# Train Models
models = {
    'SVC': SVC(kernel='linear'),
    'RandomForest': RandomForestClassifier(n_estimators=100, random_state=42),
    'GradientBoosting': GradientBoostingClassifier(n_estimators=100, random_state=42),
    'KNeighbors': KNeighborsClassifier(n_neighbors=5),
    'MultinomialNB': MultinomialNB()
}

In [20]:
best_model = None
best_accuracy = 0

In [21]:
for model_name, model in models.items():
    model.fit(X_train, y_train)
    predictions = model.predict(X_test)
    accuracy = accuracy_score(y_test, predictions)
    print(f"{model_name} Accuracy: {accuracy:.4f}")

    if accuracy > best_accuracy:
        best_accuracy = accuracy
        best_model = model

SVC Accuracy: 1.0000
RandomForest Accuracy: 1.0000
GradientBoosting Accuracy: 1.0000
KNeighbors Accuracy: 1.0000
MultinomialNB Accuracy: 1.0000


In [22]:

svc = best_model

In [23]:

diseases_list = le.classes_

In [24]:

symptoms_dict = {symptom: idx for idx, symptom in enumerate(X.columns)}

In [25]:
def get_predicted_value(patient_symptoms):
    input_vector = np.zeros(len(X.columns), dtype=int)
    matched_symptoms = []

    for symptom in patient_symptoms:
        if symptom in symptoms_dict:
            input_vector[symptoms_dict[symptom]] = 1
            matched_symptoms.append(symptom)

    if not matched_symptoms:
        return "No matching symptoms found. Please check your input.", []

    predicted_disease = diseases_list[svc.predict([input_vector])[0]]
    return predicted_disease, matched_symptoms

In [26]:
def helper(dis):
    desc = description.loc[description['Disease'] == dis, 'Description'].values
    desc = desc[0] if len(desc) > 0 else "No description available."

    pre = precautions.loc[precautions['Disease'] == dis, ['Precaution_1', 'Precaution_2', 'Precaution_3', 'Precaution_4']].dropna().values.flatten()
    med = medications.loc[medications['Disease'] == dis, 'Medication'].dropna().values.flatten()
    die = diets.loc[diets['Disease'] == dis, 'Diet'].dropna().values.flatten()
    wrkout = workout.loc[workout['disease'] == dis, 'workout'].dropna().values.flatten()

    return desc, pre.tolist(), med.tolist(), die.tolist(), wrkout.tolist()

In [28]:

symptoms_input = input("Enter your symptoms (comma-separated): ")
user_symptoms = [s.strip() for s in symptoms_input.split(',')]

predicted_disease, matched_symptoms = get_predicted_value(user_symptoms)

if isinstance(predicted_disease, str) and "No matching" in predicted_disease:
    print(predicted_disease)
else:
    desc, pre, med, die, wrkout = helper(predicted_disease)

    print(f"\nMatched Symptoms: {', '.join(matched_symptoms)}")
    print(f"\nPredicted Disease: {predicted_disease}")
    print(f"\nDescription: {desc}")

    print("\nPrecautions:")
    for i, p in enumerate(pre, start=1):
        print(f"{i}: {p}")

    print("\nMedications:")
    for i, m in enumerate(med, start=1):
        print(f"{i}: {m}")

    print("\nWorkout Suggestions:")
    for i, w in enumerate(wrkout, start=1):
        print(f"{i}: {w}")

    print("\nDiet Suggestions:")
    for i, d in enumerate(die, start=1):
        print(f"{i}: {d}")

Enter your symptoms (comma-separated): itching, skin_rash, nodal_skin_eruptions 

Matched Symptoms: itching, skin_rash, nodal_skin_eruptions

Predicted Disease: Fungal infection

Description: Fungal infection is a common skin condition caused by fungi.

Precautions:
1: bath twice
2: use detol or neem in bathing water
3: keep infected area dry
4: use clean cloths

Medications:
1: ['Antifungal Cream', 'Fluconazole', 'Terbinafine', 'Clotrimazole', 'Ketoconazole']

Workout Suggestions:
1: Avoid sugary foods
2: Consume probiotics
3: Increase intake of garlic
4: Include yogurt in diet
5: Limit processed foods
6: Stay hydrated
7: Consume green tea
8: Eat foods rich in zinc
9: Include turmeric in diet
10: Eat fruits and vegetables

Diet Suggestions:
1: ['Antifungal Diet', 'Probiotics', 'Garlic', 'Coconut oil', 'Turmeric']




In [29]:


symptoms_input = input("Enter your symptoms (comma-separated): ")
user_symptoms = [s.strip() for s in symptoms_input.split(',')]

predicted_disease, matched_symptoms = get_predicted_value(user_symptoms)

if isinstance(predicted_disease, str) and "No matching" in predicted_disease:
    print(predicted_disease)
else:
    desc, pre, med, die, wrkout = helper(predicted_disease)


    print(f"\nMatched Symptoms: {', '.join(matched_symptoms)}")
    print(f"\nPredicted Disease: {predicted_disease}")
    print(f"\nDescription: {desc}")

    print("\nPrecautions:")
    for i, p in enumerate(pre, start=1):
        print(f"{i}: {p}")

    print("\nMedications:")
    for i, m in enumerate(med, start=1):
        print(f"{i}: {m}")

    print("\nWorkout Suggestions:")
    for i, w in enumerate(wrkout, start=1):
        print(f"{i}: {w}")

    print("\nDiet Suggestions:")
    for i, d in enumerate(die, start=1):
        print(f"{i}: {d}")

Enter your symptoms (comma-separated): sweating, palpitations, anxiety

Matched Symptoms: sweating, palpitations, anxiety

Predicted Disease: Heart attack

Description: Heart attack is a sudden and severe reduction in blood flow to the heart muscle.

Precautions:

Medications:
1: ['Compression stockings', 'Exercise', 'Elevating the legs', 'Sclerotherapy', 'Laser treatments']

Workout Suggestions:
1: Follow a heart-healthy diet
2: Limit sodium intake
3: Include fiber-rich foods
4: Consume healthy fats
5: Include lean proteins
6: Limit sugary foods and beverages
7: Stay hydrated
8: Consult a healthcare professional
9: Follow medical recommendations
10: Engage in regular exercise

Diet Suggestions:
1: ['Heart-Healthy Diet', 'Low-sodium foods', 'Fruits and vegetables', 'Whole grains', 'Lean proteins']




In [None]:

symptoms_input = input("Enter your symptoms (comma-separated): ")
user_symptoms = [s.strip() for s in symptoms_input.split(',')]

predicted_disease, matched_symptoms = get_predicted_value(user_symptoms)

if isinstance(predicted_disease, str) and "No matching" in predicted_disease:
    print(predicted_disease)
else:
    desc, pre, med, die, wrkout = helper(predicted_disease)

    print(f"\nMatched Symptoms: {', '.join(matched_symptoms)}")
    print(f"\nPredicted Disease: {predicted_disease}")
    print(f"\nDescription: {desc}")

    print("\nPrecautions:")
    for i, p in enumerate(pre, start=1):
        print(f"{i}: {p}")

    print("\nMedications:")
    for i, m in enumerate(med, start=1):
        print(f"{i}: {m}")

    print("\nWorkout Suggestions:")
    for i, w in enumerate(wrkout, start=1):
        print(f"{i}: {w}")

    print("\nDiet Suggestions:")
    for i, d in enumerate(die, start=1):
        print(f"{i}: {d}")

Enter your symptoms (comma-separated): continuous_sneezing

Matched Symptoms: continuous_sneezing

Predicted Disease: Allergy

Description: Allergy is an immune system reaction to a substance in the environment.

Precautions:

Medications:
1: ['Antihistamines', 'Decongestants', 'Epinephrine', 'Corticosteroids', 'Immunotherapy']

Workout Suggestions:
1: Avoid allergenic foods
2: Consume anti-inflammatory foods
3: Include omega-3 fatty acids
4: Stay hydrated
5: Eat foods rich in vitamin C
6: Include quercetin-rich foods
7: Consume local honey
8: Limit processed foods
9: Include ginger in diet
10: Avoid artificial additives

Diet Suggestions:
1: ['Elimination Diet', 'Omega-3-rich foods', 'Vitamin C-rich foods', 'Quercetin-rich foods', 'Probiotics']




In [None]:
import pickle


with open('disease_prediction_model.pkl', 'wb') as model_file:
    pickle.dump(svc, model_file)

with open('label_encoder.pkl', 'wb') as le_file:
    pickle.dump(le, le_file)


In [None]:

with open('disease_prediction_model.pkl', 'rb') as model_file:
    svc = pickle.load(model_file)

with open('label_encoder.pkl', 'rb') as le_file:
    le = pickle.load(le_file)
diseases_list = le.classes_

print("Model and Label Encoder Loaded Successfully!")


Model and Label Encoder Loaded Successfully!
