In [2]:
import pandas as pd
import numpy as np
import re

In [3]:
dataset=pd.read_csv('symtoms_df.csv')

In [4]:
dataset.head()

Unnamed: 0.1,Unnamed: 0,Disease,Symptom_1,Symptom_2,Symptom_3,Symptom_4
0,0,Fungal infection,itching,skin_rash,nodal_skin_eruptions,dischromic _patches
1,1,Fungal infection,skin_rash,nodal_skin_eruptions,dischromic _patches,
2,2,Fungal infection,itching,nodal_skin_eruptions,dischromic _patches,
3,3,Fungal infection,itching,skin_rash,dischromic _patches,
4,4,Fungal infection,itching,skin_rash,nodal_skin_eruptions,


In [5]:
dataset.shape

(4920, 6)

In [6]:
dataset['Symptom_4'].fillna('',inplace=True)

The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  dataset['Symptom_4'].fillna('',inplace=True)


In [7]:
dataset['Symptoms']=dataset['Symptom_1']+','+dataset['Symptom_2']+','+dataset['Symptom_3']+','+dataset['Symptom_4']

In [8]:
dataset=dataset[['Symptoms','Disease']]

In [9]:
dataset['Symptoms']=dataset['Symptoms'].str.replace('_',' ')

In [10]:
print(dataset.tail(10).to_markdown())

|      | Symptoms                                                                               | Disease                                 |
|-----:|:---------------------------------------------------------------------------------------|:----------------------------------------|
| 4910 | fatigue, weight gain, cold hands and feets, mood swings                                | Hypothyroidism                          |
| 4911 | fatigue, mood swings, weight loss, restlessness                                        | Hyperthyroidism                         |
| 4912 | vomiting, fatigue, anxiety, sweating                                                   | Hypoglycemia                            |
| 4913 | joint pain, neck pain, knee pain, hip joint pain                                       | Osteoarthristis                         |
| 4914 | muscle weakness, stiff neck, swelling joints, movement stiffness                       | Arthritis                               |
| 4915 | vomiting, h

In [11]:
dataset['Disease'].nunique()

41

In [12]:
dataset.head()

Unnamed: 0,Symptoms,Disease
0,"itching, skin rash, nodal skin eruptions, disc...",Fungal infection
1,"skin rash, nodal skin eruptions, dischromic ...",Fungal infection
2,"itching, nodal skin eruptions, dischromic pat...",Fungal infection
3,"itching, skin rash, dischromic patches,",Fungal infection
4,"itching, skin rash, nodal skin eruptions,",Fungal infection


In [13]:
print(dataset[dataset['Disease']=='Diabetes '].head().to_markdown())

|    | Symptoms                                                   | Disease   |
|---:|:-----------------------------------------------------------|:----------|
| 70 | fatigue, weight loss, restlessness, lethargy               | Diabetes  |
| 71 | fatigue, weight loss, restlessness, lethargy               | Diabetes  |
| 72 | weight loss, restlessness, lethargy, irregular sugar level | Diabetes  |
| 73 | fatigue, restlessness, lethargy, irregular sugar level     | Diabetes  |
| 74 | fatigue, weight loss, lethargy, irregular sugar level      | Diabetes  |


In [14]:
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Embedding, LSTM, Dense, Dropout, Bidirectional
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.callbacks import EarlyStopping, ModelCheckpoint
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from tensorflow.keras.utils import to_categorical

In [15]:
df = pd.DataFrame(dataset)

# Tokenize the symptom texts
tokenizer = Tokenizer()
tokenizer.fit_on_texts(df['Symptoms'])
sequences = tokenizer.texts_to_sequences(df['Symptoms'])
word_index = tokenizer.word_index

# Pad sequences to ensure uniform input length
max_seq_length = max(len(seq) for seq in sequences)
padded_sequences = pad_sequences(sequences, maxlen=max_seq_length, padding='post')

# Encode Disease labels into one-hot vectors
label_encoder = LabelEncoder()
integer_labels = label_encoder.fit_transform(df['Disease'])
categorical_labels = to_categorical(integer_labels)

# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(padded_sequences, categorical_labels, test_size=0.2, random_state=42)

# --- Step 2: Build the Model ---
vocab_size = len(word_index) + 1  # +1 for the padding token
embedding_dim = 100  # Larger embedding dimension for richer representation

model = Sequential()
model.add(Embedding(input_dim=vocab_size, output_dim=embedding_dim, input_length=max_seq_length))
model.add(Bidirectional(LSTM(64, return_sequences=True)))
model.add(Dropout(0.5))
model.add(LSTM(32))
model.add(Dropout(0.5))
model.add(Dense(len(label_encoder.classes_), activation='softmax'))

model.compile(loss='categorical_crossentropy', optimizer='adam', metrics=['accuracy'])
model.summary()

# --- Step 3: Train the Model ---
# Define callbacks for early stopping and model checkpointing
callbacks = [
    EarlyStopping(monitor='val_loss', patience=3, restore_best_weights=True),
    ModelCheckpoint("best_model.keras", monitor='val_loss', save_best_only=True)
]

history = model.fit(
    X_train, y_train,
    epochs=50,                # Use more epochs for a real dataset
    batch_size=4,             # Adjust batch size based on dataset and resources
    validation_split=0.2,     # Reserve a portion of training data for validation
    callbacks=callbacks,
    verbose=1
)

# --- Step 4: Evaluate the Model ---
loss, accuracy = model.evaluate(X_test, y_test)
print("Test Loss:", loss)
print("Test Accuracy:", accuracy)




Epoch 1/50
[1m787/787[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m16s[0m 13ms/step - accuracy: 0.1543 - loss: 3.1677 - val_accuracy: 0.6637 - val_loss: 1.3536
Epoch 2/50
[1m787/787[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m9s[0m 11ms/step - accuracy: 0.6376 - loss: 1.3706 - val_accuracy: 0.8820 - val_loss: 0.6006
Epoch 3/50
[1m787/787[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m8s[0m 10ms/step - accuracy: 0.8040 - loss: 0.7881 - val_accuracy: 0.9391 - val_loss: 0.2775
Epoch 4/50
[1m787/787[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m8s[0m 10ms/step - accuracy: 0.9051 - loss: 0.5181 - val_accuracy: 0.9822 - val_loss: 0.1601
Epoch 5/50
[1m787/787[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m7s[0m 9ms/step - accuracy: 0.9257 - loss: 0.4128 - val_accuracy: 0.9772 - val_loss: 0.1513
Epoch 6/50
[1m787/787[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m7s[0m 9ms/step - accuracy: 0.9394 - loss: 0.3041 - val_accuracy: 0.9848 - val_loss: 0.0867
Epoch 7/50
[1m787/787

In [17]:
# --- Step 5: Predict on a Sample Input ---
# Example sample input symptoms (note: this string is preprocessed in the same way as training data)
sample_input = "Fungal infection,itching, skin_rash, nodal_skin_eruptions"

# Tokenize and pad the sample input
sample_seq = tokenizer.texts_to_sequences([sample_input])
sample_padded = pad_sequences(sample_seq, maxlen=max_seq_length, padding='post')

# Predict the probabilities for each disease
predictions = model.predict(sample_padded)

# Get the index of the highest probability disease
predicted_class = np.argmax(predictions, axis=1)

# Convert the predicted index back to the disease label
predicted_disease = label_encoder.inverse_transform(predicted_class)

print("Input Symptoms:", sample_input)
print("Predicted Disease:", predicted_disease[0])


[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 56ms/step
Input Symptoms: Fungal infection,itching, skin_rash, nodal_skin_eruptions
Predicted Disease: Fungal infection


In [18]:
symptoms = pd.read_csv("symtoms_df.csv")
precautions = pd.read_csv("precautions_df.csv")
workout = pd.read_csv("workout_df.csv")
description = pd.read_csv("description.csv")
medications = pd.read_csv('medications.csv')
diets = pd.read_csv("diets.csv")

In [34]:
description

Unnamed: 0,Disease,Description
0,Fungal infection,Fungal infection is a common skin condition ca...
1,Allergy,Allergy is an immune system reaction to a subs...
2,GERD,GERD (Gastroesophageal Reflux Disease) is a di...
3,Chronic cholestasis,Chronic cholestasis is a condition where bile ...
4,Drug Reaction,Drug Reaction occurs when the body reacts adve...
5,Peptic ulcer disease,Peptic ulcer disease involves sores that devel...
6,AIDS,AIDS (Acquired Immunodeficiency Syndrome) is a...
7,Diabetes,Diabetes is a chronic condition that affects h...
8,Gastroenteritis,Gastroenteritis is an inflammation of the stom...
9,Bronchial Asthma,Bronchial Asthma is a respiratory condition ch...


In [38]:
print(type(predicted_disease))
print(description['Disease'].dtype)


<class 'numpy.ndarray'>
object


In [36]:
def mapping(sample_dis):
    desc = description[description['Disease'] == sample_dis]['Description']
    desc = " ".join([w for w in desc])

    pre = precautions[precautions['Disease'] == sample_dis][['Precaution_1', 'Precaution_2', 'Precaution_3', 'Precaution_4']]
    pre = [col for col in pre.values]

    med = medications[medications['Disease'] == sample_dis]['Medication']
    med = [med for med in med.values]

    die = diets[diets['Disease'] == sample_dis]['Diet']
    die = [die for die in die.values]

    wrkout = workout[workout['disease'] == sample_dis] ['workout']


    return desc,pre,med,die,wrkout

In [None]:
desc, pre, med, die, wrkout = mapping(predicted_disease[0])

print("predicted disease:")
print(predicted_disease[0])
print("description:")
print(desc)

print("precautions:")
for i in pre[0]:
    print(i)   

print("medications:")
for i in med:
    print( i)
    

print("workout:")
for i in wrkout:
    print(i)
    

print("diets:")
for i in die:
    print(i)
    

predicted disease:
Fungal infection
description:
Fungal infection is a common skin condition caused by fungi.
precautions:
bath twice
use detol or neem in bathing water
keep infected area dry
use clean cloths
medications:
['Antifungal Cream', 'Fluconazole', 'Terbinafine', 'Clotrimazole', 'Ketoconazole']
workout:
Avoid sugary foods
Consume probiotics
Increase intake of garlic
Include yogurt in diet
Limit processed foods
Stay hydrated
Consume green tea
Eat foods rich in zinc
Include turmeric in diet
Eat fruits and vegetables
diets:
['Antifungal Diet', 'Probiotics', 'Garlic', 'Coconut oil', 'Turmeric']
