In [1]:
import pandas as pd
import numpy as np
import re
from fuzzywuzzy import process
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import accuracy_score



In [2]:
# Load the datasets
df_precaution = pd.read_csv("Disease precaution.csv")
df_symptoms = pd.read_csv("DiseaseAndSymptoms.csv")

In [3]:
# Normalize column names
df_symptoms.columns = df_symptoms.columns.str.strip().str.lower()
df_symptoms = df_symptoms.applymap(lambda x: x.strip().lower() if isinstance(x, str) else x)

  df_symptoms = df_symptoms.applymap(lambda x: x.strip().lower() if isinstance(x, str) else x)


In [4]:
# Function to clean symptom names
def clean_symptom(symptom):
    return re.sub(r'[^a-z0-9 ]', '', symptom.replace('_', ' ')).strip()

In [5]:
# Get unique symptoms
unique_symptoms = set(df_symptoms.iloc[:, 1:].values.flatten())
unique_symptoms = {clean_symptom(symptom) for symptom in unique_symptoms if isinstance(symptom, str)}

In [6]:
# Convert symptoms to feature matrix
symptom_list = list(unique_symptoms)
symptom_index = {symptom: i for i, symptom in enumerate(symptom_list)}

In [7]:
# Prepare data for model training
X = np.zeros((len(df_symptoms), len(symptom_list)), dtype=int)
y = df_symptoms['disease'].values

for i, row in df_symptoms.iterrows():
    for col in df_symptoms.columns[1:]:
        symptom = clean_symptom(str(row[col]))
        if symptom in symptom_index:
            X[i, symptom_index[symptom]] = 1


In [8]:
# Remove NaN disease entries
df_symptoms_cleaned = df_symptoms.dropna(subset=['disease']).reset_index(drop=True)
X_cleaned = X[:len(df_symptoms_cleaned)]
y_cleaned = y[:len(df_symptoms_cleaned)]

In [9]:
# Train-test split
X_train, X_test, y_train, y_test = train_test_split(X_cleaned, y_cleaned, test_size=0.2, random_state=42)

In [10]:
# Encode labels
label_encoder = LabelEncoder()
y_train_encoded = label_encoder.fit_transform(y_train)
y_test_encoded = label_encoder.transform(y_test)

In [11]:
# Train the model
model = RandomForestClassifier(n_estimators=100, random_state=42)
model.fit(X_train, y_train_encoded)

In [12]:
# Evaluate accuracy
accuracy = accuracy_score(y_test_encoded, model.predict(X_test))
print(f"Model Accuracy: {accuracy * 100:.2f}%")

Model Accuracy: 100.00%


In [13]:
# Function to match user symptoms
def match_symptom(user_symptom):
    match, score = process.extractOne(user_symptom, symptom_list)
    return match if score > 80 else user_symptom  # Match if confidence > 80%

In [14]:
# Function to predict disease from user symptoms
def predict_disease(user_symptoms):
    user_symptoms_cleaned = [match_symptom(clean_symptom(symptom)) for symptom in user_symptoms]
    
    # Create feature vector
    user_input_vector = np.zeros(len(symptom_list), dtype=int)
    for symptom in user_symptoms_cleaned:
        if symptom in symptom_index:
            user_input_vector[symptom_index[symptom]] = 1

    # Predict disease
    predicted_label = model.predict([user_input_vector])[0]
    predicted_disease = label_encoder.inverse_transform([predicted_label])[0]

    # Get precautions
    precautions = df_precaution[df_precaution["Disease"].str.lower() == predicted_disease.lower()].values.flatten()[1:]
    precautions = [p for p in precautions if isinstance(p, str)]

    return predicted_disease, precautions

In [15]:
# User input function
'''def get_user_input_and_predict():
    user_input = input("Enter symptoms separated by commas (e.g., fever, headache, nausea): ").strip().lower()
    user_symptoms = [symptom.strip() for symptom in user_input.split(",")]
    
    predicted_disease, precautions = predict_disease(user_symptoms)
    
    print(f"\nPredicted Disease: {predicted_disease}")
    print("Recommended Precautions:")
    for i, precaution in enumerate(precautions, start=1):
        print(f"{i}. {precaution}")

# Run the prediction system
get_user_input_and_predict()skin rash'''

Enter symptoms separated by commas (e.g., fever, headache, nausea):  skin rash, vomating, acidity



Predicted Disease: acne
Recommended Precautions:
1. bath twice
2. avoid fatty spicy food
3. drink plenty of water
4. avoid too many products


In [None]:
import pickle
pickle.dump(model,open('DiseasePredictionAndPrecaution.pkl','wb'))