## IMPORTING MODULES

In [8]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.neighbors import KNeighborsClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.preprocessing import LabelEncoder
import warnings

warnings.filterwarnings("ignore")

## LOAD DATA

In [9]:
try:
    df = pd.read_csv("datasets/categorized_dataset.csv")
    df_weights = pd.read_csv("datasets/Symptom_Weights.csv", header=None, names=["Symptom", "Weight"])
    df_desc = pd.read_csv("datasets/symptom_Description.csv")
    df_prec = pd.read_csv("datasets/symptom_precaution.csv")
except FileNotFoundError:
    print("Error: Files not found. Ensure all CSV files are in the 'datasets' folder.")
    exit()

# Clean Strings
for col in df.columns:
    df[col] = df[col].astype(str).str.strip()
    
df_weights["Symptom"] = df_weights["Symptom"].str.strip()
df_desc["Disease"] = df_desc["Disease"].str.strip()
df_prec["Disease"] = df_prec["Disease"].str.strip()

# Create Maps
weight_map = dict(zip(df_weights["Symptom"], df_weights["Weight"]))
description_map = dict(zip(df_desc["Disease"], df_desc["Description"]))
precaution_map = {
    row["Disease"]: [row[f"Precaution_{i}"] for i in range(1, 5)]
    for _, row in df_prec.iterrows()
}

## FEATURE ENGINEERING: SORTED WEIGHTED VECTOR

In [10]:
def get_encoded_vector(symptoms):
    weights = []
    for s in symptoms:
        s_clean = s.strip()
        if s_clean in weight_map:
            weights.append(weight_map[s_clean])
    
    # Sort descending (Heaviest symptoms first)
    weights.sort(reverse=True)
    
    # Pad to length 17
    vector = [0] * 17
    for i in range(min(len(weights), 17)):
        vector[i] = weights[i]
    return np.array(vector).reshape(1, -1)

# Prepare Training Data
X_data = []
y_data = []

# Lookup dictionary for validation: Disease -> Set of Symptoms
disease_symptom_lookup = {}

for index, row in df.iterrows():
    # Columns: Disease, Severity, Symptom_1...Symptom_17
    symptoms_in_row = [row[col] for col in df.columns[2:] if row[col] != 'nan']
    
    vector = get_encoded_vector(symptoms_in_row)[0]
    X_data.append(vector)
    y_data.append(row["Disease"])
    
    if row["Disease"] not in disease_symptom_lookup:
        disease_symptom_lookup[row["Disease"]] = set()
    disease_symptom_lookup[row["Disease"]].update(symptoms_in_row)

X = np.array(X_data)
y = np.array(y_data)

# Encode Target
le = LabelEncoder()
y_encoded = le.fit_transform(y)

## SEVERITY LOGIC

In [12]:
row_scores = [sum(x) for x in X]
t1 = np.percentile(row_scores, 33)
t2 = np.percentile(row_scores, 66)

def get_severity(symptom_list):
    score = sum([weight_map.get(s.strip(), 0) for s in symptom_list])
    if score == 0: return "Unknown", 0
    if score <= t1: return "Mild", score
    if score <= t2: return "Moderate", score
    return "Severe", score

## MODEL TRAINING

In [13]:
X_train, X_test, y_train, y_test = train_test_split(X, y_encoded, test_size=0.2, random_state=42)

knn = KNeighborsClassifier(n_neighbors=5, metric='manhattan')
knn.fit(X_train, y_train)

dt = DecisionTreeClassifier(criterion='entropy', max_depth=15, random_state=42)
dt.fit(X_train, y_train)

0,1,2
,criterion,'entropy'
,splitter,'best'
,max_depth,15
,min_samples_split,2
,min_samples_leaf,1
,min_weight_fraction_leaf,0.0
,max_features,
,random_state,42
,max_leaf_nodes,
,min_impurity_decrease,0.0


## PREDICTION WITH VALIDATION (Top 3 Logic)

In [14]:
def predict_disease(user_symptoms):
    # 1. Input Validation
    valid_symptoms = [s for s in user_symptoms if s in weight_map]
    if not valid_symptoms:
        return {"Error": "No matching symptoms found. Check spelling."}
    
    # 2. Vectorize
    vector = get_encoded_vector(valid_symptoms)
    
    # 3. Get Model Predictions
    dt_probs = dt.predict_proba(vector)[0]
    knn_probs = knn.predict_proba(vector)[0]
    
    # Raw Model Labels (for reference)
    knn_pred_label = le.inverse_transform([knn_probs.argmax()])[0]
    dt_pred_label = le.inverse_transform([dt_probs.argmax()])[0]
    
    # Combined Probabilities (Voting)
    final_probs = (dt_probs + knn_probs) / 2
    
    # Get All Candidates Sorted by Probability
    top_indices = final_probs.argsort()[::-1]
    top_diseases = le.inverse_transform(top_indices)
    
    # 4. LOGICAL VALIDATION (Find Top 3 Valid)
    # We scan the sorted list and pick the top 3 diseases that 
    # ACTUALLY contain at least one of the user's symptoms.
    top_3_diseases = []
    for disease in top_diseases:
        disease_symptoms = disease_symptom_lookup.get(disease, set())
        if any(s in disease_symptoms for s in valid_symptoms):
            top_3_diseases.append(disease)
            if len(top_3_diseases) >= 3:
                break
    
    # Fallback: If logic is too strict (no overlap found), default to top model prediction
    if not top_3_diseases:
        top_3_diseases.append(top_diseases[0])
        
    final_disease = top_3_diseases[0]

    # 5. Get Details
    description = description_map.get(final_disease, "No description available")
    precautions = precaution_map.get(final_disease, ["No precautions available"])
    severity_label, score = get_severity(valid_symptoms)
    
    note = ""
    if len(valid_symptoms) < 3:
        note = "Note: Few symptoms provided. Diagnosis might be generic (e.g., Mild Cold)."

    return {
        "Top 3 Predictions": top_3_diseases,
        "Final Predicted Disease": final_disease,
        "Severity": severity_label,
        "Score": score,
        "Description": description,
        "Precautions": precautions,
        "Note": note,
        "kNN Prediction": knn_pred_label, # Included for reference as requested
        "Decision Tree Prediction": dt_pred_label
    }

## TESTING

In [15]:
print("\n--- Test: Headache (Should NOT be UTI) ---")
print(predict_disease(["headache"]))

print("\n--- Test: Fever (Ambiguous) ---")
print(predict_disease(["high_fever"]))

print("\n--- Test: Complex ---")
print(predict_disease(["headache", "chest_pain", "dizziness"]))


--- Test: Headache (Should NOT be UTI) ---
{'Top 3 Predictions': ['Common Cold', 'Dengue', 'Chicken pox'], 'Final Predicted Disease': 'Common Cold', 'Severity': 'Mild', 'Score': 3, 'Description': "The common cold is a viral infection of your nose and throat (upper respiratory tract). It's usually harmless, although it might not feel that way. Many types of viruses can cause a common cold.", 'Precautions': ['drink vitamin c rich drinks', 'take vapour', 'avoid cold food', 'keep fever in check'], 'Note': 'Note: Few symptoms provided. Diagnosis might be generic (e.g., Mild Cold).', 'kNN Prediction': 'Fungal infection', 'Decision Tree Prediction': 'Fungal infection'}

--- Test: Fever (Ambiguous) ---
{'Top 3 Predictions': ['Common Cold', 'Dengue', 'Chicken pox'], 'Final Predicted Disease': 'Common Cold', 'Severity': 'Mild', 'Score': 3, 'Description': "The common cold is a viral infection of your nose and throat (upper respiratory tract). It's usually harmless, although it might not feel tha

In [16]:
import joblib

# 2. Save the AI Models and Label Encoder
joblib.dump(knn, "models/knn_model.pkl")
joblib.dump(dt, "models/decision_tree_model.pkl")
joblib.dump(le, "models/label_encoder.pkl")
joblib.dump(weight_map, "models/severity_map.pkl")
joblib.dump(description_map, "models/description_map.pkl")
joblib.dump(precaution_map, "models/precaution_map.pkl")
joblib.dump(disease_symptom_lookup, "models/disease_symptom_lookup.pkl")
joblib.dump({"t1": t1, "t2": t2}, "models/severity_thresholds.pkl")

print("✅ Models, mappings, and validation data saved successfully to 'models/' folder!")

✅ Models, mappings, and validation data saved successfully to 'models/' folder!


In [18]:
test_symptoms = ["mild_fever"]

result = predict_disease(test_symptoms)
print(result)

{'Top 3 Predictions': ['hepatitis A', 'Chicken pox', 'Tuberculosis'], 'Final Predicted Disease': 'hepatitis A', 'Severity': 'Mild', 'Score': 3, 'Description': "Hepatitis A is a highly contagious liver infection caused by the hepatitis A virus. The virus is one of several types of hepatitis viruses that cause inflammation and affect your liver's ability to function.", 'Precautions': ['Consult nearest hospital', 'wash hands through', 'avoid fatty spicy food', 'medication'], 'Note': 'Note: Few symptoms provided. Diagnosis might be generic (e.g., Mild Cold).', 'kNN Prediction': 'Fungal infection', 'Decision Tree Prediction': 'Fungal infection'}
