In [None]:
import pandas as pd
from sklearn.preprocessing import LabelEncoder, StandardScaler
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split, GridSearchCV, cross_val_score
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix
import joblib

# === Load CSV ===
df = pd.read_csv("..\data\dataset.csv")

# === Get all unique symptoms ===
symptom_cols = df.columns[1:]
all_symptoms = set()
for col in symptom_cols:
    all_symptoms.update(df[col].dropna().str.strip().str.lower())
all_symptoms = sorted(list(all_symptoms))

# === Create binary encoded symptom columns ===
def encode_symptoms(row):
    present_symptoms = set(str(s).strip().lower() for s in row if pd.notna(s))
    return [1 if symptom in present_symptoms else 0 for symptom in all_symptoms]

X = df[symptom_cols].apply(encode_symptoms, axis=1, result_type="expand")
X.columns = all_symptoms

# === Feature Scaling ===
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)

# === Encode target ===
le = LabelEncoder()
y = le.fit_transform(df['Disease'])

# === Split data ===
X_train, X_test, y_train, y_test = train_test_split(X_scaled, y, test_size=0.2, random_state=42)

# === Hyperparameter Tuning ===
param_grid = {
    'n_estimators': [100, 200, 300],
    'max_depth': [None, 10, 20, 30],
    'min_samples_split': [2, 5, 10]
}
grid_search = GridSearchCV(RandomForestClassifier(random_state=42), param_grid, cv=5, n_jobs=-1, verbose=1)
grid_search.fit(X_train, y_train)
clf = grid_search.best_estimator_

# === Cross-validation Score ===
cv_scores = cross_val_score(clf, X_train, y_train, cv=5)
print("Cross-validation scores:", cv_scores)
print("Mean CV score:", cv_scores.mean())

# === Predict & Evaluate ===
y_pred = clf.predict(X_test)
print("Best Parameters:", grid_search.best_params_)
print("Accuracy:", accuracy_score(y_test, y_pred))
print("\nClassification Report:\n", classification_report(y_test, y_pred))
print("\nConfusion Matrix:\n", confusion_matrix(y_test, y_pred))

# === Save model & label encoder ===
joblib.dump(clf, "../models/random_forest.pkl")
joblib.dump(le, "../models/label_encoder.pkl")
joblib.dump(all_symptoms, "../models/symptom_list.pkl")
joblib.dump(scaler, "../models/scaler.pkl")

Fitting 5 folds for each of 36 candidates, totalling 180 fits
Cross-validation scores: [1. 1. 1. 1. 1.]
Mean CV score: 1.0
Best Parameters: {'max_depth': None, 'min_samples_split': 2, 'n_estimators': 100}
Accuracy: 1.0

Classification Report:
               precision    recall  f1-score   support

           0       1.00      1.00      1.00        18
           1       1.00      1.00      1.00        30
           2       1.00      1.00      1.00        24
           3       1.00      1.00      1.00        25
           4       1.00      1.00      1.00        24
           5       1.00      1.00      1.00        23
           6       1.00      1.00      1.00        33
           7       1.00      1.00      1.00        23
           8       1.00      1.00      1.00        21
           9       1.00      1.00      1.00        15
          10       1.00      1.00      1.00        23
          11       1.00      1.00      1.00        26
          12       1.00      1.00      1.00        21

['../models/scaler.pkl']