In [None]:
# =====================================================
# Member 3 — Random Forest Model
# Diabetes Prediction Project (Supervised Learning)
# =====================================================

# 🧠 Objective:
# Predict whether a person has diabetes (1) or not (0)
# using the Pima Indians Diabetes Dataset.
#
# Model used here: Random Forest Classifier
# -----------------------------------------------------
# Works both on Local Machine and Google Colab
# =====================================================

# -----------------------------------------------------
# Step 1: Import required libraries
# -----------------------------------------------------
import os
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split, RandomizedSearchCV, StratifiedKFold
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import (
    classification_report, confusion_matrix, roc_auc_score,
    roc_curve, precision_recall_curve, average_precision_score
)

import joblib
sns.set_style("whitegrid")

RANDOM_STATE = 42
MEMBER_NAME = "member3_rf"  # your personal folder name

# Create folders (for your results)
os.makedirs(f"models/{MEMBER_NAME}", exist_ok=True)
os.makedirs(f"reports/{MEMBER_NAME}/figures", exist_ok=True)

# -----------------------------------------------------
# Step 2: Load the dataset
# -----------------------------------------------------
# Adjust the path depending on your environment
if os.path.exists("../data/raw/diabetes.csv"):
    data_path = "../data/raw/diabetes.csv"  # local path
else:
    data_path = "diabetes.csv"              # for Google Colab

df = pd.read_csv(data_path)
print("✅ Dataset loaded successfully!")
print("Shape:", df.shape)
df.head()

# -----------------------------------------------------
# Step 3: Quick data understanding
# -----------------------------------------------------
print("\nClass distribution (Outcome):")
print(df["Outcome"].value_counts(normalize=True) * 100)

sns.countplot(x="Outcome", data=df)
plt.title("Class Distribution (0 = Non-Diabetic, 1 = Diabetic)")
plt.show()

# -----------------------------------------------------
# Step 4: Check for zeros that represent missing values
# -----------------------------------------------------
zero_counts = (df == 0).sum()
print("\nNumber of zero values in each column:\n", zero_counts)

# Replace zeros with NaN for medical fields
zero_missing = ['Glucose', 'BloodPressure', 'SkinThickness', 'Insulin', 'BMI']
df[zero_missing] = df[zero_missing].replace(0, np.nan)

print("\nMissing values after replacement:")
print(df.isnull().sum())

# -----------------------------------------------------
# Step 5: Split the dataset into Train/Validation/Test
# -----------------------------------------------------
train, temp = train_test_split(df, test_size=0.3, stratify=df['Outcome'], random_state=RANDOM_STATE)
val, test = train_test_split(temp, test_size=0.5, stratify=temp['Outcome'], random_state=RANDOM_STATE)

print(f"Train size: {len(train)}, Validation size: {len(val)}, Test size: {len(test)}")

feature_cols = [c for c in df.columns if c != 'Outcome']

# -----------------------------------------------------
# Step 6: Preprocess (impute missing + scale features)
# -----------------------------------------------------
imputer = SimpleImputer(strategy="median")
scaler = StandardScaler()

X_train = scaler.fit_transform(imputer.fit_transform(train[feature_cols]))
X_val = scaler.transform(imputer.transform(val[feature_cols]))
X_test = scaler.transform(imputer.transform(test[feature_cols]))

y_train, y_val, y_test = train["Outcome"], val["Outcome"], test["Outcome"]

print("✅ Preprocessing completed!")

# -----------------------------------------------------
# Step 7: Train a baseline Random Forest
# -----------------------------------------------------
rf_base = RandomForestClassifier(
    n_estimators=200,
    random_state=RANDOM_STATE,
    class_weight="balanced",
    n_jobs=-1
)
rf_base.fit(X_train, y_train)

y_val_pred = rf_base.predict(X_val)
print("\n🔹 Baseline Validation Performance:\n")
print(classification_report(y_val, y_val_pred))

# -----------------------------------------------------
# Step 8: Hyperparameter tuning using RandomizedSearchCV
# -----------------------------------------------------
param_dist = {
    'n_estimators': [100, 200, 300, 500, 800],
    'max_depth': [None, 5, 10, 20, 30],
    'min_samples_split': [2, 5, 10],
    'min_samples_leaf': [1, 2, 4],
    'max_features': ['sqrt', 'log2', 0.2, 0.5],
    'class_weight': ['balanced', 'balanced_subsample']
}

cv = StratifiedKFold(n_splits=5, shuffle=True, random_state=RANDOM_STATE)

rnd_search = RandomizedSearchCV(
    rf_base,
    param_distributions=param_dist,
    n_iter=40,
    scoring="roc_auc",
    n_jobs=-1,
    cv=cv,
    random_state=RANDOM_STATE,
    verbose=2
)

print("\n⏳ Running hyperparameter tuning (this may take a few minutes)...")
rnd_search.fit(X_train, y_train)

best_rf = rnd_search.best_estimator_
print("\n✅ Best Parameters Found:")
print(rnd_search.best_params_)

# Save model
joblib.dump(best_rf, f"../models/{MEMBER_NAME}/rf_best.joblib")

# -----------------------------------------------------
# Step 9: Evaluate the model on the Test set
# -----------------------------------------------------
y_test_proba = best_rf.predict_proba(X_test)[:, 1]
y_test_pred = (y_test_proba >= 0.5).astype(int)

print("\n🔹 Test Set Classification Report:\n")
print(classification_report(y_test, y_test_pred))

roc_auc = roc_auc_score(y_test, y_test_proba)
ap_score = average_precision_score(y_test, y_test_proba)

print(f"ROC-AUC Score: {roc_auc:.3f}")
print(f"Average Precision (PR AUC): {ap_score:.3f}")

# -----------------------------------------------------
# Step 10: Visualization - Confusion Matrix
# -----------------------------------------------------
cm = confusion_matrix(y_test, y_test_pred)
plt.figure(figsize=(5, 4))
sns.heatmap(cm, annot=True, fmt="d", cmap="Blues")
plt.title("Confusion Matrix (Test Set)")
plt.xlabel("Predicted")
plt.ylabel("Actual")
plt.savefig(f"../reports/{MEMBER_NAME}/figures/confusion_matrix.png")
plt.show()

# -----------------------------------------------------
# Step 11: ROC Curve
# -----------------------------------------------------
fpr, tpr, _ = roc_curve(y_test, y_test_proba)
plt.plot(fpr, tpr, label=f"AUC={roc_auc:.3f}")
plt.plot([0, 1], [0, 1], "--")
plt.xlabel("False Positive Rate")
plt.ylabel("True Positive Rate")
plt.title("ROC Curve - Random Forest (Member 3)")
plt.legend()
plt.savefig(f"../reports/{MEMBER_NAME}/figures/roc_curve.png")
plt.show()

# -----------------------------------------------------
# Step 12: Precision-Recall Curve
# -----------------------------------------------------
precision, recall, _ = precision_recall_curve(y_test, y_test_proba)
plt.plot(recall, precision, label=f"AP={ap_score:.3f}")
plt.xlabel("Recall")
plt.ylabel("Precision")
plt.title("Precision-Recall Curve - Random Forest (Member 3)")
plt.legend()
plt.savefig(f"../reports/{MEMBER_NAME}/figures/pr_curve.png")
plt.show()

# -----------------------------------------------------
# Step 13: Feature Importances
# -----------------------------------------------------
importances = best_rf.feature_importances_
indices = np.argsort(importances)[::-1][:10]
plt.barh(np.array(feature_cols)[indices][::-1], importances[indices][::-1])
plt.title("Top 10 Feature Importances - Random Forest (Member 3)")
plt.xlabel("Importance Score")
plt.savefig(f"../reports/{MEMBER_NAME}/figures/feature_importances.png")
plt.show()

# -----------------------------------------------------
# Step 14: Save metrics summary
# -----------------------------------------------------
summary = {
    "roc_auc": float(roc_auc),
    "average_precision": float(ap_score),
    "best_params": rnd_search.best_params_
}
pd.DataFrame([summary]).to_csv(f"../reports/{MEMBER_NAME}/results_table.csv", index=False)

print("\n✅ Evaluation completed successfully!")
print(f"All plots & results saved in: reports/{MEMBER_NAME}/")
