In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import joblib
import json

from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.metrics import (classification_report, roc_auc_score, RocCurveDisplay,
                             precision_recall_curve, confusion_matrix, PrecisionRecallDisplay)
from xgboost import XGBClassifier


In [None]:
# --------------- Data Cleaning ---------------

df = pd.read_csv('data/heart_2020_uncleaned.csv')
df.columns = df.columns.str.strip().str.replace(' ', '')
df['HeartDisease'] = df['HeartDisease'].map({'Yes': 1, 'No': 0})
df = df.apply(lambda x: x.str.strip().str.lower() if x.dtype == 'object' else x)

target = 'HeartDisease'
num_feats = ['BMI', 'PhysicalHealth', 'MentalHealth', 'SleepTime']
cat_feats = [
    'Smoking', 'AlcoholDrinking', 'Stroke', 'DiffWalking', 'Sex',
    'AgeCategory', 'Race', 'Diabetic', 'PhysicalActivity',
    'GenHealth', 'Asthma', 'KidneyDisease', 'SkinCancer'
]

imbalance_ratio = (df[target] == 0).sum() / (df[target] == 1).sum()
print(f"Imbalance Ratio (Neg:Pos) = {imbalance_ratio:.2f}")

X_train, X_test, y_train, y_test = train_test_split(
    df[num_feats + cat_feats], df[target],
    test_size=0.2, stratify=df[target], random_state=42
)

numeric_pipeline = Pipeline([
    ('imputer', SimpleImputer(strategy='median')),
    ('scaler', StandardScaler())
])

categorical_pipeline = Pipeline([
    ('imputer', SimpleImputer(strategy='most_frequent')),
    ('encoder', OneHotEncoder(handle_unknown='ignore'))
])

preproc = ColumnTransformer([
    ('num', numeric_pipeline, num_feats),
    ('cat', categorical_pipeline, cat_feats)
])

model = XGBClassifier(
    n_estimators=300,
    max_depth=3,
    learning_rate=0.05,
    subsample=0.9,
    colsample_bytree=0.9,
    random_state=42,
    eval_metric='logloss',
    scale_pos_weight=imbalance_ratio
)

pipe = Pipeline([
    ('pre', preproc),
    ('clf', model)
])

scores = cross_val_score(pipe, X_train, y_train, cv=5, scoring='roc_auc')
print(f'CV ROC-AUC: {scores.mean():.3f} ± {scores.std():.3f}')

pipe.fit(X_train, y_train)

y_pred = pipe.predict(X_test)
y_proba = pipe.predict_proba(X_test)[:, 1]





In [None]:
# --------------- Auto Summary & Visuals ---------------

print("\n" + "="*40)
print("📌 MODEL SUMMARY & EVALUATION")
print("="*40)

print("\n🧼 Missing Data (Top 10 Features):")
missing_summary = df.isna().mean().sort_values(ascending=False).head(10) * 100
print(missing_summary.round(2).astype(str) + " %")

print("\n🧬 Feature Data Types:")
print(df.dtypes)

print("\n⚖️ Class Distribution:")
class_counts = df['HeartDisease'].value_counts()
print(class_counts)
print(f"Positive Rate (Heart Disease): {class_counts[1] / class_counts.sum():.2%}")

print(f"\n🎯 Cross-Validated ROC-AUC: {scores.mean():.3f} ± {scores.std():.3f}")

print("\n📋 Classification Report (Default Threshold):")
print(classification_report(y_test, y_pred))

test_auc = roc_auc_score(y_test, y_proba)
print(f"\n📈 Test ROC-AUC: {test_auc:.4f}")

print(f"\n🔧 Custom Threshold (~30% Recall): {custom_threshold:.2f}")
y_pred_custom = (y_proba >= custom_threshold).astype(int)
print("\n📋 Classification Report (Adjusted Threshold):")
print(classification_report(y_test, y_pred_custom))

cm = confusion_matrix(y_test, y_pred_custom)
tn, fp, fn, tp = cm.ravel()
print(f"\nConfusion Matrix:\n{cm}")
print(f"True Negatives: {tn}, False Positives: {fp}")
print(f"False Negatives: {fn}, True Positives: {tp}")

accuracy = (tp + tn) / cm.sum()
recall = tp / (tp + fn)
precision_val = tp / (tp + fp) if (tp + fp) > 0 else 0

print(f"\n✅ Adjusted Accuracy: {accuracy:.3f}")
print(f"✅ Adjusted Recall (Sensitivity): {recall:.3f}")
print(f"✅ Adjusted Precision: {precision_val:.3f}")

# Visuals
RocCurveDisplay.from_predictions(y_test, y_proba)
plt.title('ROC Curve – Heart Disease Risk Model')
plt.show()

PrecisionRecallDisplay.from_predictions(y_test, y_proba)
plt.title('Precision-Recall Curve')
plt.show()

sns.heatmap(cm, annot=True, fmt='d', cmap='Blues')
plt.title('Confusion Matrix (Adjusted Threshold)')
plt.xlabel('Predicted Label')
plt.ylabel('True Label')
plt.show()

# Feature Importance
print("\n🔑 Top Influential Features:")
ohe = pipe.named_steps['pre'].named_transformers_['cat'].named_steps['encoder']
encoded_feats = ohe.get_feature_names_out(cat_feats)
final_feats = num_feats + list(encoded_feats)

feat_importance = pipe.named_steps['clf'].feature_importances_
importance_df = pd.DataFrame({"Feature": final_feats, "Importance": feat_importance})
importance_df = importance_df.sort_values(by="Importance", ascending=False)

print(importance_df.head(15))

plt.figure(figsize=(8, 6))
sns.barplot(data=importance_df.head(15), x='Importance', y='Feature', palette='viridis')
plt.title('Top 15 Most Important Features')
plt.show()



In [None]:
# Save outputs
joblib.dump(pipe, 'model/best_heart_disease_model.joblib')
print("Model saved to 'model/best_heart_disease_model.joblib'")

meta = {
    "num_feats": num_feats,
    "cat_feats": cat_feats,
    "all_feats": final_feats,
    "target": target
}
json.dump(meta, open('model/feature_metadata.json', 'w'))
print("Feature metadata saved to 'model/feature_metadata.json'")