In [1]:
import pandas as pd
import numpy as np
import joblib
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split, StratifiedKFold, GridSearchCV
from sklearn.preprocessing import StandardScaler
from sklearn.impute import SimpleImputer
from sklearn.metrics import classification_report, roc_auc_score, precision_recall_curve, confusion_matrix
from imblearn.over_sampling import ADASYN
from imblearn.pipeline import Pipeline as ImbPipeline
from lightgbm import LGBMClassifier
import seaborn as sns

# Load and preprocess dataset with feature engineering
stroke_df = pd.read_csv(r"C:\Users\devar\Downloads\heart_project\data\healthcare-dataset-stroke-data.csv")
stroke_df = stroke_df.dropna(subset=['stroke'])
imputer = SimpleImputer(strategy='median')
stroke_df['bmi'] = imputer.fit_transform(stroke_df[['bmi']])
# Add interaction term and categorize BMI
stroke_df['age_hypertension'] = stroke_df['age'] * stroke_df['hypertension']
stroke_df['bmi_category'] = pd.cut(stroke_df['bmi'], bins=[0, 18.5, 25, 30, 100], labels=['underweight', 'normal', 'overweight', 'obese'])
encoded_df = pd.get_dummies(
    stroke_df.drop(['id'], axis=1),
    columns=['gender', 'ever_married', 'work_type', 'Residence_type', 'smoking_status', 'bmi_category'],
    drop_first=True
)
X = encoded_df.drop('stroke', axis=1)
y = encoded_df['stroke']
# Remove low-importance features from previous run
low_importance_features = ['gender_Other', 'work_type_Never_worked', 'bmi', 'avg_glucose_level']
X = X.drop(columns=[col for col in low_importance_features if col in X.columns])
scaler = StandardScaler()
X_scaled = pd.DataFrame(scaler.fit_transform(X), columns=X.columns)
X_scaled['stroke'] = y
X_scaled.to_csv("stroke_processed.csv", index=False)
joblib.dump(scaler, "scaler_stroke.pkl")
joblib.dump(imputer, "imputer_stroke.pkl")
print("✅ Stroke dataset encoded, normalized, and saved as stroke_processed.csv")

# Improved stroke model training function
def train_stroke_model():
    # Load processed data
    stroke_df = pd.read_csv("stroke_processed.csv")
    X = stroke_df.drop("stroke", axis=1)
    y = stroke_df["stroke"]

    # Define pipeline with LightGBM
    pipeline = ImbPipeline([
        ('imputer', SimpleImputer(strategy='mean')),
        ('scaler', StandardScaler()),
        ('adasyn', ADASYN(random_state=42, sampling_strategy=0.4)),  # Reduced sampling ratio
        ('model', LGBMClassifier(random_state=42, is_unbalance=True))
    ])

    # Hyperparameter grid for LightGBM
    param_grid = {
        'model__n_estimators': [100, 200],
        'model__max_depth': [3, 6, 10],
        'model__learning_rate': [0.01, 0.1],
        'model__num_leaves': [31, 50],
        'model__min_child_samples': [20, 30],
        'model__subsample': [0.7, 1.0]
    }

    # Stratified K-Fold Cross-Validation
    cv = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)
    grid_search = GridSearchCV(
        pipeline, param_grid, cv=cv, scoring=['f1_macro', 'roc_auc'], refit='f1_macro', n_jobs=-1, verbose=1, return_train_score=True
    )

    # Split data
    X_train, X_test, y_train, y_test = train_test_split(
        X, y, stratify=y, test_size=0.2, random_state=42
    )

    # Fit model with hyperparameter tuning
    grid_search.fit(X_train, y_train)
    best_model = grid_search.best_estimator_
    print("\n🧠 Best Parameters:", grid_search.best_params_)
    print("\n🧠 Cross-Validation F1-Macro Score: {:.3f} ± {:.3f}".format(
        grid_search.best_score_, grid_search.cv_results_['std_test_f1_macro'][grid_search.best_index_]
    ))
    print("\n🧠 Cross-Validation ROC AUC Score: {:.3f} ± {:.3f}".format(
        grid_search.cv_results_['mean_test_roc_auc'][grid_search.best_index_],
        grid_search.cv_results_['std_test_roc_auc'][grid_search.best_index_]
    ))

    # Feature importance
    feature_importances = best_model.named_steps['model'].feature_importances_
    feature_imp_df = pd.DataFrame({
        'feature': X.columns,
        'importance': feature_importances / feature_importances.sum()  # Normalize
    }).sort_values('importance', ascending=False)
    print("\n🧠 Feature Importances:\n", feature_imp_df)

    # Evaluate model (default threshold)
    y_pred = best_model.predict(X_test)
    print("\n🧠 Stroke Model Performance (Default Threshold):")
    print(classification_report(y_test, y_pred))
    print("\n🧠 Confusion Matrix (Default Threshold):")
    print(confusion_matrix(y_test, y_pred))

    # Improved threshold tuning
    y_probs = best_model.predict_proba(X_test)[:, 1]
    precision, recall, thresholds = precision_recall_curve(y_test, y_probs)
    # Select threshold where recall >= 0.65 and maximize F1-score
    f1_scores = 2 * (precision * recall) / (precision + recall + 1e-10)
    valid_indices = np.where(recall >= 0.65)[0]
    if len(valid_indices) > 0:
        optimal_idx = valid_indices[np.argmax(f1_scores[valid_indices])]
        optimal_threshold = thresholds[optimal_idx]
    else:
        optimal_idx = np.argmax(f1_scores)
        optimal_threshold = thresholds[optimal_idx]
    y_pred_adjusted = (y_probs >= optimal_threshold).astype(int)
    print("\n🧠 Stroke Model with Adjusted Threshold (Recall >= 0.65):")
    print(classification_report(y_test, y_pred_adjusted))
    print("\n🧠 Confusion Matrix (Adjusted Threshold):")
    print(confusion_matrix(y_test, y_pred_adjusted))
    print(f"Optimal Threshold: {optimal_threshold:.3f}")
    print(f"ROC AUC Score: {roc_auc_score(y_test, y_probs):.3f}")

    # Plot Precision-Recall Curve
    plt.figure(figsize=(8, 6))
    plt.plot(recall, precision, label='Precision-Recall Curve')
    plt.scatter(recall[optimal_idx], precision[optimal_idx], color='red', label=f'Optimal Threshold ({optimal_threshold:.3f})')
    plt.xlabel('Recall')
    plt.ylabel('Precision')
    plt.title('Precision-Recall Curve')
    plt.legend()
    plt.grid(True)
    plt.savefig('precision_recall_curve.png')
    plt.close()
    print("✅ Precision-Recall curve saved as precision_recall_curve.png")

    # Save the best model
    joblib.dump(best_model, "stroke_model_optimized.pkl")
    print("✅ Optimized stroke model saved as stroke_model_optimized.pkl")

if __name__ == "__main__":
    train_stroke_model()

✅ Stroke dataset encoded, normalized, and saved as stroke_processed.csv
Fitting 5 folds for each of 96 candidates, totalling 480 fits
[LightGBM] [Info] Number of positive: 1564, number of negative: 3889
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.000481 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 1586
[LightGBM] [Info] Number of data points in the train set: 5453, number of used features: 16
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.286815 -> initscore=-0.910905
[LightGBM] [Info] Start training from score -0.910905

🧠 Best Parameters: {'model__learning_rate': 0.1, 'model__max_depth': 3, 'model__min_child_samples': 20, 'model__n_estimators': 100, 'model__num_leaves': 31, 'model__subsample': 0.7}

🧠 Cross-Validation F1-Macro Score: 0.566 ± 0.011

🧠 Cross-Validation ROC AUC Score: 0.803 ± 0.013

🧠 Feature Importances:
     

