<div style='background-color:#E74C3C; padding: 15px; border-radius: 10px; margin-bottom: 20px;'>
<h1 style='color:#FFFFFF; text-align:center; font-family: Arial, sans-serif; margin: 0;'>🤖 Machine Learning Model Development</h1>
<h2 style='color:#FADBD8; text-align:center; font-family: Arial, sans-serif; margin: 5px 0 0 0;'>Predictive Analytics for Stroke Risk</h2>
</div>

<div style='background-color:#FDEDEC; padding: 15px; border-radius: 8px; border-left: 4px solid #E74C3C;'>
<h3 style='color:#E74C3C; margin-top: 0;'>🎯 Modeling Objectives</h3>
<ul style='color:#333; line-height: 1.6;'>
<li><strong>Algorithm Selection:</strong> Evaluate multiple ML algorithms for stroke prediction</li>
<li><strong>Model Training:</strong> Train and optimize predictive models</li>
<li><strong>Hyperparameter Tuning:</strong> Fine-tune model parameters for optimal performance</li>
<li><strong>Cross-Validation:</strong> Ensure model robustness and prevent overfitting</li>
<li><strong>Clinical Validation:</strong> Validate models for healthcare application</li>
</ul>
</div>

# **04 - Machine Learning Model Development**

## Objectives

* Train and compare multiple machine learning models for stroke prediction
* Implement proper model selection and hyperparameter tuning
* Handle class imbalance in the stroke dataset
* Evaluate models using appropriate metrics for healthcare applications
* Select the best performing model for deployment

## Inputs

* outputs/datasets/TrainSet.csv
* outputs/datasets/TestSet.csv
* outputs/ml_pipeline/feature_engineering_pipeline.pkl

## Outputs

* Trained machine learning models saved as .pkl files
* Model performance comparison plots
* Feature importance analysis
* Best model selection and justification

---

# Change working directory

In [None]:
import os
os.chdir('/workspaces/Stroke-prediction')
print("Current working directory:", os.getcwd())

---

# Load Required Libraries and Data

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import warnings
import joblib
from sklearn.model_selection import (
    cross_val_score, StratifiedKFold, GridSearchCV, RandomizedSearchCV
)
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
from sklearn.svm import SVC
from sklearn.neighbors import KNeighborsClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.metrics import (
    accuracy_score, precision_score, recall_score, f1_score,
    roc_auc_score, roc_curve, precision_recall_curve,
    classification_report, confusion_matrix
)
from sklearn.utils.class_weight import compute_class_weight
from imblearn.over_sampling import SMOTE
from imblearn.under_sampling import RandomUnderSampler
from imblearn.combine import SMOTEENN
from scipy import stats

warnings.filterwarnings('ignore')
sns.set_style("whitegrid")
plt.rcParams['figure.figsize'] = (12, 8)

## Load Preprocessed Data

In [None]:
# Load the train and test sets
try:
    X_train = pd.read_csv("outputs/datasets/X_train.csv")
    X_test = pd.read_csv("outputs/datasets/X_test.csv")
    y_train = pd.read_csv("outputs/datasets/y_train.csv").values.ravel()
    y_test = pd.read_csv("outputs/datasets/y_test.csv").values.ravel()
    
    print("Data loaded successfully!")
    print(f"Training set shape: {X_train.shape}")
    print(f"Test set shape: {X_test.shape}")
    print(f"Class distribution in training set: {np.bincount(y_train)}")
    
except FileNotFoundError:
    print("Preprocessed data not found. Please run the Feature Engineering notebook first.")
    # Load and process raw data as fallback
    df = pd.read_csv("inputs/datasets/Stroke-data.csv")
    
    # Quick preprocessing for demo purposes
    from sklearn.model_selection import train_test_split
    from sklearn.preprocessing import LabelEncoder, StandardScaler
    
    # Handle missing values
    df['bmi'].fillna(df['bmi'].median(), inplace=True)
    
    # Encode categorical variables
    le = LabelEncoder()
    cat_columns = ['gender', 'ever_married', 'work_type', 'Residence_type', 'smoking_status']
    
    for col in cat_columns:
        df[col] = le.fit_transform(df[col].astype(str))
    
    # Prepare features and target
    feature_columns = ['gender', 'age', 'hypertension', 'heart_disease', 'ever_married',
                      'work_type', 'Residence_type', 'avg_glucose_level', 'bmi', 'smoking_status']
    
    X = df[feature_columns]
    y = df['stroke']
    
    # Split the data
    X_train, X_test, y_train, y_test = train_test_split(
        X, y, test_size=0.2, random_state=42, stratify=y
    )
    
    # Scale the features
    scaler = StandardScaler()
    X_train_scaled = scaler.fit_transform(X_train)
    X_test_scaled = scaler.transform(X_test)
    
    X_train = pd.DataFrame(X_train_scaled, columns=feature_columns)
    X_test = pd.DataFrame(X_test_scaled, columns=feature_columns)
    
    print("Data processed successfully!")
    print(f"Training set shape: {X_train.shape}")
    print(f"Test set shape: {X_test.shape}")

---
# Model Development Strategy

## Class Imbalance Analysis

In [None]:
# Analyze class distribution
class_counts = np.bincount(y_train)
class_ratio = class_counts[0] / class_counts[1]

print(f"Class 0 (No Stroke): {class_counts[0]:,} samples")
print(f"Class 1 (Stroke): {class_counts[1]:,} samples")
print(f"Imbalance ratio: {class_ratio:.1f}:1")
print(f"Positive class percentage: {(class_counts[1]/len(y_train))*100:.1f}%")

# Visualize class distribution
plt.figure(figsize=(10, 6))
plt.subplot(1, 2, 1)
plt.bar(['No Stroke', 'Stroke'], class_counts, color=['lightblue', 'lightcoral'])
plt.title('Class Distribution (Training Set)')
plt.ylabel('Number of Samples')

plt.subplot(1, 2, 2)
plt.pie(class_counts, labels=['No Stroke', 'Stroke'], autopct='%1.1f%%', 
        colors=['lightblue', 'lightcoral'])
plt.title('Class Distribution Percentage')

plt.tight_layout()
plt.savefig('outputs/plots/class_distribution.png', dpi=300, bbox_inches='tight')
plt.show()

## Handle Class Imbalance with SMOTE

In [None]:
# Apply SMOTE for oversampling
smote = SMOTE(random_state=42)
X_train_balanced, y_train_balanced = smote.fit_resample(X_train, y_train)

print("After SMOTE balancing:")
balanced_counts = np.bincount(y_train_balanced)
print(f"Class 0 (No Stroke): {balanced_counts[0]:,} samples")
print(f"Class 1 (Stroke): {balanced_counts[1]:,} samples")
print(f"New ratio: {balanced_counts[0]/balanced_counts[1]:.1f}:1")

# Visualize before and after SMOTE
fig, axes = plt.subplots(1, 2, figsize=(12, 5))

# Before SMOTE
axes[0].bar(['No Stroke', 'Stroke'], class_counts, color=['lightblue', 'lightcoral'])
axes[0].set_title('Before SMOTE')
axes[0].set_ylabel('Number of Samples')

# After SMOTE
axes[1].bar(['No Stroke', 'Stroke'], balanced_counts, color=['lightblue', 'lightcoral'])
axes[1].set_title('After SMOTE')
axes[1].set_ylabel('Number of Samples')

plt.tight_layout()
plt.savefig('outputs/plots/smote_comparison.png', dpi=300, bbox_inches='tight')
plt.show()

---
# Model Selection and Training

## Define Models and Hyperparameters

In [None]:
# Define models with initial parameters
models = {
    'Logistic Regression': {
        'model': LogisticRegression(random_state=42, max_iter=1000),
        'params': {
            'C': [0.001, 0.01, 0.1, 1, 10, 100],
            'penalty': ['l1', 'l2', 'elasticnet'],
            'solver': ['liblinear', 'saga']
        }
    },
    'Random Forest': {
        'model': RandomForestClassifier(random_state=42),
        'params': {
            'n_estimators': [50, 100, 200],
            'max_depth': [3, 5, 10, None],
            'min_samples_split': [2, 5, 10],
            'min_samples_leaf': [1, 2, 4]
        }
    },
    'Gradient Boosting': {
        'model': GradientBoostingClassifier(random_state=42),
        'params': {
            'n_estimators': [50, 100, 200],
            'learning_rate': [0.01, 0.1, 0.2],
            'max_depth': [3, 5, 7],
            'min_samples_split': [2, 5, 10]
        }
    },
    'SVM': {
        'model': SVC(random_state=42, probability=True),
        'params': {
            'C': [0.1, 1, 10, 100],
            'kernel': ['rbf', 'linear', 'poly'],
            'gamma': ['scale', 'auto', 0.001, 0.01]
        }
    },
    'K-Nearest Neighbors': {
        'model': KNeighborsClassifier(),
        'params': {
            'n_neighbors': [3, 5, 7, 9, 11],
            'weights': ['uniform', 'distance'],
            'metric': ['euclidean', 'manhattan', 'minkowski']
        }
    }
}

print(f"Defined {len(models)} models for comparison:")
for name in models.keys():
    print(f"- {name}")

## Cross-Validation and Model Comparison

In [None]:
# Perform cross-validation for initial model comparison
cv_results = {}
cv = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)

print("Performing 5-fold cross-validation...\n")

for name, model_info in models.items():
    print(f"Evaluating {name}...")
    
    # Use default parameters for initial comparison
    model = model_info['model']
    
    # Cross-validation scores
    accuracy_scores = cross_val_score(model, X_train_balanced, y_train_balanced, 
                                     cv=cv, scoring='accuracy')
    precision_scores = cross_val_score(model, X_train_balanced, y_train_balanced, 
                                      cv=cv, scoring='precision')
    recall_scores = cross_val_score(model, X_train_balanced, y_train_balanced, 
                                   cv=cv, scoring='recall')
    f1_scores = cross_val_score(model, X_train_balanced, y_train_balanced, 
                               cv=cv, scoring='f1')
    roc_auc_scores = cross_val_score(model, X_train_balanced, y_train_balanced, 
                                    cv=cv, scoring='roc_auc')
    
    # Store results
    cv_results[name] = {
        'accuracy': accuracy_scores,
        'precision': precision_scores,
        'recall': recall_scores,
        'f1': f1_scores,
        'roc_auc': roc_auc_scores
    }
    
    print(f"  Accuracy: {accuracy_scores.mean():.3f} (+/- {accuracy_scores.std() * 2:.3f})")
    print(f"  Precision: {precision_scores.mean():.3f} (+/- {precision_scores.std() * 2:.3f})")
    print(f"  Recall: {recall_scores.mean():.3f} (+/- {recall_scores.std() * 2:.3f})")
    print(f"  F1-Score: {f1_scores.mean():.3f} (+/- {f1_scores.std() * 2:.3f})")
    print(f"  ROC-AUC: {roc_auc_scores.mean():.3f} (+/- {roc_auc_scores.std() * 2:.3f})")
    print()

print("Cross-validation completed!")

## Visualize Cross-Validation Results

In [None]:
# Create comparison plots
metrics = ['accuracy', 'precision', 'recall', 'f1', 'roc_auc']
model_names = list(cv_results.keys())

fig, axes = plt.subplots(2, 3, figsize=(18, 12))
axes = axes.ravel()

for i, metric in enumerate(metrics):
    data = [cv_results[model][metric] for model in model_names]
    
    axes[i].boxplot(data, labels=model_names)
    axes[i].set_title(f'{metric.upper()} Comparison')
    axes[i].set_ylabel(metric.capitalize())
    axes[i].tick_params(axis='x', rotation=45)
    axes[i].grid(True, alpha=0.3)

# Remove empty subplot
axes[-1].remove()

plt.tight_layout()
plt.savefig('outputs/plots/cv_comparison.png', dpi=300, bbox_inches='tight')
plt.show()

## Summary Table of Cross-Validation Results

In [None]:
# Create summary dataframe
summary_data = []

for model_name in model_names:
    results = cv_results[model_name]
    summary_data.append({
        'Model': model_name,
        'Accuracy': f"{results['accuracy'].mean():.3f} ± {results['accuracy'].std():.3f}",
        'Precision': f"{results['precision'].mean():.3f} ± {results['precision'].std():.3f}",
        'Recall': f"{results['recall'].mean():.3f} ± {results['recall'].std():.3f}",
        'F1-Score': f"{results['f1'].mean():.3f} ± {results['f1'].std():.3f}",
        'ROC-AUC': f"{results['roc_auc'].mean():.3f} ± {results['roc_auc'].std():.3f}"
    })

summary_df = pd.DataFrame(summary_data)
print("Cross-Validation Results Summary:")
print("=" * 80)
print(summary_df.to_string(index=False))

# Save results
summary_df.to_csv('outputs/ml_pipeline/cv_results_summary.csv', index=False)
print("\nResults saved to outputs/ml_pipeline/cv_results_summary.csv")

---
# Hyperparameter Tuning

## Select Top 3 Models for Hyperparameter Tuning

In [None]:
# Select top 3 models based on ROC-AUC score
roc_auc_means = {name: cv_results[name]['roc_auc'].mean() for name in model_names}
top_models = sorted(roc_auc_means.items(), key=lambda x: x[1], reverse=True)[:3]

print("Top 3 models based on ROC-AUC:")
for i, (model_name, score) in enumerate(top_models, 1):
    print(f"{i}. {model_name}: {score:.3f}")

top_model_names = [name for name, _ in top_models]
print(f"\nProceeding with hyperparameter tuning for: {top_model_names}")

## Hyperparameter Tuning with GridSearchCV

In [None]:
# Perform hyperparameter tuning for top models
tuned_models = {}
best_params = {}

for model_name in top_model_names:
    print(f"\nTuning hyperparameters for {model_name}...")
    
    model_info = models[model_name]
    base_model = model_info['model']
    param_grid = model_info['params']
    
    # Use RandomizedSearchCV for faster tuning
    search = RandomizedSearchCV(
        base_model, 
        param_grid, 
        n_iter=50,  # Reduced for faster execution
        cv=cv, 
        scoring='roc_auc',
        random_state=42,
        n_jobs=-1,
        verbose=1
    )
    
    search.fit(X_train_balanced, y_train_balanced)
    
    # Store best model and parameters
    tuned_models[model_name] = search.best_estimator_
    best_params[model_name] = search.best_params_
    
    print(f"Best ROC-AUC for {model_name}: {search.best_score_:.3f}")
    print(f"Best parameters: {search.best_params_}")

print("\nHyperparameter tuning completed!")

---
# Final Model Training and Evaluation

## Train Final Models

In [None]:
# Train final tuned models
final_models = {}
final_predictions = {}
final_probabilities = {}

for model_name, model in tuned_models.items():
    print(f"Training final {model_name} model...")
    
    # Train on balanced data
    model.fit(X_train_balanced, y_train_balanced)
    
    # Make predictions on original test set
    y_pred = model.predict(X_test)
    y_proba = model.predict_proba(X_test)[:, 1]
    
    # Store results
    final_models[model_name] = model
    final_predictions[model_name] = y_pred
    final_probabilities[model_name] = y_proba
    
    print(f"  {model_name} training completed")

print("\nAll final models trained successfully!")

## Model Performance Evaluation

In [None]:
# Calculate comprehensive metrics
performance_results = {}

print("Final Model Performance on Test Set:")
print("=" * 60)

for model_name in final_models.keys():
    y_pred = final_predictions[model_name]
    y_proba = final_probabilities[model_name]
    
    # Calculate metrics
    accuracy = accuracy_score(y_test, y_pred)
    precision = precision_score(y_test, y_pred)
    recall = recall_score(y_test, y_pred)
    f1 = f1_score(y_test, y_pred)
    roc_auc = roc_auc_score(y_test, y_proba)
    
    # Store results
    performance_results[model_name] = {
        'accuracy': accuracy,
        'precision': precision,
        'recall': recall,
        'f1': f1,
        'roc_auc': roc_auc
    }
    
    print(f"\n{model_name}:")
    print(f"  Accuracy:  {accuracy:.3f}")
    print(f"  Precision: {precision:.3f}")
    print(f"  Recall:    {recall:.3f}")
    print(f"  F1-Score:  {f1:.3f}")
    print(f"  ROC-AUC:   {roc_auc:.3f}")

## ROC Curves Comparison

In [None]:
# Plot ROC curves for all final models
plt.figure(figsize=(10, 8))

for model_name in final_models.keys():
    y_proba = final_probabilities[model_name]
    fpr, tpr, _ = roc_curve(y_test, y_proba)
    auc_score = performance_results[model_name]['roc_auc']
    
    plt.plot(fpr, tpr, linewidth=2, 
             label=f'{model_name} (AUC = {auc_score:.3f})')

# Plot diagonal line
plt.plot([0, 1], [0, 1], 'k--', linewidth=1, label='Random (AUC = 0.500)')

plt.xlim([0.0, 1.0])
plt.ylim([0.0, 1.05])
plt.xlabel('False Positive Rate')
plt.ylabel('True Positive Rate')
plt.title('ROC Curves - Final Models Comparison')
plt.legend(loc="lower right")
plt.grid(True, alpha=0.3)

plt.savefig('outputs/plots/roc_curves_final.png', dpi=300, bbox_inches='tight')
plt.show()

## Precision-Recall Curves

In [None]:
# Plot Precision-Recall curves
plt.figure(figsize=(10, 8))

baseline_precision = np.sum(y_test) / len(y_test)

for model_name in final_models.keys():
    y_proba = final_probabilities[model_name]
    precision_curve, recall_curve, _ = precision_recall_curve(y_test, y_proba)
    
    plt.plot(recall_curve, precision_curve, linewidth=2, 
             label=f'{model_name}')

# Plot baseline
plt.axhline(y=baseline_precision, color='k', linestyle='--', linewidth=1,
           label=f'Baseline (Precision = {baseline_precision:.3f})')

plt.xlim([0.0, 1.0])
plt.ylim([0.0, 1.05])
plt.xlabel('Recall')
plt.ylabel('Precision')
plt.title('Precision-Recall Curves - Final Models Comparison')
plt.legend(loc="lower left")
plt.grid(True, alpha=0.3)

plt.savefig('outputs/plots/precision_recall_curves.png', dpi=300, bbox_inches='tight')
plt.show()

---
# Model Selection and Feature Importance

## Select Best Model

In [None]:
# Select best model based on ROC-AUC (primary) and F1-score (secondary)
best_model_name = max(performance_results.keys(), 
                     key=lambda x: (performance_results[x]['roc_auc'], 
                                   performance_results[x]['f1']))

best_model = final_models[best_model_name]
best_performance = performance_results[best_model_name]

print(f"Best Model Selected: {best_model_name}")
print("=" * 40)
print(f"ROC-AUC:   {best_performance['roc_auc']:.3f}")
print(f"F1-Score:  {best_performance['f1']:.3f}")
print(f"Precision: {best_performance['precision']:.3f}")
print(f"Recall:    {best_performance['recall']:.3f}")
print(f"Accuracy:  {best_performance['accuracy']:.3f}")

# Save best model
joblib.dump(best_model, f'outputs/ml_pipeline/best_model_{best_model_name.lower().replace(" ", "_")}.pkl')
print(f"\nBest model saved to outputs/ml_pipeline/best_model_{best_model_name.lower().replace(' ', '_')}.pkl")

## Feature Importance Analysis

In [None]:
# Feature importance analysis for tree-based models
if hasattr(best_model, 'feature_importances_'):
    feature_names = X_train.columns
    importances = best_model.feature_importances_
    
    # Create feature importance dataframe
    feature_importance_df = pd.DataFrame({
        'feature': feature_names,
        'importance': importances
    }).sort_values('importance', ascending=False)
    
    print(f"Feature Importance - {best_model_name}:")
    print("=" * 40)
    for _, row in feature_importance_df.iterrows():
        print(f"{row['feature']:20s}: {row['importance']:.3f}")
    
    # Plot feature importance
    plt.figure(figsize=(10, 8))
    plt.barh(range(len(feature_importance_df)), feature_importance_df['importance'])
    plt.yticks(range(len(feature_importance_df)), feature_importance_df['feature'])
    plt.xlabel('Feature Importance')
    plt.title(f'Feature Importance - {best_model_name}')
    plt.gca().invert_yaxis()
    plt.grid(True, alpha=0.3)
    
    plt.tight_layout()
    plt.savefig('outputs/plots/feature_importance.png', dpi=300, bbox_inches='tight')
    plt.show()
    
    # Save feature importance
    feature_importance_df.to_csv('outputs/ml_pipeline/feature_importance.csv', index=False)
    
elif hasattr(best_model, 'coef_'):
    # For linear models, show coefficients
    feature_names = X_train.columns
    coefficients = best_model.coef_[0]
    
    coef_df = pd.DataFrame({
        'feature': feature_names,
        'coefficient': coefficients,
        'abs_coefficient': np.abs(coefficients)
    }).sort_values('abs_coefficient', ascending=False)
    
    print(f"Feature Coefficients - {best_model_name}:")
    print("=" * 50)
    for _, row in coef_df.iterrows():
        print(f"{row['feature']:20s}: {row['coefficient']:+.3f}")
    
    # Plot coefficients
    plt.figure(figsize=(10, 8))
    colors = ['red' if x < 0 else 'blue' for x in coef_df['coefficient']]
    plt.barh(range(len(coef_df)), coef_df['coefficient'], color=colors, alpha=0.7)
    plt.yticks(range(len(coef_df)), coef_df['feature'])
    plt.xlabel('Coefficient Value')
    plt.title(f'Feature Coefficients - {best_model_name}')
    plt.gca().invert_yaxis()
    plt.grid(True, alpha=0.3)
    
    plt.tight_layout()
    plt.savefig('outputs/plots/feature_coefficients.png', dpi=300, bbox_inches='tight')
    plt.show()
    
    # Save coefficients
    coef_df.to_csv('outputs/ml_pipeline/feature_coefficients.csv', index=False)
else:
    print(f"Feature importance not available for {best_model_name}")

---
# Save All Models and Results

In [None]:
# Save all final models
for model_name, model in final_models.items():
    filename = f"outputs/ml_pipeline/{model_name.lower().replace(' ', '_')}_model.pkl"
    joblib.dump(model, filename)
    print(f"Saved {model_name} to {filename}")

# Save performance results
performance_df = pd.DataFrame(performance_results).T
performance_df.to_csv('outputs/ml_pipeline/final_model_performance.csv')
print("\nSaved performance results to outputs/ml_pipeline/final_model_performance.csv")

# Save best model parameters
import json
with open('outputs/ml_pipeline/best_model_params.json', 'w') as f:
    # Convert numpy types to native Python types for JSON serialization
    serializable_params = {}
    for model_name, params in best_params.items():
        serializable_params[model_name] = {}
        for key, value in params.items():
            if isinstance(value, np.integer):
                serializable_params[model_name][key] = int(value)
            elif isinstance(value, np.floating):
                serializable_params[model_name][key] = float(value)
            else:
                serializable_params[model_name][key] = value
    
    json.dump(serializable_params, f, indent=2)

print("Saved best model parameters to outputs/ml_pipeline/best_model_params.json")

print("\n" + "="*60)
print("MODEL DEVELOPMENT COMPLETED SUCCESSFULLY!")
print("="*60)
print(f"Best Model: {best_model_name}")
print(f"Best ROC-AUC: {best_performance['roc_auc']:.3f}")
print(f"All models and results saved to outputs/ml_pipeline/")