In [None]:
# @ts-nocheck
# cspell:disable
# ruff: noqa

# **Library Imports & Initial Setup**
#### Importing Liberaries

In [None]:
# Standard libraries
import os
import math
import warnings

# Data manipulation libraries
import numpy as np
import pandas as pd

# Visualization libraries
import matplotlib.pyplot as plt
import seaborn as sns

# Machine learning: preprocessing
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split, StratifiedKFold

# Machine learning: metrics
from sklearn.metrics import (
    accuracy_score, precision_score, recall_score, f1_score,
    roc_auc_score, confusion_matrix, precision_recall_curve, auc
)

# Machine learning: models
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import SVC
import xgboost as xgb
from xgboost import XGBClassifier

# Imbalanced learning
from imblearn.over_sampling import SMOTE, ADASYN
from imblearn.combine import SMOTETomek

# Custom modules
import import_ipynb
from Setup import ( # type: ignore
    Load_Dataset, visualization, display_metrics,plot_roc_curves, 
    plot_pr_curves, plot_metrics_comparison, plot_confusion_matrices
)


warnings.filterwarnings('ignore')

In [None]:
# I have defined all general styles of plots in a separate function, in a separate file (to keep code less repetitive and clean)
visualization(fig_size=(10, 6))

pd.set_option('display.max_colwidth', None)


_____________________

## **Phase 3:** Model Development & Evaluation
##### Objective: Prepare data for M.L then develope and evalute the Machine Learning Models.

**Data Preparation:**

In [None]:
# This loads dataset from parquet file (In case we need it 😁)
processed_data = Load_Dataset('Datasets\Processed_Stroke_Dataset.parquet')

In [None]:
processed_data.head(3)

In [None]:
def prepare_for_ml(processed_df, primary_target='Stroke_Binary', test_size=0.2, random_state=42):
    
    # Identify features by patterns
    binary_features = [col for col in processed_df.columns if col.endswith('_Binary')]
    
    onehot_features = [col for col in processed_df.columns 
                      if ('Work_Type_' in col or 'Smoking_Status_' in col 
                         or 'Age_Group_' in col or 'BMI_Category_' in col
                         or 'Glucose_Category_' in col)]
    
    derived_features = ['Cardiovascular_Risk', 'Metabolic_Risk', 'Vascular_Risk', 
                       'Lifestyle_Risk', 'Age_Hypertension', 'BMI_Glucose', 'Age_Heart']
    
    # Combine all..
    keep_features = binary_features + onehot_features + derived_features
    
    
    keep_features.remove(primary_target)
    
    # Extract target and features to x, y
    y = processed_df[primary_target].copy()
    X = processed_df[keep_features].copy()
    
    # Split data
    X_train, X_test, y_train, y_test = train_test_split(
        X, y, test_size=test_size, random_state=random_state, stratify=y
    )
    
    X_train_scaled = X_train.copy()
    X_test_scaled = X_test.copy()
    
    # Scale numerical features
    num_features = X_train.select_dtypes(include=['int64', 'float64']).columns.tolist()
    scaler = StandardScaler()
    
    X_train_scaled[num_features] = scaler.fit_transform(X_train[num_features])
    X_test_scaled[num_features] = scaler.transform(X_test[num_features])
    
    return X_train_scaled, X_test_scaled, y_train, y_test, scaler

In [None]:
# Prepare... (This one is important, I'll be using this in later parts)
X_train, X_test, y_train, y_test, scaler = prepare_for_ml(processed_data, primary_target='Stroke_Binary')

**Pre-Modeling Checks:**

In [None]:
# Quick feature importance check using Random Forest

# Train a simple model
rf = RandomForestClassifier(n_estimators=100, random_state=42)
rf.fit(X_train, y_train)

# Get feature importances
importances = rf.feature_importances_
indices = np.argsort(importances)[::-1]

# Print top 10 features
print("Top 10 most important features:")
for i in range(min(10, X_train.shape[1])):
    print(f"{i+1}. {X_train.columns[indices[i]]}: {importances[indices[i]]:.4f}")

In [None]:
# Define cross-validation strategy
cv = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)


# Initialize metrics storage
cv_scores = {
    'accuracy': [],
    'roc_auc': []
}

# Simple model for testing CV
model = LogisticRegression(max_iter=1000)

# Perform cross-validation
for train_idx, val_idx in cv.split(X_train, y_train):
    # Split data
    X_cv_train, X_cv_val = X_train.iloc[train_idx], X_train.iloc[val_idx]
    y_cv_train, y_cv_val = y_train.iloc[train_idx], y_train.iloc[val_idx]
    
    # Train model
    model.fit(X_cv_train, y_cv_train)
    
    # Evaluate
    y_pred = model.predict(X_cv_val)
    y_pred_proba = model.predict_proba(X_cv_val)[:, 1]
    
    # Store metrics
    cv_scores['accuracy'].append(accuracy_score(y_cv_val, y_pred))
    cv_scores['roc_auc'].append(roc_auc_score(y_cv_val, y_pred_proba))

# Print results
print(f"Cross-validation results (5-fold):")
print(f"Accuracy: {np.mean(cv_scores['accuracy']):.4f} ± {np.std(cv_scores['accuracy']):.4f}")
print(f"ROC-AUC: {np.mean(cv_scores['roc_auc']):.4f} ± {np.std(cv_scores['roc_auc']):.4f}")

In [None]:
# Check correlation among features
correlation_matrix = X_train.corr()
plt.figure(figsize=(12, 10))
sns.heatmap(correlation_matrix, annot=False, cmap='coolwarm')
plt.title('Feature Correlation Matrix')
plt.show()

---

## **Model Development**

### **Initial Setup**

#### **Feature Selection Functions**

In [None]:
# Use all features for comprehensive analysis

def select_all_features(X_train, X_test):
    print(f"Using all {X_train.shape[1]} features")
    return X_train, X_test, X_train.columns.tolist()

In [None]:
# Select clinically relevant features for stroke prediction

def select_clinical_features(X_train, X_test):
    clinical_features = [
        # Core features
        'Age_Hypertension', 'Age_Heart', 'BMI_Glucose',
        
        # Binary ones..
        'Gender_Binary', 'Hypertension_Binary', 'Heart_Disease_Binary',
        'Ever_Married_Binary', 
        
        # Risk scores and derived features (That I created Earlier)
        'Cardiovascular_Risk', 'Metabolic_Risk', 'Vascular_Risk', 'Lifestyle_Risk'
    ]
    
    available_features = [f for f in clinical_features if f in X_train.columns]
    print(f"Selected {len(available_features)} clinical features")
    
    return X_train[available_features], X_test[available_features], available_features

#### **Class Balancing Functions**
 There's Serve class imbalance, Lets first define different approaches to handle this

**SMOTE:**

In [None]:
def balance_with_smote(X_train, y_train, random_state=42):
    smote = SMOTE(random_state=random_state)
    X_train_balanced, y_train_balanced = smote.fit_resample(X_train, y_train)
    print(f"Original class distribution: {np.bincount(y_train)}")
    print(f"SMOTE class distribution: {np.bincount(y_train_balanced)}")
    return X_train_balanced, y_train_balanced

**ADASYN:**

In [None]:
def balance_with_adasyn(X_train, y_train, random_state=42):
    adasyn = ADASYN(random_state=random_state)
    X_train_balanced, y_train_balanced = adasyn.fit_resample(X_train, y_train)
    print(f"Original class distribution: {np.bincount(y_train)}")
    print(f"ADASYN class distribution: {np.bincount(y_train_balanced)}")
    return X_train_balanced, y_train_balanced

**SMOTETomek**

In [None]:
# This combines over and undersampling..

def balance_with_smotetomek(X_train, y_train, random_state=42):
    smotetomek = SMOTETomek(random_state=random_state)
    X_train_balanced, y_train_balanced = smotetomek.fit_resample(X_train, y_train)
    print(f"Original class distribution: {np.bincount(y_train)}")
    print(f"SMOTETomek class distribution: {np.bincount(y_train_balanced)}")
    return X_train_balanced, y_train_balanced

### **Models Definations**

**- Logistic Regression:**

In [None]:
def train_logistic_regression(X_train, y_train, C=0.1, class_weight='balanced', random_state=42):

    model = LogisticRegression(
        C=C,
        penalty='l2',
        solver='liblinear',
        max_iter=2000,
        class_weight=class_weight,
        random_state=random_state
    )
    model.fit(X_train, y_train)
    return model

**- Random Forest:**

In [None]:
def train_random_forest(X_train, y_train, n_estimators=100, max_depth=None, 
                        class_weight='balanced', random_state=42):

    model = RandomForestClassifier(
        n_estimators=n_estimators,
        max_depth=max_depth,
        class_weight=class_weight,
        random_state=random_state,
        n_jobs=-1
    )
    model.fit(X_train, y_train)
    return model

**- XGBoost:**

In [None]:
def train_xgboost(X_train, y_train, learning_rate=0.1, max_depth=6, 
                  scale_pos_weight=None, random_state=42):

    if scale_pos_weight is None:
        # Calculate imbalance ratio if not provided
        scale_pos_weight = (y_train == 0).sum() / (y_train == 1).sum()
    
    model = XGBClassifier(
        learning_rate=learning_rate,
        max_depth=max_depth,
        scale_pos_weight=scale_pos_weight,
        random_state=random_state,
        use_label_encoder=False,
        eval_metric='logloss'
    )
    model.fit(X_train, y_train)
    return model

**- SVM**

In [None]:
def train_svm(X_train, y_train, C=1.0, kernel='rbf', class_weight='balanced', random_state=42):

    model = SVC(
        C=C,
        kernel=kernel,
        probability=True,
        class_weight=class_weight,
        random_state=random_state
    )
    model.fit(X_train, y_train)
    return model

### **Model Evaluation**

In [None]:
# This function just displays results of a model

def evaluate_model(model, X_test, y_test, model_name="Model"):
    
    # Get predictions
    y_pred = model.predict(X_test)
    y_pred_proba = model.predict_proba(X_test)[:, 1]
    
    # Calculate core metrics (look in list)
    metrics = {
        'Accuracy': accuracy_score(y_test, y_pred),
        'Precision': precision_score(y_test, y_pred),
        'Recall': recall_score(y_test, y_pred),
        'Specificity': recall_score(y_test, y_pred, pos_label=0),
        'F1 Score': f1_score(y_test, y_pred),
        'ROC_AUC': roc_auc_score(y_test, y_pred_proba)
    }
    
    # Calculate PR-AUC (Precision-Recall AUC)
    precision, recall, _ = precision_recall_curve(y_test, y_pred_proba)
    metrics['PR_AUC'] = auc(recall, precision)
    
    # Confusion Matrix
    cm = confusion_matrix(y_test, y_pred)
    
    # Display results using the helper functions (If you're curious, it's defined in setup file)
    display_metrics(metrics, model_name)
    
    return metrics, y_pred, y_pred_proba, cm

### **Pipleline Handling**

In [None]:
# This is the main function, we'll be using this to easily train all models etc

def run_model_pipeline(X_train, y_train, X_test, y_test, 
                       feature_selection='clinical', 
                       balancing_method='smote', 
                       model_type='logistic', 
                       random_state=42):
    
    # Select features
    if feature_selection == 'clinical':
        X_train_selected, X_test_selected, selected_features = select_clinical_features(X_train, X_test)
    else:
        X_train_selected, X_test_selected, selected_features = select_all_features(X_train, X_test)
    
    # Balance classes
    if balancing_method == 'smote':
        X_train_balanced, y_train_balanced = balance_with_smote(X_train_selected, y_train, random_state)
    elif balancing_method == 'adasyn':
        X_train_balanced, y_train_balanced = balance_with_adasyn(X_train_selected, y_train, random_state)
    elif balancing_method == 'smotetomek':
        X_train_balanced, y_train_balanced = balance_with_smotetomek(X_train_selected, y_train, random_state)
    else:
        X_train_balanced, y_train_balanced = X_train_selected, y_train
        print("No balancing applied")
    
    model_name = ''

    # Train model
    if model_type == 'logistic':
        model = train_logistic_regression(X_train_balanced, y_train_balanced, random_state=random_state)
        model_name = 'Logistic Resgrssion'
        
    elif model_type == 'random_forest':
        model = train_random_forest(X_train_balanced, y_train_balanced, random_state=random_state)
        model_name = 'Random Forest'
        
    elif model_type == 'xgboost':
        model = train_xgboost(X_train_balanced, y_train_balanced, random_state=random_state)
        model_name = 'XGBoost'
        
    elif model_type == 'svm':
        model = train_svm(X_train_balanced, y_train_balanced, random_state=random_state)
        model_name = 'SVM'
    
    # Evaluate model
    metrics, y_pred, y_pred_proba, cm = evaluate_model(model, X_test_selected, y_test, f"{model_name}")
    
    
    # Return results
    results = {
        'metrics': metrics,
        'probabilities': y_pred_proba,
        'confusion_matrix': cm,
    }
    
    return results

In [None]:
# I'm just writing this for refrence, No need to run it, as we have already executed this previously

# X_train, X_test, y_train, y_test, scaler = prepare_for_ml(processed_data, primary_target='Stroke_Binary')

### **Model Training**

**- Logistic Regression:**

In [None]:
logistic_results = run_model_pipeline(X_train, y_train, X_test, y_test, 
                       feature_selection='all', 
                       balancing_method='smote', 
                       model_type='logistic', 
                       random_state=74)

**- Random Forest:**

In [None]:
random_forest_results = run_model_pipeline(X_train, y_train, X_test, y_test, 
                       feature_selection='all', 
                       balancing_method='adasyn', 
                       model_type='random_forest', 
                       random_state=42)

**- XGBoost:**

In [None]:
xgboost_results = run_model_pipeline(X_train, y_train, X_test, y_test, 
                       feature_selection='clinical', 
                       balancing_method='none', 
                       model_type='xgboost', 
                       random_state=42)

**- SVM:**

In [None]:
svm_results = run_model_pipeline(X_train, y_train, X_test, y_test, 
                       feature_selection='all', 
                       balancing_method='smotetomek', 
                       model_type='svm', 
                       random_state=42)

In [None]:
# I'll use this in Performance Analysis

model_results = {
    'Logistic Regression': logistic_results,
    'Random Forest': random_forest_results,
    'XGBoost': xgboost_results,
    'SVM': svm_results
}

In [None]:
model_results

---

## **Phase 4:**  **Performance Analysis**

**ROC Curves:**

In [None]:
plot_roc_curves(y_test, model_results)

**Precision-Recall Curves:**

In [None]:
plot_pr_curves(y_test, model_results)

**Metric Comparisons:**

In [None]:
plot_metrics_comparison(model_results)

**Confusion Matrices:**

In [None]:
plot_confusion_matrices(model_results)