In [30]:
import pandas as pd
import numpy as np

In [31]:
df_stroke = pd.read_csv("stroke_full_data_cleaned.csv")
df_heart_disease = pd.read_csv("heart_disease_full_data_cleaned.csv")
df_hypertension = pd.read_csv("hypertension_full_data_cleaned.csv")

Encoding categorical features.

In [32]:
# One-Hot Encoding for categorical features.
# One-Hot Encoding using pd.get_dummies.
df = pd.get_dummies(df_stroke, columns=['work_type', 'smoking_status', 'gender','ever_married', 'Residence_type'], drop_first=True)

# Check the transformed data
print(df.head())

   age  hypertension  heart_disease  avg_glucose_level        bmi  stroke  \
0   67             0              1             228.69  36.600000       1   
1   61             0              0             202.21  30.007143       1   
2   80             0              1             105.92  32.500000       1   
3   49             0              0             171.23  34.400000       1   
4   79             1              0             174.12  24.000000       1   

   work_type_Never_worked  work_type_Private  work_type_Self-employed  \
0                   False               True                    False   
1                   False              False                     True   
2                   False               True                    False   
3                   False               True                    False   
4                   False              False                     True   

   smoking_status_formerly smoked  smoking_status_never smoked  \
0                            Tru

Standardize the numerical columns (age, avg_glucose_level, and bmi).

In [33]:
# Feature scaling on the numerical columns (age, avg_glucose_level, and bmi) using the StandardScaler from the sklearn.preprocessing module. 
# This is a preprocessing step to standardize the data so that each feature has a mean of 0 and a standard deviation of 1.

from sklearn.preprocessing import StandardScaler

# Create an instance of StandardScaler
scaler = StandardScaler()

# Scale numerical features: age, avg_glucose_level, bmi
df[['age', 'avg_glucose_level', 'bmi']] = scaler.fit_transform(df[['age', 'avg_glucose_level', 'bmi']])

# Check the scaled data
print(df[['age', 'avg_glucose_level', 'bmi']].head())

        age  avg_glucose_level       bmi
0  0.952055           2.532742  0.855424
1  0.620984           1.976001 -0.060110
2  1.669374          -0.048492  0.286067
3 -0.041157           1.324648  0.549916
4  1.614196           1.385410 -0.894307


In [34]:
df.columns

Index(['age', 'hypertension', 'heart_disease', 'avg_glucose_level', 'bmi',
       'stroke', 'work_type_Never_worked', 'work_type_Private',
       'work_type_Self-employed', 'smoking_status_formerly smoked',
       'smoking_status_never smoked', 'smoking_status_smokes', 'gender_Male',
       'ever_married_Yes', 'Residence_type_Urban'],
      dtype='object')

In [35]:
# split the data into a training and testing set. 
from sklearn.model_selection import train_test_split

# set a seed for reproducibility
np.random.seed(123)

X_train, X_test, y_train, y_test = train_test_split(
    df.drop('stroke', axis=1), df["stroke"], train_size=0.75, shuffle=True, stratify=df["stroke"]
)

In [36]:
from imblearn.over_sampling import SMOTE
from collections import Counter

# Create an instance of SMOTE
smote = SMOTE()

# Apply SMOTE to the training data
X_train_smote, y_train_smote = smote.fit_resample(X_train, y_train)

# Check the class distribution after resampling
print("Before SMOTE:", Counter(y_train))
print("After SMOTE:", Counter(y_train_smote))

Before SMOTE: Counter({0: 3049, 1: 185})
After SMOTE: Counter({0: 3049, 1: 3049})


LogisticRegression

In [46]:
from sklearn.linear_model import LogisticRegression

model = LogisticRegression()
model.fit(X_train, y_train)
y_pred = model.predict(X_test)

In [45]:
from sklearn.metrics import classification_report, confusion_matrix, roc_auc_score

print("Classification Report:")
print(classification_report(y_test, y_pred))

print("Confusion Matrix:")
print(confusion_matrix(y_test, y_pred))

print("ROC AUC:", roc_auc_score(y_test, y_pred))

Classification Report:
              precision    recall  f1-score   support

           0       0.94      1.00      0.97      1017
           1       0.00      0.00      0.00        62

    accuracy                           0.94      1079
   macro avg       0.47      0.50      0.49      1079
weighted avg       0.89      0.94      0.91      1079

Confusion Matrix:
[[1017    0]
 [  62    0]]
ROC AUC: 0.5


  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


LogisticRegression + SMOTE

In [48]:
model.fit(X_train_smote, y_train_smote)
y_pred = model.predict(X_test)

print("Classification Report:")
print(classification_report(y_test, y_pred))

print("Confusion Matrix:")
print(confusion_matrix(y_test, y_pred))

print("ROC AUC:", roc_auc_score(y_test, y_pred))

Classification Report:
              precision    recall  f1-score   support

           0       0.97      0.72      0.83      1017
           1       0.13      0.68      0.22        62

    accuracy                           0.72      1079
   macro avg       0.55      0.70      0.52      1079
weighted avg       0.93      0.72      0.80      1079

Confusion Matrix:
[[736 281]
 [ 20  42]]
ROC AUC: 0.7005582516573097


LogisticRegression(class_weight="balanced")

In [49]:
model_new = LogisticRegression(class_weight="balanced")
model_new.fit(X_train, y_train)
y_pred = model_new.predict(X_test)

print("Classification Report:")
print(classification_report(y_test, y_pred))

print("Confusion Matrix:")
print(confusion_matrix(y_test, y_pred))

print("ROC AUC:", roc_auc_score(y_test, y_pred))

Classification Report:
              precision    recall  f1-score   support

           0       0.98      0.72      0.83      1017
           1       0.14      0.73      0.23        62

    accuracy                           0.72      1079
   macro avg       0.56      0.73      0.53      1079
weighted avg       0.93      0.72      0.80      1079

Confusion Matrix:
[[737 280]
 [ 17  45]]
ROC AUC: 0.7252434421289689


RandomForestClassifier

In [50]:
from sklearn.ensemble import RandomForestClassifier

In [51]:
rf_classifier = RandomForestClassifier(max_depth=None, min_samples_leaf=1, min_samples_split=2, n_estimators=100, class_weight='balanced')

# Train (fit) the model on the training data
rf_classifier.fit(X_train, y_train)

y_pred = rf_classifier.predict(X_test)

In [63]:
from sklearn.model_selection import GridSearchCV

param_grid = {
    'n_estimators': [100, 200, 300],
    'max_depth': [None, 5, 10],
    'min_samples_split': [2, 5, 10],
    'min_samples_leaf': [1, 2, 4]
}

rf = RandomForestClassifier(random_state=42)
grid_search = GridSearchCV(
    estimator=rf,
    param_grid=param_grid,
    scoring='recall',           
    cv=5
)

grid_search.fit(X_train, y_train)  
print("Best Params:", grid_search.best_params_)
print("Best Score:", grid_search.best_score_)

Best Params: {'max_depth': None, 'min_samples_leaf': 1, 'min_samples_split': 2, 'n_estimators': 100}
Best Score: 0.016216216216216217


In [61]:
rf_classifier = RandomForestClassifier(max_depth=None, min_samples_leaf=1, min_samples_split=2, n_estimators=100, class_weight='balanced')

# Train (fit) the model on the training data
rf_classifier.fit(X_train, y_train)

y_pred = rf_classifier.predict(X_test)

In [62]:
print("Classification Report:")
print(classification_report(y_test, y_pred))

print("Confusion Matrix:")
print(confusion_matrix(y_test, y_pred))

print("ROC AUC:", roc_auc_score(y_test, y_pred))

Classification Report:
              precision    recall  f1-score   support

           0       0.94      1.00      0.97      1017
           1       0.00      0.00      0.00        62

    accuracy                           0.94      1079
   macro avg       0.47      0.50      0.48      1079
weighted avg       0.89      0.94      0.91      1079

Confusion Matrix:
[[1015    2]
 [  62    0]]
ROC AUC: 0.49901671583087515


In [53]:
from sklearn.metrics import precision_score, recall_score, f1_score

precision = precision_score(y_test, y_pred, pos_label=1)
recall = recall_score(y_test, y_pred, pos_label=1)
f1 = f1_score(y_test, y_pred, pos_label=1)

print(f"Precision: {precision:.2f}")
print(f"Recall:    {recall:.2f}")
print(f"F1-score:  {f1:.2f}")

Precision: 0.00
Recall:    0.00
F1-score:  0.00


  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


BalancedRandomForestClassifier

In [54]:
from imblearn.ensemble import BalancedRandomForestClassifier

brf = BalancedRandomForestClassifier(n_estimators=100, random_state=42)
brf.fit(X_train, y_train)

y_pred = brf.predict(X_test)

  warn(
  warn(
  warn(


In [56]:
from sklearn.metrics import classification_report, confusion_matrix, roc_auc_score

print("Classification Report:")
print(classification_report(y_test, y_pred))

print("Confusion Matrix:")
print(confusion_matrix(y_test, y_pred))

print("ROC AUC:", roc_auc_score(y_test, y_pred))

Classification Report:
              precision    recall  f1-score   support

           0       0.98      0.69      0.81      1017
           1       0.13      0.74      0.22        62

    accuracy                           0.70      1079
   macro avg       0.55      0.72      0.51      1079
weighted avg       0.93      0.70      0.78      1079

Confusion Matrix:
[[704 313]
 [ 16  46]]
ROC AUC: 0.7170837694674407
