##### Loading Imports & the Dataset

In [1]:
# Imports
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split
from sklearn.impute import KNNImputer
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score
from sklearn.preprocessing import StandardScaler
from imblearn.over_sampling import SMOTE
from collections import Counter
from sklearn.metrics import f1_score
from imblearn.under_sampling import RandomUnderSampler

from sklearn.calibration import LinearSVC
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.linear_model import SGDClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.neural_network import MLPClassifier
from sklearn.svm import SVC
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import roc_auc_score

In [2]:
# Loading the csv data file to a Pandas DataFrame
dataset = pd.read_csv('/Users/ibrahimharoon/Documents/Uni/Year 3/Final Project/Stroke Prediction Tool/strokeDataset.csv')

##### Dataset Analysis

1. Finding the total rows and columns

In [None]:
# Finding the total rows and columns
dataset.shape

2. Identifying data types

In [None]:
# Printing the first 5 rows of the dataset
dataset.head()

3. Checking for missing values

In [None]:
# Checking if there are any missing values in each column
dataset.isnull().sum()

4. Checking for outliers in numerical columns

In [None]:
sns.boxplot(data=dataset['avg_glucose_level']).set(title="Avg Glucose Level")

In [None]:
sns.boxplot(data=dataset['bmi']).set(title="BMI")

5. Checking if the distribution of stroke is balanced

In [None]:
#'1' who had a stroke and '0' who did not have a stroke.
dataset['stroke'].value_counts()

##### Preprocessing 1: Imputation & Oversampling

1. Creating a copy of the original dataset

In [9]:
datasetCopy = dataset.copy()

2. Removing the id column

In [10]:
datasetCopy = datasetCopy.drop('id', axis=1)

3. Binary encoding

In [None]:
datasetCopy['ever_married'] = datasetCopy['ever_married'].replace({'No' : 0, 'Yes' : 1})
datasetCopy['Residence_type'] = datasetCopy['Residence_type'].replace({'Rural' : 0, 'Urban' : 1})

4. One Hot Encoding

In [12]:
datasetCopy = pd.get_dummies(datasetCopy, columns=['gender', 'work_type', 'smoking_status'])

In [None]:
datasetCopy.head()

5. Splitting the dataset into features and target

In [14]:
x = datasetCopy.drop(columns='stroke', axis=1)
y = datasetCopy['stroke']

6. Train - test split

In [15]:
X_Train, X_Test, Y_Train, Y_Test = train_test_split(x, y, train_size=0.7, shuffle=True, random_state=1)

7. Replacing missing values with imputed values

In [16]:
imputer = KNNImputer()
imputer.fit(X_Train)

X_Train = pd.DataFrame(imputer.transform(X_Train), index=X_Train.index, columns=X_Train.columns)
X_Test = pd.DataFrame(imputer.transform(X_Test), index=X_Test.index, columns=X_Test.columns)

8. Oversampling/Undersampling

In [17]:
# SMOTE Oversampling
smote = SMOTE(sampling_strategy=0.5, random_state=1)  # Adjust the ratio as needed
X_Train_Oversampled, Y_Train_Oversampled = smote.fit_resample(X_Train, Y_Train)

In [18]:
# Undersampling
undersampler = RandomUnderSampler(sampling_strategy=0.5, random_state=1)  # Adjust the ratio as needed
X_Train_Undersampled, Y_Train_Undersampled = undersampler.fit_resample(X_Train, Y_Train)

9. Scaling values

In [19]:
scaler = StandardScaler()

# Oversampled Set
scaler.fit(X_Train_Oversampled)
X_Train_Oversampled = pd.DataFrame(scaler.transform(X_Train_Oversampled), index=X_Train_Oversampled.index, columns=X_Train_Oversampled.columns)
X_Test_Oversampled = pd.DataFrame(scaler.transform(X_Test), index=X_Test.index, columns=X_Test.columns)

# Undersample Sey
scaler.fit(X_Train_Undersampled)
X_Train_Undersampled = pd.DataFrame(scaler.transform(X_Train_Undersampled), index=X_Train_Undersampled.index, columns=X_Train_Undersampled.columns)
X_Test_Undersampled = pd.DataFrame(scaler.transform(X_Test), index=X_Test.index, columns=X_Test.columns)

In [None]:
# Checking standard Deviation
print(X_Train_Oversampled.std())
print(X_Train_Undersampled.std())

##### Model Training & Evaluation

In [21]:
models = {
    "                   Logistic Regression": LogisticRegression(class_weight='balanced'),
    "                   K-Nearest Neighbors": KNeighborsClassifier(),
    "                         Decision Tree": DecisionTreeClassifier(class_weight='balanced'),
    "                        Neural Network": MLPClassifier(),
    "                         Random Forest": RandomForestClassifier(class_weight='balanced'),
    "                     Gradient Boosting": GradientBoostingClassifier(),
}

In [None]:
# Loop through the models
for name, model in models.items():
    
    # Train on oversampled data
    model.fit(X_Train_Oversampled, Y_Train_Oversampled)
    
    # Predictions for oversampled test set
    oversampled_test_preds = model.predict(X_Test_Oversampled)
    oversampled_test_probs = model.predict_proba(X_Test_Oversampled)[:, 1]  # Probability estimates
    
    # Train on undersampled data
    model.fit(X_Train_Undersampled, Y_Train_Undersampled)
    
    # Predictions for undersampled test set
    undersampled_test_preds = model.predict(X_Test_Undersampled)
    undersampled_test_probs = model.predict_proba(X_Test_Undersampled)[:, 1]  # Probability estimates
    
    # Calculate accuracy and F1 score for oversampled data
    oversampled_test_acc = accuracy_score(Y_Test, oversampled_test_preds)
    oversampled_test_f1 = f1_score(Y_Test, oversampled_test_preds)
    oversampled_test_auc = roc_auc_score(Y_Test, oversampled_test_probs)  # AUC Score
    
    # Calculate accuracy and F1 score for undersampled data
    undersampled_test_acc = accuracy_score(Y_Test, undersampled_test_preds)
    undersampled_test_f1 = f1_score(Y_Test, undersampled_test_preds)
    undersampled_test_auc = roc_auc_score(Y_Test, undersampled_test_probs)  # AUC Score
    

In [None]:
# Print results
for name, model in models.items():
    print(f"Model: {name}")
    print(f"--- Oversampled Data ---")
    print(f"Test Accuracy: {oversampled_test_acc:.4f}")
    print(f"Test F1 Score: {oversampled_test_f1:.4f}")
    print(f"Test AUC Score: {oversampled_test_auc:.4f}")
    
    print(f"--- Undersampled Data ---")
    print(f"Test Accuracy: {undersampled_test_acc:.4f}")
    print(f"Test F1 Score: {undersampled_test_f1:.4f}")
    print(f"Test AUC Score: {undersampled_test_auc:.4f}")
    
    print("\n" + "="*50 + "\n")