##### Loading Imports & the Dataset

In [1]:
# Imports
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split
from sklearn.impute import KNNImputer
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score
from sklearn.preprocessing import StandardScaler
from imblearn.over_sampling import SMOTE
from collections import Counter
from sklearn.metrics import f1_score
from imblearn.under_sampling import RandomUnderSampler

from sklearn.calibration import LinearSVC
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.linear_model import SGDClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.neural_network import MLPClassifier
from sklearn.svm import SVC
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import roc_auc_score
from sklearn.preprocessing import MultiLabelBinarizer

In [3]:
# Loading the csv data file to a Pandas DataFrame
dataset = pd.read_csv('/Users/ibrahimharoon/Documents/Uni/Year 3/Final Project/Stroke Prediction Tool/app/data/dataset2.csv')

##### Dataset Analysis

1. Finding the total rows and columns

In [4]:
# Finding the total rows and columns
dataset.shape

(15000, 22)

2. Identifying data types

In [5]:
# Printing the first 5 rows of the dataset
dataset.head()

Unnamed: 0,Patient ID,Patient Name,Age,Gender,Hypertension,Heart Disease,Marital Status,Work Type,Residence Type,Average Glucose Level,...,Alcohol Intake,Physical Activity,Stroke History,Family History of Stroke,Dietary Habits,Stress Levels,Blood Pressure Levels,Cholesterol Levels,Symptoms,Diagnosis
0,18153,Mamooty Khurana,56,Male,0,1,Married,Self-employed,Rural,130.91,...,Social Drinker,Moderate,0,Yes,Vegan,3.48,140/108,"HDL: 68, LDL: 133","Difficulty Speaking, Headache",Stroke
1,62749,Kaira Subramaniam,80,Male,0,0,Single,Self-employed,Urban,183.73,...,Never,Low,0,No,Paleo,1.73,146/91,"HDL: 63, LDL: 70","Loss of Balance, Headache, Dizziness, Confusion",Stroke
2,32145,Dhanush Balan,26,Male,1,1,Married,Never Worked,Rural,189.0,...,Rarely,High,0,Yes,Paleo,7.31,154/97,"HDL: 59, LDL: 95","Seizures, Dizziness",Stroke
3,6154,Ivana Baral,73,Male,0,0,Married,Never Worked,Urban,185.29,...,Frequent Drinker,Moderate,0,No,Paleo,5.35,174/81,"HDL: 70, LDL: 137","Seizures, Blurred Vision, Severe Fatigue, Head...",No Stroke
4,48973,Darshit Jayaraman,51,Male,1,1,Divorced,Self-employed,Urban,177.34,...,Rarely,Low,0,Yes,Pescatarian,6.84,121/95,"HDL: 65, LDL: 68",Difficulty Speaking,Stroke


3. Checking for missing values

In [6]:
# Checking if there are any missing values in each column
dataset.isnull().sum()

Patient ID                     0
Patient Name                   0
Age                            0
Gender                         0
Hypertension                   0
Heart Disease                  0
Marital Status                 0
Work Type                      0
Residence Type                 0
Average Glucose Level          0
Body Mass Index (BMI)          0
Smoking Status                 0
Alcohol Intake                 0
Physical Activity              0
Stroke History                 0
Family History of Stroke       0
Dietary Habits                 0
Stress Levels                  0
Blood Pressure Levels          0
Cholesterol Levels             0
Symptoms                    2500
Diagnosis                      0
dtype: int64

5. Checking if the distribution of stroke is balanced

In [7]:
dataset['Diagnosis'].value_counts()

Diagnosis
No Stroke    7532
Stroke       7468
Name: count, dtype: int64

##### Preprocessing 1: Imputation & Oversampling

1. Creating a copy of the original dataset

In [8]:
datasetCopy = dataset.copy()

2. Removing the id & patient name column

In [9]:
datasetCopy = datasetCopy.drop('Patient ID', axis=1)
datasetCopy = datasetCopy.drop('Patient Name', axis=1)

3. Replacing missing values

In [10]:
datasetCopy["Symptoms"].fillna("No Symptoms", inplace=True)

The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  datasetCopy["Symptoms"].fillna("No Symptoms", inplace=True)


4. One Hot Encoding 

In [11]:
datasetCopy = pd.get_dummies(datasetCopy, columns=['Marital Status', 'Work Type', 'Smoking Status', 'Alcohol Intake', 'Physical Activity', 'Dietary Habits'])

In [12]:
# Convert boolean columns to 0 and 1
datasetCopy[[
    'Marital Status_Divorced', 'Marital Status_Married', 'Marital Status_Single',
    'Work Type_Government Job', 'Work Type_Never Worked', 'Work Type_Private', 'Work Type_Self-employed',
    'Smoking Status_Currently Smokes', 'Smoking Status_Formerly Smoked', 'Smoking Status_Non-smoker',
    'Alcohol Intake_Frequent Drinker', 'Alcohol Intake_Never', 'Alcohol Intake_Rarely', 'Alcohol Intake_Social Drinker',
    'Physical Activity_High', 'Physical Activity_Low', 'Physical Activity_Moderate',
    'Dietary Habits_Gluten-Free', 'Dietary Habits_Keto', 'Dietary Habits_Non-Vegetarian', 'Dietary Habits_Paleo',
    'Dietary Habits_Pescatarian', 'Dietary Habits_Vegan', 'Dietary Habits_Vegetarian'
]] = datasetCopy[[
    'Marital Status_Divorced', 'Marital Status_Married', 'Marital Status_Single',
    'Work Type_Government Job', 'Work Type_Never Worked', 'Work Type_Private', 'Work Type_Self-employed',
    'Smoking Status_Currently Smokes', 'Smoking Status_Formerly Smoked', 'Smoking Status_Non-smoker',
    'Alcohol Intake_Frequent Drinker', 'Alcohol Intake_Never', 'Alcohol Intake_Rarely', 'Alcohol Intake_Social Drinker',
    'Physical Activity_High', 'Physical Activity_Low', 'Physical Activity_Moderate',
    'Dietary Habits_Gluten-Free', 'Dietary Habits_Keto', 'Dietary Habits_Non-Vegetarian', 'Dietary Habits_Paleo',
    'Dietary Habits_Pescatarian', 'Dietary Habits_Vegan', 'Dietary Habits_Vegetarian'
]].astype(int)

5. Binary encoding

In [13]:
datasetCopy['Gender'] = datasetCopy['Gender'].replace({'Male' : 0, 'Female' : 1})
datasetCopy['Residence Type'] = datasetCopy['Residence Type'].replace({'Rural' : 0, 'Urban' : 1})
datasetCopy['Family History of Stroke'] = datasetCopy['Family History of Stroke'].replace({'Yes' : 0, 'No' : 1})
datasetCopy['Diagnosis'] = datasetCopy['Diagnosis'].replace({'No Stroke' : 0, 'Stroke' : 1})

  datasetCopy['Gender'] = datasetCopy['Gender'].replace({'Male' : 0, 'Female' : 1})
  datasetCopy['Residence Type'] = datasetCopy['Residence Type'].replace({'Rural' : 0, 'Urban' : 1})
  datasetCopy['Family History of Stroke'] = datasetCopy['Family History of Stroke'].replace({'Yes' : 0, 'No' : 1})
  datasetCopy['Diagnosis'] = datasetCopy['Diagnosis'].replace({'No Stroke' : 0, 'Stroke' : 1})


6. MultiLabel Binarization

In [14]:
# Step 6: Convert 'Symptoms' column to a list of individual symptoms
datasetCopy["Symptoms"] = datasetCopy["Symptoms"].apply(lambda x: x.split(", "))

In [15]:
# Step 7: Apply MultiLabel Binarizer
mlb = MultiLabelBinarizer()
symptoms_encoded = pd.DataFrame(mlb.fit_transform(datasetCopy["Symptoms"]), columns=mlb.classes_)

In [16]:
# Step 8: Merge the encoded symptoms back into the dataset and drop the original 'Symptoms' column
datasetCopy = pd.concat([datasetCopy, symptoms_encoded], axis=1).drop(columns=["Symptoms"])

In [17]:
pd.set_option('display.max_columns', None)
datasetCopy.head()

Unnamed: 0,Age,Gender,Hypertension,Heart Disease,Residence Type,Average Glucose Level,Body Mass Index (BMI),Stroke History,Family History of Stroke,Stress Levels,Blood Pressure Levels,Cholesterol Levels,Diagnosis,Marital Status_Divorced,Marital Status_Married,Marital Status_Single,Work Type_Government Job,Work Type_Never Worked,Work Type_Private,Work Type_Self-employed,Smoking Status_Currently Smokes,Smoking Status_Formerly Smoked,Smoking Status_Non-smoker,Alcohol Intake_Frequent Drinker,Alcohol Intake_Never,Alcohol Intake_Rarely,Alcohol Intake_Social Drinker,Physical Activity_High,Physical Activity_Low,Physical Activity_Moderate,Dietary Habits_Gluten-Free,Dietary Habits_Keto,Dietary Habits_Non-Vegetarian,Dietary Habits_Paleo,Dietary Habits_Pescatarian,Dietary Habits_Vegan,Dietary Habits_Vegetarian,Blurred Vision,Confusion,Difficulty Speaking,Dizziness,Headache,Loss of Balance,No Symptoms,Numbness,Seizures,Severe Fatigue,Weakness
0,56,0,0,1,0,130.91,22.37,0,0,3.48,140/108,"HDL: 68, LDL: 133",1,0,1,0,0,0,0,1,0,0,1,0,0,0,1,0,0,1,0,0,0,0,0,1,0,0,0,1,0,1,0,0,0,0,0,0
1,80,0,0,0,1,183.73,32.57,0,1,1.73,146/91,"HDL: 63, LDL: 70",1,0,0,1,0,0,0,1,0,0,1,0,1,0,0,0,1,0,0,0,0,1,0,0,0,0,1,0,1,1,1,0,0,0,0,0
2,26,0,1,1,0,189.0,20.32,0,0,7.31,154/97,"HDL: 59, LDL: 95",1,0,1,0,0,1,0,0,0,1,0,0,0,1,0,1,0,0,0,0,0,1,0,0,0,0,0,0,1,0,0,0,0,1,0,0
3,73,0,0,0,1,185.29,27.5,0,1,5.35,174/81,"HDL: 70, LDL: 137",0,0,1,0,0,1,0,0,0,0,1,1,0,0,0,0,0,1,0,0,0,1,0,0,0,1,1,0,0,1,0,0,0,1,1,0
4,51,0,1,1,1,177.34,29.06,0,0,6.84,121/95,"HDL: 65, LDL: 68",1,1,0,0,0,0,0,1,1,0,0,0,0,1,0,0,1,0,0,0,0,0,1,0,0,0,0,1,0,0,0,0,0,0,0,0


7. Splitting the dataset into features and target

In [18]:
x = datasetCopy.drop(columns='Diagnosis', axis=1)
y = datasetCopy['Diagnosis']

8. Train - test split

In [19]:
X_Train, X_Test, Y_Train, Y_Test = train_test_split(x, y, train_size=0.7, shuffle=True, random_state=1)

9. Scaling values

In [20]:
scaler = StandardScaler()

scaler.fit(X_Train)
X_Train_Oversampled = pd.DataFrame(scaler.transform(X_Train), index=X_Train.index, columns=X_Train.columns)
X_Test_Oversampled = pd.DataFrame(scaler.transform(X_Test), index=X_Test.index, columns=X_Test.columns)

ValueError: could not convert string to float: '107/102'

In [None]:
# Checking standard Deviation
print(X_Train.std())

##### Model Training & Evaluation

In [21]:
models = {
    "                   Logistic Regression": LogisticRegression(class_weight='balanced'),
    "                   K-Nearest Neighbors": KNeighborsClassifier(),
    "                         Decision Tree": DecisionTreeClassifier(class_weight='balanced'),
    "                        Neural Network": MLPClassifier(),
    "                         Random Forest": RandomForestClassifier(class_weight='balanced'),
    "                     Gradient Boosting": GradientBoostingClassifier(),
}

In [None]:
# Loop through the models
for name, model in models.items():
    
    # Train on oversampled data
    model.fit(X_Train_Oversampled, Y_Train_Oversampled)
    
    # Predictions for oversampled test set
    oversampled_test_preds = model.predict(X_Test_Oversampled)
    oversampled_test_probs = model.predict_proba(X_Test_Oversampled)[:, 1]  # Probability estimates
    
    # Train on undersampled data
    model.fit(X_Train_Undersampled, Y_Train_Undersampled)
    
    # Predictions for undersampled test set
    undersampled_test_preds = model.predict(X_Test_Undersampled)
    undersampled_test_probs = model.predict_proba(X_Test_Undersampled)[:, 1]  # Probability estimates
    
    # Calculate accuracy and F1 score for oversampled data
    oversampled_test_acc = accuracy_score(Y_Test, oversampled_test_preds)
    oversampled_test_f1 = f1_score(Y_Test, oversampled_test_preds)
    oversampled_test_auc = roc_auc_score(Y_Test, oversampled_test_probs)  # AUC Score
    
    # Calculate accuracy and F1 score for undersampled data
    undersampled_test_acc = accuracy_score(Y_Test, undersampled_test_preds)
    undersampled_test_f1 = f1_score(Y_Test, undersampled_test_preds)
    undersampled_test_auc = roc_auc_score(Y_Test, undersampled_test_probs)  # AUC Score
    

In [None]:
# Print results
for name, model in models.items():
    print(f"Model: {name}")
    print(f"Test Accuracy: {oversampled_test_acc:.4f}")
    print(f"Test F1 Score: {oversampled_test_f1:.4f}")

    print("\n" + "="*50 + "\n")