##### Loading Imports & the Dataset

In [1]:
# Imports
import pandas as pd
import joblib
from sklearn.model_selection import train_test_split
from sklearn.impute import KNNImputer
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import f1_score
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.neural_network import MLPClassifier
from sklearn.tree import DecisionTreeClassifier

In [2]:
# Loading the csv data file to a Pandas DataFrame
dataset = pd.read_csv('../datasets/dataset3.csv')

##### Dataset Analysis

1. Finding the total rows and columns

In [3]:
# Finding the total rows and columns
dataset.shape

(5769190, 12)

2. Identifying data types

In [4]:
# Printing the first 5 rows of the dataset
dataset.head()

Unnamed: 0,id,gender,age,hypertension,heart_disease,ever_married,work_type,Residence_type,avg_glucose_level,bmi,smoking_status,stroke
0,10097.029899,Male,66.569509,0.006032,0.996458,Yes,Private,Urban,223.058819,36.006626,formerly smoked,1.0
1,51383.43754,Female,64.698781,0.007622,0.009006,Yes,Self-employed,Rural,199.614279,,never smoked,1.0
2,32482.486455,Male,79.224075,0.0,1.0,Yes,Private,Rural,102.718791,32.272498,never smoked,0.996074
3,63404.678287,Female,52.658122,0.014513,0.0,Yes,Private,Urban,174.433594,33.143869,smokes,0.996139
4,1169.53959,Female,79.133351,1.0,0.01645,Yes,Self-employed,Rural,170.494305,24.62444,never smoked,0.990965


3. Checking for missing values

In [5]:
# Checking if there are any missing values in each column
dataset.isnull().sum()

id                        0
gender                    0
age                       0
hypertension              0
heart_disease             0
ever_married              0
work_type                 0
Residence_type            0
avg_glucose_level         0
bmi                  226929
smoking_status            0
stroke                    0
dtype: int64

5. Checking if the distribution of stroke is balanced

In [6]:
#'1' who had a stroke and '0' who did not have a stroke.
dataset['stroke'].value_counts()

stroke
0.000000    2744009
1.000000     140339
0.011234          1
0.013017          1
0.009659          1
             ...   
0.016173          1
0.023703          1
0.017139          1
0.006802          1
0.001333          1
Name: count, Length: 2884844, dtype: int64

##### Preprocessing 1: Imputation & Oversampling

1. Creating a copy of the original dataset

In [7]:
datasetCopy = dataset.copy()

2. Removing the id column

In [8]:
datasetCopy = datasetCopy.drop('id', axis=1)

3. Undersampling

In [9]:
# Seperating those who had and did not have a stroke
stroke_1 = datasetCopy[datasetCopy['stroke'] == 1]
stroke_0 = datasetCopy[datasetCopy['stroke'] == 0]

In [10]:
# Undersample the stroke = 0 group to match the number of stroke = 1 rows
stroke_0_undersampled = stroke_0.sample(n=len(stroke_1), random_state=42)

In [11]:
# Combine the undersampled no-stroke rows with the stroke = 1 rows
datasetCopy = pd.concat([stroke_1, stroke_0_undersampled])

In [12]:
# Shuffle the final dataset to avoid any ordering issues
datasetCopy = datasetCopy.sample(frac=1, random_state=42).reset_index(drop=True)

In [13]:
datasetCopy.head()

Unnamed: 0,gender,age,hypertension,heart_disease,ever_married,work_type,Residence_type,avg_glucose_level,bmi,smoking_status,stroke
0,Male,60.607499,1.0,1.0,Yes,Private,Urban,121.27524,35.387754,smokes,1.0
1,Female,36.121928,0.00394,0.0,Yes,Private,Rural,99.21949,26.854055,smokes,0.0
2,Female,81.75136,0.032023,0.0,Yes,Private,Urban,95.742737,21.159583,smokes,1.0
3,Male,79.386651,0.0,0.0,Yes,Self-employed,Urban,123.614932,27.814862,never smoked,1.0
4,Female,24.917364,0.028887,0.0,No,Private,Urban,103.251516,23.697047,never smoked,0.0


3. Binary encoding

In [14]:
datasetCopy['ever_married'] = datasetCopy['ever_married'].replace({'No' : 0, 'Yes' : 1})
datasetCopy['Residence_type'] = datasetCopy['Residence_type'].replace({'Rural' : 0, 'Urban' : 1})

  datasetCopy['ever_married'] = datasetCopy['ever_married'].replace({'No' : 0, 'Yes' : 1})
  datasetCopy['Residence_type'] = datasetCopy['Residence_type'].replace({'Rural' : 0, 'Urban' : 1})


4. One Hot Encoding

In [15]:
datasetCopy = pd.get_dummies(datasetCopy, columns=['gender', 'work_type', 'smoking_status'])

In [16]:
datasetCopy.head()

Unnamed: 0,age,hypertension,heart_disease,ever_married,Residence_type,avg_glucose_level,bmi,stroke,gender_Female,gender_Male,gender_Other,work_type_Govt_job,work_type_Never_worked,work_type_Private,work_type_Self-employed,work_type_children,smoking_status_Unknown,smoking_status_formerly smoked,smoking_status_never smoked,smoking_status_smokes
0,60.607499,1.0,1.0,1,1,121.27524,35.387754,1.0,False,True,False,False,False,True,False,False,False,False,False,True
1,36.121928,0.00394,0.0,1,0,99.21949,26.854055,0.0,True,False,False,False,False,True,False,False,False,False,False,True
2,81.75136,0.032023,0.0,1,1,95.742737,21.159583,1.0,True,False,False,False,False,True,False,False,False,False,False,True
3,79.386651,0.0,0.0,1,1,123.614932,27.814862,1.0,False,True,False,False,False,False,True,False,False,False,True,False
4,24.917364,0.028887,0.0,0,1,103.251516,23.697047,0.0,True,False,False,False,False,True,False,False,False,False,True,False


5. Splitting the dataset into features and target

In [17]:
x = datasetCopy.drop(columns='stroke', axis=1)
y = datasetCopy['stroke']

6. Train - test split

In [18]:
X_Train, X_Test, Y_Train, Y_Test = train_test_split(x, y, train_size=0.7, shuffle=True, random_state=1)

7. Replacing missing values with imputed values

In [19]:
imputer = KNNImputer()
imputer.fit(X_Train)

# Saving the imputer
joblib.dump(imputer, "../trained_model/imputer.pkl")

X_Train = pd.DataFrame(imputer.transform(X_Train), index=X_Train.index, columns=X_Train.columns)
X_Test = pd.DataFrame(imputer.transform(X_Test), index=X_Test.index, columns=X_Test.columns)

9. Scaling values

In [20]:
scaler = StandardScaler()
scaler.fit(X_Train)

# Saving the scaler
joblib.dump(scaler, "../trained_model/scaler.pkl")

X_Train= pd.DataFrame(scaler.transform(X_Train), index=X_Train.index, columns=X_Train.columns)
X_Test= pd.DataFrame(scaler.transform(X_Test), index=X_Test.index, columns=X_Test.columns)

In [21]:
# Checking standard Deviation
print(X_Train.std())

age                               1.000003
hypertension                      1.000003
heart_disease                     1.000003
ever_married                      1.000003
Residence_type                    1.000003
avg_glucose_level                 1.000003
bmi                               1.000003
gender_Female                     1.000003
gender_Male                       1.000003
gender_Other                      1.000003
work_type_Govt_job                1.000003
work_type_Never_worked            1.000003
work_type_Private                 1.000003
work_type_Self-employed           1.000003
work_type_children                1.000003
smoking_status_Unknown            1.000003
smoking_status_formerly smoked    1.000003
smoking_status_never smoked       1.000003
smoking_status_smokes             1.000003
dtype: float64


##### Model Training & Evaluation

In [22]:
models = {
    "Logistic Regression": LogisticRegression(),
    "K-Nearest Neighbors": KNeighborsClassifier(),
    "Decision Tree": DecisionTreeClassifier(),
    "Neural Network": MLPClassifier(),
    "Random Forest": RandomForestClassifier(),
    "Gradient Boosting": GradientBoostingClassifier(),
}

In [23]:
# Dictionary to store results
results = {}

# Loop through the models
for name, model in models.items():
    # Train the model
    model.fit(X_Train, Y_Train)
    
    # Get binary predictions (not probabilities)
    Y_pred = model.predict(X_Test)
    
    # Calculate metrics
    test_acc = accuracy_score(Y_Test, Y_pred)
    test_f1 = f1_score(Y_Test, Y_pred)
    
    # Store results
    results[name] = {
        'accuracy': test_acc,
        'f1': test_f1
    }
    



In [24]:
# Print results
#for name, model in models.items():
 #   print(f"Model: {name}")
  #  print(f"Test Accuracy: {test_acc:.4f}")
   # print(f"Test F1 Score: {test_f1:.4f}")

    #print("\n" + "="*50 + "\n")

In [25]:
# Print results correctly for each model
for name in models.keys():
    print(f"Model: {name}")
    print(f"Test Accuracy: {results[name]['accuracy']:.4f}")
    print(f"Test F1 Score: {results[name]['f1']:.4f}")
    print("\n" + "="*50 + "\n")

Model: Logistic Regression
Test Accuracy: 0.7756
Test F1 Score: 0.7846


Model: K-Nearest Neighbors
Test Accuracy: 0.9841
Test F1 Score: 0.9843


Model: Decision Tree
Test Accuracy: 0.9640
Test F1 Score: 0.9641


Model: Neural Network
Test Accuracy: 0.9748
Test F1 Score: 0.9752


Model: Random Forest
Test Accuracy: 0.9829
Test F1 Score: 0.9830


Model: Gradient Boosting
Test Accuracy: 0.8093
Test F1 Score: 0.8211




In [26]:
# Choosing model to save
KNearest_Neighbours_Model = models['K-Nearest Neighbors']

# Saving the model
joblib.dump(KNearest_Neighbours_Model, "../trained_model/KNearest_Neighbours_Model.pkl")

['../trained_model/KNearest_Neighbours_Model.pkl']