##### Loading Imports & the Dataset

In [None]:
# Imports
import pandas as pd
import joblib
from sklearn.model_selection import train_test_split
from sklearn.impute import KNNImputer
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import f1_score
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.neural_network import MLPClassifier
from sklearn.tree import DecisionTreeClassifier

In [None]:
# Loading the csv data file to a Pandas DataFrame
dataset = pd.read_csv('../datasets/dataset3.csv')

##### Dataset Analysis

1. Finding the total rows and columns

In [None]:
# Finding the total rows and columns
dataset.shape

2. Identifying data types

In [None]:
# Printing the first 5 rows of the dataset
dataset.head()

3. Checking for missing values

In [None]:
# Checking if there are any missing values in each column
dataset.isnull().sum()

5. Checking if the distribution of stroke is balanced

In [None]:
#'1' who had a stroke and '0' who did not have a stroke.
dataset['stroke'].value_counts()

##### Preprocessing 1: Imputation & Oversampling

1. Creating a copy of the original dataset

In [None]:
datasetCopy = dataset.copy()

2. Removing the id column

In [None]:
datasetCopy = datasetCopy.drop('id', axis=1)

3. Undersampling

In [None]:
# Seperating those who had and did not have a stroke
stroke_1 = datasetCopy[datasetCopy['stroke'] == 1]
stroke_0 = datasetCopy[datasetCopy['stroke'] == 0]

In [None]:
# Undersample the stroke = 0 group to match the number of stroke = 1 rows
stroke_0_undersampled = stroke_0.sample(n=len(stroke_1), random_state=42)

In [None]:
# Combine the undersampled no-stroke rows with the stroke = 1 rows
datasetCopy = pd.concat([stroke_1, stroke_0_undersampled])

In [None]:
# Shuffle the final dataset to avoid any ordering issues
datasetCopy = datasetCopy.sample(frac=1, random_state=42).reset_index(drop=True)

In [None]:
datasetCopy.head()

3. Binary encoding

In [None]:
datasetCopy['ever_married'] = datasetCopy['ever_married'].replace({'No' : 0, 'Yes' : 1})
datasetCopy['Residence_type'] = datasetCopy['Residence_type'].replace({'Rural' : 0, 'Urban' : 1})

4. One Hot Encoding

In [None]:
datasetCopy = pd.get_dummies(datasetCopy, columns=['gender', 'work_type', 'smoking_status'])

In [None]:
datasetCopy.head()

5. Splitting the dataset into features and target

In [None]:
x = datasetCopy.drop(columns='stroke', axis=1)
y = datasetCopy['stroke']

6. Train - test split

In [None]:
X_Train, X_Test, Y_Train, Y_Test = train_test_split(x, y, train_size=0.7, shuffle=True, random_state=1)

7. Replacing missing values with imputed values

In [None]:
imputer = KNNImputer()
imputer.fit(X_Train)

# Saving the imputer
joblib.dump(imputer, "../trained_model/imputer.pkl")

X_Train = pd.DataFrame(imputer.transform(X_Train), index=X_Train.index, columns=X_Train.columns)
X_Test = pd.DataFrame(imputer.transform(X_Test), index=X_Test.index, columns=X_Test.columns)

9. Scaling values

In [None]:
print(X_Train.std())

In [None]:
scaler = StandardScaler()
scaler.fit(X_Train)

# Saving the scaler
joblib.dump(scaler, "../trained_model/scaler.pkl")

X_Train= pd.DataFrame(scaler.transform(X_Train), index=X_Train.index, columns=X_Train.columns)
X_Test= pd.DataFrame(scaler.transform(X_Test), index=X_Test.index, columns=X_Test.columns)

In [None]:
# Checking standard Deviation
print(X_Train.std())

##### Model Training & Evaluation

In [None]:
models = {
    "Logistic Regression": LogisticRegression(),
    "K-Nearest Neighbors": KNeighborsClassifier(),
    "Decision Tree": DecisionTreeClassifier(),
    "Neural Network": MLPClassifier(),
    "Random Forest": RandomForestClassifier(),
    "Gradient Boosting": GradientBoostingClassifier(),
}

In [None]:
# Dictionary to store results
results = {}

# Loop through the models
for name, model in models.items():
    # Train the model
    model.fit(X_Train, Y_Train)
    
    # Get binary predictions (not probabilities)
    Y_pred = model.predict(X_Test)
    
    # Calculate metrics
    test_acc = accuracy_score(Y_Test, Y_pred)
    test_f1 = f1_score(Y_Test, Y_pred)
    
    # Store results
    results[name] = {
        'accuracy': test_acc,
        'f1': test_f1
    }
    

In [None]:
# Print results
#for name, model in models.items():
 #   print(f"Model: {name}")
  #  print(f"Test Accuracy: {test_acc:.4f}")
   # print(f"Test F1 Score: {test_f1:.4f}")

    #print("\n" + "="*50 + "\n")

In [None]:
# Print results correctly for each model
for name in models.keys():
    print(f"Model: {name}")
    print(f"Test Accuracy: {results[name]['accuracy']:.4f}")
    print(f"Test F1 Score: {results[name]['f1']:.4f}")
    print("\n" + "="*50 + "\n")

In [None]:
# Choosing model to save
KNearest_Neighbours_Model = models['K-Nearest Neighbors']

# Saving the model
joblib.dump(KNearest_Neighbours_Model, "../trained_model/KNearest_Neighbours_Model.pkl")