##### Loading Imports & the Dataset

In [5]:
# Imports
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import f1_score
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.neural_network import MLPClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.preprocessing import MultiLabelBinarizer

In [None]:
# Loading the csv data file to a Pandas DataFrame
dataset = pd.read_csv('../datasets/dataset2.csv')

##### Dataset Analysis

1. Finding the total rows and columns

In [None]:
# Finding the total rows and columns
dataset.shape

2. Identifying data types

In [None]:
# Printing the first 5 rows of the dataset
dataset.head()

3. Checking for missing values

In [None]:
# Checking if there are any missing values in each column
dataset.isnull().sum()

5. Checking if the distribution of stroke is balanced

In [None]:
dataset['Diagnosis'].value_counts()

##### Preprocessing 1: Imputation & Oversampling

1. Creating a copy of the original dataset

In [7]:
datasetCopy = dataset.copy()

2. Removing the id & patient name column

In [8]:
datasetCopy = datasetCopy.drop('Patient ID', axis=1)
datasetCopy = datasetCopy.drop('Patient Name', axis=1)

3. Replacing missing values

In [None]:
datasetCopy["Symptoms"].fillna("No Symptoms", inplace=True)

4. One Hot Encoding 

In [10]:
datasetCopy = pd.get_dummies(datasetCopy, columns=['Marital Status', 'Work Type', 'Smoking Status', 'Alcohol Intake', 'Physical Activity', 'Dietary Habits'])

In [11]:
# Convert boolean columns to 0 and 1
datasetCopy[[
    'Marital Status_Divorced', 'Marital Status_Married', 'Marital Status_Single',
    'Work Type_Government Job', 'Work Type_Never Worked', 'Work Type_Private', 'Work Type_Self-employed',
    'Smoking Status_Currently Smokes', 'Smoking Status_Formerly Smoked', 'Smoking Status_Non-smoker',
    'Alcohol Intake_Frequent Drinker', 'Alcohol Intake_Never', 'Alcohol Intake_Rarely', 'Alcohol Intake_Social Drinker',
    'Physical Activity_High', 'Physical Activity_Low', 'Physical Activity_Moderate',
    'Dietary Habits_Gluten-Free', 'Dietary Habits_Keto', 'Dietary Habits_Non-Vegetarian', 'Dietary Habits_Paleo',
    'Dietary Habits_Pescatarian', 'Dietary Habits_Vegan', 'Dietary Habits_Vegetarian'
]] = datasetCopy[[
    'Marital Status_Divorced', 'Marital Status_Married', 'Marital Status_Single',
    'Work Type_Government Job', 'Work Type_Never Worked', 'Work Type_Private', 'Work Type_Self-employed',
    'Smoking Status_Currently Smokes', 'Smoking Status_Formerly Smoked', 'Smoking Status_Non-smoker',
    'Alcohol Intake_Frequent Drinker', 'Alcohol Intake_Never', 'Alcohol Intake_Rarely', 'Alcohol Intake_Social Drinker',
    'Physical Activity_High', 'Physical Activity_Low', 'Physical Activity_Moderate',
    'Dietary Habits_Gluten-Free', 'Dietary Habits_Keto', 'Dietary Habits_Non-Vegetarian', 'Dietary Habits_Paleo',
    'Dietary Habits_Pescatarian', 'Dietary Habits_Vegan', 'Dietary Habits_Vegetarian'
]].astype(int)

5. Binary encoding

In [None]:
datasetCopy['Gender'] = datasetCopy['Gender'].replace({'Male' : 0, 'Female' : 1})
datasetCopy['Residence Type'] = datasetCopy['Residence Type'].replace({'Rural' : 0, 'Urban' : 1})
datasetCopy['Family History of Stroke'] = datasetCopy['Family History of Stroke'].replace({'Yes' : 0, 'No' : 1})
datasetCopy['Diagnosis'] = datasetCopy['Diagnosis'].replace({'No Stroke' : 0, 'Stroke' : 1})

6. MultiLabel Binarization

In [13]:
# Step 6: Convert 'Symptoms' column to a list of individual symptoms
datasetCopy["Symptoms"] = datasetCopy["Symptoms"].apply(lambda x: x.split(", "))

In [14]:
# Step 7: Apply MultiLabel Binarizer
mlb = MultiLabelBinarizer()
symptoms_encoded = pd.DataFrame(mlb.fit_transform(datasetCopy["Symptoms"]), columns=mlb.classes_)

In [15]:
# Step 8: Merge the encoded symptoms back into the dataset and drop the original 'Symptoms' column
datasetCopy = pd.concat([datasetCopy, symptoms_encoded], axis=1).drop(columns=["Symptoms"])

In [None]:
pd.set_option('display.max_columns', None)
datasetCopy.head()

7. Splitting blood pressure and cholesterol columns

In [17]:
datasetCopy[['Systolic', 'Diastolic']] = datasetCopy['Blood Pressure Levels'].str.split('/', expand=True).astype(float)
datasetCopy = datasetCopy.drop('Blood Pressure Levels', axis=1)

In [18]:
datasetCopy['HDL'] = datasetCopy['Cholesterol Levels'].str.extract(r'HDL:\s*(\d+)').astype(float)
datasetCopy['LDL'] = datasetCopy['Cholesterol Levels'].str.extract(r'LDL:\s*(\d+)').astype(float)
datasetCopy = datasetCopy.drop('Cholesterol Levels', axis=1)

7. Splitting the dataset into features and target

In [19]:
x = datasetCopy.drop(columns='Diagnosis', axis=1)
y = datasetCopy['Diagnosis']

8. Train - test split

In [20]:
X_Train, X_Test, Y_Train, Y_Test = train_test_split(x, y, train_size=0.7, shuffle=True, random_state=1)

9. Scaling values

In [21]:
scaler = StandardScaler()

scaler.fit(X_Train)
X_Train= pd.DataFrame(scaler.transform(X_Train), index=X_Train.index, columns=X_Train.columns)
X_Test = pd.DataFrame(scaler.transform(X_Test), index=X_Test.index, columns=X_Test.columns)

In [None]:
# Checking standard Deviation
print(X_Train.std())

##### Model Training & Evaluation

In [23]:
models = {
    "                   Logistic Regression": LogisticRegression(),
    "                   K-Nearest Neighbors": KNeighborsClassifier(),
    "                         Decision Tree": DecisionTreeClassifier(),
    "                        Neural Network": MLPClassifier(),
    "                         Random Forest": RandomForestClassifier(),
    "                     Gradient Boosting": GradientBoostingClassifier(),
}

In [None]:
# Dictionary to store results
results = {}

# Loop through the models
for name, model in models.items():
    # Train the model
    model.fit(X_Train, Y_Train)
    
    # Get binary predictions (not probabilities)
    Y_pred = model.predict(X_Test)
    
    # Calculate metrics
    test_acc = accuracy_score(Y_Test, Y_pred)
    test_f1 = f1_score(Y_Test, Y_pred)
    
    # Store results
    results[name] = {
        'accuracy': test_acc,
        'f1': test_f1
    }
    
    

In [None]:
# Print results
for name, model in models.items():
    print(f"Model: {name}")
    print(f"Test Accuracy: {test_acc:.4f}")
    print(f"Test F1 Score: {test_f1:.4f}")

    print("\n" + "="*50 + "\n")