In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

In [None]:
# loading the dataset
data = pd.read_csv('/kaggle/input/stroke-prediction-data/heart_strokes.csv')

In [None]:
# Loading Libraries
import seaborn as sns
import matplotlib.pyplot as plt
from imblearn.over_sampling import RandomOverSampler
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import accuracy_score

In [None]:
# checking the first few observations
data.head()

In [None]:
# data exploration 
data.describe()

In [None]:
data.info()

In [None]:
# check for duplicates
print(data.duplicated())

In [None]:
# Check the missing values.
print(data.isna().sum())

In [None]:
# replace missing calues in dataset with median and mode
smoking_status_mode = data['smoking_status'].mode()[0]
data['smoking_status'].fillna(smoking_status_mode, inplace=True)
bmi_median = data['bmi'].median()
data['bmi'].fillna(bmi_median, inplace=True)
print(data.isna().sum())

In [None]:
# data encoding
# all ordinal categorical data will be encoded with the label encoding strategy 

data['ever_married'] = data['ever_married'].replace({'Yes':1, 'No':0}).astype(np.uint8)
data['gender'] = data['gender'].replace({'Male':0,'Female':1,'Other':-1}).astype(np.uint8)
data['Residence_type'] = data['Residence_type'].replace({'Rural':0,'Urban':1}).astype(np.uint8)
data['work_type'] = data['work_type'].replace({'Private':0,'Self-employed':1,'Govt_job':2,'children':3,'Never_worked':4}).astype(np.uint8)
data['smoking_status'] = data['smoking_status'].replace({'smokes':3,'formerly smoked':2, 'never smoked':1, 'Unknown':0}).astype(np.uint8)


In [None]:
# Check how balanced the data is
print(data['stroke'].value_counts())

In [None]:
data.head()

In [None]:
# Understanding the data better. Goal is to check the correlation between the attributes.

data_correlation_matrix = data.corr() 
# Create a heatmap of the correlation matrix
plt.figure(figsize=(16, 8))
sns.heatmap(data_correlation_matrix, annot=True, cmap='crest', linewidths=0.5)
plt.title('Dataset Correlation Heatmap')
plt.show()

In [None]:
# Separation
X = data.drop(columns=['stroke'])
y = data['stroke']
X.head()

In [None]:
data.head(5)

In [None]:
# Normalization of the columns/attributes age and avg_glucose_level

# age_list = X['age'].values
# avg_glucose_level = X['avg_glucose_level'].values

# age_result = [value / age_list.max() for value in age_list]
# glucose_level_result = [value / avg_glucose_level.max() for value in avg_glucose_level]
# X['age'] = age_result
# X['avg_glucose_level'] = glucose_level_result

In [None]:
X.head()

In [None]:
from sklearn.preprocessing import StandardScaler
scalar = StandardScaler(with_mean=False,with_std=False)
scaled_data = scalar.fit_transform([X['age'], X['avg_glucose_level']])
X['age'], X['avg_glucose_level'] = scaled_data[0], scaled_data[1]

In [None]:
X.head()

In [None]:
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, precision_score, f1_score, recall_score

X_train, X_test, y_train, y_test = train_test_split(X,y, train_size=0.8, random_state=42)
y_train = y_train.values 

print(X.shape)
print(X_train.shape)
print(X_test.shape)

print(y_train)

In [None]:
# Model Creation and testing 
rf_classifier = RandomForestClassifier(random_state=42)
rf_classifier.fit(X_train, y_train)
pred = rf_classifier.predict(X_test)
first_acc = accuracy_score(y_test, pred)
first_precision = precision_score(y_test, pred, average='macro', zero_division=0)
first_recall = recall_score(y_test, pred, average='macro')
first_f1 = f1_score(y_test, pred, average='macro')
print('Accuracy: ', first_acc)
print('Precision: ', first_precision)
print('Recall: ', first_recall)
print('F1: ', first_f1)

In [None]:
#Random oversampling
oversampler = RandomOverSampler(sampling_strategy=0.75, random_state=42)

# Data for oversampling
X_r, y_r = oversampler.fit_resample(X, y)

X_r_train, X_r_test, y_r_train, y_r_test = train_test_split(X_r, y_r, train_size=0.8, random_state=42)
y_r_train = y_r_train.values

r_scalar = StandardScaler(with_mean=False,with_std=False)
r_scaled_data = r_scalar.fit_transform([X_r['age'], X_r['avg_glucose_level']])
X_r['age'], X_r['avg_glucose_level'] = r_scaled_data[0], r_scaled_data[1]

In [None]:
# checking how balanced the data is after random oversamping
print(y_r.value_counts())

In [None]:
# Pseudocode for Random Forest
# RandomForestClassifier:
#     Input:
#         - Training data (X_train, y_train)
#         - Number of trees in the forest (n_estimators)
#         - Number of features to consider at each split (max_features)
#         - Number of data points to sample for each tree (bootstrap_samples)

#     Output:
#         - Random forest ensemble

#     Ensemble = []  # Initialize an empty list to store decision trees

#     for i = 1 to n_trees:
#         # Randomly sample data points with replacement (bootstrap)
#         X_bootstrap, y_bootstrap = BootstrapSample(X_train, y_train, bootstrap_samples)

#         # Randomly select a subset of features
#         selected_features = RandomSubset(max_features, total_features)

#         # Train a decision tree on the bootstrapped dataset using the selected features
#         tree = BuildDecisionTree(X_bootstrap, y_bootstrap, selected_features)

#         # Append the trained tree to the ensemble
#         Ensemble.append(tree)

#     return Ensemble

# Predict:
#     Input:
#         - Random forest ensemble
#         - Input data (X_test)

#     Output:
#         - Predicted class labels (or regression values)

#     Initialize an array to store the predictions for each tree: predictions = []

#     for tree in Ensemble:
#         # Make predictions using each tree in the ensemble
#         y_pred_tree = tree.predict(X_test)
        
#         # Append the predictions to the array
#         predictions.append(y_pred_tree)

#     # Aggregate the predictions (e.g., majority vote for classification)
#     final_predictions = Aggregate(predictions)

#     return final_predictions

In [None]:
## Hyper Parameter tunning
# Define the hyperparameter grid to search
param_grid = {
    'n_estimators': [10, 50, 100, 200], # Number of trees in the forest
    'max_depth': [None, 10, 20, 30],    # Maximum depth of the trees
    'min_samples_split': [2, 5, 10],    # Minimum number of samples required to split an internal node
    'min_samples_leaf': [1, 2, 4]       # Minimum number of samples required to be at a leaf node
}

# Create a GridSearchCV object
grid_search = GridSearchCV(rf_classifier, param_grid, cv=5, scoring='accuracy')

# Fit GridSearchCV to the data
grid_search.fit(X_r_train, y_r_train)

# Get the best hyperparameters
best_params = grid_search.best_params_
print("Best Hyperparameters:", best_params)

In [None]:
# Train a final Random Forest classifier with the best hyperparameters
best_rf_classifier = RandomForestClassifier(random_state=42, **best_params)
best_rf_classifier.fit(X_r_train, y_r_train)

# Make predictions with the final model on the validation set
y_pred = best_rf_classifier.predict(X_r_test)

# Evaluate the model's performance
accuracy = accuracy_score(y_r_test, y_pred)
precision = precision_score(y_r_test, y_pred, average='macro', zero_division=0)
recall = recall_score(y_r_test, y_pred, average='macro')
f1 = f1_score(y_r_test, y_pred, average='macro')
print('Accuracy: ', accuracy)
print('Precision: ', precision)
print('Recall: ', recall)
print('F1: ', f1)