In [3]:
from pyspark.sql import SparkSession
spark = SparkSession.builder.appName('Random Forest').getOrCreate()

import numpy as np 
import pandas as pd


from ipynb.fs.full.Knn import *

In [4]:
# Replace 'other' with most frequent gender
df = df4.copy()
df['gender'].replace('Other', df4['gender'].value_counts().idxmax(), inplace=True)
df['gender'].unique()

df.drop('id', axis=1, inplace=True)
df.head()

Unnamed: 0,gender,age,hypertension,heart_disease,ever_married,work_type,Residence_type,avg_glucose_level,smoking_status,bmi,stroke
0,Male,67.0,0,1,Yes,Private,Urban,228.69,formerly smoked,30.8,1
1,Female,61.0,0,0,Yes,Self-employed,Rural,202.21,never smoked,28.3,1
2,Male,80.0,0,1,Yes,Private,Rural,105.92,never smoked,30.9,1
3,Female,49.0,0,0,Yes,Private,Urban,171.23,smokes,28.5,1
4,Female,79.0,1,0,Yes,Self-employed,Rural,174.12,never smoked,30.2,1


In [5]:
#rearrange the columns so that we can easily track the index of columns for ColumnTransformer
X = df.drop(['stroke'],axis=1)
Y = df['stroke']

X_category = X.select_dtypes(include='object')
X_numeric = X.select_dtypes(exclude='object')

X = pd.concat([X_category, X_numeric], axis=1)

In [6]:
# Preprocessing
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.model_selection import train_test_split
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline

# Models
from sklearn.ensemble import RandomForestClassifier
from sklearn.tree import DecisionTreeClassifier


# Evaluation
from sklearn.metrics import classification_report, confusion_matrix


In [7]:
# Building the preprocessing pipeline
#transform the numerical columns by imputting followed by scaling
imp_std = Pipeline(
    steps=[
        ('impute', SimpleImputer(strategy='median')),
        ('scale', StandardScaler()),
    ]
)

#pass this pipeline along with OneHotEncoder to ColumnsTransformer to do the Preprocessing stuff
ct = ColumnTransformer(
    remainder='passthrough',
    transformers = [
        ("Encoding",OneHotEncoder(),[0,1,2,3,4]),
        ("Scaler", imp_std,[5,6,7,8,9])
    ]
)


# Split the data
X_train_idle, X_test_idle, y_train, y_test = train_test_split(X, Y, test_size=0.25, stratify=Y)

# Fit our transformers to train set
ct.fit(X_train_idle)

# Transform both train and test set
X_train = ct.transform(X_train_idle)
X_test = ct.transform(X_test_idle)

In [13]:
#applying SMOTE to oversample the dataset in hope that the models can learn more efficiently

from imblearn.over_sampling import SMOTE

X_train_resampled, y_train_resampled = SMOTE().fit_resample(X_train, y_train)
#X_train_resampled, y_train_resampled = X_train, y_train

In [14]:
models = dict()
models['Random Forest'] = RandomForestClassifier(class_weight={0:1,1:2})

In [15]:
for model in models:
    models[model].fit(X_train_resampled, y_train_resampled)
    print(model + ' : fit')

Random Forest : fit


In [16]:
#The performance on train set is (too) good. That's because we use SMOTE. It makes model learn very well because of having a perfect balance dataset
print("Train set prediction")
for x in models:
        
    print('------------------------'+x+'------------------------')
    model = models[x]
    y_train_pred = model.predict(X_train_resampled)
    arg_train = {'y_true':y_train_resampled, 'y_pred':y_train_pred}
    print(confusion_matrix(**arg_train))
    print(classification_report(**arg_train))

Train set prediction
------------------------Random Forest------------------------
[[3645    0]
 [   0 3645]]
              precision    recall  f1-score   support

           0       1.00      1.00      1.00      3645
           1       1.00      1.00      1.00      3645

    accuracy                           1.00      7290
   macro avg       1.00      1.00      1.00      7290
weighted avg       1.00      1.00      1.00      7290



In [17]:
print("Test set prediction")
for x in models:
        
    print('------------------------'+x+'------------------------')
    model = models[x]
    y_test_pred = model.predict(X_test)
    arg_test = {'y_true':y_test, 'y_pred':y_test_pred}
    print(confusion_matrix(**arg_test))
    print(classification_report(**arg_test))

Test set prediction
------------------------Random Forest------------------------
[[1162   54]
 [  49   13]]
              precision    recall  f1-score   support

           0       0.96      0.96      0.96      1216
           1       0.19      0.21      0.20        62

    accuracy                           0.92      1278
   macro avg       0.58      0.58      0.58      1278
weighted avg       0.92      0.92      0.92      1278

