# Assignment 3
## JuPyter Notebook - Verschuur L. 1811053, Kolenbrander M. 1653415

In [1]:
from utility_functions import *

## Parameters

# Convert numeric values into ranged representations
convert_to_range = True
# Represent ranges as categorical values or rounded to nearest base value: 12, b=5 -> 10-14
range_categorical = False
# Convert categorical values (not from ranged values) into one hot representations:
# {'smoking': ['sometimes', 'regularly', 'sometimes', 'never']} -> 
# {'smoking_sometimes': [1, 0, 1, 0], 'smoking_regularly': [0, 1, 0, 0], 'smoking_never': [0, 0, 0, 1]}
convert_categorical_to_one_hot = True

## Data fetching & Data pre-processing

In [74]:
import pandas as pd
import numpy as np

file_path = "healthcare-dataset-stroke-data.csv"

# Fetching CSV and converting to data frame
data_file = pd.read_csv(file_path, header=0)

# Drop all entries with nan values
data_file = data_file.dropna()

# Convert interval and ratio variables into ranges
if convert_to_range and range_categorical:
    data_file.insert(data_file.columns.get_loc("age"), "ranged_age", [floor_range(age, 20) for age in data_file["age"]])
    data_file.insert(data_file.columns.get_loc("avg_glucose_level"), "ranged_avg_glucose_level", [floor_range(avg_glucose_level, 30) for avg_glucose_level in data_file["avg_glucose_level"]])
    data_file.insert(data_file.columns.get_loc("bmi"), "ranged_bmi", [floor_range(bmi, 2) for bmi in data_file["bmi"]])
elif convert_to_range and not range_categorical:
    data_file.insert(data_file.columns.get_loc("age"), "ranged_age", [round_to_base(age, 20) for age in data_file["age"]])
    data_file.insert(data_file.columns.get_loc("avg_glucose_level"), "ranged_avg_glucose_level", [round_to_base(avg_glucose_level, 30) for avg_glucose_level in data_file["avg_glucose_level"]])
    data_file.insert(data_file.columns.get_loc("bmi"), "ranged_bmi", [round_to_base(bmi, 2) for bmi in data_file["bmi"]])

data_file.insert(data_file.columns.get_loc("bmi"), "rounded_bmi", data_file["bmi"].round(0))
    
# Convert "boolean" variables into true boolean variables
data_file["hypertension"] = data_file["hypertension"].astype("bool")
data_file["heart_disease"] = data_file["heart_disease"].astype("bool")
data_file["stroke"] = data_file["stroke"].astype("bool")
data_file["ever_married"] = np.where(data_file["ever_married"] == "Yes", True, False).astype("bool")
    
# Convert categorical variables into a one-hot representation
if convert_categorical_to_one_hot:
    data_file = pd.concat([data_file, pd.get_dummies(data_file["gender"], prefix="gender")], axis=1)
    data_file = pd.concat([data_file, pd.get_dummies(data_file["work_type"], prefix="work_type")], axis=1)
    data_file = pd.concat([data_file, pd.get_dummies(data_file["Residence_type"], prefix="Residence_type")], axis=1)
    data_file = pd.concat([data_file, pd.get_dummies(data_file["smoking_status"], prefix="smoking_status")], axis=1)
    
data_file.head()

Unnamed: 0,id,gender,ranged_age,age,hypertension,heart_disease,ever_married,work_type,Residence_type,ranged_avg_glucose_level,...,work_type_Never_worked,work_type_Private,work_type_Self-employed,work_type_children,Residence_type_Rural,Residence_type_Urban,smoking_status_Unknown,smoking_status_formerly smoked,smoking_status_never smoked,smoking_status_smokes
0,9046,Male,60,67.0,False,True,True,Private,Urban,240,...,0,1,0,0,0,1,0,1,0,0
2,31112,Male,80,80.0,False,True,True,Private,Rural,120,...,0,1,0,0,1,0,0,0,1,0
3,60182,Female,40,49.0,False,False,True,Private,Urban,180,...,0,1,0,0,0,1,0,0,0,1
4,1665,Female,80,79.0,True,False,True,Self-employed,Rural,180,...,0,0,1,0,1,0,0,0,1,0
5,56669,Male,80,81.0,False,False,True,Private,Urban,180,...,0,1,0,0,0,1,0,1,0,0


## Classification

### Test & Train set generation
**Applied algorithms for this test and training set**
- SKLearn Random Forest Classifier
- SKLearn Support Vector Classification Classifier
- XGBoost Classifier

In [75]:
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler

# Decision Variables
X = data_file.loc[:,["ranged_age", "hypertension", "heart_disease", "ever_married", "ranged_avg_glucose_level", "rounded_bmi", *fetch_columns_on_name_list(data_file, ["work_type"])]].values
# Target Variable
y = data_file.loc[:,"stroke"].values

# Split datasets into training and testing sets
X_train, X_test, Y_train, Y_test = train_test_split(X, y, test_size=0.1)

sc = StandardScaler()

# Scale data for classifiers
X_train_scaled = sc.fit_transform(X_train)
X_test_scaled = sc.fit_transform(X_test)

### Random Forest Classification training & testing

In [103]:
from sklearn.ensemble import RandomForestClassifier

rf_classifier = RandomForestClassifier(n_estimators=150, n_jobs=-1)
rf_classifier.fit(X_train_scaled, Y_train)

RandomForestClassifier(n_estimators=150, n_jobs=-1)

#### Random Forest Classification report

In [104]:
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix

rf_y_prediction = rf_classifier.predict(X_test_scaled)

print("# # # Classification report # # #")
print(classification_report(Y_test, rf_y_prediction))

print("# # # Confusion matrix # # #")
print(confusion_matrix(Y_test, rf_y_prediction), "\n")

print("# # # Accuracy score # # #")
print(accuracy_score(Y_test, rf_y_prediction), "\n")

# # # Classification report # # #
              precision    recall  f1-score   support

       False       0.97      0.99      0.98       473
        True       0.22      0.11      0.15        18

    accuracy                           0.95       491
   macro avg       0.59      0.55      0.56       491
weighted avg       0.94      0.95      0.95       491

# # # Confusion matrix # # #
[[466   7]
 [ 16   2]] 

# # # Accuracy score # # #
0.9531568228105907 



### Support Vector Classification training & testing

In [113]:
from sklearn.svm import SVC

SV_classifier = SVC()
SV_classifier.fit(X_train_scaled, Y_train)

SVC(cache_size=1000)

#### Support Vector Classification report

In [175]:
SV_y_prediction = SV_classifier.predict(X_test_scaled)

print("# # # Classification report # # #")
print(classification_report(Y_test, SV_y_prediction))

print("# # # Confusion matrix # # #")
print(confusion_matrix(Y_test, SV_y_prediction), "\n")

print("# # # Accuracy score # # #")
print(accuracy_score(Y_test, SV_y_prediction), "\n")

# # # Classification report # # #
              precision    recall  f1-score   support

       False       0.96      1.00      0.98       473
        True       0.00      0.00      0.00        18

    accuracy                           0.96       491
   macro avg       0.48      0.50      0.49       491
weighted avg       0.93      0.96      0.95       491

# # # Confusion matrix # # #
[[473   0]
 [ 18   0]] 

# # # Accuracy score # # #
0.9633401221995926 



  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


### XGBoost Classification training & testing

In [173]:
import xgboost as xgb

XGB_classifier = xgb.XGBClassifier(base_score=0.6, n_estimators=130)
XGB_classifier.fit(X_train_scaled, Y_train)





XGBClassifier(base_score=0.6, booster='gbtree', colsample_bylevel=1,
              colsample_bynode=1, colsample_bytree=1, enable_categorical=False,
              gamma=0, gpu_id=-1, importance_type=None,
              interaction_constraints='', learning_rate=0.300000012,
              max_delta_step=0, max_depth=6, min_child_weight=1, missing=nan,
              monotone_constraints='()', n_estimators=130, n_jobs=8,
              num_parallel_tree=1, predictor='auto', random_state=0,
              reg_alpha=0, reg_lambda=1, scale_pos_weight=1, subsample=1,
              tree_method='exact', validate_parameters=1, verbosity=None)

#### XGBoost Classification report

In [174]:
XGB_y_prediction = XGB_classifier.predict(X_test_scaled)

print("# # # Classification report # # #")
print(classification_report(Y_test, XGB_y_prediction))

print("# # # Confusion matrix # # #")
print(confusion_matrix(Y_test, XGB_y_prediction), "\n")

print("# # # Accuracy score # # #")
print(accuracy_score(Y_test, XGB_y_prediction), "\n")

# # # Classification report # # #
              precision    recall  f1-score   support

       False       0.97      0.99      0.98       473
        True       0.44      0.22      0.30        18

    accuracy                           0.96       491
   macro avg       0.71      0.61      0.64       491
weighted avg       0.95      0.96      0.96       491

# # # Confusion matrix # # #
[[468   5]
 [ 14   4]] 

# # # Accuracy score # # #
0.9613034623217923 

