# Ensemble Learning

## Imports and Setting up the Kaggle API
### Create .env File and Set KAGGLE_KEY and KAGGLE_USERNAME as Kaggle Username and Key in .env File
### Example:
KAGGLE_KEY=API_KEY
KAGGLE_USERNAME=USERNAME

load_dotenv will take .env and set key pairs as environmental variables in Python

In [226]:
import os
from dotenv import load_dotenv
load_dotenv()
import kaggle
import pandas as pd
import numpy as np
from sklearn.preprocessing import MinMaxScaler, StandardScaler
import matplotlib.pyplot as plt
import xgboost as xgb



Setting the API Instance and downloading dataset

In [227]:
apiInstance=kaggle.KaggleApi()
apiInstance.dataset_download_files('fedesoriano/stroke-prediction-dataset', unzip=True)

Dataset URL: https://www.kaggle.com/datasets/fedesoriano/stroke-prediction-dataset


## Preprocessing


In [228]:
strokeData=pd.read_csv('healthcare-dataset-stroke-data.csv')
#strokeData.info()
strokeDataFeatures=strokeData.iloc[:,1:-1]
#iloc[rows,columns] we used : on rows as :specifies a range so a range with no upper or lower bound means taking everyting
#1:-1 means a range from 1(dropping our first column) to -1(which really means our last column)
#dropping the first column our ID column since it has no predictive power and can potentially cause any learners we use to develop patterns on it
#dropping the last column since we only want our features and not the labels
strokeDataLabels=strokeData.iloc[:,-1]
#getting only the last column as we only want the labels

In [229]:
print(strokeData.isnull().any())
#BMI is the only column with NaNs
from sklearn.impute import SimpleImputer
imputer=SimpleImputer(strategy='mean')
# our BMI column is our 8th column so we want to put that column in the imputer
strokeDataFeatures[['bmi']]=pd.DataFrame(imputer.fit_transform(strokeDataFeatures[['bmi']]))
print(strokeDataFeatures.isnull().any())

id                   False
gender               False
age                  False
hypertension         False
heart_disease        False
ever_married         False
work_type            False
Residence_type       False
avg_glucose_level    False
bmi                   True
smoking_status       False
stroke               False
dtype: bool
gender               False
age                  False
hypertension         False
heart_disease        False
ever_married         False
work_type            False
Residence_type       False
avg_glucose_level    False
bmi                  False
smoking_status       False
dtype: bool


In [230]:
from sklearn.preprocessing import OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OrdinalEncoder
from sklearn.preprocessing import StandardScaler
#we want to use standard scaler to scale the inputs for the numberical inputs to avoid problems with weights for different model types
#standard scaler or minmax scaler are good but we will use StandardScaler
#we want to use onehotencoder for categorical columns
# that have more than 2 possible awnsers, ordinal or label encoder for categorical columsn that have only 2 possible awnsers and then we want to use the columnTransformer to apply these encoders to the columsn we want to apply them to

#these lines help us figure out which columsn need to be onehotencoded and which need to be converted to binary
print('Columns to be OneHotEncoded')
for column in strokeDataFeatures:
    if (strokeDataFeatures[column].nunique()>2) & (strokeDataFeatures[column].dtype == 'object'):
        print(f'{column} has unique categories of {strokeDataFeatures[column].unique()}')
#gender work_type and smoking_status should be OneHotEncoded to avoid the learners accidentally ranking
print('Columns to be converted to Binary')
for column in strokeDataFeatures:
    if (strokeDataFeatures[column].nunique()==2) & (strokeDataFeatures[column].dtype == 'object'):
        print(f'{column} has unique categories of {strokeDataFeatures[column].unique()}')
#ever_married and residence_type can be converted to binary 0 and 1 since there are only 2 values

#column transformer takes in an array of tuples(each tuples has three values) each tuple is represents an encoder you will use on some columns in the tuples you have three values the first value is some arbitrary name like 'ordinalEncoder' and the second vlaue is the function for the encoder itself like OrdinalEncoder() the third value is a list of the column indices or column names if the data is a dataframe which you want that specific encoder to be used on so in this case we only want our OrdinalEncoder to
ct=ColumnTransformer(transformers=[('ordinalEncoder', OrdinalEncoder(), ['ever_married','Residence_type']),('oneHotEncoder', OneHotEncoder(), ['gender','work_type','smoking_status']),('scaler', StandardScaler(),['bmi','avg_glucose_level','age'])],remainder='passthrough')
strokeDataFeatures=ct.fit_transform(strokeDataFeatures)
print(ct.get_feature_names_out())
strokeDataFeatures[0]





Columns to be OneHotEncoded
gender has unique categories of ['Male' 'Female' 'Other']
work_type has unique categories of ['Private' 'Self-employed' 'Govt_job' 'children' 'Never_worked']
smoking_status has unique categories of ['formerly smoked' 'never smoked' 'smokes' 'Unknown']
Columns to be converted to Binary
ever_married has unique categories of ['Yes' 'No']
Residence_type has unique categories of ['Urban' 'Rural']
['ordinalEncoder__ever_married' 'ordinalEncoder__Residence_type'
 'oneHotEncoder__gender_Female' 'oneHotEncoder__gender_Male'
 'oneHotEncoder__gender_Other' 'oneHotEncoder__work_type_Govt_job'
 'oneHotEncoder__work_type_Never_worked'
 'oneHotEncoder__work_type_Private'
 'oneHotEncoder__work_type_Self-employed'
 'oneHotEncoder__work_type_children'
 'oneHotEncoder__smoking_status_Unknown'
 'oneHotEncoder__smoking_status_formerly smoked'
 'oneHotEncoder__smoking_status_never smoked'
 'oneHotEncoder__smoking_status_smokes' 'scaler__bmi'
 'scaler__avg_glucose_level' 'scaler__

array([1.        , 1.        , 0.        , 1.        , 0.        ,
       0.        , 0.        , 1.        , 0.        , 0.        ,
       0.        , 1.        , 0.        , 0.        , 1.00123401,
       2.70637544, 1.05143428, 0.        , 1.        ])

## Test-Train Split

In [231]:
from sklearn.model_selection import train_test_split
trainFeatures, testFeatures, trainLabels, testLabels = train_test_split(
    strokeDataFeatures, strokeDataLabels, test_size=0.2, random_state=42,stratify=strokeDataLabels)

In [232]:
print(trainFeatures[0])
print(trainLabels[0])

[ 1.          1.          1.          0.          0.          0.
  0.          1.          0.          0.          0.          0.
  1.          0.          0.54652702 -0.81577711  0.21111428  0.
  0.        ]
1


# Model Creation and Setup
We want to use XGBoost so we need to convert our dataset from numpy arrays to Dmatrix

In [233]:
dTrain=xgb.DMatrix(trainFeatures, trainLabels)
dTest = xgb.DMatrix(testFeatures, testLabels)
#xgboost parameters we will need to optimize to make ours ensemble training better

#Tree Booster Parameters

#eta is our learning rate
#gamma is the minimum loss reduction to make a further leaf partition increasing makes the model less complex
#max_depth is max depth of the tree increasing max depth can make the model more complex
#min_child_weight relates to the minimum amount of samples can be in a leaf node increasing decrease the model complexity
#max_delta_step acts to reduce how fast leaf values can change important for us as the skewed dataset can mean that the tree can change drastically to fit our very low amount of strokes in dataset so by increasing max_delta_step above 0 we can reduce the rate of change of leaf values meaning there is a less change of our skewed dataset from overfitting quickly
#subsample tells how much of the dataset to randomly sample before growin tree decreasing number will help reduce overfitting
#sampling_method how the dataset samples are chosen uniform means each instance has an equal change of being chosen and gradient based samples are chosen based on which samples have the greatest gradient, most variation between predicted label and actual label
#colsample_bylevel,colsample_bytree,colsample_bynode control the amount of features(columns) can be used at each by object so colsample_bylevel=0.5 means that only 50% of the available features can be used to split at the level decreasing features used can reduce overfitting by forcing the tree to use different features
#lambda alpha are L1,L2 regularization terms respectively increasing them increases how conservative the model is
#tree_method controls tree construction algorithm exact approx hist
#scale_pos_weight controls the balance of positive and negative weights
#updater A comma separated string defining the sequence of tree updaters to run, providing a modular way to construct and to modify the trees.
#process_type type of boosting process to run we don't need to change for us
#grow_policy controls how new nodes are added to the tree
#max_leaves controls maxiumum number of nodes to add


#Learning Task Parameters

#objective controls which objective function we use
#base_score controls initial prediction of all instances
#eval_metric evaluation metrics for validation data
#seed random number seed


params = {'objective':'binary:logistic','eval_metric':'rmse','eta':0.3,'gamma':0,'max_depth':6,'min_child_weight':1,'max_delta_step':0,'subsample':1,'sampling_method':'uniform','colsample_bylevel':1,'colsample_bytree':1,'colsample_bynode':1,'alpha':0,'lambda':1,'tree_method':'auto','scale_pos_weight':1,'refresh_leaf':1,'process_type':'default','grow_policy':'depthwise','max_leaves':0,'max_bin':256,}

evallist = [(dTrain, 'train'), (dTest, 'eval')]

"""params (Dict[str, Any]) – Booster params.

dtrain (DMatrix) – Data to be trained.

num_boost_round (int) – Number of boosting iterations.

evals (Sequence[Tuple[DMatrix, str]] | None) – List of validation sets for which metrics will evaluated during training. Validation metrics will help us track the performance of the model."""

bst=xgb.train(params=params, dtrain=dTrain, num_boost_round=100, evals=evallist, verbose_eval=True)



[0]	train-rmse:0.20476	eval-rmse:0.20980
[1]	train-rmse:0.19778	eval-rmse:0.20914
[2]	train-rmse:0.19216	eval-rmse:0.20845
[3]	train-rmse:0.18922	eval-rmse:0.20837
[4]	train-rmse:0.18478	eval-rmse:0.20874
[5]	train-rmse:0.18272	eval-rmse:0.20891
[6]	train-rmse:0.18021	eval-rmse:0.20947
[7]	train-rmse:0.17924	eval-rmse:0.20994
[8]	train-rmse:0.17705	eval-rmse:0.20988
[9]	train-rmse:0.17611	eval-rmse:0.20981
[10]	train-rmse:0.17432	eval-rmse:0.21050
[11]	train-rmse:0.17367	eval-rmse:0.21055
[12]	train-rmse:0.17323	eval-rmse:0.21039
[13]	train-rmse:0.17142	eval-rmse:0.21092
[14]	train-rmse:0.16876	eval-rmse:0.21132
[15]	train-rmse:0.16845	eval-rmse:0.21162
[16]	train-rmse:0.16668	eval-rmse:0.21120
[17]	train-rmse:0.16504	eval-rmse:0.21119
[18]	train-rmse:0.16345	eval-rmse:0.21038
[19]	train-rmse:0.16156	eval-rmse:0.21035
[20]	train-rmse:0.16119	eval-rmse:0.21038
[21]	train-rmse:0.16018	eval-rmse:0.21078
[22]	train-rmse:0.15938	eval-rmse:0.21089
[23]	train-rmse:0.15837	eval-rmse:0.21101
[2