# Ensemble Learning

## Imports and Setting up the Kaggle API
### Create .env File and Set KAGGLE_KEY and KAGGLE_USERNAME as Kaggle Username and Key in .env File
### Example:
KAGGLE_KEY=API_KEY
KAGGLE_USERNAME=USERNAME

load_dotenv will take .env and set key pairs as environmental variables in Python

In [200]:
import os
from dotenv import load_dotenv
load_dotenv()
import kaggle
import pandas as pd
import numpy as np
from sklearn.preprocessing import MinMaxScaler, StandardScaler
import matplotlib.pyplot as plt



Setting the API Instance and downloading dataset

In [201]:
apiInstance=kaggle.KaggleApi()
apiInstance.dataset_download_files('fedesoriano/stroke-prediction-dataset', unzip=True)

Dataset URL: https://www.kaggle.com/datasets/fedesoriano/stroke-prediction-dataset


## Preprocessing


In [202]:
strokeData=pd.read_csv('healthcare-dataset-stroke-data.csv')
#strokeData.info()
strokeDataFeatures=strokeData.iloc[:,1:-1]
#iloc[rows,columns] we used : on rows as :specifies a range so a range with no upper or lower bound means taking everyting
#1:-1 means a range from 1(dropping our first column) to -1(which really means our last column)
#dropping the first column our ID column since it has no predictive power and can potentially cause any learners we use to develop patterns on it
#dropping the last column since we only want our features and not the labels
strokeDataLabels=strokeData.iloc[:,-1]
#getting only the last column as we only want the labels

In [203]:
print(strokeData.isnull().any())
#BMI is the only column with NaNs
from sklearn.impute import SimpleImputer
imputer=SimpleImputer(strategy='mean')
# our BMI column is our 8th column so we want to put that column in the imputer
strokeDataFeatures[['bmi']]=pd.DataFrame(imputer.fit_transform(strokeDataFeatures[['bmi']]))
print(strokeDataFeatures.isnull().any())

id                   False
gender               False
age                  False
hypertension         False
heart_disease        False
ever_married         False
work_type            False
Residence_type       False
avg_glucose_level    False
bmi                   True
smoking_status       False
stroke               False
dtype: bool
gender               False
age                  False
hypertension         False
heart_disease        False
ever_married         False
work_type            False
Residence_type       False
avg_glucose_level    False
bmi                  False
smoking_status       False
dtype: bool


In [204]:
from sklearn.preprocessing import OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OrdinalEncoder
from sklearn.preprocessing import StandardScaler
#we want to use standard scaler to scale the inputs for the numberical inputs to avoid problems with weights for different model types
#standard scaler or minmax scaler are good but we will use StandardScaler
#we want to use onehotencoder for categorical columns
# that have more than 2 possible awnsers, ordinal or label encoder for categorical columsn that have only 2 possible awnsers and then we want to use the columnTransformer to apply these encoders to the columsn we want to apply them to

#these lines help us figure out which columsn need to be onehotencoded and which need to be converted to binary
print('Columns to be OneHotEncoded')
for column in strokeDataFeatures:
    if (strokeDataFeatures[column].nunique()>2) & (strokeDataFeatures[column].dtype == 'object'):
        print(f'{column} has unique categories of {strokeDataFeatures[column].unique()}')
#gender work_type and smoking_status should be OneHotEncoded to avoid the learners accidentally ranking
print('Columns to be converted to Binary')
for column in strokeDataFeatures:
    if (strokeDataFeatures[column].nunique()==2) & (strokeDataFeatures[column].dtype == 'object'):
        print(f'{column} has unique categories of {strokeDataFeatures[column].unique()}')
#ever_married and residence_type can be converted to binary 0 and 1 since there are only 2 values

#column transformer takes in an array of tuples(each tuples has three values) each tuple is represents an encoder you will use on some columns in the tuples you have three values the first value is some arbitrary name like 'ordinalEncoder' and the second vlaue is the function for the encoder itself like OrdinalEncoder() the third value is a list of the column indices or column names if the data is a dataframe which you want that specific encoder to be used on so in this case we only want our OrdinalEncoder to
ct=ColumnTransformer(transformers=[('ordinalEncoder', OrdinalEncoder(), ['ever_married','Residence_type']),('oneHotEncoder', OneHotEncoder(), ['gender','work_type','smoking_status']),('scaler', StandardScaler(),['bmi','avg_glucose_level','age'])],remainder='passthrough')
strokeDataFeatures=ct.fit_transform(strokeDataFeatures)
print(ct.get_feature_names_out())
strokeDataFeatures[0]





Columns to be OneHotEncoded
gender has unique categories of ['Male' 'Female' 'Other']
work_type has unique categories of ['Private' 'Self-employed' 'Govt_job' 'children' 'Never_worked']
smoking_status has unique categories of ['formerly smoked' 'never smoked' 'smokes' 'Unknown']
Columns to be converted to Binary
ever_married has unique categories of ['Yes' 'No']
Residence_type has unique categories of ['Urban' 'Rural']
['ordinalEncoder__ever_married' 'ordinalEncoder__Residence_type'
 'oneHotEncoder__gender_Female' 'oneHotEncoder__gender_Male'
 'oneHotEncoder__gender_Other' 'oneHotEncoder__work_type_Govt_job'
 'oneHotEncoder__work_type_Never_worked'
 'oneHotEncoder__work_type_Private'
 'oneHotEncoder__work_type_Self-employed'
 'oneHotEncoder__work_type_children'
 'oneHotEncoder__smoking_status_Unknown'
 'oneHotEncoder__smoking_status_formerly smoked'
 'oneHotEncoder__smoking_status_never smoked'
 'oneHotEncoder__smoking_status_smokes' 'scaler__bmi'
 'scaler__avg_glucose_level' 'scaler__

array([1.        , 1.        , 0.        , 1.        , 0.        ,
       0.        , 0.        , 1.        , 0.        , 0.        ,
       0.        , 1.        , 0.        , 0.        , 1.00123401,
       2.70637544, 1.05143428, 0.        , 1.        ])