# Practice Activity - AdaBoost
## Nick Bias
### 4/22/22
## Goal: Predict Penguin Species
### Libraries 

In [1]:
# For data 
import numpy as np
import pandas as pd

# For Gradient Boost
from sklearn.ensemble import GradientBoostingClassifier

# XGBoost
from xgboost import XGBClassifier

# For AdaBoosting Models
from sklearn.ensemble import AdaBoostClassifier
from sklearn.preprocessing import LabelEncoder
from sklearn.tree import DecisionTreeClassifier

# For Comparision Models
# compare standalone models for binary classification
from sklearn.ensemble import StackingClassifier
from sklearn.ensemble import BaggingClassifier
from numpy import mean
from numpy import std
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import RepeatedStratifiedKFold
from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsClassifier
#from sklearn.tree import DecisionTreeClassifier
from sklearn.svm import SVC
from sklearn.naive_bayes import GaussianNB
from sklearn.ensemble import RandomForestClassifier

# For splitting data into training and testing 
from sklearn.model_selection import train_test_split
# example of calculate the mean absolute error
from sklearn.metrics import accuracy_score

# So results are same when re-run
import random
random.seed(10)

### Importing and Cleaning Data

In [2]:
size = pd.read_csv("Data/Week4/penguins_size.csv")

# Drop Rows with NA values 
clean = size.dropna()

# Drops row with a '.' for the Sex Variable
clean = clean[clean['sex'] != '.']
# only 11 rows were dropped 

# Creating Dummy Variables for Island and Sex
island = pd.get_dummies(clean['island'])
sex = pd.get_dummies(clean['sex'])

# Merging with Original Data
penguins = pd.merge(clean, island, left_index=True, right_index=True)
penguins = pd.merge(penguins, sex, left_index=True, right_index=True)

# Dropping Columns that the Dummies were made from 
penguins = penguins.drop(['sex', 'island', 'FEMALE'], axis = 1)
penguins

Unnamed: 0,species,culmen_length_mm,culmen_depth_mm,flipper_length_mm,body_mass_g,Biscoe,Dream,Torgersen,MALE
0,Adelie,39.1,18.7,181.0,3750.0,0,0,1,1
1,Adelie,39.5,17.4,186.0,3800.0,0,0,1,0
2,Adelie,40.3,18.0,195.0,3250.0,0,0,1,0
4,Adelie,36.7,19.3,193.0,3450.0,0,0,1,0
5,Adelie,39.3,20.6,190.0,3650.0,0,0,1,1
...,...,...,...,...,...,...,...,...,...
338,Gentoo,47.2,13.7,214.0,4925.0,1,0,0,0
340,Gentoo,46.8,14.3,215.0,4850.0,1,0,0,0
341,Gentoo,50.4,15.7,222.0,5750.0,1,0,0,1
342,Gentoo,45.2,14.8,212.0,5200.0,1,0,0,0


### Splitting Prediction Variable from dataset
- X = Dataset with all Independent Variables 
- y = The Dependent Variable of Penguin Species

In [3]:
# Only Takes Dependent Variable we are tryng to predict 
y = penguins['species']

# Change levels to numeric for XGBoosting 
y = y.replace('Adelie', 0)
y = y.replace('Chinstrap', 1)
y = y.replace('Gentoo', 2)
# 'Adelie' 'Chinstrap' 'Gentoo'

# Takes independent variables that will be used to predict 
X = penguins.iloc[:,1:]

## Baseline Models to Compare with 

In [4]:
# get a stacking ensemble of models
def get_stacking():
    # define the base models
    level0 = list()
    level0.append(('lr', BaggingClassifier(base_estimator=LogisticRegression())))
    level0.append(('knn', BaggingClassifier(base_estimator=KNeighborsClassifier())))
    level0.append(('cart', BaggingClassifier(base_estimator=DecisionTreeClassifier())))
    level0.append(('forest', BaggingClassifier(base_estimator=RandomForestClassifier())))
    level0.append(('svm', BaggingClassifier(base_estimator=SVC())))
    level0.append(('bayes', BaggingClassifier(base_estimator=GaussianNB())))
    level0.append(('adaboost', BaggingClassifier(base_estimator=AdaBoostClassifier())))
    level0.append(('gradboost', BaggingClassifier(base_estimator=GradientBoostingClassifier())))
    level0.append(('xgboost', BaggingClassifier(base_estimator=XGBClassifier())))
    # define meta learner model
    level1 = DecisionTreeClassifier()
    # define the stacking ensemble
    model = StackingClassifier(estimators=level0, final_estimator=level1, cv=5, passthrough = True)
    return model
# passthrough is used for the stacking to take the og dataset instead of just the other model results

# get a list of models to evaluate
def get_models():
    models = dict()
    models['lr'] = LogisticRegression()
    models['knn'] = KNeighborsClassifier()
    models['cart'] = DecisionTreeClassifier()
    models['forest'] = RandomForestClassifier()
    models['svm'] = SVC()
    models['bayes'] = GaussianNB()
    models['adaboost'] = AdaBoostClassifier()
    models['gradboost'] = GradientBoostingClassifier()
    models['xgboost'] = XGBClassifier()
    models['stacking'] = get_stacking()
    return models

# evaluate a given model using cross-validation
def evaluate_model(model, X, y):
    cv = RepeatedStratifiedKFold(n_splits=10, n_repeats=3, random_state=1)
    scores = cross_val_score(model, X, y, scoring='accuracy', cv=cv, n_jobs=-1, error_score='raise')
    return scores

# get the models to evaluate
models = get_models()

In [5]:
# evaluate the models and store results
results, names = list(), list()
for name, model in models.items():
    scores = evaluate_model(model, X, y)
    results.append(scores)
    names.append(name)
    print('>%s %.3f (%.3f)' % (name, mean(scores), std(scores)))

>lr 0.988 (0.017)
>knn 0.792 (0.043)
>cart 0.980 (0.024)
>forest 0.994 (0.014)
>svm 0.724 (0.038)
>bayes 0.841 (0.059)
>adaboost 0.813 (0.067)
>gradboost 0.986 (0.017)
>xgboost 0.988 (0.017)
>stacking 0.989 (0.017)


The base models of Gradient Boosting and XGBoosting performed much better than a base AdaBoost model. In fact they achieved accuracies of about 17% higher than AdaBoost. Gradient Boosting and XGBoosting perfromed the best, just behind the random forest and logistic regression models. 

## Gradient Boost

In [6]:
# Splitting the Dataset into a Training and Testing set 
x_train,x_test,y_train,y_test=train_test_split(X,y,test_size=0.2)

In [7]:
# Default Model
model = GradientBoostingClassifier()
model.fit(x_train, y_train)
y_pred = model.predict(x_test)
predictions = [round(value) for value in y_pred]
accuracy = accuracy_score(y_test, predictions)
print("Base Model Accuracy: %.2f%%" % (accuracy * 100.0))

# Model 2 - Low parameters 
model2 = GradientBoostingClassifier(n_estimators=10, 
                                    learning_rate=0.01, 
                                    subsample=0.01, 
                                    max_depth=0.01, 
                                    min_impurity_decrease=0.5, 
                                    random_state=0)
model2.fit(x_train, y_train)
y_pred2 = model2.predict(x_test)
predictions2 = [round(value) for value in y_pred2]
accuracy2 = accuracy_score(y_test, predictions2)
print("Model 2 Accuracy: %.2f%%" % (accuracy2 * 100.0))

# Model 3 - High parameters 
model3 = GradientBoostingClassifier(n_estimators=1000, 
                                    learning_rate=1, 
                                    subsample=1, 
                                    max_depth=10, 
                                    min_impurity_decrease=1, 
                                    random_state=0)
model3.fit(x_train, y_train)
y_pred3 = model3.predict(x_test)
predictions3 = [round(value) for value in y_pred3]
accuracy3 = accuracy_score(y_test, predictions3)
print("Model 3 Accuracy: %.2f%%" % (accuracy3 * 100.0))

# Model 4 - Low subsample 
model4 = GradientBoostingClassifier(n_estimators=1000, 
                                    learning_rate=1, 
                                    subsample=0.01, 
                                    max_depth=10, 
                                    min_impurity_decrease=1, 
                                    random_state=0)
model4.fit(x_train, y_train)
y_pred4 = model4.predict(x_test)
predictions4 = [round(value) for value in y_pred4]
accuracy4 = accuracy_score(y_test, predictions4)
print("Model 4 Accuracy: %.2f%%" % (accuracy4 * 100.0))

# Model 5 - Low Max_depth
model5 = GradientBoostingClassifier(n_estimators=1000, 
                                    learning_rate=1, 
                                    subsample=1, 
                                    max_depth=0.5, 
                                    min_impurity_decrease=1, 
                                    random_state=0)
model5.fit(x_train, y_train)
y_pred5 = model5.predict(x_test)
predictions5 = [round(value) for value in y_pred5]
accuracy5 = accuracy_score(y_test, predictions5)
print("Model 5 Accuracy: %.2f%%" % (accuracy5 * 100.0))

# Model 6 - high min_impurity_decrease
model6 = GradientBoostingClassifier(n_estimators=1000, 
                                    learning_rate=1, 
                                    subsample=1, 
                                    max_depth=5, 
                                    min_impurity_decrease=1, 
                                    random_state=0)
model6.fit(x_train, y_train)
y_pred6 = model6.predict(x_test)
predictions6 = [round(value) for value in y_pred6]
accuracy6 = accuracy_score(y_test, predictions6)
print("Model 6 Accuracy: %.2f%%" % (accuracy6 * 100.0))

Base Model Accuracy: 98.51%
Model 2 Accuracy: 49.25%
Model 3 Accuracy: 100.00%
Model 4 Accuracy: 34.33%
Model 5 Accuracy: 49.25%
Model 6 Accuracy: 100.00%


- The Base Gradient Boost Model achieved an accuracy of 95.5%. This will be what the other models are compared to. 
- Model 2 was given low parameters. It achieved almost the lowest accuracy score because of it. The low learning rate means that there is decreased contribution of each classifier. Low n-estimators means that less weak learners are used in training.  
- Model 3 was given high parameters. It acchieved the highest accuracy out of all the models. It is basically the opposite of Model 2 
- Model 4 was given just a low subsample. A subsample smaller than 1.0 this results in Stochastic Gradient Boosting. This achieved the lowest accuracy score. 
- Model 5 was given a low maximum depth. A small maximum depth limits the number of nodes in the tree. It achieved almost the lowest accuracy score.
- Model 6 was given a low minimum impurity decrease. This means a node will be split if this split induces a decrease of the impurity greater than or equal to this value. This model achieved one of the best accuracy scores. 

## XGBoost

In [8]:
# Default Model 
model = XGBClassifier()
model.fit(x_train, y_train)
y_pred = model.predict(x_test)
predictions = [round(value) for value in y_pred]
accuracy = accuracy_score(y_test, predictions)
print("Base Model Accuracy: %.2f%%" % (accuracy * 100.0))

# Model 2 - linear booster
model2 = XGBClassifier(booster = 'gblinear')
model2.fit(x_train, y_train)
y_pred2 = model2.predict(x_test)
predictions2 = [round(value) for value in y_pred2]
accuracy2 = accuracy_score(y_test, predictions2)
print("Model 2 Accuracy: %.2f%%" % (accuracy2 * 100.0))

# Model 3 - High Parameters 
model3 = XGBClassifier(n_estimators=1000, 
                       learning_rate = 1, 
                       gamma = 100, 
                       max_depth = 10, 
                       min_child_weight = 10, 
                       max_delta_step = 10, 
                       subsample = 1,
                       reg_lambda = 10,
                       alpha = 10)
model3.fit(x_train, y_train)
y_pred3 = model3.predict(x_test)
predictions3 = [round(value) for value in y_pred3]
accuracy3 = accuracy_score(y_test, predictions3)
print("Model 3 Accuracy: %.2f%%" % (accuracy3 * 100.0))

# Model 4 - Low Parameters 
model4 = XGBClassifier(n_estimators=10, 
                       learning_rate = 0.01, 
                       gamma = 1, 
                       max_depth = 1, 
                       min_child_weight = 0.1, 
                       max_delta_step = 0.1, 
                       subsample = 0.1,
                       reg_lambda = 1,
                       alpha = 1)
model4.fit(x_train, y_train)
y_pred4 = model4.predict(x_test)
predictions4 = [round(value) for value in y_pred4]
accuracy4 = accuracy_score(y_test, predictions4)
print("Model 4 Accuracy: %.2f%%" % (accuracy4 * 100.0))

# Model 5 - High Gamma
model5 = XGBClassifier(gamma = 100)
model5.fit(x_train, y_train)
y_pred5 = model5.predict(x_test)
predictions5 = [round(value) for value in y_pred5]
accuracy5 = accuracy_score(y_test, predictions5)
print("Model 5  Accuracy: %.2f%%" % (accuracy5 * 100.0))

# Model 5 - High alpha
model6 = XGBClassifier(alpha = 100)
model6.fit(x_train, y_train)
y_pred6 = model6.predict(x_test)
predictions6 = [round(value) for value in y_pred6]
accuracy6 = accuracy_score(y_test, predictions6)
print("Model 6  Accuracy: %.2f%%" % (accuracy6 * 100.0))

Base Model Accuracy: 98.51%
Model 2 Accuracy: 98.51%
Model 3 Accuracy: 82.09%
Model 4 Accuracy: 98.51%
Model 5  Accuracy: 83.58%
Model 6  Accuracy: 49.25%


- The XGBoosted base model with default parameters achieved an accuracy of 97%. This will be what the other models are compared to.
- Model 2 had a linear function booster. This achieved the highest accuracy out of all the models. 
- Model 3 was given high parameters. It had a low accuracy. The high min_child  wieght is the Minimum sum of instance weight (hessian) needed in a child node. Max_delta_step is Maximum delta step we allow each leaf output to be. If the value is set to 0, it means there is no constraint.
- Model 4 was given low parameters and achieved a higher accuracy score than Model 3
- Model 5 was given a high Gamma. This resulted in a bad accuracy score. Gamma is the minimum loss reduction required to make a further partition on a leaf node of the tree. The larger gamma is, the more conservative the algorithm will be.
- Model 6 was given a high alpha. This had the worst accuracy. Alpha is the L1 regularization term on weights. Increasing this value will make model more conservative.

### Compared with AdaBoost

Compared to the AdaBoost PA, these accuracy scores varied much more. It seems that small changes in any variable can have drastic effects on the performance of the models. Unlike AdaBoosting, where you had to change many variables to get a different accuracy. 