# Practice Activity - AdaBoost
## Nick Bias
### 4/22/22
## Goal: Predict Penguin Species
### Libraries 

In [1]:
# For data 
import numpy as np
import pandas as pd

# For AdaBoosting Models
from sklearn.ensemble import AdaBoostClassifier
from sklearn.preprocessing import LabelEncoder
from sklearn.tree import DecisionTreeClassifier

# For Comparision Models
# compare standalone models for binary classification
from sklearn.ensemble import StackingClassifier
from sklearn.ensemble import BaggingClassifier
from numpy import mean
from numpy import std
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import RepeatedStratifiedKFold
from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsClassifier
#from sklearn.tree import DecisionTreeClassifier
from sklearn.svm import SVC
from sklearn.naive_bayes import GaussianNB
from sklearn.ensemble import RandomForestClassifier

# For splitting data into training and testing 
from sklearn.model_selection import train_test_split
# example of calculate the mean absolute error
#from sklearn.metrics import accur

# So results are same when re-run
import random
random.seed(10)

### Importing Data

In [2]:
#lter = pd.read_csv("Data/Week4/penguins_lter.csv")
size = pd.read_csv("Data/Week4/penguins_size.csv")
#penguin = pd.merge(lter, size, left_index=True, right_index=True)
#penguin.head
size

Unnamed: 0,species,island,culmen_length_mm,culmen_depth_mm,flipper_length_mm,body_mass_g,sex
0,Adelie,Torgersen,39.1,18.7,181.0,3750.0,MALE
1,Adelie,Torgersen,39.5,17.4,186.0,3800.0,FEMALE
2,Adelie,Torgersen,40.3,18.0,195.0,3250.0,FEMALE
3,Adelie,Torgersen,,,,,
4,Adelie,Torgersen,36.7,19.3,193.0,3450.0,FEMALE
...,...,...,...,...,...,...,...
339,Gentoo,Biscoe,,,,,
340,Gentoo,Biscoe,46.8,14.3,215.0,4850.0,FEMALE
341,Gentoo,Biscoe,50.4,15.7,222.0,5750.0,MALE
342,Gentoo,Biscoe,45.2,14.8,212.0,5200.0,FEMALE


As we can see, there are quite a few NaN values in the cells. Since most of the variables in these rows have NaNs, we should probably just drop these rows from the dataset, since they will not be useful for analysis. 

### Data Cleaning

In [3]:
# Drop Rows with NA values 
clean = size.dropna()

# Drops row with a '.' for the Sex Variable
clean = clean[clean['sex'] != '.']

clean
# only 11 rows were dropped 

Unnamed: 0,species,island,culmen_length_mm,culmen_depth_mm,flipper_length_mm,body_mass_g,sex
0,Adelie,Torgersen,39.1,18.7,181.0,3750.0,MALE
1,Adelie,Torgersen,39.5,17.4,186.0,3800.0,FEMALE
2,Adelie,Torgersen,40.3,18.0,195.0,3250.0,FEMALE
4,Adelie,Torgersen,36.7,19.3,193.0,3450.0,FEMALE
5,Adelie,Torgersen,39.3,20.6,190.0,3650.0,MALE
...,...,...,...,...,...,...,...
338,Gentoo,Biscoe,47.2,13.7,214.0,4925.0,FEMALE
340,Gentoo,Biscoe,46.8,14.3,215.0,4850.0,FEMALE
341,Gentoo,Biscoe,50.4,15.7,222.0,5750.0,MALE
342,Gentoo,Biscoe,45.2,14.8,212.0,5200.0,FEMALE


In [4]:
# Creating Dummy Variables for Island and Sex
island = pd.get_dummies(clean['island'])
sex = pd.get_dummies(clean['sex'])

# Merging with Original Data
penguins = pd.merge(clean, island, left_index=True, right_index=True)
penguins = pd.merge(penguins, sex, left_index=True, right_index=True)

# Dropping Columns that the Dummies were made from 
penguins = penguins.drop(['sex', 'island', 'FEMALE'], axis = 1)
penguins

Unnamed: 0,species,culmen_length_mm,culmen_depth_mm,flipper_length_mm,body_mass_g,Biscoe,Dream,Torgersen,MALE
0,Adelie,39.1,18.7,181.0,3750.0,0,0,1,1
1,Adelie,39.5,17.4,186.0,3800.0,0,0,1,0
2,Adelie,40.3,18.0,195.0,3250.0,0,0,1,0
4,Adelie,36.7,19.3,193.0,3450.0,0,0,1,0
5,Adelie,39.3,20.6,190.0,3650.0,0,0,1,1
...,...,...,...,...,...,...,...,...,...
338,Gentoo,47.2,13.7,214.0,4925.0,1,0,0,0
340,Gentoo,46.8,14.3,215.0,4850.0,1,0,0,0
341,Gentoo,50.4,15.7,222.0,5750.0,1,0,0,1
342,Gentoo,45.2,14.8,212.0,5200.0,1,0,0,0


This is the Final Dataset that will be used for Analysis. 

### Splitting Prediction Variable from dataset
- X = Dataset with all Independent Variables 
- y = The Dependent Variable of Penguin Species

In [5]:
# Only Takes Dependent Variable we are tryng to predict 
y = penguins['species']

# Takes independent variables that will be used to predict 
X = penguins.iloc[:,1:]

## Baseline Models to Compare with 

In [6]:
# get a stacking ensemble of models
def get_stacking():
    # define the base models
    level0 = list()
    level0.append(('lr', BaggingClassifier(base_estimator=LogisticRegression())))
    level0.append(('knn', BaggingClassifier(base_estimator=KNeighborsClassifier())))
    level0.append(('cart', BaggingClassifier(base_estimator=DecisionTreeClassifier())))
    level0.append(('forest', BaggingClassifier(base_estimator=RandomForestClassifier())))
    level0.append(('svm', BaggingClassifier(base_estimator=SVC())))
    level0.append(('bayes', BaggingClassifier(base_estimator=GaussianNB())))
    level0.append(('boost', BaggingClassifier(base_estimator=AdaBoostClassifier())))
    # define meta learner model
    level1 = DecisionTreeClassifier()
    # define the stacking ensemble
    model = StackingClassifier(estimators=level0, final_estimator=level1, cv=5, passthrough = True)
    return model
# passthrough is used for the stacking to take the og dataset instead of just the other model results

# get a list of models to evaluate
def get_models():
    models = dict()
    models['lr'] = LogisticRegression()
    models['knn'] = KNeighborsClassifier()
    models['cart'] = DecisionTreeClassifier()
    models['forest'] = RandomForestClassifier()
    models['svm'] = SVC()
    models['bayes'] = GaussianNB()
    models['boost'] = AdaBoostClassifier()
    models['stacking'] = get_stacking()
    return models

# evaluate a given model using cross-validation
def evaluate_model(model, X, y):
    cv = RepeatedStratifiedKFold(n_splits=10, n_repeats=3, random_state=1)
    scores = cross_val_score(model, X, y, scoring='accuracy', cv=cv, n_jobs=-1, error_score='raise')
    return scores

# get the models to evaluate
models = get_models()

In [7]:
# evaluate the models and store results
results, names = list(), list()
for name, model in models.items():
    scores = evaluate_model(model, X, y)
    results.append(scores)
    names.append(name)
    print('>%s %.3f (%.3f)' % (name, mean(scores), std(scores)))

>lr 0.988 (0.017)
>knn 0.792 (0.043)
>cart 0.977 (0.023)
>forest 0.992 (0.015)
>svm 0.724 (0.038)
>bayes 0.841 (0.059)
>boost 0.813 (0.067)
>stacking 0.981 (0.025)


The best Models were the Decision Tree and Random Forest Models. AdaBoosting was the 6th best model out of 8 models. This may have been because the parameters have not been tuned. 

## AdaBoosting

In [8]:
# Splitting the Dataset into a Training and Testing set 
x_train,x_test,y_train,y_test=train_test_split(X,y,test_size=0.2)

In [9]:
AdaBoost = AdaBoostClassifier(n_estimators=100,learning_rate=0.2, algorithm='SAMME')
    
AdaBoost.fit(x_train, y_train)

prediction = AdaBoost.score(x_test,y_test)

print('The accuracy is: ',prediction*100,'%')

The accuracy is:  97.01492537313433 %


### Tuning Parameters

In [10]:
# Model 0 - default parameters
AdaBoost0 = AdaBoostClassifier()
AdaBoost0.fit(x_train, y_train)
prediction0 = AdaBoost0.score(x_test,y_test)

# Model 1 - High parameters 
AdaBoost1 = AdaBoostClassifier(n_estimators=400,
                               learning_rate=1, algorithm='SAMME')
AdaBoost1.fit(x_train, y_train)
prediction1 = AdaBoost1.score(x_test,y_test)

# Model 2 - Low estimators and learning rate 
AdaBoost2 = AdaBoostClassifier(n_estimators=5,
                               learning_rate=0.1, algorithm='SAMME')
AdaBoost2.fit(x_train, y_train)
prediction2 = AdaBoost2.score(x_test,y_test)

# Model 3 - Low learning rate 
AdaBoost3 = AdaBoostClassifier(n_estimators=100,
                               learning_rate=0.1, algorithm='SAMME')
AdaBoost3.fit(x_train, y_train)
prediction3 = AdaBoost3.score(x_test,y_test)

# Model 4 - New SVC model, high estimators and learning rate
AdaBoost4 = AdaBoostClassifier(base_estimator= SVC(), 
                               n_estimators=400,
                               learning_rate=1, algorithm='SAMME')
AdaBoost4.fit(x_train, y_train)
prediction4 = AdaBoost4.score(x_test,y_test)

base_estimator=RandomForestClassifier()

# Model 5 - New Random Forest model 
AdaBoost5 = AdaBoostClassifier(base_estimator=RandomForestClassifier())
AdaBoost5.fit(x_train, y_train)
prediction5 = AdaBoost5.score(x_test,y_test)

# Model 6 - Decision Tree 
AdaBoost6 = AdaBoostClassifier(base_estimator=DecisionTreeClassifier(),
                               n_estimators=100,
                               learning_rate=0.5, algorithm='SAMME')
AdaBoost6.fit(x_train, y_train)
prediction6 = AdaBoost6.score(x_test,y_test)

print('Model 0 accuracy is: ',prediction0*100,'%')
print('Model 1 accuracy is: ',prediction1*100,'%')
print('Model 2 accuracy is: ',prediction2*100,'%')
print('Model 3 accuracy is: ',prediction3*100,'%')
print('Model 4 accuracy is: ',prediction4*100,'%')
print('Model 5 accuracy is: ',prediction5*100,'%')
print('Model 6 accuracy is: ',prediction6*100,'%')

Model 0 accuracy is:  73.13432835820896 %
Model 1 accuracy is:  98.50746268656717 %
Model 2 accuracy is:  65.67164179104478 %
Model 3 accuracy is:  95.52238805970148 %
Model 4 accuracy is:  37.3134328358209 %
Model 5 accuracy is:  98.50746268656717 %
Model 6 accuracy is:  95.52238805970148 %


- Model 0 is a Baseline model with default perameters. This will be what the other models are compared with. 
- Model 1 has High n-estimators and Learning Rate. A higher learning rate increases the contribution of each classifier. High n-estimators means much more weak learners are used in the next training iteration. This achieved one of the best accuracy scores. 
- Model 2 has low n-estimators and Learning Rate. This is basically the opposite of Model 2. Classifiers have less contribution and not many weak learners are trained in the next iteration. Its accuracy is decent.
- Model 3 has a low learning rate and still achieved one of the best accuracy scores. 
- Model 4 uses a Support Vector Classifier model and high estimators and learning rate. It performed the worst out of all the models.
- Model 5 uses a Random Forest Model with default perameters. It is one of the best performing models.
- Model 6 uses a Decision Tree Model with high estimators and less learning rate. 