# Practice Activity - Neural Networks
## Nick Bias
### 5/6/22
## Goal: Predict Penguin Species
### Libraries 

In [1]:
import pandas as pd
import numpy as np

# For Neural Networks 
from sklearn.neural_network import MLPClassifier
from sklearn import metrics

# For Boosting 
from sklearn.ensemble import GradientBoostingClassifier
from xgboost import XGBClassifier
from sklearn.ensemble import AdaBoostClassifier
from sklearn.preprocessing import LabelEncoder
from sklearn.tree import DecisionTreeClassifier

# For Comparision Models
# compare standalone models for binary classification
from sklearn.ensemble import StackingClassifier
from sklearn.ensemble import BaggingClassifier
from numpy import mean
from numpy import std
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import RepeatedStratifiedKFold
from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsClassifier
#from sklearn.tree import DecisionTreeClassifier
from sklearn.svm import SVC
from sklearn.naive_bayes import GaussianNB
from sklearn.naive_bayes import ComplementNB 
from sklearn.naive_bayes import BernoulliNB 
from sklearn.ensemble import RandomForestClassifier

# For splitting data into training and testing 
from sklearn.model_selection import train_test_split
# example of calculate the mean absolute error
from sklearn.metrics import accuracy_score
from sklearn.metrics import confusion_matrix

# For Evaluations 
from imblearn.under_sampling import RandomUnderSampler
from sklearn.model_selection import train_test_split
from sklearn.metrics import confusion_matrix, classification_report, roc_curve, roc_auc_score
from sklearn.linear_model import LogisticRegression
import matplotlib.pyplot as plt
from sklearn.inspection import permutation_importance

# for viewing PNG files 
from IPython.display import Image
from IPython.core.display import HTML 

import warnings
warnings.filterwarnings('ignore')

# So results are same when re-run
import random
random.seed(10)

In [2]:
size = pd.read_csv("Data/Week4/penguins_size.csv")

# Drop Rows with NA values 
clean = size.dropna()

# Drops row with a '.' for the Sex Variable
clean = clean[clean['sex'] != '.']
# only 11 rows were dropped 

# Creating Dummy Variables for Island and Sex
island = pd.get_dummies(clean['island'])
sex = pd.get_dummies(clean['sex'])

# Merging with Original Data
penguins = pd.merge(clean, island, left_index=True, right_index=True)
penguins = pd.merge(penguins, sex, left_index=True, right_index=True)

# Dropping Columns that the Dummies were made from 
penguins = penguins.drop(['sex', 'island', 'FEMALE'], axis = 1)
penguins

Unnamed: 0,species,culmen_length_mm,culmen_depth_mm,flipper_length_mm,body_mass_g,Biscoe,Dream,Torgersen,MALE
0,Adelie,39.1,18.7,181.0,3750.0,0,0,1,1
1,Adelie,39.5,17.4,186.0,3800.0,0,0,1,0
2,Adelie,40.3,18.0,195.0,3250.0,0,0,1,0
4,Adelie,36.7,19.3,193.0,3450.0,0,0,1,0
5,Adelie,39.3,20.6,190.0,3650.0,0,0,1,1
...,...,...,...,...,...,...,...,...,...
338,Gentoo,47.2,13.7,214.0,4925.0,1,0,0,0
340,Gentoo,46.8,14.3,215.0,4850.0,1,0,0,0
341,Gentoo,50.4,15.7,222.0,5750.0,1,0,0,1
342,Gentoo,45.2,14.8,212.0,5200.0,1,0,0,0


### Splitting Prediction Variable from dataset
- X = Dataset with all Independent Variables 
- y = The Dependent Variable of Penguin Species

In [3]:
# Only Takes Dependent Variable we are tryng to predict 
y = penguins['species']

# Change levels to numeric for XGBoosting 
y = y.replace('Adelie', 0)
y = y.replace('Chinstrap', 1)
y = y.replace('Gentoo', 2)
# 'Adelie' 'Chinstrap' 'Gentoo'

# Takes independent variables that will be used to predict 
X = penguins.iloc[:,1:]

In [4]:
# get a stacking ensemble of models
def get_stacking():
    # define the base models
    level0 = list()
    level0.append(('lr', BaggingClassifier(base_estimator=LogisticRegression())))
    level0.append(('knn', BaggingClassifier(base_estimator=KNeighborsClassifier())))
    level0.append(('tree', BaggingClassifier(base_estimator=DecisionTreeClassifier())))
    level0.append(('forest', BaggingClassifier(base_estimator=RandomForestClassifier())))
    level0.append(('svm', BaggingClassifier(base_estimator=SVC())))
    level0.append(('bayes', BaggingClassifier(base_estimator=GaussianNB())))
    level0.append(('adaboost', BaggingClassifier(base_estimator=AdaBoostClassifier())))
    level0.append(('gradboost', BaggingClassifier(base_estimator=GradientBoostingClassifier())))
    level0.append(('xgboost', BaggingClassifier(base_estimator=XGBClassifier())))
    level0.append(('nueralNet', BaggingClassifier(base_estimator=MLPClassifier(random_state=1))))
    # define meta learner model
    level1 = DecisionTreeClassifier()
    # define the stacking ensemble
    model = StackingClassifier(estimators=level0, final_estimator=level1, cv=5, passthrough = True)
    return model
# passthrough is used for the stacking to take the og dataset instead of just the other model results

# get a list of models to evaluate
def get_models():
    models = dict()
    models['lr'] = LogisticRegression()
    models['knn'] = KNeighborsClassifier()
    models['tree'] = DecisionTreeClassifier()
    models['forest'] = RandomForestClassifier()
    models['svm'] = SVC()
    models['bayes'] = GaussianNB()
    models['adaboost'] = AdaBoostClassifier()
    models['gradboost'] = GradientBoostingClassifier()
    models['xgboost'] = XGBClassifier()
    models['nueralNet'] = MLPClassifier(random_state=1)
    models['stacking'] = get_stacking()
    return models

# evaluate a given model using cross-validation
def evaluate_model(model, X, y):
    cv = RepeatedStratifiedKFold(n_splits=10, n_repeats=3, random_state=1)
    scores = cross_val_score(model, X, y, scoring='accuracy', cv=cv, n_jobs=-1, error_score='raise')
    return scores

# get the models to evaluate
models = get_models()

In [None]:
# evaluate the models and store results
results, names = list(), list()
for name, model in models.items():
    scores = evaluate_model(model, X, y)
    results.append(scores)
    names.append(name)
    print('>%s %.3f (%.3f)' % (name, mean(scores), std(scores)))

>lr 0.988 (0.017)
>knn 0.792 (0.043)
>tree 0.980 (0.020)
>forest 0.992 (0.015)
>svm 0.724 (0.038)
>bayes 0.841 (0.059)
>adaboost 0.813 (0.067)
>gradboost 0.986 (0.017)
>xgboost 0.988 (0.017)
>nueralNet 0.398 (0.078)


We can see that out of the 10 different default Models, 5 were able to achieve accuracy scores above 97%. These models were Logistic Regression, Decision Tree, Random Forest, Gradient Boost, and XG Boost. Naive Bayes and AdaBoost had Accuracies around 80%, while KNN and SVM had accuracies around 75%. These all did much better compared to a default Nueral Network. It has an Accuracy about 40%, which is the worst out of all the models. This is horrible accuracy compared to every other model. Even SVM accuracy, which had the second worst accuracy, was 32.6% higher than the Nueral Network. Further tunning will need to be done to make a decent Nueral Network model.

In [5]:
# Splitting the Dataset into a Training and Testing set 
x_train,x_test,y_train,y_test=train_test_split(X,y,test_size=0.2)

To have consistent resutls, I will have each model have "random_state=1".

In [8]:
clf = MLPClassifier(random_state=1)
clf.fit(x_train, y_train)

y_pred = clf.predict(x_test)
predictions = [round(value) for value in y_pred]
accuracy = accuracy_score(y_test, predictions)
print("Base Model Accuracy: %.2f%%" % (accuracy * 100.0))

Base Model Accuracy: 43.28%


We can see that a base Neural Metwork Model only has an accuracy of 43.28%. This is very bad. We would at least like this to be above 75%. I will use the SciKit Learn documentation website to find parameters to tune. I will mess around with these until I notice a difference in the accuracy score.

In [10]:
clf = MLPClassifier(random_state=1, solver='sgd', learning_rate='invscaling', power_t=1, max_iter=500)
clf.fit(x_train, y_train)

y_pred = clf.predict(x_test)
predictions = [round(value) for value in y_pred]
accuracy = accuracy_score(y_test, predictions)
print("Model 1 Accuracy: %.2f%%" % (accuracy * 100.0))

Model 1 Accuracy: 41.79%


For this second Model, I changed the solver to ‘sgd’ which refers to stochastic gradient descent. From there, I changed the learning rate to 'invscaling'. This means the model will gradually decrease the learning rate at each time step ‘t’ using an inverse scaling exponent of ‘power_t’. Because of this I added power t. It is the exponent for inverse scaling learning rate. It is used in updating effective learning rate when the learning_rate is set to ‘invscaling’ and can only used when solver=’sgd’. I then made max iterations 500. OVerall, this made the Model 1 perform slightly worse then the base model, with an accuracy of 41.79%.

In [12]:
clf = MLPClassifier(solver='lbfgs', alpha=1e-5, hidden_layer_sizes=(5, 2), random_state=1)
clf.fit(x_train, y_train)

y_pred = clf.predict(x_test)
predictions = [round(value) for value in y_pred]
accuracy = accuracy_score(y_test, predictions)
print("Model 2 Accuracy: %.2f%%" % (accuracy * 100.0))

Model 2 Accuracy: 32.84%


Model 2 had a 'lbfgs' solver. This is an optimizer in the family of quasi-Newton methods.The alpha was then set to 0.00001, which is the L2 penalty (regularization term) parameter. It has an architecture of 2 hideen layers with the first layer having 5 nodes, while the second layer has 2. In the end, this made Model 2 perform worse than the previous 2 models with an accuracy of 32.84%, which is more than 10% less than the base model.

In [27]:
clf = MLPClassifier(random_state=1, hidden_layer_sizes=(400,100,50,5), alpha=0.01)
clf.fit(x_train, y_train)

y_pred = clf.predict(x_test)
predictions = [round(value) for value in y_pred]
accuracy = accuracy_score(y_test, predictions)
print("Model 3 Accuracy: %.2f%%" % (accuracy * 100.0))

Model 3 Accuracy: 25.37%


For Model 3, I messed with the Hidden Layer sizes. I gave it 4 hidden layers. The first layer had 400 nodes, the second had 100, the third had 50, and the four and final hidden layer had 5 nodes. The alpha was then set to 0.01. This made the worst performing model out of all the Neural Networks tested, with an accuracy of 25.37%.

In [17]:
bag = BaggingClassifier(base_estimator=MLPClassifier(random_state=1), random_state=1)
bag.fit(x_train, y_train)

y_pred = bag.predict(x_test)
predictions = [round(value) for value in y_pred]
accuracy = accuracy_score(y_test, predictions)
print("Model 4 Accuracy: %.2f%%" % (accuracy * 100.0))

Model 4 Accuracy: 58.21%


For Model 4, I decided to Bag a base Neural Network model to see if this would imporve anything. To my surprise, it increase accuracy by almost 15%. This is the best performing model so far. For my Final Model I will make sure to use bagging to increase performance. 

In [25]:
clf = MLPClassifier(random_state=1, max_iter=200, activation='logistic', learning_rate_init=0.0005)
clf.fit(x_train, y_train)

y_pred = clf.predict(x_test)
predictions = [round(value) for value in y_pred]
accuracy = accuracy_score(y_test, predictions)
print("Model 5 Accuracy: %.2f%%" % (accuracy * 100.0))

Model 5 Accuracy: 56.72%


For Model 5, I increased the base max iterations from 100 to 200. This solver iterates until convergence (determined by ‘tol’) or this number of iterations. I then changed activation to 'logistic', which uses the logistic sigmoid function, returns f(x) = 1 / (1 + exp(-x)). I then decreased the inital learning rate, which controls the step-size in updating the weights. This Model had increrased performance from the base model, with an accuracy of 56.72%. 

In [19]:
clf = MLPClassifier(random_state=1, 
                    max_iter=300, 
                    activation='logistic', 
                    learning_rate_init=0.002, 
                    hidden_layer_sizes=(400,), 
                    alpha=0.001)
clf.fit(x_train, y_train)

y_pred = clf.predict(x_test)
predictions = [round(value) for value in y_pred]
accuracy = accuracy_score(y_test, predictions)
print("Model 6 Accuracy: %.2f%%" % (accuracy * 100.0))

Model 6 Accuracy: 73.13%


Model 6 achieved an accuracy of 73.13%. Its maximum iterations were set to 300. It used a logistic activation, with an inital learning rate of 0.002. Its architecture has only 1 hidden layer with 400 nodes in it. The alpha was then set to 0.001. Overall, this is performing alright, but we can still try to make a better model. 

In [33]:
bag = BaggingClassifier(base_estimator=MLPClassifier(random_state=1, 
                                                     solver='lbfgs',
                                                     max_iter=300, 
                                                     activation='logistic', 
                                                     learning_rate_init=0.00065, 
                                                     hidden_layer_sizes=(400,400), 
                                                     alpha=0.01), random_state=1)
bag.fit(x_train, y_train)

y_pred = bag.predict(x_test)
predictions = [round(value) for value in y_pred]
accuracy = accuracy_score(y_test, predictions)
print("Model 7 Accuracy: %.2f%%" % (accuracy * 100.0))

Model 7 Accuracy: 92.54%


Model 6 uses many features of the old models and adds to them. I first bagged the model, as I noticed this helped performance earlier. Its architecture is 2 hidden layers with 400 nodes in each. Its solver was set to 'lbfgs', with 300 maximum iterations, a logistic activation function, an inital learning rate of 0.00065 and an alpha of 0.01. All these parameters helped this model achieve an accuracy above 90%. 

In [28]:
bag = BaggingClassifier(base_estimator=MLPClassifier(random_state=1, 
                                                     solver='lbfgs',
                                                     max_iter=300, 
                                                     activation='logistic', 
                                                     learning_rate_init=0.00065, 
                                                     hidden_layer_sizes=(400,), 
                                                     alpha=0.01), random_state=1)
bag.fit(x_train, y_train)

y_pred = bag.predict(x_test)
predictions = [round(value) for value in y_pred]
accuracy = accuracy_score(y_test, predictions)
print("Final Model Accuracy: %.2f%%" % (accuracy * 100.0))

Final Model Accuracy: 100.00%


For my Final Model, I used everything that I learned in the previous models. I put in all the parameters that I noticed would increase the accuracy of a model. Because of this, the Final Model had an amazing accuracy of 100%. This is almost a 57% increase in accuracy compared to the base model. This may indicate that the Model is overfitting the data, however, this dataset is rather small, with only 67 observations in the testing set. 

In [29]:
print(metrics.confusion_matrix(y_test, y_pred))

[[22  0  0]
 [ 0 17  0]
 [ 0  0 28]]


Here we can see the Confusion Matrix of the final model and see how it classified each spieces.

In [30]:
print(metrics.classification_report(y_test, predictions, digits=3))

              precision    recall  f1-score   support

           0      1.000     1.000     1.000        22
           1      1.000     1.000     1.000        17
           2      1.000     1.000     1.000        28

    accuracy                          1.000        67
   macro avg      1.000     1.000     1.000        67
weighted avg      1.000     1.000     1.000        67



Here we can see that the Precision and Recall of each class is at 100%. This means 100% of positive identifications that are actually correct and 100% of actual positives that were identified correctly. This Model seems to have no misclassification. If the dataset was larger we should investigate overfitting, but for now, this is a perfect model. 

In [31]:
forest = RandomForestClassifier()
forest.fit(x_train, y_train)

y_pred = forest.predict(x_test)
predictions = [round(value) for value in y_pred]

print(metrics.classification_report(y_test, predictions, digits=3))

              precision    recall  f1-score   support

           0      0.955     0.955     0.955        22
           1      0.941     0.941     0.941        17
           2      1.000     1.000     1.000        28

    accuracy                          0.970        67
   macro avg      0.965     0.965     0.965        67
weighted avg      0.970     0.970     0.970        67



The Final Neural Network Model is now outperforming a base Random Forest Model, which has an accuracy of 97%. Its precision and recall scores range from 94% to 100%. This is a good model, however, the Neural Network model had to go throguh a great deal of tuning to outperform this base level Random Forest Model. I am sure with some tuning in the Random Forest Model, it will be able to achieve an accuracy of 100%. 

## Model Archecture

I do not know why but the images will not appear in the HTML file.

### Base Model and Models 1, 4, & 5 

In [24]:
Image(url= "Data/Week6/NN_Arch1.png")

### Model 2

In [12]:
Image(url= "Data/Week6/NN_Arch2.png")

### Model 3

In [13]:
Image(url= "Data/Week6/NN_Arch3.png")

### Model 7

In [14]:
Image(url= "Data/Week6/NN_Arch5.png")

### Model 6 and Final Model

In [16]:
Image(url= "Data/Week6/NN_Arch4.png")