# Penguin Classification Model

In [181]:
import pandas as pd
import numpy as np
import seaborn as sns

###  Importing penguin  dataset from  seaborn

In [182]:
df = sns.load_dataset('penguins')
df.head()

Unnamed: 0,species,island,bill_length_mm,bill_depth_mm,flipper_length_mm,body_mass_g,sex
0,Adelie,Torgersen,39.1,18.7,181.0,3750.0,Male
1,Adelie,Torgersen,39.5,17.4,186.0,3800.0,Female
2,Adelie,Torgersen,40.3,18.0,195.0,3250.0,Female
3,Adelie,Torgersen,,,,,
4,Adelie,Torgersen,36.7,19.3,193.0,3450.0,Female


In [183]:
#rows, columns
df.shape

(344, 7)

In [184]:
#finding out the types of our columns
df.dtypes

species               object
island                object
bill_length_mm       float64
bill_depth_mm        float64
flipper_length_mm    float64
body_mass_g          float64
sex                   object
dtype: object

#### 3 object data types, we have to convert those into numbers

#### 3 Ways to convert objects
1. Mapping with pandas (will use on Species)
2. Label Encoding (will use on Island)
3. One Hot Encoding (will use on Sex)

## Check and drop null values

In [185]:
df.isnull().sum()

species               0
island                0
bill_length_mm        2
bill_depth_mm         2
flipper_length_mm     2
body_mass_g           2
sex                  11
dtype: int64

In [186]:
df.dropna(inplace=True)
df.head()

Unnamed: 0,species,island,bill_length_mm,bill_depth_mm,flipper_length_mm,body_mass_g,sex
0,Adelie,Torgersen,39.1,18.7,181.0,3750.0,Male
1,Adelie,Torgersen,39.5,17.4,186.0,3800.0,Female
2,Adelie,Torgersen,40.3,18.0,195.0,3250.0,Female
4,Adelie,Torgersen,36.7,19.3,193.0,3450.0,Female
5,Adelie,Torgersen,39.3,20.6,190.0,3650.0,Male


In [187]:
#Creating a copy of the dataframe so we can work on
df1 = df.copy()
df1.head()

Unnamed: 0,species,island,bill_length_mm,bill_depth_mm,flipper_length_mm,body_mass_g,sex
0,Adelie,Torgersen,39.1,18.7,181.0,3750.0,Male
1,Adelie,Torgersen,39.5,17.4,186.0,3800.0,Female
2,Adelie,Torgersen,40.3,18.0,195.0,3250.0,Female
4,Adelie,Torgersen,36.7,19.3,193.0,3450.0,Female
5,Adelie,Torgersen,39.3,20.6,190.0,3650.0,Male


## 1.  Mapping Function

In [188]:
#getting unique values for species column
df1.species.unique()

array(['Adelie', 'Chinstrap', 'Gentoo'], dtype=object)

In [189]:
#seperate dataframe mapping the species to 0-1-2
Y = df1.species.map({'Adelie':0,'Chinstrap':1,'Gentoo':2})
Y.head()

0    0
1    0
2    0
4    0
5    0
Name: species, dtype: int64

### Dropping the species column for model purposes later

In [190]:
df1.drop('species', inplace=True, axis=1)
df1.head()

Unnamed: 0,island,bill_length_mm,bill_depth_mm,flipper_length_mm,body_mass_g,sex
0,Torgersen,39.1,18.7,181.0,3750.0,Male
1,Torgersen,39.5,17.4,186.0,3800.0,Female
2,Torgersen,40.3,18.0,195.0,3250.0,Female
4,Torgersen,36.7,19.3,193.0,3450.0,Female
5,Torgersen,39.3,20.6,190.0,3650.0,Male


## 2. Label Encoding

In [191]:
#getting unique values for island column
df1.island.unique()

array(['Torgersen', 'Biscoe', 'Dream'], dtype=object)

In [192]:
#import label encoding from libraries
from sklearn.preprocessing import LabelEncoder

In [193]:
le = LabelEncoder()

In [194]:
#making the label encoding - It follows alphabetical order Biscoe = 0, Dream = 1, Torgersen = 2
df1['island'] = le.fit_transform(df1['island'])
df1['island']

0      2
1      2
2      2
4      2
5      2
      ..
338    0
340    0
341    0
342    0
343    0
Name: island, Length: 333, dtype: int32

### Convert array into series

In [195]:
#We need to convert the array into a series so we can use it for data manipulation with pandas since 
#arrays are not possible to be used
series = pd.Series(df1['island'])
print("Pandas Series :")
display(series)

Pandas Series :


0      2
1      2
2      2
4      2
5      2
      ..
338    0
340    0
341    0
342    0
343    0
Name: island, Length: 333, dtype: int32

## 3. One Hot Encoding

In [196]:
#Create dummies for the sex column - It creates seperate column for each value putting 0s and 1s on what the entry belongs to
pd.get_dummies(df1['sex']).head()

Unnamed: 0,Female,Male
0,0,1
1,1,0
2,1,0
4,1,0
5,0,1


In [197]:
#Create a seperatee dataframe for the dummies but this time we drop one column since we only have 2 sex variables and
#we can determine the other from the 0s
se = pd.get_dummies(df1['sex'],drop_first=True)
se.head()

Unnamed: 0,Male
0,1
1,0
2,0
4,0
5,1


### Now we concatenate all created dataframes into a single one
We also ddrop sex column since we have replaced it with the newly created OneHotCoded(OHC) one

In [198]:
X = pd.concat([df1, se], axis=1)
X.drop('sex', axis=1, inplace=True)
X.head()

Unnamed: 0,island,bill_length_mm,bill_depth_mm,flipper_length_mm,body_mass_g,Male
0,2,39.1,18.7,181.0,3750.0,1
1,2,39.5,17.4,186.0,3800.0,0
2,2,40.3,18.0,195.0,3250.0,0
4,2,36.7,19.3,193.0,3450.0,0
5,2,39.3,20.6,190.0,3650.0,1


In [199]:
#checking new object types of the dataframe to make sure we got number types before we 
#proceed to creating predictive model
X.dtypes

island                 int32
bill_length_mm       float64
bill_depth_mm        float64
flipper_length_mm    float64
body_mass_g          float64
Male                   uint8
dtype: object

In [200]:
#checking our species dataframe as well to make sure we got number type
#this will be what we will be predicting
Y.dtypes

dtype('int64')

## Predictive Model Creation

In [201]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, Y, test_size=0.3, random_state=40)

### Random Forest Classifier

In [202]:
from sklearn.ensemble import RandomForestClassifier

rfc = RandomForestClassifier()

model1 = rfc.fit(X_train, y_train)
prediction1 = model1.predict(X_test)

print("Accuracy on training data: {:,.3f}".format(rfc.score(X_train, y_train)))
print("Accuracy on test data: {:,.3f}".format(rfc.score(X_test, y_test)))

Accuracy on training data: 1.000
Accuracy on test data: 1.000


In [203]:
from sklearn.metrics import confusion_matrix
from sklearn.metrics import classification_report

In [204]:
#All cases classify as one of the 3 species, no negative or false cases
print(confusion_matrix(y_test,prediction1))

[[41  0  0]
 [ 0 23  0]
 [ 0  0 36]]


In [205]:
#Analytics for the model
print(classification_report(y_test,prediction1))

              precision    recall  f1-score   support

           0       1.00      1.00      1.00        41
           1       1.00      1.00      1.00        23
           2       1.00      1.00      1.00        36

    accuracy                           1.00       100
   macro avg       1.00      1.00      1.00       100
weighted avg       1.00      1.00      1.00       100



### Different Model Evaluation

In [206]:
from sklearn.neighbors import KNeighborsClassifier
from sklearn.svm import SVC
from sklearn.naive_bayes import GaussianNB
from sklearn.model_selection import KFold
from sklearn.model_selection import cross_val_score
from sklearn.metrics import accuracy_score

In [207]:
models = []
models.append(('K-Neighbors Classifier',KNeighborsClassifier()))
models.append(('Naive Bayes classifier',GaussianNB()))
models.append(('Support Vector Machine',SVC()))
models.append(('Random Forest Classifier',RandomForestClassifier()))

In [208]:
#Evaluation for each model

results = []
names = []
for name, model in models:
    kfold = KFold(n_splits=10)
    cv_results = cross_val_score(model, X_train, y_train, cv=kfold, scoring='accuracy')
    results.append(cv_results)
    names.append(name)
    msg = '%s:, %f, (%f)' % (name,cv_results.mean(), cv_results.std())
    print(msg)

K-Neighbors Classifier:, 0.755616, (0.060228)
Naive Bayes classifier:, 0.961413, (0.048394)
Support Vector Machine:, 0.747283, (0.076540)
Random Forest Classifier:, 0.982609, (0.039848)


## Predictions

### Random Forest Classifier

In [209]:
rfc = RandomForestClassifier()
rfc.fit(X_train, y_train)
predictions = rfc.predict(X_test)
print(accuracy_score(y_test,predictions))
print(classification_report(y_test,predictions))
print(confusion_matrix(y_test,predictions))

1.0
              precision    recall  f1-score   support

           0       1.00      1.00      1.00        41
           1       1.00      1.00      1.00        23
           2       1.00      1.00      1.00        36

    accuracy                           1.00       100
   macro avg       1.00      1.00      1.00       100
weighted avg       1.00      1.00      1.00       100

[[41  0  0]
 [ 0 23  0]
 [ 0  0 36]]


### Gaussian NB

In [210]:
NB = GaussianNB()
NB.fit(X_train, y_train)
predictions = NB.predict(X_test)
print(accuracy_score(y_test,predictions))
print(classification_report(y_test,predictions))
print(confusion_matrix(y_test,predictions))

0.97
              precision    recall  f1-score   support

           0       1.00      0.93      0.96        41
           1       0.88      1.00      0.94        23
           2       1.00      1.00      1.00        36

    accuracy                           0.97       100
   macro avg       0.96      0.98      0.97       100
weighted avg       0.97      0.97      0.97       100

[[38  3  0]
 [ 0 23  0]
 [ 0  0 36]]


### KNN Classifier

In [211]:
KNN = KNeighborsClassifier()
KNN.fit(X_train, y_train)
predictions = KNN.predict(X_test)
print(accuracy_score(y_test,predictions))
print(classification_report(y_test,predictions))
print(confusion_matrix(y_test,predictions))

0.78
              precision    recall  f1-score   support

           0       0.68      0.95      0.80        41
           1       0.91      0.43      0.59        23
           2       0.91      0.81      0.85        36

    accuracy                           0.78       100
   macro avg       0.83      0.73      0.75       100
weighted avg       0.82      0.78      0.77       100

[[39  0  2]
 [12 10  1]
 [ 6  1 29]]
