In [96]:
import pandas as pd 
import numpy as np 
import seaborn as sns
from sklearn.preprocessing import OneHotEncoder
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import confusion_matrix
from sklearn.metrics import classification_report,accuracy_score

In [97]:
#Loading Dataset
df = sns.load_dataset('penguins')
df

Unnamed: 0,species,island,bill_length_mm,bill_depth_mm,flipper_length_mm,body_mass_g,sex
0,Adelie,Torgersen,39.1,18.7,181.0,3750.0,Male
1,Adelie,Torgersen,39.5,17.4,186.0,3800.0,Female
2,Adelie,Torgersen,40.3,18.0,195.0,3250.0,Female
3,Adelie,Torgersen,,,,,
4,Adelie,Torgersen,36.7,19.3,193.0,3450.0,Female
...,...,...,...,...,...,...,...
339,Gentoo,Biscoe,,,,,
340,Gentoo,Biscoe,46.8,14.3,215.0,4850.0,Female
341,Gentoo,Biscoe,50.4,15.7,222.0,5750.0,Male
342,Gentoo,Biscoe,45.2,14.8,212.0,5200.0,Female


In [98]:
#Calculating the null values
df.isnull().sum()

species               0
island                0
bill_length_mm        2
bill_depth_mm         2
flipper_length_mm     2
body_mass_g           2
sex                  11
dtype: int64

In [99]:
#dropping null values
df.dropna(inplace = True)

df.isnull().sum()

df


Unnamed: 0,species,island,bill_length_mm,bill_depth_mm,flipper_length_mm,body_mass_g,sex
0,Adelie,Torgersen,39.1,18.7,181.0,3750.0,Male
1,Adelie,Torgersen,39.5,17.4,186.0,3800.0,Female
2,Adelie,Torgersen,40.3,18.0,195.0,3250.0,Female
4,Adelie,Torgersen,36.7,19.3,193.0,3450.0,Female
5,Adelie,Torgersen,39.3,20.6,190.0,3650.0,Male
...,...,...,...,...,...,...,...
338,Gentoo,Biscoe,47.2,13.7,214.0,4925.0,Female
340,Gentoo,Biscoe,46.8,14.3,215.0,4850.0,Female
341,Gentoo,Biscoe,50.4,15.7,222.0,5750.0,Male
342,Gentoo,Biscoe,45.2,14.8,212.0,5200.0,Female


In [100]:
#Tranforming Island data from categorical to numeric 

df.island.unique()

island = pd.get_dummies(df['island'],drop_first=True)
island.head()


Unnamed: 0,Dream,Torgersen
0,False,True
1,False,True
2,False,True
4,False,True
5,False,True


In [101]:
#Transforming Sex df into numeric
df.sex.unique()

sex = pd.get_dummies(df['sex'], drop_first=True)
sex.head()

Unnamed: 0,Male
0,True
1,False
2,False
4,False
5,True


In [102]:
#Concatinating new dfs to original Df

new_data = pd.concat([df,island,sex],axis=1)
new_data.head()

Unnamed: 0,species,island,bill_length_mm,bill_depth_mm,flipper_length_mm,body_mass_g,sex,Dream,Torgersen,Male
0,Adelie,Torgersen,39.1,18.7,181.0,3750.0,Male,False,True,True
1,Adelie,Torgersen,39.5,17.4,186.0,3800.0,Female,False,True,False
2,Adelie,Torgersen,40.3,18.0,195.0,3250.0,Female,False,True,False
4,Adelie,Torgersen,36.7,19.3,193.0,3450.0,Female,False,True,False
5,Adelie,Torgersen,39.3,20.6,190.0,3650.0,Male,False,True,True


In [103]:
#Dropping repeted columns

new_data.drop(['sex', 'island'], axis=1, inplace=True)
new_data.head()

Unnamed: 0,species,bill_length_mm,bill_depth_mm,flipper_length_mm,body_mass_g,Dream,Torgersen,Male
0,Adelie,39.1,18.7,181.0,3750.0,False,True,True
1,Adelie,39.5,17.4,186.0,3800.0,False,True,False
2,Adelie,40.3,18.0,195.0,3250.0,False,True,False
4,Adelie,36.7,19.3,193.0,3450.0,False,True,False
5,Adelie,39.3,20.6,190.0,3650.0,False,True,True


In [104]:
#separating data 

Y =  new_data.species

#transforming  y into numeric data

Y = Y.map({'Adelie':0, 'Chinstrap':1, 'Gentoo':2})
Y.head()

0    0
1    0
2    0
4    0
5    0
Name: species, dtype: int64

In [105]:
#Dropping y column

new_data.drop('species', inplace=True, axis= 1)
new_data.head()

Unnamed: 0,bill_length_mm,bill_depth_mm,flipper_length_mm,body_mass_g,Dream,Torgersen,Male
0,39.1,18.7,181.0,3750.0,False,True,True
1,39.5,17.4,186.0,3800.0,False,True,False
2,40.3,18.0,195.0,3250.0,False,True,False
4,36.7,19.3,193.0,3450.0,False,True,False
5,39.3,20.6,190.0,3650.0,False,True,True


In [106]:
#Splitting of data into Train and Test data
X= new_data
X_train, X_test,y_train,y_test = train_test_split(X,Y, test_size=0.3,random_state =0 )


In [107]:
#Train Random Forest Classification on training set

classifier = RandomForestClassifier(n_estimators = 5, criterion = 'entropy', random_state = 0)
classifier.fit(X_train,y_train)

In [108]:
#Predicting the test result

y_pred = classifier.predict(X_test)
y_pred

array([0, 0, 2, 0, 0, 0, 1, 2, 2, 1, 2, 0, 0, 1, 0, 0, 2, 0, 1, 0, 0, 0,
       2, 2, 2, 2, 0, 0, 0, 0, 0, 1, 0, 0, 0, 2, 1, 0, 1, 0, 2, 2, 0, 0,
       0, 0, 0, 0, 2, 0, 0, 0, 2, 2, 0, 0, 0, 0, 0, 2, 0, 1, 0, 2, 0, 0,
       2, 2, 1, 2, 2, 1, 2, 1, 0, 2, 0, 2, 0, 2, 1, 2, 2, 2, 1, 2, 1, 0,
       0, 2, 2, 0, 2, 0, 2, 0, 2, 0, 2, 2], dtype=int64)

In [109]:
#Accuracy of the model
cm = confusion_matrix(y_test,y_pred)
cm

array([[48,  0,  0],
       [ 2, 14,  0],
       [ 0,  0, 36]], dtype=int64)