# Random Forest

## Classifier

In [161]:
import pandas as pd
import numpy as np
import seaborn as sns
from sklearn.preprocessing import OneHotEncoder, OrdinalEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.ensemble import RandomForestClassifier, RandomForestRegressor
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix, mean_squared_error, r2_score

In [142]:
df = sns.load_dataset('penguins')

In [145]:
df.dropna(inplace=True)
df.info()
df.isna().sum()
# df.shape

<class 'pandas.core.frame.DataFrame'>
Index: 333 entries, 0 to 343
Data columns (total 7 columns):
 #   Column             Non-Null Count  Dtype  
---  ------             --------------  -----  
 0   species            333 non-null    object 
 1   island             333 non-null    object 
 2   bill_length_mm     333 non-null    float64
 3   bill_depth_mm      333 non-null    float64
 4   flipper_length_mm  333 non-null    float64
 5   body_mass_g        333 non-null    float64
 6   sex                333 non-null    object 
dtypes: float64(4), object(3)
memory usage: 20.8+ KB


species              0
island               0
bill_length_mm       0
bill_depth_mm        0
flipper_length_mm    0
body_mass_g          0
sex                  0
dtype: int64

In [146]:
categorical_feature = ['island']
ordinal_feature1 = ['sex']
ordinal_feature2 = ['species']
ordinal_category1 = [['Male', 'Female']]
ordinal_category2 = [['Adelie', 'Chinstrap', 'Gentoo']]

pipe_categorical = Pipeline(steps=[
    ('categorical', OneHotEncoder(handle_unknown='ignore', sparse_output=False))
])

pipe_ordinal1 = Pipeline(steps=[
    ('Ordinal1', OrdinalEncoder(categories=ordinal_category1))
])

pipe_ordinal2 = Pipeline(steps=[
    ('Ordinal2', OrdinalEncoder(categories=ordinal_category2))
])

# ColumnTransformer definition
transformer = ColumnTransformer(transformers=[
    ('category', pipe_categorical, categorical_feature),
    ('Ordinal1', pipe_ordinal1, ordinal_feature1),
    ('Ordinal2', pipe_ordinal2, ordinal_feature2)
])

transformer.set_output(transform='pandas')
transformed_data = transformer.fit_transform(df)

new_df = pd.concat([df, transformed_data],axis=1)

new_df.drop(['island', 'sex'], axis=1, inplace=True)
new_df.head()

Unnamed: 0,species,bill_length_mm,bill_depth_mm,flipper_length_mm,body_mass_g,category__island_Biscoe,category__island_Dream,category__island_Torgersen,Ordinal1__sex,Ordinal2__species
0,Adelie,39.1,18.7,181.0,3750.0,0.0,0.0,1.0,0.0,0.0
1,Adelie,39.5,17.4,186.0,3800.0,0.0,0.0,1.0,1.0,0.0
2,Adelie,40.3,18.0,195.0,3250.0,0.0,0.0,1.0,1.0,0.0
4,Adelie,36.7,19.3,193.0,3450.0,0.0,0.0,1.0,1.0,0.0
5,Adelie,39.3,20.6,190.0,3650.0,0.0,0.0,1.0,0.0,0.0


In [167]:
x = new_df.iloc[:, 1:9]
y = new_df.iloc[:, -1]

x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.2, random_state=0)

# with entropy

model1 = RandomForestClassifier(n_estimators=5, criterion='entropy', random_state=0)
model1.fit(x_train, y_train)

y_pred1 = model1.predict(x_test)
accuracy_score1 = accuracy_score(y_pred1,y_test)
classification_report1 = classification_report(y_pred1, y_test)
confusion_matrix1 = confusion_matrix(y_pred1, y_test)

print(f''' ## Classification with entropy:
Accuracy Score: {round(accuracy_score1,2)*100}\n
Confusion Matrix: \n{confusion_matrix1} \n
Classification Report: \n{classification_report1}
''')

# with gini

model2 = RandomForestClassifier(n_estimators=5, criterion='gini', random_state=0)
model2.fit(x_train, y_train)

y_pred2 = model2.predict(x_test)
accuracy_score2 = accuracy_score(y_pred2,y_test)
classification_report2 = classification_report(y_pred2, y_test)
confusion_matrix2 = confusion_matrix(y_pred2, y_test)

# cross_validation:

cross_validation_entropy = cross_val_score(model1,x, y, cv=5)
cross_validation_gini = cross_val_score(model2, x, y, cv=5)

print(f''' ## Classification with gini:
Accuracy Score: {round(accuracy_score2,2)*100}\n
Confusion Matrix: \n{confusion_matrix2} \n
Classification Report: \n{classification_report2}

cross-validation:
cross_validation_entropy: \n {cross_validation_entropy} \n
cross_validation_gini: \n {cross_validation_gini} \n
''')


 ## Classification with entropy:
Accuracy Score: 99.0

Confusion Matrix: 
[[39  1  0]
 [ 0  9  0]
 [ 0  0 18]] 

Classification Report: 
              precision    recall  f1-score   support

         0.0       1.00      0.97      0.99        40
         1.0       0.90      1.00      0.95         9
         2.0       1.00      1.00      1.00        18

    accuracy                           0.99        67
   macro avg       0.97      0.99      0.98        67
weighted avg       0.99      0.99      0.99        67


 ## Classification with gini:
Accuracy Score: 97.0

Confusion Matrix: 
[[39  2  0]
 [ 0  8  0]
 [ 0  0 18]] 

Classification Report: 
              precision    recall  f1-score   support

         0.0       1.00      0.95      0.97        41
         1.0       0.80      1.00      0.89         8
         2.0       1.00      1.00      1.00        18

    accuracy                           0.97        67
   macro avg       0.93      0.98      0.95        67
weighted avg       0.