### Libraries Used

In [169]:
import pandas as pd
import numpy as np

from sklearn.model_selection import train_test_split
from sklearn.svm import SVC
from sklearn.neural_network import MLPClassifier
from sklearn.ensemble import AdaBoostClassifier, BaggingClassifier, GradientBoostingClassifier, HistGradientBoostingClassifier, RandomForestClassifier
from sklearn.tree import DecisionTreeClassifier, ExtraTreeClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.neighbors import KNeighborsClassifier, RadiusNeighborsClassifier 
from sklearn.impute import SimpleImputer
from sklearn.metrics import accuracy_score, precision_score, f1_score

#### Importing Data

In [197]:
df = pd.read_csv(f'/home/kaykebk/Downloads/penguins_size.csv')

#### Transforming Sex class into 0 an 1

In [198]:
df['sex'] = df['sex'].map({'MALE': 0, 'FEMALE': 1})

#### Showing data characteristics

In [199]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 344 entries, 0 to 343
Data columns (total 7 columns):
 #   Column             Non-Null Count  Dtype  
---  ------             --------------  -----  
 0   species            344 non-null    object 
 1   island             344 non-null    object 
 2   culmen_length_mm   342 non-null    float64
 3   culmen_depth_mm    342 non-null    float64
 4   flipper_length_mm  342 non-null    float64
 5   body_mass_g        342 non-null    float64
 6   sex                333 non-null    float64
dtypes: float64(5), object(2)
memory usage: 18.9+ KB


In [201]:
df.describe(include='all')

Unnamed: 0,species,island,culmen_length_mm,culmen_depth_mm,flipper_length_mm,body_mass_g,sex
count,344,344,342.0,342.0,342.0,342.0,333.0
unique,3,3,,,,,
top,Adelie,Biscoe,,,,,
freq,152,168,,,,,
mean,,,43.92193,17.15117,200.915205,4201.754386,0.495495
std,,,5.459584,1.974793,14.061714,801.954536,0.500732
min,,,32.1,13.1,172.0,2700.0,0.0
25%,,,39.225,15.6,190.0,3550.0,0.0
50%,,,44.45,17.3,197.0,4050.0,0.0
75%,,,48.5,18.7,213.0,4750.0,1.0


In [203]:
df.corr(numeric_only=True)

Unnamed: 0,culmen_length_mm,culmen_depth_mm,flipper_length_mm,body_mass_g,sex
culmen_length_mm,1.0,-0.235053,0.656181,0.59511,-0.344078
culmen_depth_mm,-0.235053,1.0,-0.583851,-0.471916,-0.372673
flipper_length_mm,0.656181,-0.583851,1.0,0.871202,-0.255169
body_mass_g,0.59511,-0.471916,0.871202,1.0,-0.424987
sex,-0.344078,-0.372673,-0.255169,-0.424987,1.0


Since there's high correlation between flipper_length_mm and body_mass_g, they're going to be used
to train the models and sex will be the target of this classification.

#### Checking if there are null values

In [206]:
df.isna().sum()

species               0
island                0
culmen_length_mm      2
culmen_depth_mm       2
flipper_length_mm     2
body_mass_g           2
sex                  11
dtype: int64

##### Since there are null values, we'll be using Simple Imputer to fill null data

In [207]:
imputer = SimpleImputer(strategy='most_frequent')

In [208]:
df.iloc[:,:] = imputer.fit_transform(df)

In [209]:
df.isnull().sum()

species              0
island               0
culmen_length_mm     0
culmen_depth_mm      0
flipper_length_mm    0
body_mass_g          0
sex                  0
dtype: int64

#### Testing Multiple Classifiers

Splitting data into train and test dataset

In [182]:
train, test = train_test_split(df, test_size=0.3, random_state=42)

In [183]:
train_X = train[['flipper_length_mm', 'body_mass_g']]
train_y = train['sex']

test_X = test[['flipper_length_mm', 'body_mass_g']]
test_y = test['sex']

#### SVM

In [184]:
model = SVC(random_state=42)
model.fit(train_X, train_y)
prediction = model.predict(test_X)

print(f"Accuracy of SVC is {accuracy_score(test_y, prediction)}")
print(f"Precision of SVC is {precision_score(test_y, prediction, labels=df['sex'].unique(), average='weighted')}")
print(f"F1 Score of SVC is {f1_score(test_y, prediction)}")

Accuracy of SVC is 0.6923076923076923
Precision of SVC is 0.6932752781809385
F1 Score of SVC is 0.6799999999999999


#### MLP

In [185]:
model = MLPClassifier(random_state=42)
model.fit(train_X, train_y)
prediction = model.predict(test_X)

print(f"Accuracy of MLP is {accuracy_score(test_y, prediction)}")
print(f"Precision of MLP is {precision_score(test_y, prediction, labels=df['sex'].unique(), average='weighted')}")
print(f"F1 Score of MLP is {f1_score(test_y, prediction)}")

Accuracy of MLP is 0.3173076923076923
Precision of MLP is 0.212828330206379
F1 Score of MLP is 0.0


#### Random Forest

In [186]:
model = RandomForestClassifier(random_state=42)
model.fit(train_X, train_y)
prediction = model.predict(test_X)

print(f"Accuracy of Random Forest is {accuracy_score(test_y, prediction)}")
print(f"Precision of Random Forest is {precision_score(test_y, prediction, labels=df['sex'].unique(), average='weighted')}")
print(f"F1 Score of Random Forest is {f1_score(test_y, prediction)}")

Accuracy of Random Forest is 0.7884615384615384
Precision of Random Forest is 0.7884184684296666
F1 Score of Random Forest is 0.7708333333333333


#### Ada Boost

In [187]:
model = AdaBoostClassifier(random_state=42)
model.fit(train_X, train_y)
prediction = model.predict(test_X)

print(f"Accuracy of Ada Boost is {accuracy_score(test_y, prediction)}")
print(f"Precision of Ada Boost is {precision_score(test_y, prediction, labels=df['sex'].unique(), average='weighted')}")
print(f"F1 Score of Ada Boost is {f1_score(test_y, prediction)}")

Accuracy of Ada Boost is 0.8076923076923077
Precision of Ada Boost is 0.8086773866434882
F1 Score of Ada Boost is 0.7872340425531914


#### Bagging

In [188]:
model = BaggingClassifier(random_state=42)
model.fit(train_X, train_y)
prediction = model.predict(test_X)

print(f"Accuracy of Bagging is {accuracy_score(test_y, prediction)}")
print(f"Precision of Bagging is {precision_score(test_y, prediction, labels=df['sex'].unique(), average='weighted')}")
print(f"F1 Score of Bagging is {f1_score(test_y, prediction)}")

Accuracy of Bagging is 0.7692307692307693
Precision of Bagging is 0.7691159158124443
F1 Score of Bagging is 0.7499999999999999


#### Gradient Boosting

In [189]:
model = GradientBoostingClassifier(random_state=42)
model.fit(train_X, train_y)
prediction = model.predict(test_X)

print(f"Accuracy of Gradient Boosting is {accuracy_score(test_y, prediction)}")
print(f"Precision of Gradient Boosting is {precision_score(test_y, prediction, labels=df['sex'].unique(), average='weighted')}")
print(f"F1 Score of Gradient Boosting is {f1_score(test_y, prediction)}")

Accuracy of Gradient Boosting is 0.8365384615384616
Precision of Gradient Boosting is 0.8368447293447294
F1 Score of Gradient Boosting is 0.8282828282828283


#### Hist Gradient Boosting

In [190]:
model = HistGradientBoostingClassifier(random_state=42)
model.fit(train_X, train_y)
prediction = model.predict(test_X)

print(f"Accuracy of Hist Gradient Boosting is {accuracy_score(test_y, prediction)}")
print(f"Precision of Hist Gradient Boosting is {precision_score(test_y, prediction, labels=df['sex'].unique(), average='weighted')}")
print(f"F1 Score of Hist Gradient Boosting is {f1_score(test_y, prediction)}")

Accuracy of Hist Gradient Boosting is 0.8076923076923077
Precision of Hist Gradient Boosting is 0.8086773866434882
F1 Score of Hist Gradient Boosting is 0.7872340425531914


#### Decision Tree

In [191]:
model = DecisionTreeClassifier(random_state=42)
model.fit(train_X, train_y)
prediction = model.predict(test_X)

print(f"Accuracy of Decision Tree is {accuracy_score(test_y, prediction)}")
print(f"Precision of Decision Tree is {precision_score(test_y, prediction, labels=df['sex'].unique(), average='weighted')}")
print(f"F1 Score of Decision Tree is {f1_score(test_y, prediction)}")

Accuracy of Decision Tree is 0.7403846153846154
Precision of Decision Tree is 0.7407621082621083
F1 Score of Decision Tree is 0.7272727272727272


#### Extra Tree

In [192]:
model = ExtraTreeClassifier(random_state=42)
model.fit(train_X, train_y)
prediction = model.predict(test_X)

print(f"Accuracy of Extra Tree is {accuracy_score(test_y, prediction)}")
print(f"Precision of Extra Tree is {precision_score(test_y, prediction, labels=df['sex'].unique(), average='weighted')}")
print(f"F1 Score of Extra Tree is {f1_score(test_y, prediction)}")

Accuracy of Extra Tree is 0.7019230769230769
Precision of Extra Tree is 0.7021416083916084
F1 Score of Extra Tree is 0.6666666666666666


#### KNN

In [193]:
model = KNeighborsClassifier()
model.fit(train_X, train_y)
prediction = model.predict(test_X)

print(f"Accuracy of KNN is {accuracy_score(test_y, prediction)}")
print(f"Precision of KNN is {precision_score(test_y, prediction, labels=df['sex'].unique(), average='weighted')}")
print(f"F1 Score of KNN is {f1_score(test_y, prediction)}")

Accuracy of KNN is 0.7211538461538461
Precision of KNN is 0.7209592319225003
F1 Score of KNN is 0.6947368421052631


### Results

As we can see, Gradient Boost reached the higher metrics results on this subject. However, most of the models performed almost equally, except MLP and SVM.

I want to thank you for your time seeing this simple study, it's my first public study and I know there is some errors or inconsistencies, I would appreciate any advice or correction. 