### Classification on the Penguin Dataset

#### Libraries

In [1]:
import pandas as pd

from sklearn.model_selection import train_test_split
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import accuracy_score, recall_score

#### Read data

In [2]:
penguins = pd.read_csv("data/penguins.txt")

In [3]:
penguins.head(3)

Unnamed: 0,studyName,Sample Number,Species,Region,Island,Stage,Individual ID,Clutch Completion,Date Egg,Culmen Length (mm),Culmen Depth (mm),Flipper Length (mm),Body Mass (g),Sex,Delta 15 N (o/oo),Delta 13 C (o/oo),Comments
0,PAL0708,1,Adelie Penguin (Pygoscelis adeliae),Anvers,Torgersen,"Adult, 1 Egg Stage",N1A1,Yes,11/11/07,39.1,18.7,181.0,3750.0,MALE,,,Not enough blood for isotopes.
1,PAL0708,2,Adelie Penguin (Pygoscelis adeliae),Anvers,Torgersen,"Adult, 1 Egg Stage",N1A2,Yes,11/11/07,39.5,17.4,186.0,3800.0,FEMALE,8.94956,-24.69454,
2,PAL0708,3,Adelie Penguin (Pygoscelis adeliae),Anvers,Torgersen,"Adult, 1 Egg Stage",N2A1,Yes,11/16/07,40.3,18.0,195.0,3250.0,FEMALE,8.36821,-25.33302,


#### Preprocess data

We will only use this four columns for analysis : 
- Culmen length (mm)
- Culmen Depth (mm)
- Flipper Length (mm)
- Body Mass (g)

In [4]:
attributes = ["Culmen Length (mm)","Culmen Depth (mm)","Flipper Length (mm)","Body Mass (g)"]
target = ["Species"]
penguins = penguins[attributes+target]
penguins.dropna(inplace=True)

In [5]:
X = penguins[attributes]
y = penguins[target]

#### Train-Test Split

In [6]:
X_train, X_test, y_train, y_test = train_test_split(X,y,test_size=0.2,random_state=42, stratify=y)

#### Classification with a Decision Tree

In [7]:
# Create the classifier
clf = DecisionTreeClassifier(random_state=42)

# Learn
clf.fit(X_train,y_train)

# Predict
y_pred = clf.predict(X_test)

# Evaluate
accuracy = accuracy_score(y_test,y_pred)
recall = recall_score(y_test,y_pred,average=None)

print(f"Accuracy: {accuracy:.3f}")
print(f"Recall: {sum(recall)/3:.3f}")

Accuracy: 0.957
Recall: 0.971


In [8]:
from sklearn.metrics import confusion_matrix

confusion_matrix(y_pred,y_test)

array([[32,  0,  0],
       [ 3, 12,  0],
       [ 0,  0, 22]], dtype=int64)