# Use scikit-learn for decision trees

We'll implement a decision tree in scikit-learn with the penguins data from the previous objective. We want to classify each penguin as male or female based on the physical characteristics and the species

In [1]:
# Imports!
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import OneHotEncoder

# Use the decision tree classifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.model_selection import train_test_split

# Set-up the one-hot encoder method
categorical_features = ['species']
categorical_transformer = Pipeline(steps=[('onehot', OneHotEncoder())])

# Set up our preprocessor/column transformer
preprocessor = ColumnTransformer(
    transformers=[
        ('cat', categorical_transformer, categorical_features)])

# Add the classifier to the preprocessing pipeline
pipeline = Pipeline(steps=[('preprocessor', preprocessor),
                      ('classifier', DecisionTreeClassifier())])

In [2]:
# Load in the data!

import pandas as pd
import seaborn as sns

penguins = sns.load_dataset("penguins")
penguins.dropna(inplace=True)

# Select features
features = ['species', 'bill_length_mm', 'bill_depth_mm', 'flipper_length_mm', 'body_mass_g']
X = penguins[features]

# Encode the 'sex' column
from sklearn import preprocessing
le = preprocessing.LabelEncoder()
penguins['sex_encode'] = le.fit_transform(penguins['sex'])

# Set target array
y = penguins['sex_encode']

# Apply the pipeline

# Separate into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.25)

# Fit the model with our logistic regression classifier
pipeline.fit(X_train, y_train)
print("model score: %.3f" % pipeline.score(X_test, y_test))

model score: 0.476


It looks like we have a model that performs slightly better than the logistic regression model from earlier!