# PALMER PENGUINS NOTEBOOK

## 1. Import Libraries

In [1]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.tree import DecisionTreeClassifier, export_graphviz
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score, ConfusionMatrixDisplay
from sklearn.impute import SimpleImputer
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OneHotEncoder
# Visualization
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn import tree
from graphviz import Source
from IPython.display import display
# Dataset loaders
from palmerpenguins import load_penguins 

  import pkg_resources


## 2. Load Dataset + Basic Preprocessing

In [None]:
penguins =load_penguins()
# Remove any rows where the 'species' value is missing (NaN)
penguins_cleaned = penguins.dropna(subset=['species']).copy()
# Separate the dataset into features (X) and target labels (y)
X = penguins_cleaned.drop(columns=['species'])
y = penguins_cleaned['species']
print("Shape after removing missing species:")
print("X shape:", X.shape)
print("y shape:", y.shape)

Shape after removing missing species:
X shape: (344, 7)
y shape: (344,)


## 3. Full Preprocessing with Imputer + OneHotEncoder

In [None]:
def preprocess_features(X):
    # Identify the names of categorical and numerical columns in the dataset
    categorical_columns = X.select_dtypes(include=['object', 'category']).columns.tolist()
    numerical_columns = X.select_dtypes(include=['number']).columns.tolist()
     # For numerical columns: replace missing values with the mean of each column
     # For categorical columns: one-hot encode and handle unknown categories
    preprocessor = ColumnTransformer(
        transformers=[
            ('num', SimpleImputer(strategy='mean'), numerical_columns),
            ('cat', OneHotEncoder(handle_unknown='ignore',sparse_output=False), categorical_columns)
        ],
        remainder='passthrough'
    )
    X_processed = preprocessor.fit_transform(X)
    feature_names = numerical_columns.copy()
    if categorical_columns:
        cat_encoded_names= preprocessor.named_transformers_['cat'].get_feature_names_out(categorical_columns)
        feature_names.extend(cat_encoded_names)
    return pd.DataFrame(X_processed, columns=feature_names)

## 4. Data Splitting


In [None]:
# Split data into train/test sets with stratified sampling to maintain class distribution
def split_dataset(X,y,test_size):
    return train_test_split(X,y,test_size=test_size,stratify=y,shuffle=True, random_state=42)

## 5. Train Decision Tree

In [None]:
# Create a decision tree classifier with entropy criterion
def train_decision_tree(X_train, y_train, max_depth=None):
    clf = DecisionTreeClassifier(criterion='entropy',max_depth=max_depth, random_state=42)
    clf.fit(X_train, y_train)
    return clf

## 6. Visualize Label Distribution

In [None]:
# Function to visualize the label distribution
def plot_distribution(y, title):
    y = pd.Series(y.ravel())  # Ensure 1D
    sns.countplot(x=y)
    plt.title(title)
    plt.xlabel("Species")
    plt.ylabel("Count")
    plt.grid(True)
    plt.show()

## 7. Visualize Decision Trees with Graphviz

In [None]:
# Export the trained decision tree to DOT format (Graphviz format)
def draw_decision_tree(tree_model, feature_names,class_names):
    dot_data = tree.export_graphviz(
        tree_model,
        out_file=None, 
        feature_names=feature_names,  
        class_names=class_names,  
        filled=True,
        rounded=True,  
        special_characters=True)  
    graph = Source(dot_data)
    display(graph)
    

## 8. Classification Report

In [8]:
def print_classification_report(y_test,y_pred,target_names):
    print(classification_report(y_true=y_test, y_pred=y_pred, target_names=target_names))


## 9. Confusion Matrix

In [9]:
def plot_confusion_matrix(y_test, y_pred,target_names,labels,depth,test_size):
    cm = confusion_matrix(y_test, y_pred, labels=labels)
    display = ConfusionMatrixDisplay(confusion_matrix=cm, display_labels=target_names)
    figure,axes = plt.subplots(figsize=(8, 8))
    display.plot(ax=axes, cmap=plt.cm.Blues, values_format='d')
    plt.title(f"Confusion Matrix (Depth={depth}, {100 - int(test_size * 100)}/{int(test_size * 100)} Split)")
    plt.grid(False)
    plt.show()

## 10. Max Depth - Accurary

In [None]:
# Print table of relationship between max_depth and accuracy
def print_max_depth_accuracy(max_depths, accuracies):
    df = pd.DataFrame([accuracies],columns=[str(v) if v is not None else 'None' for v in max_depths],index=['Accuracy'])
    df.columns.name = 'max_depth'
    print(df)

# Draw a chart of accuracy by max_depth
def max_depth_accuracy_chart(max_depths,accuracies):
    plot_x = [-1 if v is None else v for v in max_depths]
    labels_x =[str(v) if v is not None else 'None' for v in max_depths]
    plt.figure(figsize=(10, 6))
    plt.plot(plot_x,accuracies,'o-', color='blue')
    plt.title('Decision Tree Accuracy by Maximum Depth')
    plt.xlabel('Max Depth')
    plt.ylabel('Accuracy')
    plt.xticks(plot_x, labels_x)
    plt.grid(True)
    plt.show()


## 11. Main

In [None]:
def main(): 
    split_ratios =[0.6, 0.4, 0.2, 0.1, 0.2]
    X_processed = preprocess_features(X)
    feature_names = X_processed.columns.tolist()
    # Extract feature names from processed dataset for visualization
    target_names = ["Adelie", "Chinstrap", "Gentoo"]
    class_names = [label for label in target_names if label in y.unique()]
    # Filter to get only class names that actually exist in the dataset
    for i in range(0,4):
        print(f"{i}: Train/Test = {100 - int(split_ratios[i] * 100)}/{int(split_ratios[i] * 100)}")
    print("4: Max Depth Accuracy Analysis")
    # Menu for user to choose split ratio
    choose = int(input("Choose a split ratio (0-4): "))
    X_train, X_test, y_train, y_test = split_dataset(X_processed, y, split_ratios[choose])
    match choose:
        case 0 | 1 | 2 | 3: #Standard decision tree analysis with visualization
            clf = train_decision_tree(X_train, y_train, max_depth=None)
            y_pred = clf.predict(X_test)
            plot_distribution(y, "Original Dataset Distribution")
            plot_distribution(y_train, "Training Set Distribution")
            plot_distribution(y_test, "Test Set Distribution")
            draw_decision_tree(clf, feature_names, class_names)
            print_classification_report(y_test, y_pred,target_names)
            plot_confusion_matrix(y_test,y_pred,target_names,labels=class_names,depth=None, test_size=split_ratios[choose])
        case 4: #  Max depth analysis - compare different tree depths
            max_depths= [None,2,3,4,5,6,7]
            accuracies = []
            for max_depth in max_depths:
                clf = train_decision_tree(X_train,y_train,max_depth)
                y_pred = clf.predict(X_test)
                accuracy = accuracy_score(y_test, y_pred)
                accuracies.append(accuracy)
            print("\n\n")
            print_max_depth_accuracy(max_depths,accuracies)
            max_depth_accuracy_chart(max_depths, accuracies)
        case _:
            print("Please choose a valid option (0-3).")

main()