In [None]:
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

In [None]:
# Get dataset
dataset_url = "https://raw.githubusercontent.com/allisonhorst/palmerpenguins/master/inst/extdata/penguins.csv"
df = pd.read_csv(dataset_url)
df.head()

In [None]:
# Get dataset info
df_shape = print(f"Number of rows: {df.shape[0]}, Number of columns: {df.shape[1]}\n")
df_columns = print(f"columns in datset: {df.columns.tolist()}\n")
df_info = print(df.info())

In [None]:
df.isna().sum().to_frame(name='missing').assign(percent=lambda x: 100 * x['missing'] / len(df))

## Drop Missing Values
For simplicity in this demo, we’ll drop rows with missing values.

In [None]:
# Drop rows with missing values
df = df.dropna()
df.shape

## Class Distribution
We'll predict whether a penguin is an **Adelie** or not. Let's check class balance.

In [None]:
# get value counts for the species and plot a bar chart
df['species'].value_counts().plot(kind='bar')

In [None]:
# make adelie the target variable
df['is_adelie'] = (df['species'] == 'Adelie').astype(int)

# make plot to show the distribution of the target variable
df['is_adelie'].value_counts().plot(kind='bar', xlabel='Adelie', ylabel='Count', title='Distribution of Adelie')

## Exploratory Data Analysis
We examine how the features vary by class using scatterplots.

In [None]:
#
sns.pairplot(df, vars=['bill_length_mm', 'bill_depth_mm', 'flipper_length_mm', 'body_mass_g'], hue='species')
plt.show()

In [None]:
#
numeric_columns = df.select_dtypes(include=['number']).columns
df[numeric_columns].plot(subplots=True, figsize=(10, 10), layout=(3, 2));

In [None]:
#
categorical_columns = df.select_dtypes(include=["object"]).columns

# plot categorical columns individually
for column in categorical_columns:
    df[column].value_counts().plot(kind='bar', figsize=(3, 3))
    plt.show()

## Modeling

In [None]:
from sklearn.model_selection import StratifiedKFold, cross_validate
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler
from sklearn.neural_network import MLPClassifier
from sklearn.metrics import classification_report
from sklearn.model_selection import train_test_split
import numpy as np

In [None]:
# 
test_size = 0.2
random_state = 42
train_df, test_df = train_test_split(df, test_size=test_size, random_state=random_state)

In [None]:
# 
predictors = ['bill_length_mm', 'bill_depth_mm', 'flipper_length_mm', 'body_mass_g']
X = train_df[predictors]
y = train_df['is_adelie']

In [None]:
# 
pipe = Pipeline([
    #('scaler', StandardScaler()),
    ('nn', MLPClassifier(
        hidden_layer_sizes=(4,), 
        activation='relu', 
        solver='adam', 
        max_iter=500, 
        random_state=42))
])

# Setup stratified k-fold cross-validation
cv = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)
scoring = ['accuracy', 'precision', 'recall', 'f1']

# Actually do the cross-validation
results = cross_validate(pipe, X, y, cv=cv, scoring=scoring, return_estimator=True)

## Evaluation Metrics

In [None]:
for metric in scoring:
    print(f"{metric.capitalize()}: {np.mean(results[f'test_{metric}']):.3f} ± {np.std(results[f'test_{metric}']):.3f}")

## Classification Report

In [None]:
X_test = test_df[predictors]
y_test = test_df['is_adelie']

# Choose a model (not that you should do it this way)
last_model = results['estimator'][-1]

# Get predictions
y_pred = last_model.predict(X_test)
print(classification_report(y_test, y_pred))