In [1]:
# penguins_EDA.ipynb

#1. Import Libraries
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report, accuracy_score
import os

# 2. Load Dataset
df = sns.load_dataset("penguins")
df.head()

Unnamed: 0,species,island,bill_length_mm,bill_depth_mm,flipper_length_mm,body_mass_g,sex
0,Adelie,Torgersen,39.1,18.7,181.0,3750.0,Male
1,Adelie,Torgersen,39.5,17.4,186.0,3800.0,Female
2,Adelie,Torgersen,40.3,18.0,195.0,3250.0,Female
3,Adelie,Torgersen,,,,,
4,Adelie,Torgersen,36.7,19.3,193.0,3450.0,Female


In [2]:
# 📌 3. Data Cleaning
df = df.dropna()  # Remove rows with missing values
df.reset_index(drop=True, inplace=True)
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 333 entries, 0 to 332
Data columns (total 7 columns):
 #   Column             Non-Null Count  Dtype  
---  ------             --------------  -----  
 0   species            333 non-null    object 
 1   island             333 non-null    object 
 2   bill_length_mm     333 non-null    float64
 3   bill_depth_mm      333 non-null    float64
 4   flipper_length_mm  333 non-null    float64
 5   body_mass_g        333 non-null    float64
 6   sex                333 non-null    object 
dtypes: float64(4), object(3)
memory usage: 18.3+ KB


In [3]:
# 📌 4. Visualizations – Save to /visuals/
os.makedirs("visuals", exist_ok=True)

# Species count
plt.figure(figsize=(6,4))
sns.countplot(data=df, x='species', palette="Set2")
plt.title("Penguin Species Count")
plt.tight_layout()
plt.savefig("visuals/species_count.png")
plt.close()

# Pairplot
sns.pairplot(df, hue="species", palette="Set1")
plt.savefig("visuals/pairplot.png")
plt.close()


Passing `palette` without assigning `hue` is deprecated and will be removed in v0.14.0. Assign the `x` variable to `hue` and set `legend=False` for the same effect.

  sns.countplot(data=df, x='species', palette="Set2")


In [4]:
# 📌 5. Encoding
df_encoded = df.copy()
df_encoded['sex'] = df_encoded['sex'].map({'Male': 0, 'Female': 1})
df_encoded = pd.get_dummies(df_encoded, columns=['island'], drop_first=True)

# Features and target
X = df_encoded.drop('species', axis=1)
y = df_encoded['species']

In [5]:
# 📌 6. Train-Test Split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [6]:
# 📌 7. Model Training
model = RandomForestClassifier(n_estimators=100, random_state=42)
model.fit(X_train, y_train)

y_pred = model.predict(X_test)
acc = accuracy_score(y_test, y_pred)

print("Accuracy:", acc)
print("\nClassification Report:\n", classification_report(y_test, y_pred))

Accuracy: 1.0

Classification Report:
               precision    recall  f1-score   support

      Adelie       1.00      1.00      1.00        31
   Chinstrap       1.00      1.00      1.00        13
      Gentoo       1.00      1.00      1.00        23

    accuracy                           1.00        67
   macro avg       1.00      1.00      1.00        67
weighted avg       1.00      1.00      1.00        67



In [7]:
# 📌 8. Feature Importance
feat_imp = pd.Series(model.feature_importances_, index=X.columns).sort_values(ascending=False)

plt.figure(figsize=(8,5))
sns.barplot(x=feat_imp, y=feat_imp.index, palette='coolwarm')
plt.title("Feature Importance")
plt.xlabel("Importance Score")
plt.ylabel("Features")
plt.tight_layout()
plt.savefig("visuals/feature_importance.png")
plt.close()


Passing `palette` without assigning `hue` is deprecated and will be removed in v0.14.0. Assign the `y` variable to `hue` and set `legend=False` for the same effect.

  sns.barplot(x=feat_imp, y=feat_imp.index, palette='coolwarm')


In [8]:
# 📌 9. Save Metrics to CSV
os.makedirs("results", exist_ok=True)
metrics_df = pd.DataFrame(classification_report(y_test, y_pred, output_dict=True)).T
metrics_df.to_csv("results/metrics_summary.csv")