# CleanFE: Data Cleaning and Feature Engineering

cleaning the data based on EDA findings. 

In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import warnings
warnings.filterwarnings('ignore')

from sklearn.experimental import enable_iterative_imputer
from sklearn.impute import IterativeImputer, KNNImputer
from sklearn.preprocessing import OneHotEncoder
from sklearn.ensemble import IsolationForest

sns.set_style("whitegrid")

## Load Data

In [2]:
df = pd.read_csv('data.csv')

df['Species'] = df['Species'].map({
    'Adelie Penguin (Pygoscelis adeliae)': 'Adelie',
    'Gentoo penguin (Pygoscelis papua)': 'Gentoo',
    'Chinstrap penguin (Pygoscelis antarctica)': 'Chinstrap'
})

morph_features = ['Culmen Length (mm)', 'Culmen Depth (mm)', 'Flipper Length (mm)', 'Body Mass (g)']
isotope_features = ['Delta 15 N (o/oo)', 'Delta 13 C (o/oo)']

df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 344 entries, 0 to 343
Data columns (total 17 columns):
 #   Column               Non-Null Count  Dtype  
---  ------               --------------  -----  
 0   studyName            344 non-null    object 
 1   Sample Number        344 non-null    int64  
 2   Species              344 non-null    object 
 3   Region               344 non-null    object 
 4   Island               344 non-null    object 
 5   Stage                344 non-null    object 
 6   Individual ID        344 non-null    object 
 7   Clutch Completion    344 non-null    object 
 8   Date Egg             344 non-null    object 
 9   Culmen Length (mm)   342 non-null    float64
 10  Culmen Depth (mm)    342 non-null    float64
 11  Flipper Length (mm)  342 non-null    float64
 12  Body Mass (g)        342 non-null    float64
 13  Sex                  334 non-null    object 
 14  Delta 15 N (o/oo)    330 non-null    float64
 15  Delta 13 C (o/oo)    331 non-null    flo

## Data Cleaning

from EDA:
- sex has '.' data entry error → replace with NaN
- drop useless columns: region, stage, studyName, date egg, sample number, comments, clutch completion, Individual ID

In [3]:
df['Sex'] = df['Sex'].replace('.', np.nan)

columns_to_drop = ['Region', 'Stage', 'studyName', 'Date Egg', 'Sample Number', 'Comments', 'Clutch Completion', 'Individual ID']
df = df.drop(columns=columns_to_drop)

df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 344 entries, 0 to 343
Data columns (total 9 columns):
 #   Column               Non-Null Count  Dtype  
---  ------               --------------  -----  
 0   Species              344 non-null    object 
 1   Island               344 non-null    object 
 2   Culmen Length (mm)   342 non-null    float64
 3   Culmen Depth (mm)    342 non-null    float64
 4   Flipper Length (mm)  342 non-null    float64
 5   Body Mass (g)        342 non-null    float64
 6   Sex                  333 non-null    object 
 7   Delta 15 N (o/oo)    330 non-null    float64
 8   Delta 13 C (o/oo)    331 non-null    float64
dtypes: float64(6), object(3)
memory usage: 24.3+ KB


## Feature Encoding

encode categorical features before saving.
no imputation here - that will happen in TrainTune AFTER train/test split to prevent data leakage.

In [4]:
# Encode Island
island_encoder = OneHotEncoder(sparse_output=False, drop=None)
island_encoded = island_encoder.fit_transform(df[['Island']])
island_columns = [f'Island_{island}' for island in island_encoder.categories_[0]]
island_df = pd.DataFrame(island_encoded, columns=island_columns, index=df.index)
df = pd.concat([df, island_df], axis=1)
df = df.drop(columns=['Island'])

# Encode Sex to 1 and 0
df['Sex'] = df['Sex'].map({'MALE': 1, 'FEMALE': 0})

df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 344 entries, 0 to 343
Data columns (total 11 columns):
 #   Column               Non-Null Count  Dtype  
---  ------               --------------  -----  
 0   Species              344 non-null    object 
 1   Culmen Length (mm)   342 non-null    float64
 2   Culmen Depth (mm)    342 non-null    float64
 3   Flipper Length (mm)  342 non-null    float64
 4   Body Mass (g)        342 non-null    float64
 5   Sex                  333 non-null    float64
 6   Delta 15 N (o/oo)    330 non-null    float64
 7   Delta 13 C (o/oo)    331 non-null    float64
 8   Island_Biscoe        344 non-null    float64
 9   Island_Dream         344 non-null    float64
 10  Island_Torgersen     344 non-null    float64
dtypes: float64(10), object(1)
memory usage: 29.7+ KB


In [5]:
key_features = ['Culmen Length (mm)', 'Culmen Depth (mm)', 'Flipper Length (mm)', 'Body Mass (g)', 'Sex']

missing_counts = df[key_features].isna().sum(axis=1)
all_missing = missing_counts == len(key_features)

print(f"samples with ALL key features missing: {all_missing.sum()}")

if all_missing.sum() > 0:
    print(f"\nremoving {all_missing.sum()} samples with all key features missing")
    print(df[all_missing][['Species'] + key_features])
    df = df[~all_missing].reset_index(drop=True)

print(f"\nfinal dataset: {len(df)} samples")
df.info()

samples with ALL key features missing: 2

removing 2 samples with all key features missing
    Species  Culmen Length (mm)  Culmen Depth (mm)  Flipper Length (mm)  \
3    Adelie                 NaN                NaN                  NaN   
339  Gentoo                 NaN                NaN                  NaN   

     Body Mass (g)  Sex  
3              NaN  NaN  
339            NaN  NaN  

final dataset: 342 samples
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 342 entries, 0 to 341
Data columns (total 11 columns):
 #   Column               Non-Null Count  Dtype  
---  ------               --------------  -----  
 0   Species              342 non-null    object 
 1   Culmen Length (mm)   342 non-null    float64
 2   Culmen Depth (mm)    342 non-null    float64
 3   Flipper Length (mm)  342 non-null    float64
 4   Body Mass (g)        342 non-null    float64
 5   Sex                  333 non-null    float64
 6   Delta 15 N (o/oo)    330 non-null    float64
 7   Delta 13 C (o/oo)

## Save Cleaned Data

ready for TrainTune:
- all 344 unique penguins
- categorical features encoded
- missing values NOT imputed yet (will happen after split to prevent leakage)

## Remove samples with too many missing values

samples with all morphological features missing cannot be used

In [6]:
df.to_csv('data_cleaned.csv', index=False)
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 342 entries, 0 to 341
Data columns (total 11 columns):
 #   Column               Non-Null Count  Dtype  
---  ------               --------------  -----  
 0   Species              342 non-null    object 
 1   Culmen Length (mm)   342 non-null    float64
 2   Culmen Depth (mm)    342 non-null    float64
 3   Flipper Length (mm)  342 non-null    float64
 4   Body Mass (g)        342 non-null    float64
 5   Sex                  333 non-null    float64
 6   Delta 15 N (o/oo)    330 non-null    float64
 7   Delta 13 C (o/oo)    331 non-null    float64
 8   Island_Biscoe        342 non-null    float64
 9   Island_Dream         342 non-null    float64
 10  Island_Torgersen     342 non-null    float64
dtypes: float64(10), object(1)
memory usage: 29.5+ KB
