In [143]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.preprocessing import StandardScaler, MinMaxScaler, MaxAbsScaler, QuantileTransformer
from IPython.display import Image

In [144]:
data = pd.read_csv('data/penguins_size.csv')
data.head()

Unnamed: 0,species,island,culmen_length_mm,culmen_depth_mm,flipper_length_mm,body_mass_g,sex
0,Adelie,Torgersen,39.1,18.7,181.0,3750.0,MALE
1,Adelie,Torgersen,39.5,17.4,186.0,3800.0,FEMALE
2,Adelie,Torgersen,40.3,18.0,195.0,3250.0,FEMALE
3,Adelie,Torgersen,,,,,
4,Adelie,Torgersen,36.7,19.3,193.0,3450.0,FEMALE


In [145]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 344 entries, 0 to 343
Data columns (total 7 columns):
 #   Column             Non-Null Count  Dtype  
---  ------             --------------  -----  
 0   species            344 non-null    object 
 1   island             344 non-null    object 
 2   culmen_length_mm   342 non-null    float64
 3   culmen_depth_mm    342 non-null    float64
 4   flipper_length_mm  342 non-null    float64
 5   body_mass_g        342 non-null    float64
 6   sex                334 non-null    object 
dtypes: float64(4), object(3)
memory usage: 18.9+ KB


In [146]:
data.isnull().sum()

species               0
island                0
culmen_length_mm      2
culmen_depth_mm       2
flipper_length_mm     2
body_mass_g           2
sex                  10
dtype: int64

# Imputation

## Drop missing

In [147]:
data.dropna(inplace=True)

In [148]:
data.isnull().sum()

species              0
island               0
culmen_length_mm     0
culmen_depth_mm      0
flipper_length_mm    0
body_mass_g          0
sex                  0
dtype: int64

In [149]:
data.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 334 entries, 0 to 343
Data columns (total 7 columns):
 #   Column             Non-Null Count  Dtype  
---  ------             --------------  -----  
 0   species            334 non-null    object 
 1   island             334 non-null    object 
 2   culmen_length_mm   334 non-null    float64
 3   culmen_depth_mm    334 non-null    float64
 4   flipper_length_mm  334 non-null    float64
 5   body_mass_g        334 non-null    float64
 6   sex                334 non-null    object 
dtypes: float64(4), object(3)
memory usage: 20.9+ KB


In [150]:
data = pd.read_csv('data/penguins_size.csv')
data.isnull().sum()

species               0
island                0
culmen_length_mm      2
culmen_depth_mm       2
flipper_length_mm     2
body_mass_g           2
sex                  10
dtype: int64

In [151]:
data["culmen_length_mm"].fillna(data["culmen_length_mm"].mean(), inplace=True)
data["culmen_depth_mm"].fillna(data["culmen_depth_mm"].median(), inplace=True)
data["flipper_length_mm"].fillna(data["flipper_length_mm"].mean(), inplace=True)
data["body_mass_g"].fillna(data["body_mass_g"].mean(), inplace=True)
data['sex'].fillna(data["sex"].value_counts().index[0], inplace=True)

In [152]:
data.isnull().sum()

species              0
island               0
culmen_length_mm     0
culmen_depth_mm      0
flipper_length_mm    0
body_mass_g          0
sex                  0
dtype: int64

In [153]:
data.loc[(data["sex"] != "MALE") & (data["sex"] != "FEMALE")]

Unnamed: 0,species,island,culmen_length_mm,culmen_depth_mm,flipper_length_mm,body_mass_g,sex
336,Gentoo,Biscoe,44.5,15.7,217.0,4875.0,.


In [154]:
data["sex"].unique()

array(['MALE', 'FEMALE', '.'], dtype=object)

In [155]:
data = data.query("sex != '.'")

In [156]:
data["sex"].unique()

array(['MALE', 'FEMALE'], dtype=object)

In [157]:
data.dtypes

species               object
island                object
culmen_length_mm     float64
culmen_depth_mm      float64
flipper_length_mm    float64
body_mass_g          float64
sex                   object
dtype: object

# Encoding

In [158]:
data["species"] = data["species"].astype("category")
data["island"] = data["island"].astype("category")
data["sex"] = data["sex"].astype("category")

In [159]:
data.dtypes

species              category
island               category
culmen_length_mm      float64
culmen_depth_mm       float64
flipper_length_mm     float64
body_mass_g           float64
sex                  category
dtype: object

In [160]:
cat_data = data.loc[:, ["species", "island", "sex"]]
cat_data.rename(columns={"sex": "gender"}, inplace=True)
cat_data.head()

Unnamed: 0,species,island,gender
0,Adelie,Torgersen,MALE
1,Adelie,Torgersen,FEMALE
2,Adelie,Torgersen,FEMALE
3,Adelie,Torgersen,MALE
4,Adelie,Torgersen,FEMALE


In [161]:
cat_data.dtypes

species    category
island     category
gender     category
dtype: object

In [162]:
cat_data["species_cat"] = cat_data["species"].cat.codes
cat_data["island_cat"] = cat_data["island"].cat.codes
cat_data["gender_cat"] = cat_data["gender"].cat.codes
cat_data.head()

Unnamed: 0,species,island,gender,species_cat,island_cat,gender_cat
0,Adelie,Torgersen,MALE,0,2,1
1,Adelie,Torgersen,FEMALE,0,2,0
2,Adelie,Torgersen,FEMALE,0,2,0
3,Adelie,Torgersen,MALE,0,2,1
4,Adelie,Torgersen,FEMALE,0,2,0


# One Hot Encoding

In [163]:
encoded_species = pd.get_dummies(cat_data["species"])
encoded_island = pd.get_dummies(cat_data["island"])
encoded_gender = pd.get_dummies(cat_data["gender"])
cat_data = cat_data.join([encoded_species, encoded_island, encoded_gender])
cat_data.head()

Unnamed: 0,species,island,gender,species_cat,island_cat,gender_cat,Adelie,Chinstrap,Gentoo,Biscoe,Dream,Torgersen,FEMALE,MALE
0,Adelie,Torgersen,MALE,0,2,1,1,0,0,0,0,1,0,1
1,Adelie,Torgersen,FEMALE,0,2,0,1,0,0,0,0,1,1,0
2,Adelie,Torgersen,FEMALE,0,2,0,1,0,0,0,0,1,1,0
3,Adelie,Torgersen,MALE,0,2,1,1,0,0,0,0,1,0,1
4,Adelie,Torgersen,FEMALE,0,2,0,1,0,0,0,0,1,1,0


## Count Encoding

In [164]:
cat_data["gender"].value_counts()

MALE      178
FEMALE    165
Name: gender, dtype: int64

## Target Encoding

In [170]:
factor = 2
upper_lim = data['culmen_length_mm'].mean() + data['culmen_length_mm'].std() * factor
lower_lim = data['culmen_length_mm'].mean() - data['culmen_length_mm'].std() * factor

outliers = data[(data['culmen_length_mm'] < upper_lim) & (data['culmen_length_mm'] > lower_lim)]
outliers

Unnamed: 0,species,island,culmen_length_mm,culmen_depth_mm,flipper_length_mm,body_mass_g,sex
0,Adelie,Torgersen,39.10000,18.7,181.000000,3750.000000,MALE
1,Adelie,Torgersen,39.50000,17.4,186.000000,3800.000000,FEMALE
2,Adelie,Torgersen,40.30000,18.0,195.000000,3250.000000,FEMALE
3,Adelie,Torgersen,43.92193,17.3,200.915205,4201.754386,MALE
4,Adelie,Torgersen,36.70000,19.3,193.000000,3450.000000,FEMALE
...,...,...,...,...,...,...,...
339,Gentoo,Biscoe,43.92193,17.3,200.915205,4201.754386,MALE
340,Gentoo,Biscoe,46.80000,14.3,215.000000,4850.000000,FEMALE
341,Gentoo,Biscoe,50.40000,15.7,222.000000,5750.000000,MALE
342,Gentoo,Biscoe,45.20000,14.8,212.000000,5200.000000,FEMALE


In [171]:
upper_lim

54.82325678080954

In [172]:
lower_lim

33.017232196736394

In [175]:
outliers = data[(data['culmen_length_mm'] < upper_lim) & (data['culmen_length_mm'] > lower_lim)]
outliers

Unnamed: 0,species,island,culmen_length_mm,culmen_depth_mm,flipper_length_mm,body_mass_g,sex
0,Adelie,Torgersen,39.10000,18.7,181.000000,3750.000000,MALE
1,Adelie,Torgersen,39.50000,17.4,186.000000,3800.000000,FEMALE
2,Adelie,Torgersen,40.30000,18.0,195.000000,3250.000000,FEMALE
3,Adelie,Torgersen,43.92193,17.3,200.915205,4201.754386,MALE
4,Adelie,Torgersen,36.70000,19.3,193.000000,3450.000000,FEMALE
...,...,...,...,...,...,...,...
339,Gentoo,Biscoe,43.92193,17.3,200.915205,4201.754386,MALE
340,Gentoo,Biscoe,46.80000,14.3,215.000000,4850.000000,FEMALE
341,Gentoo,Biscoe,50.40000,15.7,222.000000,5750.000000,MALE
342,Gentoo,Biscoe,45.20000,14.8,212.000000,5200.000000,FEMALE


In [176]:
outliers.shape

(337, 7)