In [1]:
import pandas as pd

# Load Dataset
We begin by loading the dataset to get an overview of the data structure.

In [2]:
penguins = pd.read_csv("penguins_size.csv")
penguins.head()

Unnamed: 0,species,island,culmen_length_mm,culmen_depth_mm,flipper_length_mm,body_mass_g,sex
0,Adelie,Torgersen,39.1,18.7,181.0,3750.0,MALE
1,Adelie,Torgersen,39.5,17.4,186.0,3800.0,FEMALE
2,Adelie,Torgersen,40.3,18.0,195.0,3250.0,FEMALE
3,Adelie,Torgersen,,,,,
4,Adelie,Torgersen,36.7,19.3,193.0,3450.0,FEMALE


# Find Out Shape of Data
Next, we determine the shape of the dataset to understand the number of rows and columns.

In [3]:
penguins.shape

(344, 7)

# Dataset Info
We then retrieve detailed information about the dataset, including data types and non-null counts.

In [4]:
penguins.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 344 entries, 0 to 343
Data columns (total 7 columns):
 #   Column             Non-Null Count  Dtype  
---  ------             --------------  -----  
 0   species            344 non-null    object 
 1   island             344 non-null    object 
 2   culmen_length_mm   342 non-null    float64
 3   culmen_depth_mm    342 non-null    float64
 4   flipper_length_mm  342 non-null    float64
 5   body_mass_g        342 non-null    float64
 6   sex                334 non-null    object 
dtypes: float64(4), object(3)
memory usage: 18.9+ KB


# Unique Species
Identifying the unique species present in the dataset helps us understand the diversity of the data.

In [5]:
unique_species = penguins['species'].unique()
unique_species

array(['Adelie', 'Chinstrap', 'Gentoo'], dtype=object)

# Find NaN Values per Column
Counting NaN values per column allows us to assess data completeness and plan for data cleaning if necessary.

In [6]:
nan_values = penguins.isnull().sum()
nan_values

species               0
island                0
culmen_length_mm      2
culmen_depth_mm       2
flipper_length_mm     2
body_mass_g           2
sex                  10
dtype: int64

# Handle missing values
Handling missing values using KNNImputer

In [7]:
import pandas as pd

# Impute missing values for categorical variables (mode)
penguins['sex'].replace('.', pd.NA, inplace=True)
penguins['sex'].fillna(penguins['sex'].mode()[0], inplace=True)

# Impute missing values for numerical variables (median)
penguins['culmen_length_mm'].fillna(penguins['culmen_length_mm'].median(), inplace=True)
penguins['culmen_depth_mm'].fillna(penguins['culmen_depth_mm'].median(), inplace=True)
penguins['flipper_length_mm'].fillna(penguins['flipper_length_mm'].median(), inplace=True)
penguins['body_mass_g'].fillna(penguins['body_mass_g'].median(), inplace=True)

print(penguins.isnull().sum())

species              0
island               0
culmen_length_mm     0
culmen_depth_mm      0
flipper_length_mm    0
body_mass_g          0
sex                  0
dtype: int64


In [8]:
penguins.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 344 entries, 0 to 343
Data columns (total 7 columns):
 #   Column             Non-Null Count  Dtype  
---  ------             --------------  -----  
 0   species            344 non-null    object 
 1   island             344 non-null    object 
 2   culmen_length_mm   344 non-null    float64
 3   culmen_depth_mm    344 non-null    float64
 4   flipper_length_mm  344 non-null    float64
 5   body_mass_g        344 non-null    float64
 6   sex                344 non-null    object 
dtypes: float64(4), object(3)
memory usage: 18.9+ KB


In [9]:
penguins.head()

Unnamed: 0,species,island,culmen_length_mm,culmen_depth_mm,flipper_length_mm,body_mass_g,sex
0,Adelie,Torgersen,39.1,18.7,181.0,3750.0,MALE
1,Adelie,Torgersen,39.5,17.4,186.0,3800.0,FEMALE
2,Adelie,Torgersen,40.3,18.0,195.0,3250.0,FEMALE
3,Adelie,Torgersen,44.45,17.3,197.0,4050.0,MALE
4,Adelie,Torgersen,36.7,19.3,193.0,3450.0,FEMALE


# Find Culmen Depth in Each Island
Analyzing culmen depth by island can reveal geographic variations in this feature.

In [10]:
culmen_depth_island = penguins.groupby('island')['culmen_depth_mm'].mean()
culmen_depth_island

island
Biscoe       15.883333
Dream        18.344355
Torgersen    18.407692
Name: culmen_depth_mm, dtype: float64

# Body Mass in Each Sex
Examining the body mass of penguins by sex can highlight potential sexual dimorphism in the species.

In [11]:
body_mass_sex = penguins.groupby('sex')['body_mass_g'].mean()
body_mass_sex

sex
FEMALE    3862.272727
MALE      4512.988827
Name: body_mass_g, dtype: float64

# Species in Each Island
Understanding the distribution of species across different islands.

In [12]:
species_island = penguins.groupby(['island', 'species']).size().unstack(fill_value=0)
species_island

species,Adelie,Chinstrap,Gentoo
island,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
Biscoe,44,0,124
Dream,56,68,0
Torgersen,52,0,0


# Share of Female Penguins in Each Species
Calculating the share of female penguins in each species provides insights into the sex ratio and potential reproductive dynamics.

In [13]:
female_share_species = penguins[penguins['sex'] == 'FEMALE'].groupby('species').size() / penguins.groupby('species').size()
female_share_species

species
Adelie       0.480263
Chinstrap    0.500000
Gentoo       0.467742
dtype: float64

# Flipper Length Comparisons Between Species
Comparing flipper lengths among species (min, max, mean) helps in understanding physical differences and adaptations.

In [14]:
flipper_stats_species = penguins.groupby('species')['flipper_length_mm'].agg(['min', 'max', 'mean'])
flipper_stats_species

Unnamed: 0_level_0,min,max,mean
species,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
Adelie,172.0,210.0,190.0
Chinstrap,178.0,212.0,195.823529
Gentoo,197.0,231.0,217.024194
