# Generating synthetic data
***

In [72]:
# Dataframes.
import pandas as pd

import sklearn as sk

# Fills missing values.
from sklearn.impute import SimpleImputer

# Plots.
import matplotlib.pyplot as plt

## Palmer Archipelago (Antarctica) penguin data
***

### Load data
***

In [20]:
# Read in csv data.
data = pd.read_csv('data/penguins_lter.csv')

# Remove columns that will not be used.
data = data.drop(labels=['studyName', 'Sample Number', 'Region', 'Stage', 'Individual ID', 'Clutch Completion', 'Date Egg', 'Delta 15 N (o/oo)', 'Delta 13 C (o/oo)', 'Comments'], axis=1)


In [19]:
# Get basic info about dataset.
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 344 entries, 0 to 343
Data columns (total 7 columns):
 #   Column               Non-Null Count  Dtype  
---  ------               --------------  -----  
 0   Species              344 non-null    object 
 1   Island               344 non-null    object 
 2   Culmen Length (mm)   342 non-null    float64
 3   Culmen Depth (mm)    342 non-null    float64
 4   Flipper Length (mm)  342 non-null    float64
 5   Body Mass (g)        342 non-null    float64
 6   Sex                  334 non-null    object 
dtypes: float64(4), object(3)
memory usage: 18.9+ KB


In [30]:
# Statistical summary.
data.describe()

Unnamed: 0,Culmen Length (mm),Culmen Depth (mm),Flipper Length (mm),Body Mass (g)
count,342.0,342.0,342.0,342.0
mean,43.92193,17.15117,200.915205,4201.754386
std,5.459584,1.974793,14.061714,801.954536
min,32.1,13.1,172.0,2700.0
25%,39.225,15.6,190.0,3550.0
50%,44.45,17.3,197.0,4050.0
75%,48.5,18.7,213.0,4750.0
max,59.6,21.5,231.0,6300.0


### Clean data
***

In [27]:
# Check for null values.
data.isnull().sum()

Species                 0
Island                  0
Culmen Length (mm)      2
Culmen Depth (mm)       2
Flipper Length (mm)     2
Body Mass (g)           2
Sex                    10
dtype: int64

In [38]:
# Eyeball data to see where Null values are. 
pd.set_option('display.max_rows', None)

#### Fill missing values

In [81]:
# Code adapted from https://www.kaggle.com/parulpandey/penguin-dataset-the-new-iris/notebook
# Fill in missing values with the most freuquent occurance in the column.
imputer = SimpleImputer(strategy='most_frequent') 
data.iloc[:,:] = imputer.fit_transform(data)

In [53]:
# Check data again. 
data.isnull().sum()

Species                0
Island                 0
Culmen Length (mm)     0
Culmen Depth (mm)      0
Flipper Length (mm)    0
Body Mass (g)          0
Sex                    0
dtype: int64

#### Convert values in Sex column from strings to integers

In [77]:
# Convert sex type from boolean string value to boolean integer value.
lb = sk.preprocessing.LabelEncoder()
data["Sex"] = lb.fit_transform(data["Sex"])

In [80]:
# Check the Sex column.
data.Sex.head()

0    2
1    1
2    1
3    2
4    1
Name: Sex, dtype: int64

#### Check species count

In [92]:
data['Species'].value_counts()

Adelie Penguin (Pygoscelis adeliae)          152
Gentoo penguin (Pygoscelis papua)            124
Chinstrap penguin (Pygoscelis antarctica)     68
Name: Species, dtype: int64