In [1]:
import os
import numpy as np
import matplotlib.pyplot as plt
plt.style.use('default')
import pandas as pd
plt.rcParams['figure.figsize'] = [12, 8]
plt.rcParams['figure.dpi'] = 100 # 200 e.g. is really fine, but slower
plt.tight_layout()
import matplotlib 
font = {'family' : 'normal',
        'weight' : 'normal',
        'size'   : 20}
matplotlib.rc('font', **font)

<Figure size 1200x800 with 0 Axes>

In [2]:
# Read the origin file
original_df = pd.read_csv("../data/penguins_lter.csv")
original_df

Unnamed: 0,studyName,Sample Number,Species,Region,Island,Stage,Individual ID,Clutch Completion,Date Egg,Culmen Length (mm),Culmen Depth (mm),Flipper Length (mm),Body Mass (g),Sex,Delta 15 N (o/oo),Delta 13 C (o/oo),Comments
0,PAL0708,1,Adelie Penguin (Pygoscelis adeliae),Anvers,Torgersen,"Adult, 1 Egg Stage",N1A1,Yes,11-11-07,39.1,18.7,181.0,3750.0,MALE,,,Not enough blood for isotopes.
1,PAL0708,2,Adelie Penguin (Pygoscelis adeliae),Anvers,Torgersen,"Adult, 1 Egg Stage",N1A2,Yes,11-11-07,39.5,17.4,186.0,3800.0,FEMALE,8.94956,-24.69454,
2,PAL0708,3,Adelie Penguin (Pygoscelis adeliae),Anvers,Torgersen,"Adult, 1 Egg Stage",N2A1,Yes,11/16/07,40.3,18.0,195.0,3250.0,FEMALE,8.36821,-25.33302,
3,PAL0708,4,Adelie Penguin (Pygoscelis adeliae),Anvers,Torgersen,"Adult, 1 Egg Stage",N2A2,Yes,11/16/07,,,,,,,,Adult not sampled.
4,PAL0708,5,Adelie Penguin (Pygoscelis adeliae),Anvers,Torgersen,"Adult, 1 Egg Stage",N3A1,Yes,11/16/07,36.7,19.3,193.0,3450.0,FEMALE,8.76651,-25.32426,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
339,PAL0910,120,Gentoo penguin (Pygoscelis papua),Anvers,Biscoe,"Adult, 1 Egg Stage",N38A2,No,12-01-09,,,,,,,,
340,PAL0910,121,Gentoo penguin (Pygoscelis papua),Anvers,Biscoe,"Adult, 1 Egg Stage",N39A1,Yes,11/22/09,46.8,14.3,215.0,4850.0,FEMALE,8.41151,-26.13832,
341,PAL0910,122,Gentoo penguin (Pygoscelis papua),Anvers,Biscoe,"Adult, 1 Egg Stage",N39A2,Yes,11/22/09,50.4,15.7,222.0,5750.0,MALE,8.30166,-26.04117,
342,PAL0910,123,Gentoo penguin (Pygoscelis papua),Anvers,Biscoe,"Adult, 1 Egg Stage",N43A1,Yes,11/22/09,45.2,14.8,212.0,5200.0,FEMALE,8.24246,-26.11969,


In [3]:
# Number of unique values in each columns
print(original_df.nunique())

studyName                3
Sample Number          152
Species                  3
Region                   1
Island                   3
Stage                    1
Individual ID          190
Clutch Completion        2
Date Egg                50
Culmen Length (mm)     164
Culmen Depth (mm)       80
Flipper Length (mm)     55
Body Mass (g)           94
Sex                      3
Delta 15 N (o/oo)      330
Delta 13 C (o/oo)      331
Comments                 7
dtype: int64


In [4]:
# Each study researches only one kind of penguin. We want to predict types of penguins based on natural characteristics
df_remove_spare_columns = original_df.drop('studyName', inplace=False, axis=1)
# studyName is removed, its sample order should be too
df_remove_spare_columns = df_remove_spare_columns.drop('Sample Number', inplace=False, axis=1)
# all stages are the same
df_remove_spare_columns = df_remove_spare_columns.drop('Stage', inplace=False, axis=1)
# all regions are the same
df_remove_spare_columns = df_remove_spare_columns.drop('Region', inplace=False, axis=1)
# Date Egg may reveals the species of the corresponding research
df_remove_spare_columns = df_remove_spare_columns.drop('Date Egg', inplace=False, axis=1)
# Individual ID is subjective and distinct with each species
df_remove_spare_columns = df_remove_spare_columns.drop('Individual ID', inplace=False, axis=1)
# Comments are personal and has too many NaN values
df_remove_spare_columns = df_remove_spare_columns.drop('Comments', inplace=False, axis=1)
# Take long time to observe
df_remove_spare_columns = df_remove_spare_columns.drop('Clutch Completion', inplace=False, axis=1)
# Researches Island is bias
df_remove_spare_columns = df_remove_spare_columns.drop('Island', inplace=False, axis=1)

In [5]:
df_remove_spare_columns

Unnamed: 0,Species,Culmen Length (mm),Culmen Depth (mm),Flipper Length (mm),Body Mass (g),Sex,Delta 15 N (o/oo),Delta 13 C (o/oo)
0,Adelie Penguin (Pygoscelis adeliae),39.1,18.7,181.0,3750.0,MALE,,
1,Adelie Penguin (Pygoscelis adeliae),39.5,17.4,186.0,3800.0,FEMALE,8.94956,-24.69454
2,Adelie Penguin (Pygoscelis adeliae),40.3,18.0,195.0,3250.0,FEMALE,8.36821,-25.33302
3,Adelie Penguin (Pygoscelis adeliae),,,,,,,
4,Adelie Penguin (Pygoscelis adeliae),36.7,19.3,193.0,3450.0,FEMALE,8.76651,-25.32426
...,...,...,...,...,...,...,...,...
339,Gentoo penguin (Pygoscelis papua),,,,,,,
340,Gentoo penguin (Pygoscelis papua),46.8,14.3,215.0,4850.0,FEMALE,8.41151,-26.13832
341,Gentoo penguin (Pygoscelis papua),50.4,15.7,222.0,5750.0,MALE,8.30166,-26.04117
342,Gentoo penguin (Pygoscelis papua),45.2,14.8,212.0,5200.0,FEMALE,8.24246,-26.11969


## Warning: The code below can change outside files

In [6]:
# Only use below code when needing to save the data
df_remove_spare_columns.to_csv('../data/data_remove_spare_columns.csv', index = False)
df_remove_spare_columns

Unnamed: 0,Species,Culmen Length (mm),Culmen Depth (mm),Flipper Length (mm),Body Mass (g),Sex,Delta 15 N (o/oo),Delta 13 C (o/oo)
0,Adelie Penguin (Pygoscelis adeliae),39.1,18.7,181.0,3750.0,MALE,,
1,Adelie Penguin (Pygoscelis adeliae),39.5,17.4,186.0,3800.0,FEMALE,8.94956,-24.69454
2,Adelie Penguin (Pygoscelis adeliae),40.3,18.0,195.0,3250.0,FEMALE,8.36821,-25.33302
3,Adelie Penguin (Pygoscelis adeliae),,,,,,,
4,Adelie Penguin (Pygoscelis adeliae),36.7,19.3,193.0,3450.0,FEMALE,8.76651,-25.32426
...,...,...,...,...,...,...,...,...
339,Gentoo penguin (Pygoscelis papua),,,,,,,
340,Gentoo penguin (Pygoscelis papua),46.8,14.3,215.0,4850.0,FEMALE,8.41151,-26.13832
341,Gentoo penguin (Pygoscelis papua),50.4,15.7,222.0,5750.0,MALE,8.30166,-26.04117
342,Gentoo penguin (Pygoscelis papua),45.2,14.8,212.0,5200.0,FEMALE,8.24246,-26.11969


In [7]:
df_remove_spare_columns = pd.read_csv("../data/data_remove_spare_columns.csv")
df_remove_spare_columns

Unnamed: 0,Species,Culmen Length (mm),Culmen Depth (mm),Flipper Length (mm),Body Mass (g),Sex,Delta 15 N (o/oo),Delta 13 C (o/oo)
0,Adelie Penguin (Pygoscelis adeliae),39.1,18.7,181.0,3750.0,MALE,,
1,Adelie Penguin (Pygoscelis adeliae),39.5,17.4,186.0,3800.0,FEMALE,8.94956,-24.69454
2,Adelie Penguin (Pygoscelis adeliae),40.3,18.0,195.0,3250.0,FEMALE,8.36821,-25.33302
3,Adelie Penguin (Pygoscelis adeliae),,,,,,,
4,Adelie Penguin (Pygoscelis adeliae),36.7,19.3,193.0,3450.0,FEMALE,8.76651,-25.32426
...,...,...,...,...,...,...,...,...
339,Gentoo penguin (Pygoscelis papua),,,,,,,
340,Gentoo penguin (Pygoscelis papua),46.8,14.3,215.0,4850.0,FEMALE,8.41151,-26.13832
341,Gentoo penguin (Pygoscelis papua),50.4,15.7,222.0,5750.0,MALE,8.30166,-26.04117
342,Gentoo penguin (Pygoscelis papua),45.2,14.8,212.0,5200.0,FEMALE,8.24246,-26.11969


In [8]:
# Outlier is defined as a sample with no valid important values
df_remove_outliers = df_remove_spare_columns[df_remove_spare_columns['Culmen Length (mm)'].notna() |
                                            df_remove_spare_columns['Culmen Depth (mm)'].notna() |
                                            df_remove_spare_columns['Flipper Length (mm)'].notna() |
                                            df_remove_spare_columns['Body Mass (g)'].notna() |
                                            df_remove_spare_columns['Delta 15 N (o/oo)'].notna() |
                                            df_remove_spare_columns['Delta 13 C (o/oo)'].notna()]
df_remove_outliers.to_csv('../data/data_remove_outliers.csv', index = False)

In [9]:
df_remove_outliers = pd.read_csv("../data/data_remove_outliers.csv")
df_remove_outliers

Unnamed: 0,Species,Culmen Length (mm),Culmen Depth (mm),Flipper Length (mm),Body Mass (g),Sex,Delta 15 N (o/oo),Delta 13 C (o/oo)
0,Adelie Penguin (Pygoscelis adeliae),39.1,18.7,181.0,3750.0,MALE,,
1,Adelie Penguin (Pygoscelis adeliae),39.5,17.4,186.0,3800.0,FEMALE,8.94956,-24.69454
2,Adelie Penguin (Pygoscelis adeliae),40.3,18.0,195.0,3250.0,FEMALE,8.36821,-25.33302
3,Adelie Penguin (Pygoscelis adeliae),36.7,19.3,193.0,3450.0,FEMALE,8.76651,-25.32426
4,Adelie Penguin (Pygoscelis adeliae),39.3,20.6,190.0,3650.0,MALE,8.66496,-25.29805
...,...,...,...,...,...,...,...,...
337,Gentoo penguin (Pygoscelis papua),47.2,13.7,214.0,4925.0,FEMALE,7.99184,-26.20538
338,Gentoo penguin (Pygoscelis papua),46.8,14.3,215.0,4850.0,FEMALE,8.41151,-26.13832
339,Gentoo penguin (Pygoscelis papua),50.4,15.7,222.0,5750.0,MALE,8.30166,-26.04117
340,Gentoo penguin (Pygoscelis papua),45.2,14.8,212.0,5200.0,FEMALE,8.24246,-26.11969


In [10]:
print(df_remove_spare_columns.nunique())
print(df_remove_spare_columns.shape)

Species                  3
Culmen Length (mm)     164
Culmen Depth (mm)       80
Flipper Length (mm)     55
Body Mass (g)           94
Sex                      3
Delta 15 N (o/oo)      330
Delta 13 C (o/oo)      331
dtype: int64
(344, 8)


In [11]:
df_Adelie = df_remove_spare_columns[df_remove_spare_columns['Species'] == 'Adelie Penguin (Pygoscelis adeliae)']
df_Chinstrap = df_remove_spare_columns[df_remove_spare_columns['Species'] == 'Chinstrap penguin (Pygoscelis antarctica)']
df_Gentoo = df_remove_spare_columns[df_remove_spare_columns['Species'] == 'Gentoo penguin (Pygoscelis papua)']

# Suffle
df_Adelie = df_Adelie.sample(frac=1)
df_Chinstrap = df_Chinstrap.sample(frac=1)
df_Gentoo = df_Gentoo.sample(frac=1)

In [12]:
def dfGetTrainData(df, percentage):
    df_len = len(df)
    return df.head(int(df_len * percentage))
    
def dfGetTestData(df, percentage):
    df_len = len(df)    
    return df.tail(df_len - int(df_len * percentage))

In [13]:
percentage = 0.8

df_Adelie_train = dfGetTrainData(df_Adelie, percentage)
df_Chinstrap_train = dfGetTrainData(df_Chinstrap, percentage)
df_Gentoo_train = dfGetTrainData(df_Gentoo, percentage)

df_Adelie_test = dfGetTestData(df_Adelie, percentage)
df_Chinstrap_test = dfGetTestData(df_Chinstrap, percentage)
df_Gentoo_test = dfGetTestData(df_Gentoo, percentage)

In [14]:
frames = [df_Adelie_train, df_Chinstrap_train, df_Gentoo_train]
df_train = pd.concat(frames).sample(frac=1)
frames = [df_Adelie_test, df_Chinstrap_test, df_Gentoo_test]
df_test = pd.concat(frames).sample(frac=1)

In [15]:
try:
    os.makedirs('../data/data1')
except OSError as e:
    pass
df_train.to_csv('../data/data1/train_data.csv', index = False)

## Warning: The function below can change outside files

In [16]:
def splitTrainTest(df, percentage, data_name):
    df_Adelie = df_remove_spare_columns[df_remove_spare_columns['Species'] == 'Adelie Penguin (Pygoscelis adeliae)']
    df_Chinstrap = df_remove_spare_columns[df_remove_spare_columns['Species'] == 'Chinstrap penguin (Pygoscelis antarctica)']
    df_Gentoo = df_remove_spare_columns[df_remove_spare_columns['Species'] == 'Gentoo penguin (Pygoscelis papua)']

    # Suffle
    df_Adelie = df_Adelie.sample(frac=1)
    df_Chinstrap = df_Chinstrap.sample(frac=1)
    df_Gentoo = df_Gentoo.sample(frac=1)
    
    def dfGetTrainData(df, percentage):
        df_len = len(df)
        return df.head(int(df_len * percentage))

    def dfGetTestData(df, percentage):
        df_len = len(df)    
        return df.tail(df_len - int(df_len * percentage))
    
    df_Adelie_train = dfGetTrainData(df_Adelie, percentage)
    df_Chinstrap_train = dfGetTrainData(df_Chinstrap, percentage)
    df_Gentoo_train = dfGetTrainData(df_Gentoo, percentage)

    df_Adelie_test = dfGetTestData(df_Adelie, percentage)
    df_Chinstrap_test = dfGetTestData(df_Chinstrap, percentage)
    df_Gentoo_test = dfGetTestData(df_Gentoo, percentage)
    
    frames = [df_Adelie_train, df_Chinstrap_train, df_Gentoo_train]
    df_train = pd.concat(frames).sample(frac=1)
    frames = [df_Adelie_test, df_Chinstrap_test, df_Gentoo_test]
    df_test = pd.concat(frames).sample(frac=1)
    
    try:
        os.makedirs('../data/' + data_name)
    except OSError as e:
        pass
    
    df_train.to_csv('../data/' + data_name + '/train_data.csv', index = False)
    df_test.to_csv('../data/' + data_name + '/test_data.csv', index = False)

In [17]:
splitTrainTest(df_remove_outliers, percentage = 0.8, data_name = 'data1')

In [18]:
train_data = pd.read_csv("../data/data1/train_data.csv")
test_data = pd.read_csv("../data/data1/test_data.csv")

In [19]:
df_remove_outliers.nunique()

Species                  3
Culmen Length (mm)     164
Culmen Depth (mm)       80
Flipper Length (mm)     55
Body Mass (g)           94
Sex                      3
Delta 15 N (o/oo)      330
Delta 13 C (o/oo)      331
dtype: int64

In [20]:
X_train = train_data.drop(['Species'], axis = 1)
y_train = train_data[['Species']].copy()
X_train.to_csv('../data/' + 'data1' + '/X_train.csv', index = False)
y_train.to_csv('../data/' + 'data1' + '/y_train.csv', index = False)

X_test = test_data.drop(['Species'], axis = 1)
y_test = test_data[['Species']].copy()
X_test.to_csv('../data/' + 'data1' + '/X_test.csv', index = False)
y_test.to_csv('../data/' + 'data1' + '/y_test.csv', index = False)

In [21]:
pd.read_csv("../data/data1/X_train.csv").values

array([[43.5, 15.2, 213.0, ..., 'FEMALE', 8.21634, -26.11046],
       [45.4, 18.7, 188.0, ..., 'FEMALE', 8.64701, -24.62717],
       [44.5, 14.7, 214.0, ..., 'FEMALE', 8.20106, -26.16524],
       ...,
       [34.4, 18.4, 184.0, ..., 'FEMALE', 8.47827, -25.23319],
       [49.5, 19.0, 200.0, ..., 'MALE', 9.63074, -24.34684],
       [52.0, 19.0, 197.0, ..., 'MALE', 9.36799, -24.47142]], dtype=object)

In [22]:
print(X_train)

     Culmen Length (mm)  Culmen Depth (mm)  Flipper Length (mm)  \
0                  43.5               15.2                213.0   
1                  45.4               18.7                188.0   
2                  44.5               14.7                214.0   
3                  41.1               19.0                182.0   
4                  40.2               17.1                193.0   
..                  ...                ...                  ...   
269                50.5               15.9                222.0   
270                51.9               19.5                206.0   
271                34.4               18.4                184.0   
272                49.5               19.0                200.0   
273                52.0               19.0                197.0   

     Body Mass (g)     Sex  Delta 15 N (o/oo)  Delta 13 C (o/oo)  
0           4650.0  FEMALE            8.21634          -26.11046  
1           3525.0  FEMALE            8.64701          -24.62

In [23]:
# Remove Island
df_remove_Island = df_remove_outliers.drop('Island', inplace=False, axis=1)
df_remove_Island.to_csv('../data/data_remove_Island.csv', index = False)

KeyError: "['Island'] not found in axis"