In [1]:
%matplotlib inline
import pandas as pd

# Loading up the data with pandas
The [Palmer Penguins dataset](https://github.com/allisonhorst/palmerpenguins) is composed of two CSV files: One with "raw" data and one, presumably, less "raw". Here is a data dictionary I found on [Kaggle](https://www.kaggle.com/datasets/parulpandey/palmer-archipelago-antarctica-penguin-data). 

* species: penguin species (Chinstrap, Adélie, or Gentoo)
* culmen_length_mm: culmen length (mm)
* culmen_depth_mm: culmen depth (mm)
* flipper_length_mm: flipper length (mm)
* body_mass_g: body mass (g)
* island: island name (Dream, Torgersen, or Biscoe) in the Palmer Archipelago (Antarctica)
* sex: penguin sex
* year: the year the observation was made

Pandas lets me load this up in a DataFrame object, capturing the rows and columns of a csv file.

In [31]:
raw_df = pd.read_csv('data/palmerpenguins/inst/extdata/penguins_raw.csv')
raw_df.columns.tolist()

['studyName',
 'Sample Number',
 'Species',
 'Region',
 'Island',
 'Stage',
 'Individual ID',
 'Clutch Completion',
 'Date Egg',
 'Culmen Length (mm)',
 'Culmen Depth (mm)',
 'Flipper Length (mm)',
 'Body Mass (g)',
 'Sex',
 'Delta 15 N (o/oo)',
 'Delta 13 C (o/oo)',
 'Comments']

In [36]:
df = pd.read_csv('data/palmerpenguins/inst/extdata/penguins.csv')
list(df.columns)

['species',
 'island',
 'bill_length_mm',
 'bill_depth_mm',
 'flipper_length_mm',
 'body_mass_g',
 'sex',
 'year']

In [25]:
# for i in df['island'].unique().tolist():
#     print(i)

In [32]:
df.head()

Unnamed: 0,species,island,bill_length_mm,bill_depth_mm,flipper_length_mm,body_mass_g,sex,year
0,Adelie,Torgersen,39.1,18.7,181.0,3750.0,male,2007
1,Adelie,Torgersen,39.5,17.4,186.0,3800.0,female,2007
2,Adelie,Torgersen,40.3,18.0,195.0,3250.0,female,2007
3,Adelie,Torgersen,,,,,,2007
4,Adelie,Torgersen,36.7,19.3,193.0,3450.0,female,2007


In [34]:
raw_df = raw_df[[
    'Species', 
    'Island', 
    'Date Egg', 
    'Culmen Length (mm)', 
    'Culmen Depth (mm)', 
    'Flipper Length (mm)',
    'Body Mass (g)',
    'Sex']]

In [37]:
raw_df.columns = ['species',
 'island',
 'date',
 'bill_length_mm',
 'bill_depth_mm',
 'flipper_length_mm',
 'body_mass_g',
 'sex']

In [41]:
#raw_df['date'] = pd.to_datetime(raw_df['date'])
raw_df['year'] = raw_df['date'].apply(lambda x: x.year)

In [43]:
raw_df = raw_df.drop('date',axis=1)

In [45]:
raw_df.head(1)

Unnamed: 0,species,island,bill_length_mm,bill_depth_mm,flipper_length_mm,body_mass_g,sex,year
0,Adelie Penguin (Pygoscelis adeliae),Torgersen,39.1,18.7,181.0,3750.0,MALE,2007


In [46]:
df.head(1)

Unnamed: 0,species,island,bill_length_mm,bill_depth_mm,flipper_length_mm,body_mass_g,sex,year
0,Adelie,Torgersen,39.1,18.7,181.0,3750.0,male,2007


In [47]:
raw_df['species'].unique()

array(['Adelie Penguin (Pygoscelis adeliae)',
       'Gentoo penguin (Pygoscelis papua)',
       'Chinstrap penguin (Pygoscelis antarctica)'], dtype=object)

In [48]:
def clean_species(x):
    if 'Adelie' in x:
        return 'Adelie'
    if 'Gentoo' in x:
        return 'Gentoo'
    if 'Chinstrap' in x:
        return 'Chinstrap'

In [49]:
clean_species('Adelie Penguin (Pygoscelis adeliae)')

'Adelie'

In [52]:
raw_df['species'] = raw_df['species'].apply(clean_species)

In [62]:

raw_df['sex'] = raw_df['sex'].fillna('')

In [63]:
raw_df['sex'] = raw_df['sex'].apply(lambda x: x.lower())


In [64]:
raw_df.head(1)

Unnamed: 0,species,island,bill_length_mm,bill_depth_mm,flipper_length_mm,body_mass_g,sex,year
0,Adelie,Torgersen,39.1,18.7,181.0,3750.0,male,2007


In [65]:
df.head(1)

Unnamed: 0,species,island,bill_length_mm,bill_depth_mm,flipper_length_mm,body_mass_g,sex,year
0,Adelie,Torgersen,39.1,18.7,181.0,3750.0,male,2007


### Functions
1. nunique
2. unique
3. column selection
4. row selection (df.loc[0])
6. df.columns
7. pd.to_datetime(raw_df['Date Egg']).apply(lambda x: x.year)
8. df.dtypes
9. groupby
10. value_counts
11. filtering / greater than / inclusion.