# Data preparation

Download the large csv file and split into smaller csv files based off decade.

Source (including data column types): https://data.gov.au/data/dataset/tasmanian-births-1803-1933

In [None]:
import os
import pandas as pd

In [None]:
data_url = 'https://data.gov.au/dataset/c8a4d43f-7ab5-40c4-817e-7babdc43e0f2/resource/8d710758-26ce-46ca-972d-5067db3a75d2/download/births.csv'

In [None]:
columns = [
    'NI_BIRTH_DATE',s
    'NI_GENDER',
    'NI_NAME_FACET',
    'NI_MOTHER',
    'NI_FATHER',
    'NI_REG_YEAR',
    'NI_REG_PLACE',
]

In [None]:
# Read the data in from the url - NOTE: large file, be patient
df0 = pd.read_csv(data_url, usecols=columns)

In [None]:
# Convert the birth date to a datetime and save as the index; convert errors to `NaT`
df0.index = pd.to_datetime(df0.NI_BIRTH_DATE, errors='coerce')

In [None]:
df0.head(2)

In [None]:
# Get the start of the decade for the earliest record
start_decade = (df0.index.min().year // 10) * 10

# Get the decade past the last record
end_decade = (((df0.index.max().year // 10) + 1) * 10)

In [None]:
# Get a list of the decades as a date range
decades = pd.date_range(f'{start_decade}-01-01', f'{end_decade}-01-01', freq='10AS')

In [None]:
# Loop the decades (note the zip trick)
for start_date, end_date in zip(decades, decades[1:]):
    # Subtract one day from decade to take us back to end of previous
    end_date = end_date - pd.Timedelta('1 day')
    
    # Get a view of the data with just the records in this decade
    df1 = df0[start_date.isoformat():end_date.isoformat()]
    
    # Create a filename to save the new csv
    save_fn = f'../data/tasmania-births-{start_date.year}-{end_date.year:}.csv'
    
    print(f"Saving {save_fn}....", end='')
    
    # Save the records
    df1.to_csv(save_fn)
    
    print('done')

In [None]:
# Show a directory listing
!ls -lrsth ../data

In [None]:
# Those are big, let's compress
!bzip2 ../data/*.csv

In [None]:
# Show a directory listing
!ls -lrsth ../data

## Gapminder data

In [None]:
gapminder_url = 'https://raw.githubusercontent.com/wtgee/python-novice-gapminder/gh-pages/data/gapminder_gdp_africa.csv'

In [None]:
!wget $gapminder_url -O '../data/gapminder_gdp_africa.csv'