# Data preparation

Download the large csv file and split into smaller csv files based off decade.

Source (including data column types): https://data.gov.au/data/dataset/tasmanian-births-1803-1933

In [32]:
import os
import pandas as pd

In [2]:
data_url = 'https://data.gov.au/dataset/c8a4d43f-7ab5-40c4-817e-7babdc43e0f2/resource/8d710758-26ce-46ca-972d-5067db3a75d2/download/births.csv'

In [3]:
columns = [
    'NI_BIRTH_DATE',
    'NI_GENDER',
    'NI_NAME_FACET',
    'NI_MOTHER',
    'NI_FATHER',
    'NI_REG_YEAR',
    'NI_REG_PLACE',
]

In [4]:
# Read the data in from the url - NOTE: large file, be patient
df0 = pd.read_csv(data_url, usecols=columns)

In [5]:
# Convert the birth date to a datetime and save as the index; convert errors to `NaT`
df0.index = pd.to_datetime(df0.NI_BIRTH_DATE, errors='coerce')

In [6]:
df0.head(2)

Unnamed: 0_level_0,NI_BIRTH_DATE,NI_GENDER,NI_NAME_FACET,NI_MOTHER,NI_FATHER,NI_REG_YEAR,NI_REG_PLACE
NI_BIRTH_DATE,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
1870-11-19,19 Nov 1870,Female,"Dennis, Mary","Sutcliffe, Maria","Dennis, Christopher",1870,Launceston
1870-11-20,20 Nov 1870,Female,"West, Edith Jane","Laing, Margaret","West, William",1870,Launceston


In [26]:
# Get the start of the decade for the earliest record
start_decade = (df0.index.min().year // 10) * 10

# Get the decade past the last record
end_decade = (((df0.index.max().year // 10) + 1) * 10)

In [27]:
# Get a list of the decades as a date range
decades = pd.date_range(f'{start_decade}-01-01', f'{end_decade}-01-01', freq='10AS')

In [28]:
# Loop the decades (note the zip trick)
for start_date, end_date in zip(decades, decades[1:]):
    # Subtract one day from decade to take us back to end of previous
    end_date = end_date - pd.Timedelta('1 day')
    
    # Get a view of the data with just the records in this decade
    df1 = df0[start_date.isoformat():end_date.isoformat()]
    
    # Create a filename to save the new csv
    save_fn = f'../data/tasmania-births-{start_date.year}-{end_date.year:}.csv'
    
    print(f"Saving {save_fn}....", end='')
    
    # Save the records
    df1.to_csv(save_fn)
    
    print('done')

Saving ../data/tasmania-births-1770-1779.csv....done
Saving ../data/tasmania-births-1780-1789.csv....done
Saving ../data/tasmania-births-1790-1799.csv....done
Saving ../data/tasmania-births-1800-1809.csv....done
Saving ../data/tasmania-births-1810-1819.csv....done
Saving ../data/tasmania-births-1820-1829.csv....done
Saving ../data/tasmania-births-1830-1839.csv....done
Saving ../data/tasmania-births-1840-1849.csv....done
Saving ../data/tasmania-births-1850-1859.csv....done
Saving ../data/tasmania-births-1860-1869.csv....done
Saving ../data/tasmania-births-1870-1879.csv....done
Saving ../data/tasmania-births-1880-1889.csv....done
Saving ../data/tasmania-births-1890-1899.csv....done
Saving ../data/tasmania-births-1900-1909.csv....done
Saving ../data/tasmania-births-1910-1919.csv....done
Saving ../data/tasmania-births-1920-1929.csv....done
Saving ../data/tasmania-births-1930-1939.csv....done


In [29]:
# Show a directory listing
!ls -lrsth ../data

total 21M
4.0K -rw-r--r-- 1 wtgee wtgee  205 Feb 19 23:24 tasmania-births-1770-1779.csv
4.0K -rw-r--r-- 1 wtgee wtgee  423 Feb 19 23:24 tasmania-births-1780-1789.csv
4.0K -rw-r--r-- 1 wtgee wtgee  717 Feb 19 23:24 tasmania-births-1790-1799.csv
4.0K -rw-r--r-- 1 wtgee wtgee 2.8K Feb 19 23:24 tasmania-births-1800-1809.csv
 56K -rw-r--r-- 1 wtgee wtgee  54K Feb 19 23:24 tasmania-births-1810-1819.csv
248K -rw-r--r-- 1 wtgee wtgee 246K Feb 19 23:24 tasmania-births-1820-1829.csv
644K -rw-r--r-- 1 wtgee wtgee 641K Feb 19 23:24 tasmania-births-1830-1839.csv
1.6M -rw-r--r-- 1 wtgee wtgee 1.6M Feb 19 23:24 tasmania-births-1840-1849.csv
2.8M -rw-r--r-- 1 wtgee wtgee 2.8M Feb 19 23:24 tasmania-births-1850-1859.csv
3.1M -rw-r--r-- 1 wtgee wtgee 3.1M Feb 19 23:24 tasmania-births-1860-1869.csv
3.3M -rw-r--r-- 1 wtgee wtgee 3.3M Feb 19 23:24 tasmania-births-1870-1879.csv
4.5M -rw-r--r-- 1 wtgee wtgee 4.5M Feb 19 23:24 tasmania-births-1880-1889.csv
5.0M -rw-r--r-- 1 wtgee wtgee 4.9M Feb 19 23:24 tasman

In [30]:
# Those are big, let's compress
!bzip2 ../data/*.csv

In [31]:
# Show a directory listing
!ls -lrsth ../data

total 3.0M
368K -rw-r--r-- 1 wtgee wtgee 366K Feb 19 23:24 tasmania-births-1850-1859.csv.bz2
200K -rw-r--r-- 1 wtgee wtgee 199K Feb 19 23:24 tasmania-births-1840-1849.csv.bz2
 84K -rw-r--r-- 1 wtgee wtgee  84K Feb 19 23:24 tasmania-births-1830-1839.csv.bz2
 36K -rw-r--r-- 1 wtgee wtgee  34K Feb 19 23:24 tasmania-births-1820-1829.csv.bz2
 12K -rw-r--r-- 1 wtgee wtgee 8.3K Feb 19 23:24 tasmania-births-1810-1819.csv.bz2
4.0K -rw-r--r-- 1 wtgee wtgee  870 Feb 19 23:24 tasmania-births-1800-1809.csv.bz2
4.0K -rw-r--r-- 1 wtgee wtgee  345 Feb 19 23:24 tasmania-births-1790-1799.csv.bz2
4.0K -rw-r--r-- 1 wtgee wtgee  260 Feb 19 23:24 tasmania-births-1780-1789.csv.bz2
4.0K -rw-r--r-- 1 wtgee wtgee  185 Feb 19 23:24 tasmania-births-1770-1779.csv.bz2
668K -rw-r--r-- 1 wtgee wtgee 665K Feb 19 23:24 tasmania-births-1880-1889.csv.bz2
460K -rw-r--r-- 1 wtgee wtgee 460K Feb 19 23:24 tasmania-births-1870-1879.csv.bz2
416K -rw-r--r-- 1 wtgee wtgee 413K Feb 19 23:24 tasmania-births-1860-1869.csv.bz2
4.0K 

## Gapminder data

In [46]:
gapminder_url = 'https://raw.githubusercontent.com/wtgee/python-novice-gapminder/gh-pages/data/gapminder_gdp_africa.csv'

In [47]:
!wget $gapminder_url -O '../data/gapminder_gdp_africa.csv'

--2019-02-20 02:33:36--  https://raw.githubusercontent.com/wtgee/python-novice-gapminder/gh-pages/data/gapminder_gdp_africa.csv
Resolving raw.githubusercontent.com (raw.githubusercontent.com)... 151.101.192.133, 151.101.128.133, 151.101.64.133, ...
Connecting to raw.githubusercontent.com (raw.githubusercontent.com)|151.101.192.133|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 8095 (7.9K) [text/plain]
Saving to: ‘../data/gapminder_gdp_africa.csv’


2019-02-20 02:33:36 (957 KB/s) - ‘../data/gapminder_gdp_africa.csv’ saved [8095/8095]

