# Data preparation

Download the large csv file and split into smaller csv files based off decade.

Source (including data column types): https://data.gov.au/data/dataset/tasmanian-births-1803-1933

In [1]:
import os
import pandas as pd

In [2]:
data_url = 'https://data.gov.au/dataset/c8a4d43f-7ab5-40c4-817e-7babdc43e0f2/resource/8d710758-26ce-46ca-972d-5067db3a75d2/download/births.csv'

In [3]:
# Read the data in from the url - NOTE: large file, be patient
df0 = pd.read_csv(data_url)

  interactivity=interactivity, compiler=compiler, result=result)


In [4]:
# Convert the birth date to a datetime and save as the index; convert errors to `NaT`
df0.index = pd.to_datetime(df0.NI_BIRTH_DATE, errors='coerce')

In [5]:
df0.head(2)

Unnamed: 0_level_0,NI_URL,NI_BIRTH_DATE,TASMANIAN,NI_GENDER,NI_NAME_FACET,NI_MOTHER,NI_FATHER,NI_INDEX,NI_REG_YEAR,PUBDATE_RANGE,...,RELEVANCE_SORT,PUBDATE,FORMAT,ACCESS_LEVEL,NI_REG_PLACE,NI_YEAR,NI_CATKEY,FORMAT_LINCTAS,TITLE,NI_BAPTISM_DATE
NI_BIRTH_DATE,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1870-11-19,&lt;subfield_u&gt;https://stors.tas.gov.au/RGD...,19 Nov 1870,Published in Tasmania|About Tasmania|By a Tasm...,Female,"Dennis, Mary","Sutcliffe, Maria","Dennis, Christopher",Births,1870,1870,...,0,1870,VIEW,0|1|2|3|4,Launceston,1870,(Sirsi) 927365,Names Index,"Dennis, Mary",
1870-11-20,&lt;subfield_u&gt;https://stors.tas.gov.au/RGD...,20 Nov 1870,Published in Tasmania|About Tasmania|By a Tasm...,Female,"West, Edith Jane","Laing, Margaret","West, William",Births,1870,1870,...,0,1870,VIEW,0|1|2|3|4,Launceston,1870,(Sirsi) 927366,Names Index,"West, Edith Jane",


In [6]:
# Get the start of the decade for the earliest record
start_decade = (df0.index.min().year // 10) * 10

# Get the decade past the last record
end_decade = (((df0.index.max().year // 10) + 1) * 10)

In [7]:
# Get a list of the decades as a date range
decades = pd.date_range(f'{start_decade}-01-01', f'{end_decade}-01-01', freq='10AS')

In [8]:
# Loop the decades (note the zip trick)
for start_date, end_date in zip(decades, decades[1:]):
    # Subtract one day from decade to take us back to end of previous
    end_date = end_date - pd.Timedelta('1 day')
    
    # Get a view of the data with just the records in this decade
    df1 = df0[start_date.isoformat():end_date.isoformat()]
    
    # Create a filename to save the new csv
    save_fn = f'../data/tasmania-births-{start_date.year}-{end_date.year:}.csv'
    
    print(f"Saving {save_fn}....", end='')
    
    # Save the records
    df1.to_csv(save_fn)
    
    print('done')

Saving ../data/births-1770-1779.csv....done
Saving ../data/births-1780-1789.csv....done
Saving ../data/births-1790-1799.csv....done
Saving ../data/births-1800-1809.csv....done
Saving ../data/births-1810-1819.csv....done
Saving ../data/births-1820-1829.csv....done
Saving ../data/births-1830-1839.csv....done
Saving ../data/births-1840-1849.csv....done
Saving ../data/births-1850-1859.csv....done
Saving ../data/births-1860-1869.csv....done
Saving ../data/births-1870-1879.csv....done
Saving ../data/births-1880-1889.csv....done
Saving ../data/births-1890-1899.csv....done
Saving ../data/births-1900-1909.csv....done
Saving ../data/births-1910-1919.csv....done
Saving ../data/births-1920-1929.csv....done
Saving ../data/births-1930-1939.csv....done


In [13]:
# Show a directory listing
!ls -lrsth ../data

total 105M
4.0K -rw-r--r-- 1 wtgee wtgee  819 Feb 19 16:51 births-1770-1779.csv
4.0K -rw-r--r-- 1 wtgee wtgee 1.9K Feb 19 16:51 births-1780-1789.csv
4.0K -rw-r--r-- 1 wtgee wtgee 3.4K Feb 19 16:51 births-1790-1799.csv
 16K -rw-r--r-- 1 wtgee wtgee  14K Feb 19 16:51 births-1800-1809.csv
276K -rw-r--r-- 1 wtgee wtgee 275K Feb 19 16:51 births-1810-1819.csv
1.3M -rw-r--r-- 1 wtgee wtgee 1.3M Feb 19 16:51 births-1820-1829.csv
3.3M -rw-r--r-- 1 wtgee wtgee 3.3M Feb 19 16:51 births-1830-1839.csv
7.9M -rw-r--r-- 1 wtgee wtgee 7.9M Feb 19 16:51 births-1840-1849.csv
 14M -rw-r--r-- 1 wtgee wtgee  14M Feb 19 16:51 births-1850-1859.csv
 16M -rw-r--r-- 1 wtgee wtgee  16M Feb 19 16:52 births-1860-1869.csv
 17M -rw-r--r-- 1 wtgee wtgee  17M Feb 19 16:52 births-1870-1879.csv
 23M -rw-r--r-- 1 wtgee wtgee  23M Feb 19 16:52 births-1880-1889.csv
 24M -rw-r--r-- 1 wtgee wtgee  24M Feb 19 16:52 births-1890-1899.csv
 64K -rw-r--r-- 1 wtgee wtgee  63K Feb 19 16:52 births-1900-1909.csv
 44K -rw-r--r-- 1 wtgee

In [15]:
# Those are big, let's compress
!bzip2 ../data/*.csv

In [16]:
# Show a directory listing
!ls -lrsth ../data

total 8.0M
572K -rw-r--r-- 1 wtgee wtgee 570K Feb 19 16:51 births-1840-1849.csv.bz2
240K -rw-r--r-- 1 wtgee wtgee 237K Feb 19 16:51 births-1830-1839.csv.bz2
 92K -rw-r--r-- 1 wtgee wtgee  91K Feb 19 16:51 births-1820-1829.csv.bz2
 24K -rw-r--r-- 1 wtgee wtgee  21K Feb 19 16:51 births-1810-1819.csv.bz2
4.0K -rw-r--r-- 1 wtgee wtgee 2.1K Feb 19 16:51 births-1800-1809.csv.bz2
4.0K -rw-r--r-- 1 wtgee wtgee  949 Feb 19 16:51 births-1790-1799.csv.bz2
4.0K -rw-r--r-- 1 wtgee wtgee  743 Feb 19 16:51 births-1780-1789.csv.bz2
4.0K -rw-r--r-- 1 wtgee wtgee  533 Feb 19 16:51 births-1770-1779.csv.bz2
1.1M -rw-r--r-- 1 wtgee wtgee 1.1M Feb 19 16:51 births-1850-1859.csv.bz2
1.3M -rw-r--r-- 1 wtgee wtgee 1.3M Feb 19 16:52 births-1870-1879.csv.bz2
1.2M -rw-r--r-- 1 wtgee wtgee 1.2M Feb 19 16:52 births-1860-1869.csv.bz2
1.8M -rw-r--r-- 1 wtgee wtgee 1.8M Feb 19 16:52 births-1880-1889.csv.bz2
4.0K -rw-r--r-- 1 wtgee wtgee 2.1K Feb 19 16:52 births-1930-1939.csv.bz2
8.0K -rw-r--r-- 1 wtgee wtgee 6.4K Feb 1