## Import libraries and datasets

In [23]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from datetime import datetime

In [24]:
df = pd.read_excel('Austrlian Labour Market data.xls', sheet_name='Data1', index_col=0, parse_dates=True)

## Preparing Dataset

In [25]:
df.head(10)

Unnamed: 0,Employed total ; Persons ;,Employed total ; Persons ;.1,Employed total ; Persons ;.2,Employed total ; > Males ;,Employed total ; > Males ;.1,Employed total ; > Males ;.2,Employed total ; > Females ;,Employed total ; > Females ;.1,Employed total ; > Females ;.2,> Employed full-time ; Persons ;,...,Participation rate ; > Males ;.2,Participation rate ; > Females ;,Participation rate ; > Females ;.1,Participation rate ; > Females ;.2,Not in the labour force (NILF) ; Persons ;,Not in the labour force (NILF) ; > Males ;,Not in the labour force (NILF) ; > Females ;,Civilian population aged 15 years and over ; Persons ;,Civilian population aged 15 years and over ; > Males ;,Civilian population aged 15 years and over ; > Females ;
Unit,000,000,000,000,000,000,000,000,000,000,...,Percent,Percent,Percent,Percent,000,000,000,000,000,000
Series Type,Trend,Seasonally Adjusted,Original,Trend,Seasonally Adjusted,Original,Trend,Seasonally Adjusted,Original,Trend,...,Original,Trend,Seasonally Adjusted,Original,Original,Original,Original,Original,Original,Original
Data Type,STOCK,STOCK,STOCK,STOCK,STOCK,STOCK,STOCK,STOCK,STOCK,STOCK,...,PERCENT,PERCENT,PERCENT,PERCENT,STOCK,STOCK,STOCK,STOCK,STOCK,STOCK
Frequency,Month,Month,Month,Month,Month,Month,Month,Month,Month,Month,...,Month,Month,Month,Month,Month,Month,Month,Month,Month,Month
Collection Month,1,1,1,1,1,1,1,1,1,1,...,1,1,1,1,1,1,1,1,1,1
Series Start,1978-02-01 00:00:00,1978-02-01 00:00:00,1978-02-01 00:00:00,1978-02-01 00:00:00,1978-02-01 00:00:00,1978-02-01 00:00:00,1978-02-01 00:00:00,1978-02-01 00:00:00,1978-02-01 00:00:00,1978-02-01 00:00:00,...,1978-02-01 00:00:00,1978-02-01 00:00:00,1978-02-01 00:00:00,1978-02-01 00:00:00,1978-02-01 00:00:00,1978-02-01 00:00:00,1978-02-01 00:00:00,1978-02-01 00:00:00,1978-02-01 00:00:00,1978-02-01 00:00:00
Series End,2020-04-01 00:00:00,2020-04-01 00:00:00,2020-04-01 00:00:00,2020-04-01 00:00:00,2020-04-01 00:00:00,2020-04-01 00:00:00,2020-04-01 00:00:00,2020-04-01 00:00:00,2020-04-01 00:00:00,2020-04-01 00:00:00,...,2020-04-01 00:00:00,2020-04-01 00:00:00,2020-04-01 00:00:00,2020-04-01 00:00:00,2020-04-01 00:00:00,2020-04-01 00:00:00,2020-04-01 00:00:00,2020-04-01 00:00:00,2020-04-01 00:00:00,2020-04-01 00:00:00
No. Obs,507,507,507,507,507,507,507,507,507,507,...,507,507,507,507,507,507,507,507,507,507
Series ID,A84423127L,A84423043C,A84423085A,A84423113X,A84423029J,A84423071L,A84423141J,A84423057T,A84423099R,A84423125J,...,A84423079F,A84423149A,A84423065T,A84423107C,A84423090V,A84423076X,A84423104W,A84423091W,A84423077A,A84423105X
1978-02-01 00:00:00,,5997.66,5985.66,,3881.13,3887.39,,2116.53,2098.27,,...,80.0953,,43.495,43.7318,4020.85,1029.79,2991.06,10489.3,5173.6,5315.73


The original dataset contain both seasonally adjusted data and original data as well as the trend, we will use Seasonally Adjusted data here and drop the other two.

In [26]:
# Given all the Seasonally Adjusted data contain ';.1' in the column name, we will use this feature to filter these columns
df = df.filter(like=';.1', axis = 1)
# Alternatively, we can use df.loc[:, df.columns.str.contains('1')]

In [27]:
# After filtering the selected columns, the index 'Unit', 'Series Type' and etc are not needed
df.drop(index=['Unit', 'Series Type', 'Data Type', 'Frequency', 'Collection Month', 'Series Start', 
               'Series End', 'No. Obs', 'Series ID'], axis=0, inplace=True)

In [28]:
df.shape

(507, 36)

In [29]:
df.columns

Index(['Employed total ;  Persons ;.1', 'Employed total ;  > Males ;.1',
       'Employed total ;  > Females ;.1',
       '> Employed full-time ;  Persons ;.1',
       '> Employed full-time ;  > Males ;.1',
       '> Employed full-time ;  > Females ;.1',
       '> Employed part-time ;  Persons ;.1',
       '> Employed part-time ;  > Males ;.1',
       '> Employed part-time ;  > Females ;.1',
       'Employment to population ratio ;  Persons ;.1',
       'Employment to population ratio ;  > Males ;.1',
       'Employment to population ratio ;  > Females ;.1',
       'Unemployed total ;  Persons ;.1', 'Unemployed total ;  > Males ;.1',
       'Unemployed total ;  > Females ;.1',
       '> Unemployed looked for full-time work ;  Persons ;.1',
       '> Unemployed looked for full-time work ;  > Males ;.1',
       '> Unemployed looked for full-time work ;  > Females ;.1',
       '> Unemployed looked for only part-time work ;  Persons ;.1',
       '> Unemployed looked for only part-time work

In [30]:
# Fixing messy column names
# Remove unwanted ';' and '>' in the column name
df.columns = df.columns.str.strip().str.replace('; ', '').str.replace('> ', '').str.replace('Persons', '')
# Remove extra space in the middle and at the end
df.columns = df.columns.str.rstrip().str.replace('  ', '')

In [31]:
# add (%) for all the columns contain ratio values
ratio = df.loc[:, df.columns.str.contains('rate | ratio')]
ratio.columns = ratio.columns.str.replace(';.1', '(%)')

In [32]:
# add (000) for all the columns contain number values
number = df.loc[:, ~df.columns.str.contains('rate | ratio')]
number.columns = number.columns.str.replace(';.1', '(000)')

In [33]:
number.head()

Unnamed: 0,Employed total (000),Employed totalMales (000),Employed totalFemales (000),Employed full-time (000),Employed full-timeMales (000),Employed full-timeFemales (000),Employed part-time (000),Employed part-timeMales (000),Employed part-timeFemales (000),Unemployed total (000),...,Unemployed looked for only part-time work (000),Unemployed looked for only part-time workMales (000),Unemployed looked for only part-time workFemales (000),Unemployment rateMales (000),Unemployment rateFemales (000),Labour force total (000),Labour force totalMales (000),Labour force totalFemales (000),Participation rateMales (000),Participation rateFemales (000)
1978-02-01 00:00:00,5997.66,3881.13,2116.53,5079.56,3679.93,1399.63,918.094,201.201,716.893,426.936,...,72.1559,19.4069,52.749,5.62638,8.4578,6424.59,4112.52,2312.08,79.4904,43.495
1978-03-01 00:00:00,6003.22,3884.74,2118.48,5095.41,3691.39,1404.02,907.804,193.345,714.459,403.867,...,65.1666,16.126,49.0407,5.30495,8.08075,6407.08,4102.37,2304.71,79.1528,43.2746
1978-04-01 00:00:00,6030.85,3888.62,2142.23,5119.62,3696.86,1422.77,911.227,191.768,719.459,403.3,...,64.6427,15.0625,49.5802,5.36345,7.86683,6434.15,4109.01,2325.14,79.2099,43.6141
1978-05-01 00:00:00,6033.38,3885.81,2147.57,5119.05,3692.27,1426.78,914.329,193.546,720.784,399.492,...,62.2155,13.7406,48.4749,5.24844,7.90161,6432.87,4101.05,2331.82,78.9696,43.6878
1978-06-01 00:00:00,6033.77,3882.76,2151.01,5104.09,3680.45,1423.64,929.675,202.304,727.371,405.973,...,61.8066,12.6264,49.1802,5.49105,7.73707,6439.74,4108.35,2331.39,79.0358,43.6158


In [34]:
new_df = pd.concat([ratio, number], axis=1)

In [35]:
new_df.columns

Index(['Employment to population ratio (%)',
       'Employment to population ratioMales (%)',
       'Employment to population ratioFemales (%)', 'Unemployment rate (%)',
       'Unemployment rate looked for full-time work (%)',
       'Unemployment rate looked for full-time workMales (%)',
       'Unemployment rate looked for full-time workFemales (%)',
       'Unemployment rate looked for only part-time work (%)',
       'Unemployment rate looked for only part-time workMales (%)',
       'Unemployment rate looked for only part-time workFemales (%)',
       'Participation rate (%)', 'Employed total (000)',
       'Employed totalMales (000)', 'Employed totalFemales (000)',
       'Employed full-time (000)', 'Employed full-timeMales (000)',
       'Employed full-timeFemales (000)', 'Employed part-time (000)',
       'Employed part-timeMales (000)', 'Employed part-timeFemales (000)',
       'Unemployed total (000)', 'Unemployed totalMales (000)',
       'Unemployed totalFemales (000)',


In [36]:
new_df.shape

(507, 36)

In [37]:
# Generate new columns for Year and Month
new_df.index = pd.to_datetime(new_df.index)
year = new_df.index.year
month = new_df.index.month 
new_df.insert(0, 'Year', year)
new_df.insert(1, 'Month', month)

In [38]:
# Check if column names are more organised
new_df.head()

Unnamed: 0,Year,Month,Employment to population ratio (%),Employment to population ratioMales (%),Employment to population ratioFemales (%),Unemployment rate (%),Unemployment rate looked for full-time work (%),Unemployment rate looked for full-time workMales (%),Unemployment rate looked for full-time workFemales (%),Unemployment rate looked for only part-time work (%),...,Unemployed looked for only part-time work (000),Unemployed looked for only part-time workMales (000),Unemployed looked for only part-time workFemales (000),Unemployment rateMales (000),Unemployment rateFemales (000),Labour force total (000),Labour force totalMales (000),Labour force totalFemales (000),Participation rateMales (000),Participation rateFemales (000)
1978-02-01,1978,2,57.1786,75.018,39.8163,6.64535,6.52849,5.44665,9.25822,7.28663,...,72.1559,19.4069,52.749,5.62638,8.4578,6424.59,4112.52,2312.08,79.4904,43.495
1978-03-01,1978,3,57.1265,74.9538,39.7777,6.30344,6.23285,5.17616,8.90191,6.69769,...,65.1666,16.126,49.0407,5.30495,8.08075,6407.08,4102.37,2304.71,79.1528,43.2746
1978-04-01,1978,4,57.3347,74.9615,40.183,6.26811,6.20446,5.26173,8.56851,6.62411,...,64.6427,15.0625,49.5802,5.36345,7.86683,6434.15,4109.01,2325.14,79.2099,43.6141
1978-05-01,1978,5,57.2934,74.8249,40.2357,6.21017,6.18139,5.17495,8.68934,6.37098,...,62.2155,13.7406,48.4749,5.24844,7.90161,6432.87,4101.05,2331.82,78.9696,43.6878
1978-06-01,1978,6,57.228,74.6959,40.2412,6.30418,6.317,5.46988,8.43825,6.23376,...,61.8066,12.6264,49.1802,5.49105,7.73707,6439.74,4108.35,2331.39,79.0358,43.6158


In [39]:
new_df.dtypes

Year                                                            int64
Month                                                           int64
Employment to population ratio (%)                             object
Employment to population ratioMales (%)                        object
Employment to population ratioFemales (%)                      object
Unemployment rate (%)                                          object
Unemployment rate looked for full-time work (%)                object
Unemployment rate looked for full-time workMales (%)           object
Unemployment rate looked for full-time workFemales (%)         object
Unemployment rate looked for only part-time work (%)           object
Unemployment rate looked for only part-time workMales (%)      object
Unemployment rate looked for only part-time workFemales (%)    object
Participation rate (%)                                         object
Employed total (000)                                           object
Employed totalMales 

In [40]:
# Convert all the columns from object to float or int with one touch
new_df = new_df.infer_objects()

In [41]:
# Calculate the value for population not in the labour force
new_df['Not in the labour force (000)'] = new_df['Labour force total (000)']/new_df['Participation rate (%)']*100-new_df['Labour force total (000)']

In [42]:
new_df['Employed total growth (000)'] = new_df['Employed total (000)'].diff()
new_df['Full-time employed growth (000)'] = new_df['Employed full-time (000)'].diff()
new_df['Part-time employed growth (000)'] = new_df['Employed part-time (000)'].diff()

In [43]:
recent_df = new_df['2000-01-01': '2020-04-01']
april_df = recent_df[recent_df['Month']==4]

In [44]:
# Save cleaned dataframe as csv for further exploration
new_df.to_csv('Cleaned National Labour Market Data.csv')