## Labour Force Characteristics
### Importing Data

In [360]:
import pandas as pd
labour_df = pd.read_csv('data/labour_force_characteristics.csv')
labour_df[:10]

Unnamed: 0,REF_DATE,GEO,DGUID,Labour force characteristics,Educational degree,Sex,Age group,UOM,UOM_ID,SCALAR_FACTOR,SCALAR_ID,VECTOR,COORDINATE,VALUE,STATUS,SYMBOL,TERMINATED,DECIMALS
0,1990,Canada,2016A000011124,Population,"Total, all education levels",Both sexes,15 years and over,Persons,249,thousands,3,v55009060,1.1.1.1.1,21214.7,,,,1
1,1990,Canada,2016A000011124,Population,"Total, all education levels",Both sexes,15 to 24 years,Persons,249,thousands,3,v55009061,1.1.1.1.2,3924.9,,,,1
2,1990,Canada,2016A000011124,Population,"Total, all education levels",Both sexes,25 years and over,Persons,249,thousands,3,v55009062,1.1.1.1.3,17289.8,,,,1
3,1990,Canada,2016A000011124,Population,"Total, all education levels",Both sexes,25 to 54 years,Persons,249,thousands,3,v55009063,1.1.1.1.4,12029.9,,,,1
4,1990,Canada,2016A000011124,Population,"Total, all education levels",Both sexes,55 years and over,Persons,249,thousands,3,v55009064,1.1.1.1.5,5259.9,,,,1
5,1990,Canada,2016A000011124,Population,"Total, all education levels",Both sexes,55 to 64 years,Persons,249,thousands,3,v55009065,1.1.1.1.6,2379.7,,,,1
6,1990,Canada,2016A000011124,Population,"Total, all education levels",Both sexes,65 years and over,Persons,249,thousands,3,v55009066,1.1.1.1.7,2880.2,,,,1
7,1990,Canada,2016A000011124,Population,"Total, all education levels",Males,15 years and over,Persons,249,thousands,3,v55009067,1.1.1.2.1,10410.2,,,,1
8,1990,Canada,2016A000011124,Population,"Total, all education levels",Males,15 to 24 years,Persons,249,thousands,3,v55009068,1.1.1.2.2,1995.9,,,,1
9,1990,Canada,2016A000011124,Population,"Total, all education levels",Males,25 years and over,Persons,249,thousands,3,v55009069,1.1.1.2.3,8414.3,,,,1


In [361]:
labour_df.dtypes

REF_DATE                          int64
GEO                              object
DGUID                            object
Labour force characteristics     object
Educational degree               object
Sex                              object
Age group                        object
UOM                              object
UOM_ID                            int64
SCALAR_FACTOR                    object
SCALAR_ID                         int64
VECTOR                           object
COORDINATE                       object
VALUE                           float64
STATUS                           object
SYMBOL                          float64
TERMINATED                      float64
DECIMALS                          int64
dtype: object

In [362]:
'number of rows=%d' % labour_df.shape[0]

'number of rows=785400'

### Dropping Unnecessary Columns

In [363]:
labour_df[labour_df.columns.values].apply(lambda x: x.unique())

REF_DATE                        [1990, 1991, 1992, 1993, 1994, 1995, 1996, 199...
GEO                             [Canada, Newfoundland and Labrador, Prince Edw...
DGUID                           [2016A000011124, 2016A000210, 2016A000211, 201...
Labour force characteristics    [Population, Labour force, Employment, Full-ti...
Educational degree              [Total, all education levels, No degree, certi...
Sex                                                  [Both sexes, Males, Females]
Age group                       [15 years and over, 15 to 24 years, 25 years a...
UOM                                                         [Persons, Percentage]
UOM_ID                                                                 [249, 242]
SCALAR_FACTOR                                                  [thousands, units]
SCALAR_ID                                                                  [3, 0]
VECTOR                          [v55009060, v55009061, v55009062, v55009063, v...
COORDINATE      

In [364]:
labour_df = labour_df.drop(columns=['DGUID', 'UOM_ID', 'SCALAR_ID', 'VECTOR', 'COORDINATE', 'STATUS', 'SYMBOL', 'TERMINATED', 'DECIMALS'])
labour_df

Unnamed: 0,REF_DATE,GEO,Labour force characteristics,Educational degree,Sex,Age group,UOM,SCALAR_FACTOR,VALUE
0,1990,Canada,Population,"Total, all education levels",Both sexes,15 years and over,Persons,thousands,21214.7
1,1990,Canada,Population,"Total, all education levels",Both sexes,15 to 24 years,Persons,thousands,3924.9
2,1990,Canada,Population,"Total, all education levels",Both sexes,25 years and over,Persons,thousands,17289.8
3,1990,Canada,Population,"Total, all education levels",Both sexes,25 to 54 years,Persons,thousands,12029.9
4,1990,Canada,Population,"Total, all education levels",Both sexes,55 years and over,Persons,thousands,5259.9
...,...,...,...,...,...,...,...,...,...
785395,2023,British Columbia,Employment rate,Above bachelor's degree,Females,25 years and over,Percentage,units,74.7
785396,2023,British Columbia,Employment rate,Above bachelor's degree,Females,25 to 54 years,Percentage,units,88.6
785397,2023,British Columbia,Employment rate,Above bachelor's degree,Females,55 years and over,Percentage,units,43.9
785398,2023,British Columbia,Employment rate,Above bachelor's degree,Females,55 to 64 years,Percentage,units,70.9


### Column Filtering
The `UOM` column has two unique values: `Persons` and `Percentage`. We only care about the percentage of graduates per age group, degree, and labour force characteristic. Thus, we filter for rows where `UOM == "Persons"`.

In [365]:
labour_df = labour_df.query('UOM == "Persons"')
if len(labour_df['UOM'].unique()) == 1:
  labour_df = labour_df.drop(columns=['UOM'])
labour_df

Unnamed: 0,REF_DATE,GEO,Labour force characteristics,Educational degree,Sex,Age group,SCALAR_FACTOR,VALUE
0,1990,Canada,Population,"Total, all education levels",Both sexes,15 years and over,thousands,21214.7
1,1990,Canada,Population,"Total, all education levels",Both sexes,15 to 24 years,thousands,3924.9
2,1990,Canada,Population,"Total, all education levels",Both sexes,25 years and over,thousands,17289.8
3,1990,Canada,Population,"Total, all education levels",Both sexes,25 to 54 years,thousands,12029.9
4,1990,Canada,Population,"Total, all education levels",Both sexes,55 years and over,thousands,5259.9
...,...,...,...,...,...,...,...,...
784765,2023,British Columbia,Not in labour force,Above bachelor's degree,Females,25 years and over,thousands,50.5
784766,2023,British Columbia,Not in labour force,Above bachelor's degree,Females,25 to 54 years,thousands,12.5
784767,2023,British Columbia,Not in labour force,Above bachelor's degree,Females,55 years and over,thousands,37.9
784768,2023,British Columbia,Not in labour force,Above bachelor's degree,Females,55 to 64 years,thousands,7.8


Additionally, since the `SCALAR_FACTOR` column is dependent on the `UOM` column, it now has a cardinality of one (df above: `thousands`). We can drop the `SCALAR_FACTOR` column.

In [366]:
if len(labour_df['SCALAR_FACTOR'].unique()) == 1:
  labour_df = labour_df.drop(columns=['SCALAR_FACTOR'])
labour_df

Unnamed: 0,REF_DATE,GEO,Labour force characteristics,Educational degree,Sex,Age group,VALUE
0,1990,Canada,Population,"Total, all education levels",Both sexes,15 years and over,21214.7
1,1990,Canada,Population,"Total, all education levels",Both sexes,15 to 24 years,3924.9
2,1990,Canada,Population,"Total, all education levels",Both sexes,25 years and over,17289.8
3,1990,Canada,Population,"Total, all education levels",Both sexes,25 to 54 years,12029.9
4,1990,Canada,Population,"Total, all education levels",Both sexes,55 years and over,5259.9
...,...,...,...,...,...,...,...
784765,2023,British Columbia,Not in labour force,Above bachelor's degree,Females,25 years and over,50.5
784766,2023,British Columbia,Not in labour force,Above bachelor's degree,Females,25 to 54 years,12.5
784767,2023,British Columbia,Not in labour force,Above bachelor's degree,Females,55 years and over,37.9
784768,2023,British Columbia,Not in labour force,Above bachelor's degree,Females,55 to 64 years,7.8


The value is now in thousands as explained above. We will now multiply the value to make sure the data is consistent among the other dataframes.

In [367]:
labour_df['VALUE'] = labour_df['VALUE'].apply(lambda x: x * 1000)
labour_df = labour_df.fillna(0)
labour_df['VALUE'] = labour_df['VALUE'].astype('Int64')
labour_df

TypeError: cannot safely cast non-equivalent float64 to int64

The dataframe includes information for each province in Canada, but in order to satisfy the requirements of the other dataframes, we will drop the column.

In [None]:
labour_df = labour_df.query('GEO == "Canada"')
if len(labour_df['GEO'].unique()) == 1:
  labour_df = labour_df.drop(columns=['GEO'])
labour_df

Unnamed: 0,REF_DATE,Labour force characteristics,Educational degree,Sex,Age group,VALUE
0,1990,Population,"Total, all education levels",Both sexes,15 years and over,21214700
1,1990,Population,"Total, all education levels",Both sexes,15 to 24 years,3924900
2,1990,Population,"Total, all education levels",Both sexes,25 years and over,17289800
3,1990,Population,"Total, all education levels",Both sexes,25 to 54 years,12029900
4,1990,Population,"Total, all education levels",Both sexes,55 years and over,5259900
...,...,...,...,...,...,...
763765,2023,Not in labour force,Above bachelor's degree,Females,25 years and over,355500
763766,2023,Not in labour force,Above bachelor's degree,Females,25 to 54 years,106200
763767,2023,Not in labour force,Above bachelor's degree,Females,55 years and over,249300
763768,2023,Not in labour force,Above bachelor's degree,Females,55 to 64 years,55900


We can include only the age groups that are not aggregates. In other words, they don't have `x years and over` in their name.

In [None]:
labour_df = labour_df.query('`Age group` == "15 to 24 years" or `Age group` == "25 to 54 years" or `Age group` == "55 to 64 years"')
labour_df

Unnamed: 0,REF_DATE,Labour force characteristics,Educational degree,Sex,Age group,VALUE
1,1990,Population,"Total, all education levels",Both sexes,15 to 24 years,3924900
3,1990,Population,"Total, all education levels",Both sexes,25 to 54 years,12029900
5,1990,Population,"Total, all education levels",Both sexes,55 to 64 years,2379700
8,1990,Population,"Total, all education levels",Males,15 to 24 years,1995900
10,1990,Population,"Total, all education levels",Males,25 to 54 years,6013900
...,...,...,...,...,...,...
763759,2023,Not in labour force,Above bachelor's degree,Males,25 to 54 years,42300
763761,2023,Not in labour force,Above bachelor's degree,Males,55 to 64 years,42500
763764,2023,Not in labour force,Above bachelor's degree,Females,15 to 24 years,3500
763766,2023,Not in labour force,Above bachelor's degree,Females,25 to 54 years,106200


We don't care about any of the labour force characteristics except for `Population`, `Employment`, and `Unemployment`.

In [None]:
labour_df = labour_df.query('`Labour force characteristics` == "Population" or `Labour force characteristics` == "Employment" or `Labour force characteristics` == "Unemployment"')
labour_df

Unnamed: 0,REF_DATE,Labour force characteristics,Educational degree,Sex,Age group,VALUE
1,1990,Population,"Total, all education levels",Both sexes,15 to 24 years,3924900
3,1990,Population,"Total, all education levels",Both sexes,25 to 54 years,12029900
5,1990,Population,"Total, all education levels",Both sexes,55 to 64 years,2379700
8,1990,Population,"Total, all education levels",Males,15 to 24 years,1995900
10,1990,Population,"Total, all education levels",Males,25 to 54 years,6013900
...,...,...,...,...,...,...
763549,2023,Unemployment,Above bachelor's degree,Males,25 to 54 years,32000
763551,2023,Unemployment,Above bachelor's degree,Males,55 to 64 years,6500
763554,2023,Unemployment,Above bachelor's degree,Females,15 to 24 years,2500
763556,2023,Unemployment,Above bachelor's degree,Females,25 to 54 years,41900


We want to have only information after `2000`.

In [None]:
labour_df = labour_df[labour_df['REF_DATE'] >= 2000]
labour_df

Unnamed: 0,REF_DATE,Labour force characteristics,Educational degree,Sex,Age group,VALUE
231001,2000,Population,"Total, all education levels",Both sexes,15 to 24 years,4067900
231003,2000,Population,"Total, all education levels",Both sexes,25 to 54 years,13687300
231005,2000,Population,"Total, all education levels",Both sexes,55 to 64 years,2765500
231008,2000,Population,"Total, all education levels",Males,15 to 24 years,2081100
231010,2000,Population,"Total, all education levels",Males,25 to 54 years,6831700
...,...,...,...,...,...,...
763549,2023,Unemployment,Above bachelor's degree,Males,25 to 54 years,32000
763551,2023,Unemployment,Above bachelor's degree,Males,55 to 64 years,6500
763554,2023,Unemployment,Above bachelor's degree,Females,15 to 24 years,2500
763556,2023,Unemployment,Above bachelor's degree,Females,25 to 54 years,41900


A lot of the aggregates included in this dataframe are not needed and will mess up the fact table. Thus, we can remove the aggregates for the `Sex` column, and `Educational degree` column.

In [None]:
labour_df = labour_df.query('`Educational degree` != "Total, all education levels"')
labour_df = labour_df.query('Sex != "Both sexes"')
labour_df

Unnamed: 0,REF_DATE,Labour force characteristics,Educational degree,Sex,Age group,VALUE
231029,2000,Population,"No degree, certificate or diploma",Males,15 to 24 years,943600
231031,2000,Population,"No degree, certificate or diploma",Males,25 to 54 years,1232300
231033,2000,Population,"No degree, certificate or diploma",Males,55 to 64 years,486000
231036,2000,Population,"No degree, certificate or diploma",Females,15 to 24 years,817100
231038,2000,Population,"No degree, certificate or diploma",Females,25 to 54 years,1093700
...,...,...,...,...,...,...
763549,2023,Unemployment,Above bachelor's degree,Males,25 to 54 years,32000
763551,2023,Unemployment,Above bachelor's degree,Males,55 to 64 years,6500
763554,2023,Unemployment,Above bachelor's degree,Females,15 to 24 years,2500
763556,2023,Unemployment,Above bachelor's degree,Females,25 to 54 years,41900


Finally, we can rename our dataframe and we will be done cleaning the dataframe.

In [None]:
labour_df.rename(columns={'REF_DATE': 'date', 'Labour force characteristics': 'labour_force', 'Educational degree': 'education', 'Sex': 'sex', 'Age group': 'age_group', 'VALUE': 'value'}, inplace=True)
labour_df

Unnamed: 0,date,labour_force,education,sex,age_group,value
231029,2000,Population,"No degree, certificate or diploma",Males,15 to 24 years,943600
231031,2000,Population,"No degree, certificate or diploma",Males,25 to 54 years,1232300
231033,2000,Population,"No degree, certificate or diploma",Males,55 to 64 years,486000
231036,2000,Population,"No degree, certificate or diploma",Females,15 to 24 years,817100
231038,2000,Population,"No degree, certificate or diploma",Females,25 to 54 years,1093700
...,...,...,...,...,...,...
763549,2023,Unemployment,Above bachelor's degree,Males,25 to 54 years,32000
763551,2023,Unemployment,Above bachelor's degree,Males,55 to 64 years,6500
763554,2023,Unemployment,Above bachelor's degree,Females,15 to 24 years,2500
763556,2023,Unemployment,Above bachelor's degree,Females,25 to 54 years,41900
