# Employment/Unemployment By Industry

### Importing Data

In [242]:
import pandas as pd
# Reading the csv files into dataframes
df = pd.read_csv('data/employment-by-industry.csv')
df.head()

Unnamed: 0,REF_DATE,GEO,DGUID,Labour force characteristics,North American Industry Classification System (NAICS),Sex,Age group,UOM,UOM_ID,SCALAR_FACTOR,SCALAR_ID,VECTOR,COORDINATE,VALUE,STATUS,SYMBOL,TERMINATED,DECIMALS
0,1976,Canada,2016A000011124,Labour force,"Total, all industries",Both sexes,15 years and over,Persons,249,thousands,3,v2363353,1.1.1.1.1,10491.3,,,,1
1,1976,Canada,2016A000011124,Labour force,"Total, all industries",Both sexes,15 to 24 years,Persons,249,thousands,3,v19668072,1.1.1.1.2,2891.9,,,,1
2,1976,Canada,2016A000011124,Labour force,"Total, all industries",Both sexes,25 to 54 years,Persons,249,thousands,3,v19668073,1.1.1.1.3,6410.7,,,,1
3,1976,Canada,2016A000011124,Labour force,"Total, all industries",Both sexes,55 years and over,Persons,249,thousands,3,v19668074,1.1.1.1.4,1188.7,,,,1
4,1976,Canada,2016A000011124,Labour force,"Total, all industries",Males,15 years and over,Persons,249,thousands,3,v2363523,1.1.1.2.1,6549.6,,,,1


### Filter Data

In [243]:
df = df[(df['REF_DATE'] >= 2000) & (df['REF_DATE'] < 2021) & (df["GEO"] == "Canada") & (df["UOM"] == "Persons")]
df = df[df["Labour force characteristics"].isin(["Employment", "Unemployment"])]
df = df[df["Sex"].isin(["Males", "Females"])]
df.head()

Unnamed: 0,REF_DATE,GEO,DGUID,Labour force characteristics,North American Industry Classification System (NAICS),Sex,Age group,UOM,UOM_ID,SCALAR_FACTOR,SCALAR_ID,VECTOR,COORDINATE,VALUE,STATUS,SYMBOL,TERMINATED,DECIMALS
460504,2000,Canada,2016A000011124,Employment,"Total, all industries",Males,15 years and over,Persons,249,thousands,3,v2363552,1.2.1.2.1,7971.0,,,,1
460505,2000,Canada,2016A000011124,Employment,"Total, all industries",Males,15 to 24 years,Persons,249,thousands,3,v19668336,1.2.1.2.2,1180.5,,,,1
460506,2000,Canada,2016A000011124,Employment,"Total, all industries",Males,25 to 54 years,Persons,249,thousands,3,v19668337,1.2.1.2.3,5863.6,,,,1
460507,2000,Canada,2016A000011124,Employment,"Total, all industries",Males,55 years and over,Persons,249,thousands,3,v19668338,1.2.1.2.4,926.9,,,,1
460508,2000,Canada,2016A000011124,Employment,"Total, all industries",Females,15 years and over,Persons,249,thousands,3,v2363722,1.2.1.3.1,6789.1,,,,1


### Select Required Columns

In [244]:
df = df[["REF_DATE", "Labour force characteristics", "North American Industry Classification System (NAICS)", "Sex","Age group", "VALUE"]]

df.head()


Unnamed: 0,REF_DATE,Labour force characteristics,North American Industry Classification System (NAICS),Sex,Age group,VALUE
460504,2000,Employment,"Total, all industries",Males,15 years and over,7971.0
460505,2000,Employment,"Total, all industries",Males,15 to 24 years,1180.5
460506,2000,Employment,"Total, all industries",Males,25 to 54 years,5863.6
460507,2000,Employment,"Total, all industries",Males,55 years and over,926.9
460508,2000,Employment,"Total, all industries",Females,15 years and over,6789.1


### Multiply Value By Scalar_Factor

### Rename Columns

In [245]:
df.rename(columns={'REF_DATE': 'Date', 'North American Industry Classification System (NAICS)': 'NAICS','VALUE': 'Value', 'Sex': 'Gender', "Labour force characteristics": "Labour Force Characteristics", "Age group": "Age Group"}, inplace=True)
df

Unnamed: 0,Date,Labour Force Characteristics,NAICS,Gender,Age Group,Value
460504,2000,Employment,"Total, all industries",Males,15 years and over,7971.0
460505,2000,Employment,"Total, all industries",Males,15 to 24 years,1180.5
460506,2000,Employment,"Total, all industries",Males,25 to 54 years,5863.6
460507,2000,Employment,"Total, all industries",Males,55 years and over,926.9
460508,2000,Employment,"Total, all industries",Females,15 years and over,6789.1
...,...,...,...,...,...,...
910651,2020,Unemployment,Unclassified industries,Males,55 years and over,44.0
910652,2020,Unemployment,Unclassified industries,Females,15 years and over,230.3
910653,2020,Unemployment,Unclassified industries,Females,15 to 24 years,85.1
910654,2020,Unemployment,Unclassified industries,Females,25 to 54 years,117.2


### Change "Employment" and "Unemployment" to "Employed" and "Unemployed"

In [246]:
df.loc[df["Labour Force Characteristics"] == "Employment", "Labour Force Characteristics"] = 'Employed'
df.loc[df["Labour Force Characteristics"] == "Unemployment", "Labour Force Characteristics"] = 'Unemployed'
df


Unnamed: 0,Date,Labour Force Characteristics,NAICS,Gender,Age Group,Value
460504,2000,Employed,"Total, all industries",Males,15 years and over,7971.0
460505,2000,Employed,"Total, all industries",Males,15 to 24 years,1180.5
460506,2000,Employed,"Total, all industries",Males,25 to 54 years,5863.6
460507,2000,Employed,"Total, all industries",Males,55 years and over,926.9
460508,2000,Employed,"Total, all industries",Females,15 years and over,6789.1
...,...,...,...,...,...,...
910651,2020,Unemployed,Unclassified industries,Males,55 years and over,44.0
910652,2020,Unemployed,Unclassified industries,Females,15 years and over,230.3
910653,2020,Unemployed,Unclassified industries,Females,15 to 24 years,85.1
910654,2020,Unemployed,Unclassified industries,Females,25 to 54 years,117.2


### Reset Index

In [247]:
df.reset_index(drop=True, inplace=True)
df

Unnamed: 0,Date,Labour Force Characteristics,NAICS,Gender,Age Group,Value
0,2000,Employed,"Total, all industries",Males,15 years and over,7971.0
1,2000,Employed,"Total, all industries",Males,15 to 24 years,1180.5
2,2000,Employed,"Total, all industries",Males,25 to 54 years,5863.6
3,2000,Employed,"Total, all industries",Males,55 years and over,926.9
4,2000,Employed,"Total, all industries",Females,15 years and over,6789.1
...,...,...,...,...,...,...
9571,2020,Unemployed,Unclassified industries,Males,55 years and over,44.0
9572,2020,Unemployed,Unclassified industries,Females,15 years and over,230.3
9573,2020,Unemployed,Unclassified industries,Females,15 to 24 years,85.1
9574,2020,Unemployed,Unclassified industries,Females,25 to 54 years,117.2


### Filter NAICS To Remove Aggregated Industries

In [248]:
df = df[(df['NAICS'].str.contains(r'\['))]
df = df[~df["NAICS"].isin(["Forestry, fishing, mining, quarrying, oil and gas [21, 113-114, 1153, 2100]", "Finance, insurance, real estate, rental and leasing [52, 53]", "Wholesale and retail trade [41, 44-45]"])]
df.reset_index(drop=True, inplace=True)

for val in df["NAICS"].unique():
    print(val)

Agriculture [111-112, 1100, 1151-1152]
Forestry and logging and support activities for forestry [113, 1153]
Fishing, hunting and trapping [114]
Mining, quarrying, and oil and gas extraction [21, 2100]
Utilities [22]
Construction [23]
Manufacturing [31-33]
Durables [321, 327, 331-339]
Non-durables [311-316, 322-326]
Wholesale trade [41]
Retail trade [44-45]
Transportation and warehousing [48-49]
Finance and insurance [52]
Real estate and rental and leasing [53]
Professional, scientific and technical services [54]
Business, building and other support services [55, 56]
Educational services [61]
Health care and social assistance [62]
Information, culture and recreation [51, 71]
Accommodation and food services [72]
Other services (except public administration) [81]
Public administration [91]


In [249]:
df

Unnamed: 0,Date,Labour Force Characteristics,NAICS,Gender,Age Group,Value
0,2000,Employed,"Agriculture [111-112, 1100, 1151-1152]",Males,15 years and over,258.9
1,2000,Employed,"Agriculture [111-112, 1100, 1151-1152]",Males,15 to 24 years,44.0
2,2000,Employed,"Agriculture [111-112, 1100, 1151-1152]",Males,25 to 54 years,142.6
3,2000,Employed,"Agriculture [111-112, 1100, 1151-1152]",Males,55 years and over,72.3
4,2000,Employed,"Agriculture [111-112, 1100, 1151-1152]",Females,15 years and over,112.4
...,...,...,...,...,...,...
7387,2020,Unemployed,Public administration [91],Males,55 years and over,2.8
7388,2020,Unemployed,Public administration [91],Females,15 years and over,17.4
7389,2020,Unemployed,Public administration [91],Females,15 to 24 years,5.0
7390,2020,Unemployed,Public administration [91],Females,25 to 54 years,6.8


In [250]:
replace_with_agriculture = "Agriculture, forestry, fishing and hunting [11]"
replace_with_mining = "Mining, quarrying, and oil and gas extraction [21]"
df['NAICS'].replace({'Agriculture [111-112, 1100, 1151-1152]':replace_with_agriculture,'Forestry and logging and support activities for forestry [113, 1153]': replace_with_agriculture, 'Fishing, hunting and trapping [114]': replace_with_agriculture , "Mining, quarrying, and oil and gas extraction [21, 2100]": replace_with_mining}, inplace=True)

df = df.groupby(["Date","Labour Force Characteristics","NAICS", "Gender", "Age Group",], as_index=False)['Value'].sum()

arr = ["Non-durables [311-316, 322-326]",
"Durables [321, 327, 331-339]", "Wholesale trade [41]"]

df = df.query('NAICS not in @arr')

for val in df["NAICS"].unique():
    print(val)

Accommodation and food services [72]
Agriculture, forestry, fishing and hunting [11]
Business, building and other support services [55, 56]
Construction [23]
Educational services [61]
Finance and insurance [52]
Health care and social assistance [62]
Information, culture and recreation [51, 71]
Manufacturing [31-33]
Mining, quarrying, and oil and gas extraction [21]
Other services (except public administration) [81]
Professional, scientific and technical services [54]
Public administration [91]
Real estate and rental and leasing [53]
Retail trade [44-45]
Transportation and warehousing [48-49]
Utilities [22]


In [251]:
df

Unnamed: 0,Date,Labour Force Characteristics,NAICS,Gender,Age Group,Value
0,2000,Employed,Accommodation and food services [72],Females,15 to 24 years,235.0
1,2000,Employed,Accommodation and food services [72],Females,15 years and over,558.2
2,2000,Employed,Accommodation and food services [72],Females,25 to 54 years,285.4
3,2000,Employed,Accommodation and food services [72],Females,55 years and over,37.7
4,2000,Employed,Accommodation and food services [72],Males,15 to 24 years,160.9
...,...,...,...,...,...,...
6707,2020,Unemployed,Utilities [22],Females,55 years and over,0.0
6708,2020,Unemployed,Utilities [22],Males,15 to 24 years,0.0
6709,2020,Unemployed,Utilities [22],Males,15 years and over,2.1
6710,2020,Unemployed,Utilities [22],Males,25 to 54 years,0.0


In [252]:
df.dtypes

Date                              int64
Labour Force Characteristics     object
NAICS                            object
Gender                           object
Age Group                        object
Value                           float64
dtype: object

### Find Empty Values

### Find Averages For Different Groups and Replace 0.0/Empty/Missing Values

In [253]:
df = df.fillna(0.0)
temp = df
temp = temp.drop(columns=["Date"])
temp.reset_index(drop=True, inplace=True)

empty_rows = temp.query('Value == 0.0').groupby(["Labour Force Characteristics", "NAICS", "Gender", "Age Group"])

empty_rows_list = empty_rows.apply(lambda x: x).values.tolist()
for row in empty_rows_list:

    query_df = df[(df['Value'] != 0.0) & (df['Labour Force Characteristics'] == row[0]) * (df["NAICS"] == row[1]) & (df["Gender"] == row[2]) & (df["Age Group"] == row[3])]
    if (not query_df.empty):
        avg = query_df["Value"].mean()
        df.loc[(df['Value'] == 0.0) & (df['Labour Force Characteristics'] == row[0]) * (df["NAICS"] == row[1]) & (df["Gender"] == row[2]) & (df["Age Group"] == row[3]), "Value"] = avg

df[df.isna().any(axis=1)]

Unnamed: 0,Date,Labour Force Characteristics,NAICS,Gender,Age Group,Value


### Multiply By The Scalar Value

In [254]:
df["Value"] = (df["Value"] * 1000).round(0)
df["Value"] = df["Value"].astype('int64')

In [259]:
df.reset_index(drop=True, inplace=True)
df

Unnamed: 0,Date,Labour Force Characteristics,NAICS,Gender,Age Group,Value
0,2000,Employed,Accommodation and food services [72],Females,15 to 24 years,235000
1,2000,Employed,Accommodation and food services [72],Females,15 years and over,558200
2,2000,Employed,Accommodation and food services [72],Females,25 to 54 years,285400
3,2000,Employed,Accommodation and food services [72],Females,55 years and over,37700
4,2000,Employed,Accommodation and food services [72],Males,15 to 24 years,160900
...,...,...,...,...,...,...
5707,2020,Unemployed,Utilities [22],Females,55 years and over,0
5708,2020,Unemployed,Utilities [22],Males,15 to 24 years,0
5709,2020,Unemployed,Utilities [22],Males,15 years and over,2100
5710,2020,Unemployed,Utilities [22],Males,25 to 54 years,1743


In [256]:
df.dtypes

Date                             int64
Labour Force Characteristics    object
NAICS                           object
Gender                          object
Age Group                       object
Value                            int64
dtype: object

### Export To CSV

In [257]:
df.to_csv('data/cleaned_employment_by_industry.csv', index=False)