# Employment/Unemployment By Industry

### Importing Data

In [9]:
import pandas as pd
# Reading the csv files into dataframes
df = pd.read_csv('data/employment-by-industry.csv')
df.head()

Unnamed: 0,REF_DATE,GEO,DGUID,Labour force characteristics,North American Industry Classification System (NAICS),Sex,Age group,UOM,UOM_ID,SCALAR_FACTOR,SCALAR_ID,VECTOR,COORDINATE,VALUE,STATUS,SYMBOL,TERMINATED,DECIMALS
0,1976,Canada,2016A000011124,Labour force,"Total, all industries",Both sexes,15 years and over,Persons,249,thousands,3,v2363353,1.1.1.1.1,10491.3,,,,1
1,1976,Canada,2016A000011124,Labour force,"Total, all industries",Both sexes,15 to 24 years,Persons,249,thousands,3,v19668072,1.1.1.1.2,2891.9,,,,1
2,1976,Canada,2016A000011124,Labour force,"Total, all industries",Both sexes,25 to 54 years,Persons,249,thousands,3,v19668073,1.1.1.1.3,6410.7,,,,1
3,1976,Canada,2016A000011124,Labour force,"Total, all industries",Both sexes,55 years and over,Persons,249,thousands,3,v19668074,1.1.1.1.4,1188.7,,,,1
4,1976,Canada,2016A000011124,Labour force,"Total, all industries",Males,15 years and over,Persons,249,thousands,3,v2363523,1.1.1.2.1,6549.6,,,,1


### Filter Data

In [10]:
df = df[(df['REF_DATE'] >= 2000) & (df['REF_DATE'] < 2021) & (df["GEO"] == "Canada") & (df["UOM"] == "Persons")]
df = df[df["Labour force characteristics"].isin(["Employment", "Unemployment"])]
df.head()

Unnamed: 0,REF_DATE,GEO,DGUID,Labour force characteristics,North American Industry Classification System (NAICS),Sex,Age group,UOM,UOM_ID,SCALAR_FACTOR,SCALAR_ID,VECTOR,COORDINATE,VALUE,STATUS,SYMBOL,TERMINATED,DECIMALS
460500,2000,Canada,2016A000011124,Employment,"Total, all industries",Both sexes,15 years and over,Persons,249,thousands,3,v2363382,1.2.1.1.1,14760.1,,,,1
460501,2000,Canada,2016A000011124,Employment,"Total, all industries",Both sexes,15 to 24 years,Persons,249,thousands,3,v19668333,1.2.1.1.2,2287.4,,,,1
460502,2000,Canada,2016A000011124,Employment,"Total, all industries",Both sexes,25 to 54 years,Persons,249,thousands,3,v19668334,1.2.1.1.3,10933.1,,,,1
460503,2000,Canada,2016A000011124,Employment,"Total, all industries",Both sexes,55 years and over,Persons,249,thousands,3,v19668335,1.2.1.1.4,1539.6,,,,1
460504,2000,Canada,2016A000011124,Employment,"Total, all industries",Males,15 years and over,Persons,249,thousands,3,v2363552,1.2.1.2.1,7971.0,,,,1


### Select Required Columns

In [11]:
df = df[["REF_DATE", "Labour force characteristics", "North American Industry Classification System (NAICS)", "Sex","Age group", "VALUE"]]

df.head()

Unnamed: 0,REF_DATE,Labour force characteristics,North American Industry Classification System (NAICS),Sex,Age group,VALUE
460500,2000,Employment,"Total, all industries",Both sexes,15 years and over,14760.1
460501,2000,Employment,"Total, all industries",Both sexes,15 to 24 years,2287.4
460502,2000,Employment,"Total, all industries",Both sexes,25 to 54 years,10933.1
460503,2000,Employment,"Total, all industries",Both sexes,55 years and over,1539.6
460504,2000,Employment,"Total, all industries",Males,15 years and over,7971.0


### Rename Columns

In [12]:
df.rename(columns={'REF_DATE': 'Date', 'North American Industry Classification System (NAICS)': 'NAICS','VALUE': 'Number of People', 'Sex': 'Gender'}, inplace=True)
df

Unnamed: 0,Date,Labour force characteristics,NAICS,Gender,Age group,Number of People
460500,2000,Employment,"Total, all industries",Both sexes,15 years and over,14760.1
460501,2000,Employment,"Total, all industries",Both sexes,15 to 24 years,2287.4
460502,2000,Employment,"Total, all industries",Both sexes,25 to 54 years,10933.1
460503,2000,Employment,"Total, all industries",Both sexes,55 years and over,1539.6
460504,2000,Employment,"Total, all industries",Males,15 years and over,7971.0
...,...,...,...,...,...,...
910651,2020,Unemployment,Unclassified industries,Males,55 years and over,44.0
910652,2020,Unemployment,Unclassified industries,Females,15 years and over,230.3
910653,2020,Unemployment,Unclassified industries,Females,15 to 24 years,85.1
910654,2020,Unemployment,Unclassified industries,Females,25 to 54 years,117.2


### Change "Employment" and "Unemployment" to "Employed" and "Unemployed"

In [13]:
df.loc[df["Labour force characteristics"] == "Employment", "Labour force characteristics"] = 'Employed'
df.loc[df["Labour force characteristics"] == "Unemployment", "Labour force characteristics"] = 'Unemployed'
df

Unnamed: 0,Date,Labour force characteristics,NAICS,Gender,Age group,Number of People
460500,2000,Employed,"Total, all industries",Both sexes,15 years and over,14760.1
460501,2000,Employed,"Total, all industries",Both sexes,15 to 24 years,2287.4
460502,2000,Employed,"Total, all industries",Both sexes,25 to 54 years,10933.1
460503,2000,Employed,"Total, all industries",Both sexes,55 years and over,1539.6
460504,2000,Employed,"Total, all industries",Males,15 years and over,7971.0
...,...,...,...,...,...,...
910651,2020,Unemployed,Unclassified industries,Males,55 years and over,44.0
910652,2020,Unemployed,Unclassified industries,Females,15 years and over,230.3
910653,2020,Unemployed,Unclassified industries,Females,15 to 24 years,85.1
910654,2020,Unemployed,Unclassified industries,Females,25 to 54 years,117.2


### Reset Index

In [14]:
df.reset_index(drop=True, inplace=True)
df

Unnamed: 0,Date,Labour force characteristics,NAICS,Gender,Age group,Number of People
0,2000,Employed,"Total, all industries",Both sexes,15 years and over,14760.1
1,2000,Employed,"Total, all industries",Both sexes,15 to 24 years,2287.4
2,2000,Employed,"Total, all industries",Both sexes,25 to 54 years,10933.1
3,2000,Employed,"Total, all industries",Both sexes,55 years and over,1539.6
4,2000,Employed,"Total, all industries",Males,15 years and over,7971.0
...,...,...,...,...,...,...
14359,2020,Unemployed,Unclassified industries,Males,55 years and over,44.0
14360,2020,Unemployed,Unclassified industries,Females,15 years and over,230.3
14361,2020,Unemployed,Unclassified industries,Females,15 to 24 years,85.1
14362,2020,Unemployed,Unclassified industries,Females,25 to 54 years,117.2


### Filter NAICS To Remove Aggregated Industries

In [15]:
df = df[(df['NAICS'].str.contains(r'\['))]
df = df[~df["NAICS"].isin(["Forestry, fishing, mining, quarrying, oil and gas [21, 113-114, 1153, 2100]", "Finance, insurance, real estate, rental and leasing [52, 53]", "Wholesale and retail trade [41, 44-45]"])]

for val in df["NAICS"].unique():
    print(val)



Agriculture [111-112, 1100, 1151-1152]
Forestry and logging and support activities for forestry [113, 1153]
Fishing, hunting and trapping [114]
Mining, quarrying, and oil and gas extraction [21, 2100]
Utilities [22]
Construction [23]
Manufacturing [31-33]
Durables [321, 327, 331-339]
Non-durables [311-316, 322-326]
Wholesale trade [41]
Retail trade [44-45]
Transportation and warehousing [48-49]
Finance and insurance [52]
Real estate and rental and leasing [53]
Professional, scientific and technical services [54]
Business, building and other support services [55, 56]
Educational services [61]
Health care and social assistance [62]
Information, culture and recreation [51, 71]
Accommodation and food services [72]
Other services (except public administration) [81]
Public administration [91]


### Export To CSV

In [16]:
df.to_csv('data/cleaned_employment_by_industry.csv', index=False)