# Data Input and Processing
This Jupyter Notebook contains code that takes the raw data files from Statistics Sweden and produces the files that are used in the analysis.

## The data from Statistics Sweden:
Occupational Transitions betweeen 2016 and 2017
Quarterly Seasonally adjusted vacancy and employment data between 2004 and 2019
Quarterly Seasonally and Calender adjusted unemployment data between 2004 and 2019
Yearly Employment data distributed by SSYK occupational code between 2014 and 2018
Yearly averages of total weekly hours worked between 2014 and 2018

## Computerisation Probabilities from Frey & Osborne
This data was developed for the american SOC (System for Occupational Classifications) and has to be translated to match Swedish data
First the data is translated to ISCO using a key from https://ibs.org.pl/en/resources/occupation-classifications-crosswalks-from-onet-soc-to-isco/ which is based on work of David Autor and Daron Acemoglu: http://economics.mit.edu/faculty/dautor/data/acemoglu
Then the data is translated using a key found on Statistics Sweden's website

In [1]:
# Required packages (check which are required)
import matplotlib.pyplot as plt
from matplotlib.patches import Patch
import networkx as nx
import pandas as pd
import numpy as np
import scipy as sp
import datetime as dt
from shapely.geometry import Polygon

import random
import math

import cmocean as cmo

# Write files

%matplotlib inline

# Occupational transitions and the Adjacency matrix

In [2]:
# This is where the occupation transition data (as well as occupation code keys) is imported

data = pd.read_csv('../Data_Labour/swedish_occupation_transitions.csv', sep = ';', index_col = 0)
data.index.name = None
data = data.drop(axis = 1, labels = 'Totalsumma')
data = data.drop(axis = 1, index = 'Totalsumma')
last = data.index[-1]
data = data.rename(index={last: 'NULL'})

# Drop Null and '***' columns
data = data.iloc[0:148, 0:148]

# ['31', '21', '11'] Are actually meant to be '031', '021' and '011'. 
# The 0 denotes that these are military occupations. 
# The data from Frey and Osborne do not cover military information which means that we cannot include 
# it in our analysis
data.drop(labels = ['31', '21', '11'], axis = 0, inplace = True)
data.drop(labels = ['31', '21', '11'], axis = 1, inplace = True)

# This section calculates the adjacency matrix A from the raw data
A = pd.DataFrame(np.zeros(data.shape), columns = data.columns, index = data.index)

for i in range(data.shape[0]):
    total = 0
    for t in range(data.shape[1]):
        if math.isnan(data.iloc[i,t]) != True:
            total += data.iloc[i,t]
        else:
            data.iloc[i, t] = 0

    for j in range(data.shape[1]):
        T = data.iloc[i,j]
        A.iloc[i,j] = (T/total)

A.index = A.index.map(str)
A.columns = A.columns.map(str)

row_nonzeros = np.count_nonzero(A, axis=0)
col_nonzeros = np.count_nonzero(A, axis=1)

for i in range(len(row_nonzeros)):
    if row_nonzeros[i] == col_nonzeros[i] & row_nonzeros[i] == 1:
        print(A.columns[i])

# SSYK 323 and 622 only has selfloops and are not connected to the main component of the graph 
# As discussed in the thesis - these are removed
# data.drop(labels = ['323', '622'], axis = 0, inplace = True)
# data.drop(labels = ['323', '622'], axis = 1, inplace = True)
# A.drop(labels = ['323', '622'], axis = 0, inplace = True)
# A.drop(labels = ['323', '622'], axis = 1, inplace = True)

G = nx.from_pandas_adjacency(A, create_using = nx.DiGraph)
print('The code outputs node labels which have no edges')

data.to_csv('../Data_Labour/Occupation_transitions.csv', sep = ',')
nx.write_graphml(G, '../Data_Labour/Occ_mob_sweden.graphml')

323
622


# Frey & Osborne Computerisation Probabilities

In [30]:
# This is where the automation shock data from Frey and Osborne is imported and processed between occupation classification systems

frey_osborne = pd.read_csv('../Data_Labour/osborne_frey_data.csv', sep = ';', index_col = 0)

SOC_shock = frey_osborne[['Probability', 'SOC code']]
SOC_shock.columns = ['Computerisation Probability', 'soc10']

for i in range(len(SOC_shock['soc10'])):
    SOC_shock.iloc[i,1] = SOC_shock.iloc[i,1][0:2] + SOC_shock.iloc[i,1][3:7]
    #SOC_shock.iloc[i,1] = SOC_shock.iloc[i,1]


SOC_ISCO = pd.read_csv('../Data_Labour/soc10_isco08.csv', sep = ',')
for i in range(len(SOC_ISCO['isco08'])):
    SOC_ISCO.iloc[i,1] = str(SOC_ISCO.iloc[i,1])
    SOC_ISCO.iloc[i,0] = str(SOC_ISCO.iloc[i,0])
    # if len(SOC_ISCO.iloc[i,1]) == 3:
    #     SOC_ISCO.iloc[i,1] = '0' + SOC_ISCO.iloc[i,1]

ISCO_SSYK = pd.read_csv('../Data_Labour/nyckel_ssyk2012_isco-08.csv', sep = ';')
ISCO_SSYK = ISCO_SSYK[['SSYK 2012 kod','ISCO-08 ']]
ISCO_SSYK.columns = ['ssyk12', 'isco08']

for i in range(len(ISCO_SSYK['isco08'])):
    ISCO_SSYK.iloc[i,1] = str(ISCO_SSYK.iloc[i,1])
    ISCO_SSYK.iloc[i,0] = str(ISCO_SSYK.iloc[i,0])


# The file above contains many duplicates
ISCO_SSYK.drop_duplicates(inplace=True)

# Below transfers SOC_shock to SSYK_shock
ISCO_shock = pd.merge(SOC_shock, SOC_ISCO, on = 'soc10')

SSYK_shock = pd.merge(ISCO_shock, ISCO_SSYK, on = 'isco08')


# The codes are 4 level need to be 3 level. Only need to change final table (SSYK_shock)
SSYK_shock['ssyk3'] =  [str(code[0:3]) for code in SSYK_shock['ssyk12']]

SSYK3_shock = SSYK_shock[['Computerisation Probability', 'ssyk3']]

SSYK3 = list(SSYK3_shock['ssyk3'])

SSYK3_shock = SSYK3_shock.groupby(['ssyk3']).mean()
#SSYK3_shock['ssyk3'] =  [str(code) for code in SSYK3_shock['ssyk3']]

G = nx.from_pandas_adjacency(A, create_using = nx.DiGraph)
SSYK3_fromnw = list(G.nodes)
SSYK3_fromnw = [str(node) for node in SSYK3_fromnw]

SSYK3_shock.to_csv('../Data_Labour/occupation_shock.csv', sep = ',')

# Problem is that certain SOC codes in osborne frey have been abbreviated with 0s. 
# Which makes the matching miss a few rows
# This problem can be fixed

# SOC codes that are not found in the SOC-ISCO translation file
# print(set(SOC_shock['soc10'])-set(SOC_ISCO['soc10']))
# {'292037', '292055', '499799', '291060', '394831', '319799', '292799', '251000', '253999', 
# '151179', '474799', '131078', '452090', '299799', '151150', '151799', '519399', '291111'}

# Focus on '251000', '151150', '291060'
# 291060 solves 221 because 291060 doesnt exist in soc_isco
# 291141, 291151, 291171, 291161 <- 222
# 29-1111 is not used anymore, 29-1141 should be used instead: https://www.onetonline.org/find/quick?s=29-1111

# 231 ssyk: soc_isco översätter till isco 2310 som inte existerar i isco_ssyk nykeln där det istället är 231X. 

# 251000: Post-secondary teachers is translated as 2310 SSYK

# SSYK codes not found in ISCO-SSYK translation file
# print(list(set(SSYK3_fromnw) - set(SSYK3)))
# ['221', '21', '11', '222', '231', '31']
# ['21', '11', '31'] are military occupations and we do not have computersiation probabilities for these

# focus on ['221', '222', '231']


# Occupational employment distribution

In [33]:
SSYK_labels = pd.read_csv('../Data_Labour/Ssyk-2012-koder.csv', sep = ';').astype(str)

SSYK3_shock = SSYK3_shock.merge(SSYK_labels, how = 'left', on = 'ssyk3')

employment_SSYK = pd.read_csv('../Data_Labour/employment_SSYK.csv', sep = ',').astype(str)
employment_SSYK.rename(columns = {'Yrke (SSYK 2012)':'ssyk3'}, inplace = True)
employment_SSYK.to_csv('../Data_Labour/occupational_employment.csv', sep = ',')

occupational_data = SSYK3_shock.merge(employment_SSYK, on = 'ssyk3')
occupational_data.to_csv('../Data_Labour/occupational_data.csv', sep = ',')

# SSYK code and years as columns

In [41]:
occupational_data.index = occupational_data.ssyk3
occupational_data.loc['622']

ssyk3                                                  622
Computerisation Probability                           0.72
swedish                             Fiskodlare och fiskare
english                        Fishery workers and fishers
2014                                                   474
2015                                                   489
2016                                                   553
2017                                                   586
2018                                                   604
Name: 622, dtype: object

In [None]:
    # Data used to calculate the post shock demand
    comp_prob = nx.get_node_attributes(G, 'comp_prob')
    average_hours_worked_0 = avg_hours_0

    vacancies = nx.get_node_attributes(G, 'vacancies')
    employed = nx.get_node_attributes(G, 'employed')

    # e_0 = {key:val for key, val in employed.items()}
    demand_0 = {occ:len(vacancies[occ]) + employed[occ] for occ in vacancies.keys()} 

    # Calculate the post shock demand for each occupation
    L = sum(demand_0.values())
    final_hours_worked = {occ : average_hours_worked_0*employed[occ]*(1-prob) for occ, prob in comp_prob.items()}
    final_average_hours_worked = sum(final_hours_worked.values())/L

    # Post shock demand
    final_demand = {occupation: round(hours/final_average_hours_worked) for occupation, hours in final_hours_worked.items()}

# Deviations from potential GDP from Konjunktur Institutet

In [31]:
gdp_gap = pd.read_csv('../Data_Labour/bnp-gap.csv', sep = ';')
gdp_gap['Qtr'] = pd.to_datetime(gdp_gap.date).dt.quarter
gdp_gap['date'] = [gdp_gap['date'].iloc[i][0:4] + 'Q' + str(gdp_gap['Qtr'].iloc[i]) for i in range(len(gdp_gap['date']))]
gdp_gap['recession'] = [1 if gap <= 0 else 0 for gap in list(gdp_gap['BNP-gap'])]
gap_offset = zip(gdp_gap['BNP-gap'].iloc[1:],gdp_gap['BNP-gap'].iloc[:-1])
change_ls = [(gap_t1 - gap_t0)/gap_t0 for gap_t1, gap_t0 in gap_offset]
change_ls.insert(0,float('NaN'))
gdp_gap['gap_change'] = change_ls
gdp_gap.rename(columns = {'BNP-gap': 'gdp_gap'}, inplace = True)

# Data from Statistics Sweden

In [102]:
# New calibration data - seasonally adjusted
employment_sa = pd.read_csv('../Data_Labour/employment_quarterly.csv', sep = ';')
employment_sa['date'] = [str(2000 + employment_sa['year'].iloc[i])+'Q'+str(employment_sa['quarter'].iloc[i]) for i in range(len(employment_sa))]
employment_sa = employment_sa[['date', 'e_sa', 'e_trend']]
employment_sa['e_sa'] = [int(float(string.replace(',','.'))*1000) for string in employment_sa['e_sa']]
employment_sa['e_trend'] = [int(float(string.replace(',','.'))*1000) for string in employment_sa['e_trend']]


unemployment_all = pd.read_csv('../Data_Labour/unemployment_quarterly.csv', sep = ';')
unemployment_all['date'] = [str(2000 + unemployment_all['year'].iloc[i])+'Q'+str(unemployment_all['quarter'].iloc[i]) for i in range(len(unemployment_all))]
unemployment_sa = unemployment_all[['date', 'u_sa', 'u_trend']]

unemployment_sa['u_sa'] = [float(string.replace(',','.')) for string in unemployment_sa['u_sa']]
unemployment_sa['u_trend'] = [float(string.replace(',','.')) for string in unemployment_sa['u_trend']]

vac_rate_all = pd.read_csv('../Data_Labour/Vacancy Data/sa_2004-2019.csv', sep = ';')

sa_calibration_data = pd.merge(unemployment_sa, vac_rate_all[['date', 'sa_vac', 'na_vac']], on ='date')
sa_calibration_data = pd.merge(sa_calibration_data, employment_sa, on = 'date')
sa_calibration_data = pd.merge(sa_calibration_data, gdp_gap[['date', 'recession', 'gap_change']], on = 'date')

sa_calibration_data['sa_vac_rate'] = sa_calibration_data['sa_vac']*100/(sa_calibration_data['e_trend']+sa_calibration_data['sa_vac'])
sa_calibration_data['year'] = [date[:4] for date in sa_calibration_data['date']]

sa_calibration_data['workforce'] = sa_calibration_data['e_trend']/(1-sa_calibration_data['u_trend']/100)

sa_calibration_data.to_csv('../Data_Labour/calibration_data.csv', sep = ',')


u_trend    6.34
Name: 2018, dtype: float64

# Hours worked data (from Statistics Sweden)

In [69]:
hours_worked = pd.read_csv('../Data_Labour/hours_worked_sa.csv', sep =',')
hours_worked.drop(labels = ['ekonomisk indikator'], axis = 1, inplace = True)
hours_worked = hours_worked.transpose()
hours_worked.columns = ['hours/week']
hours_worked['hours/week'] = hours_worked['hours/week']*1000
hours_worked['year'] = [date[:4] for date in hours_worked.index]
yearly_hours_worked = hours_worked.groupby(['year']).mean()

yearly_employment = sa_calibration_data[['year','e_trend']].groupby(['year']).mean()
yearly_hours_employment = pd.merge(yearly_employment, yearly_hours_worked, on = 'year')
yearly_hours_employment['average_hours/week'] = yearly_hours_employment['hours/week']/yearly_hours_employment['e_trend']
yearly_hours_employment['average_hours/year'] = yearly_hours_employment['average_hours/week']*52

yearly_hours_employment.to_csv('../Data_Labour/hours_data.csv', sep = ',')

# Processing simulation output

In [96]:
# Import the simulated data
vac_data = pd.read_csv('../Data_Labour/vac_simulation.csv', sep = ',')
emp_data = pd.read_csv('../Data_Labour/emp_simulation.csv', sep = ',')
unemp_data = pd.read_csv('../Data_Labour/unemp_simulation.csv', sep = ',')