In [63]:
import pandas as pd

# Read Statistics Canada formatted data
strikeCountsDf = pd.read_csv('data/strikes/strike-count.csv')
strikeAverageDurationsDf = pd.read_csv('data/strikes/strike-average-duration.csv')
strikeDaysNotWorkedDf = pd.read_csv('data/strikes/strike-days-not-worked.csv')

# Read FRED Data
unemploymentDf = pd.read_csv('data/unemployment.csv')
realGdpDf = pd.read_csv('data/real-gdp.csv')
inflationDf = pd.read_csv('data/inflation.csv')
populationDf = pd.read_csv('data/population.csv')

# Read Labour Share Data
# Household Compensation is Statistics Canada data, but it has been modified in Excel to have years as the dates
householdCompensationDf = pd.read_csv('data/labour-share/household-compensation.csv')

# Nominal GDP is Fred data
nominalGdpDf = pd.read_csv('data/labour-share/nominal-gdp.csv')

In [64]:
# Cleaning data
# Convert all FRED dates to the just a year
def convertToYears(date):
    return int(date[:4])

fredDfs = [unemploymentDf, realGdpDf, inflationDf, populationDf, nominalGdpDf]

for df in fredDfs:
    df['DATE'] = df['DATE'].apply(convertToYears)

# Rename datasets and drop unnecessary rows
strikeCountsDf = strikeCountsDf.rename(columns={'VALUE': 'strikeCounts'})
strikeAverageDurationsDf = strikeAverageDurationsDf.rename(columns={'VALUE': 'strikeAverageDurations'})
strikeDaysNotWorkedDf = strikeDaysNotWorkedDf.rename(columns={'VALUE': 'strikesDaysNotWorked'})

householdCompensationDf = householdCompensationDf.rename(columns={'VALUE': 'householdCompensation'})
nominalGdpDf = nominalGdpDf.rename(columns={'NGDPXDCCAA': 'nominalGdp'})

realGdpDf = realGdpDf.rename(columns={'NGDPRXDCCAA': 'realGdp'})
unemploymentDf = unemploymentDf.rename(columns={'LRUNTTTTCAA156S': 'unemploymentRate'})
inflationDf = inflationDf.rename(columns={'FPCPITOTLZGCAN': 'inflationRate'})
populationDf = populationDf.rename(columns={'POPTOTCAA647NWDB': 'population'})


In [65]:
# Merge into one dataframe
allDfs = [strikeCountsDf, strikeAverageDurationsDf, strikeDaysNotWorkedDf, unemploymentDf, realGdpDf, inflationDf, populationDf, householdCompensationDf, nominalGdpDf]

fullDf = strikeCountsDf

for df in allDfs[1:]:
    fullDf = pd.merge(fullDf, df, on='DATE')
    
fullDf = fullDf.rename(columns={'DATE': 'year'})

In [66]:
import numpy as np

# Create all variables

# Create labour share variable
fullDf['labourShare'] = fullDf['householdCompensation'] / fullDf['nominalGdp']

# Make strike counts and days not worked due to strikes relative to 1000 people, to adjust for population size
# Strike duration does not need to be adjusted for this, as it is measuring time
peopleRatioForStrikes = 1000

fullDf['strikeCounts'] = (fullDf['strikeCounts'] / fullDf['population']) * peopleRatioForStrikes
fullDf['strikesDaysNotWorked'] = (fullDf['strikesDaysNotWorked'] / fullDf['population']) * peopleRatioForStrikes

# Create *approximated* growth rate in real GDP
# Uses log differences and the Taylor Series to approximate the growth rate 

fullDf['realGdpGrowthRate'] = np.log(fullDf['realGdp'])
fullDf['realGdpGrowthRate'] =  fullDf['realGdpGrowthRate'].diff()