In [18]:
import pandas as pd

frequency = 'annual'

# Read Statistics Canada formatted data
strikeCountsDf = pd.read_csv(f"data/{frequency}/strikes/strike-count.csv")
strikeAverageDurationsDf = pd.read_csv(f"data/{frequency}/strikes/strike-average-duration.csv")
strikeDaysNotWorkedDf = pd.read_csv(f"data/{frequency}/strikes/strike-days-not-worked.csv")

# Read FRED Data
unemploymentDf = pd.read_csv(f"data/{frequency}/unemployment.csv")
realGdpDf = pd.read_csv(f"data/{frequency}/real-gdp.csv")
inflationDf = pd.read_csv(f"data/{frequency}/inflation.csv")
populationDf = pd.read_csv(f"data/{frequency}/population.csv")

# Read Labour Share Data
# Household Compensation is Statistics Canada data, but it has been modified in Excel to have years as the dates
householdCompensationDf = pd.read_csv(f"data/{frequency}/labour-share/household-compensation.csv")

# Nominal GDP is Fred data
nominalGdpDf = pd.read_csv(f"data/{frequency}/labour-share/nominal-gdp.csv")

In [19]:
# Cleaning data
# Convert all FRED dates to the just a year
def convertToYears(date):
    return int(date[:4])

fredDfs = [unemploymentDf, realGdpDf, inflationDf, populationDf, nominalGdpDf]

for df in fredDfs:
    df['DATE'] = df['DATE'].apply(convertToYears)

# Rename datasets and drop unnecessary rows
strikeCountsDf = strikeCountsDf.rename(columns={'VALUE': 'strikeCounts'})
strikeAverageDurationsDf = strikeAverageDurationsDf.rename(columns={'VALUE': 'strikeAverageDurations'})
strikeDaysNotWorkedDf = strikeDaysNotWorkedDf.rename(columns={'VALUE': 'strikesDaysNotWorked'})

householdCompensationDf = householdCompensationDf.rename(columns={'VALUE': 'householdCompensation'})
nominalGdpDf = nominalGdpDf.rename(columns={'NGDPXDCCAA': 'nominalGdp'})

realGdpDf = realGdpDf.rename(columns={'NGDPRXDCCAA': 'realGdp'})
unemploymentDf = unemploymentDf.rename(columns={'LRUNTTTTCAA156S': 'unemploymentRate'})
inflationDf = inflationDf.rename(columns={'FPCPITOTLZGCAN': 'inflationRate'})
populationDf = populationDf.rename(columns={'POPTOTCAA647NWDB': 'population'})


In [20]:
# Merge into one dataframe
allDfs = [strikeCountsDf, strikeAverageDurationsDf, strikeDaysNotWorkedDf, unemploymentDf, realGdpDf, inflationDf, populationDf, householdCompensationDf, nominalGdpDf]

fullDf = strikeCountsDf

for df in allDfs[1:]:
    fullDf = pd.merge(fullDf, df, on='DATE')
    
fullDf = fullDf.rename(columns={'DATE': 'year'})

In [21]:
import numpy as np

# Create all variables

# Create labour share variable
fullDf['labourShare'] = fullDf['householdCompensation'] / fullDf['nominalGdp']

# Make strike counts and days not worked due to strikes relative to 1 000 000 people, to adjust for population size
# Strike duration does not need to be adjusted for this, as it is measuring time
peopleRatioForStrikes = 1000000

fullDf['strikeCounts'] = (fullDf['strikeCounts'] / fullDf['population']) * peopleRatioForStrikes
fullDf['strikesDaysNotWorked'] = (fullDf['strikesDaysNotWorked'] / fullDf['population']) * peopleRatioForStrikes

# Create *approximated* growth rate in real GDP
# Uses log differences and the Taylor Series to approximate the growth rate 
# Drop first row since it is now empty (1962 is the first year now)

fullDf['realGdpGrowthRate'] = np.log(fullDf['realGdp'])
fullDf['realGdpGrowthRate'] =  fullDf['realGdpGrowthRate'].diff()
fullDf = fullDf.drop(index=[0])

# Create dummy variable for inflation targeting period
targetingStartYear = 1991

fullDf['inflationTargetingPeriod'] = fullDf['year'] > targetingStartYear

print(fullDf)

    year  strikeCounts  strikeAverageDurations  strikesDaysNotWorked  \
1   1962     16.707854                    18.3          76144.837219   
2   1963     17.506855                    19.7          48354.250158   
3   1964     17.749030                    21.9          81764.036223   
4   1965     25.510723                    20.0         116937.188739   
5   1966     30.776137                    18.5         258381.384677   
..   ...           ...                     ...                   ...   
58  2019      3.402582                    33.2          32243.129344   
59  2020      1.735534                    44.5          38170.075931   
60  2021      4.864034                    23.4          34614.061389   
61  2022      4.519884                    57.6          48698.201621   
62  2023     19.402580                    24.5         164214.106618   

    unemploymentRate       realGdp  inflationRate  population  \
1           5.958333  4.046531e+05       1.061571    18614000   
2    