In [136]:
import pandas as pd

frequency = 'annual'

# Read Statistics Canada formatted data
strikeCountsDf = pd.read_csv(f"data/{frequency}/strikes/strike-count.csv")
strikeAverageDurationsDf = pd.read_csv(f"data/{frequency}/strikes/strike-average-duration.csv")
strikeDaysNotWorkedDf = pd.read_csv(f"data/{frequency}/strikes/strike-days-not-worked.csv")

# Read FRED Data
unemploymentDf = pd.read_csv(f"data/{frequency}/unemployment.csv")
realGdpDf = pd.read_csv(f"data/{frequency}/real-gdp.csv")
inflationDf = pd.read_csv(f"data/{frequency}/inflation.csv")
populationDf = pd.read_csv(f"data/{frequency}/population.csv")

# Read Labour Share Data
# Household Compensation is Statistics Canada data, but it has been modified in Excel to have years as the dates
householdCompensationDf = pd.read_csv(f"data/{frequency}/labour-share/household-compensation.csv")

# Nominal GDP is Fred data
nominalGdpDf = pd.read_csv(f"data/{frequency}/labour-share/nominal-gdp.csv")

In [137]:
# Cleaning data
# Convert all FRED dates to the just a year
def convertToYears(date):
    return int(date[:4])

fredDfs = [unemploymentDf, realGdpDf, inflationDf, populationDf, nominalGdpDf]

for df in fredDfs:
    df['DATE'] = df['DATE'].apply(convertToYears)

# Rename datasets and drop unnecessary rows
strikeCountsDf = strikeCountsDf.rename(columns={'VALUE': 'strikeCounts'})
strikeAverageDurationsDf = strikeAverageDurationsDf.rename(columns={'VALUE': 'strikeAverageDurations'})
strikeDaysNotWorkedDf = strikeDaysNotWorkedDf.rename(columns={'VALUE': 'strikesDaysNotWorked'})

householdCompensationDf = householdCompensationDf.rename(columns={'VALUE': 'householdCompensation'})
nominalGdpDf = nominalGdpDf.rename(columns={'NGDPXDCCAA': 'nominalGdp'})

realGdpDf = realGdpDf.rename(columns={'NGDPRXDCCAA': 'realGdp'})
unemploymentDf = unemploymentDf.rename(columns={'LRUNTTTTCAA156S': 'unemploymentRate'})
inflationDf = inflationDf.rename(columns={'FPCPITOTLZGCAN': 'inflationRate'})
populationDf = populationDf.rename(columns={'POPTOTCAA647NWDB': 'population'})


In [138]:
# Merge into one dataframe
allDfs = [strikeCountsDf, strikeAverageDurationsDf, strikeDaysNotWorkedDf, unemploymentDf, realGdpDf, inflationDf, populationDf, householdCompensationDf, nominalGdpDf]

fullDf = strikeCountsDf

for df in allDfs[1:]:
    fullDf = pd.merge(fullDf, df, on='DATE')
    
fullDf = fullDf.rename(columns={'DATE': 'year'})

In [139]:
import numpy as np

# Create all variables
# Create labour share variable
fullDf['labourShare'] = fullDf['householdCompensation'] / fullDf['nominalGdp']

# Make strike counts and days not worked due to strikes relative to 1 000 000 people, to adjust for population size
# Strike duration does not need to be adjusted for this, as it is measuring time
peopleRatioForStrikes = 100000

fullDf['strikeCounts'] = (fullDf['strikeCounts'] / fullDf['population']) * peopleRatioForStrikes
fullDf['strikesDaysNotWorked'] = (fullDf['strikesDaysNotWorked'] / fullDf['population']) * peopleRatioForStrikes

# Create *approximated* growth rate in real GDP
# Uses log differences and the Taylor Series to approximate the growth rate 
fullDf['realGdpGrowthRate'] = np.log(fullDf['realGdp'])
fullDf['realGdpGrowthRate'] =  fullDf['realGdpGrowthRate'].diff()

# Create change in unemployment variable
fullDf['unemploymentRateChange'] = fullDf['unemploymentRate'].diff()

# Create dummy variable for inflation targeting period
targetingStartYear = 1991

fullDf['inflationTargetingPeriod'] = fullDf['year'] > targetingStartYear

# Drop first row since it is now empty (1962 is the first year now)

fullDf = fullDf.drop(index=[0])

In [140]:
# Convert % data to larger numbers for easier interpretation
fullDf['labourShare'] = fullDf['labourShare'] * 100
fullDf['realGdpGrowthRate'] = fullDf['realGdpGrowthRate'] * 100

In [141]:
import statsmodels.formula.api as smf

strikeVariableNames = ['strikeCounts', 'strikesDaysNotWorked', 'strikeAverageDurations']

for variableName in strikeVariableNames:
    strikesFormula = f"{variableName} ~ inflationRate + unemploymentRate + unemploymentRateChange + labourShare + realGdpGrowthRate + inflationTargetingPeriod + inflationRate: inflationTargetingPeriod"
    strikesModel = smf.ols(formula=strikesFormula, data=fullDf).fit()
    
    print(f"{variableName} Model\n")
    print(f"{strikesModel.summary()}\n")

strikeCounts Model

                            OLS Regression Results                            
Dep. Variable:           strikeCounts   R-squared:                       0.899
Model:                            OLS   Adj. R-squared:                  0.886
Method:                 Least Squares   F-statistic:                     68.64
Date:                Fri, 02 Aug 2024   Prob (F-statistic):           1.36e-24
Time:                        21:54:26   Log-Likelihood:                -34.497
No. Observations:                  62   AIC:                             84.99
Df Residuals:                      54   BIC:                             102.0
Df Model:                           7                                         
Covariance Type:            nonrobust                                         
                                                     coef    std err          t      P>|t|      [0.025      0.975]
---------------------------------------------------------------------------

In [142]:
# Graphs of data