In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from linearmodels.panel import PanelOLS
import statsmodels.api as sm

In [2]:
df = pd.read_excel('Data Finale.xlsx')
df.head()

Unnamed: 0,Country,Year,Real Minimum Wage,Monthly Minimum Wage,CollectiveBargain_Coverage,TradeUnions_Density,Real Average Annual Wage Growth,Annual Inflation CPI,Gini Index,Employment/population ratio Women age 15 to 64,...,Current health expenditure (% of GDP),Exports of goods and services (annual % growth),"Foreign direct investment, net inflows (% of GDP)","Foreign direct investment, net outflows (% of GDP)",Imports of goods and services (% of GDP),"Labor force, total",Labor tax and contributions (% of commercial profits),"Part time employment, total (% of total employment)",GDP (current US$),Treatment
0,Austria,1990,0.0,0.0,98.0,46.799999,,3.261872,,,...,,8.57222,0.392278,1.021846,35.790955,,,,166463400000.0,0
1,Austria,1991,0.0,0.0,98.0,45.5,3.125859,3.337426,,,...,,2.937296,0.207142,0.743983,35.457917,3569407.0,,,173794200000.0,0
2,Austria,1992,0.0,0.0,98.0,44.5,2.014635,4.020847,,,...,,1.319386,0.761499,0.902222,34.4749,3652067.0,,,195078100000.0,0
3,Austria,1993,0.0,0.0,98.0,44.0,0.815329,3.631786,,,...,,-2.373161,0.59098,0.619051,31.801153,3684501.0,,,190379700000.0,0
4,Austria,1994,0.0,0.0,98.0,41.5,1.007529,2.953407,30.8,,...,,5.688089,1.019884,0.60973,33.828559,3851283.0,,,203535200000.0,0


In [4]:
data = df.copy()

In [None]:
plt.figure(figsize=(10, 6))
sns.lineplot(x='Year', y='CollectiveBargain_Coverage', hue='Country', data=data)
plt.title('Collective Bargaining over Years for Different Countries')
plt.grid()
plt.show()

## Handling missing data

In [5]:
df.sort_values(by=['Country', 'Year'], inplace=True)


In [6]:
variables = [
    "CollectiveBargain_Coverage",
    "TradeUnions_Density",
    "Real Average Annual Wage Growth",
    "Gini Index",
    "Labour force participation rate (Women age 15 to 64)",
    "Unemployment rate (Women age 15 to 64)",
    "Central government debt, total (% of GDP)",
    "Children out of school, primary",
    "Current health expenditure (% of GDP)",
    "Foreign direct investment, net inflows (% of GDP)",
    "Foreign direct investment, net outflows (% of GDP)",
    "Labor force, total",
    "Labor tax and contributions (% of commercial profits)",
    "Part time employment, total (% of total employment)",
]

for variable in variables:
    # Forward Fill within each country
    df[variable] = df.groupby('Country')[variable].ffill()
    # Backward Fill within each country
    df[variable] = df.groupby('Country')[variable].bfill()
    # Interpolation within each country
    df[variable] = df.groupby('Country')[variable].transform(lambda group: group.interpolate())
    # Fill with mean within each country (only for variables where this makes sense)
    df[variable] = df.groupby('Country')[variable].transform(lambda group: group.fillna(group.mean()))

In [7]:
# round all the values to 2 decimal places except treatment and control
cols = [col for col in df.columns if col not in ['treatment', 'control']]
df[cols] = df[cols].round(2)

In [9]:
df.to_excel('Data Finale_balanced.xlsx', index=False)


# Confounder handling

In [24]:
economic_confounders = [
    'Real Average Annual Wage Growth',
    'Annual Inflation CPI',
    'Exports of goods and services (annual % growth)',
    'Foreign direct investment, net inflows (% of GDP)',
    'Foreign direct investment, net outflows (% of GDP)',
    'Imports of goods and services (% of GDP)',
]
labor_confounders = [
    'Employment/population ratio Women age 15 to 64',
    'Labour force participation rate (Women age 15 to 64)',
    'Unemployment rate (Women age 15 to 64)',
    'Labor force, total',
    'Labor tax and contributions (% of commercial profits)',
    'Part time employment, total (% of total employment)'
]
density_confounders = [
        'Trade Unions Density'
]        


# Regression

In [33]:
df.reset_index(inplace=True)
df = df.set_index(['Country', 'Year'])
y = df['Gini Index']

In [35]:
X = df[['CollectiveBargain_Coverage']]
X = sm.add_constant(X)
model = PanelOLS(y, X, entity_effects=True)
fe_res = model.fit()

print(fe_res)

                          PanelOLS Estimation Summary                           
Dep. Variable:             Gini Index   R-squared:                        0.0185
Estimator:                   PanelOLS   R-squared (Between):             -0.0524
No. Observations:                 429   R-squared (Within):               0.0185
Date:                Sun, Mar 03 2024   R-squared (Overall):             -0.0396
Time:                        12:58:43   Log-likelihood                   -808.79
Cov. Estimator:            Unadjusted                                           
                                        F-statistic:                      7.8244
Entities:                          13   P-value                           0.0054
Avg Obs:                       33.000   Distribution:                   F(1,415)
Min Obs:                       33.000                                           
Max Obs:                       33.000   F-statistic (robust):             7.8244
                            

In [36]:
# Second regression with confounders1 and confounders2
X = df[['CollectiveBargain_Coverage'] + economic_confounders + labor_confounders]
X = sm.add_constant(X)

model = PanelOLS(y, X, entity_effects=True)
fe_res = model.fit()

print(fe_res)

Inputs contain missing values. Dropping rows with missing observations.
  super().__init__(dependent, exog, weights=weights, check_rank=check_rank)


ValueError: exog does not have full column rank. If you wish to proceed with model estimation irrespective of the numerical accuracy of coefficient estimates, you can set check_rank=False.