In [66]:
import pandas as pd 

import numpy as np 
import matplotlib.pyplot as plt

from scipy import stats
import statsmodels.api as sm
import statsmodels.formula.api as smf

from pandas.api import types

from linearmodels import PanelOLS
import causallib as cl
from causallib.estimation import IPW

from sklearn.linear_model import LogisticRegression, LinearRegression

from econml.dml import DML
from econml.dr import LinearDRLearner # Example of a direct DR learner

In [67]:
data = pd.read_csv("src/did_training_productivity.csv")
data = data.set_index(["worker_id", "period"])

Estimate treatment effects separately by:

• Technical vs non-technical workers

• Gender

• Mother’s education level

In [68]:
technical_workers = data[data['technical'] == 1].copy()
non_technical_workers = data[data['technical'] == 0].copy()

In [69]:
y_technical = technical_workers["productivity"]
y_nontechnical = non_technical_workers["productivity"]

X_technical = technical_workers["post"]*technical_workers["treat_group"]
X_nontechnical = non_technical_workers["post"]*non_technical_workers["treat_group"]


In [70]:
model_technical = PanelOLS(y_technical, X_technical, entity_effects=True, time_effects=True).fit(cov_type='clustered', cluster_entity = True)

In [71]:
model_technical

0,1,2,3
Dep. Variable:,productivity,R-squared:,0.1969
Estimator:,PanelOLS,R-squared (Between):,0.0528
No. Observations:,14664,R-squared (Within):,0.3358
Date:,"Thu, Apr 17 2025",R-squared (Overall):,0.0564
Time:,19:47:03,Log-likelihood,-5.532e+04
Cov. Estimator:,Clustered,,
,,F-statistic:,3440.6
Entities:,611,P-value,0.0000
Avg Obs:,24.000,Distribution:,"F(1,14029)"
Min Obs:,24.000,,

0,1,2,3,4,5,6
,Parameter,Std. Err.,T-stat,P-value,Lower CI,Upper CI
0.0,22.080,0.3928,56.212,0.0000,21.310,22.849


In [72]:
model_nontechnical = PanelOLS(y_nontechnical, X_nontechnical, entity_effects=True, time_effects=True).fit(cov_type='clustered', cluster_entity = True)

In [73]:
model_nontechnical

0,1,2,3
Dep. Variable:,productivity,R-squared:,0.1881
Estimator:,PanelOLS,R-squared (Between):,0.0479
No. Observations:,33336,R-squared (Within):,0.3055
Date:,"Thu, Apr 17 2025",R-squared (Overall):,0.0514
Time:,19:47:05,Log-likelihood,-1.259e+05
Cov. Estimator:,Clustered,,
,,F-statistic:,7395.5
Entities:,1389,P-value,0.0000
Avg Obs:,24.000,Distribution:,"F(1,31923)"
Min Obs:,24.000,,

0,1,2,3,4,5,6
,Parameter,Std. Err.,T-stat,P-value,Lower CI,Upper CI
0.0,22.561,0.2661,84.792,0.0000,22.039,23.083


In [74]:
female = data[data['female'] == 1].copy()
male = data[data['female'] == 0].copy()

y_female = female["productivity"]
y_male = male["productivity"]

X_female = female["post"]*female["treat_group"]
X_male = male["post"]*male["treat_group"]


In [75]:
model_female = PanelOLS(y_female, X_female, entity_effects=True, time_effects=True).fit(cov_type='clustered', cluster_entity = True)

In [76]:
model_female

0,1,2,3
Dep. Variable:,productivity,R-squared:,0.1986
Estimator:,PanelOLS,R-squared (Between):,0.0537
No. Observations:,21096,R-squared (Within):,0.3305
Date:,"Thu, Apr 17 2025",R-squared (Overall):,0.0576
Time:,19:47:09,Log-likelihood,-7.959e+04
Cov. Estimator:,Clustered,,
,,F-statistic:,5004.7
Entities:,879,P-value,0.0000
Avg Obs:,24.000,Distribution:,"F(1,20193)"
Min Obs:,24.000,,

0,1,2,3,4,5,6
,Parameter,Std. Err.,T-stat,P-value,Lower CI,Upper CI
0.0,22.493,0.3204,70.200,0.0000,21.865,23.121


In [77]:
model_male = PanelOLS(y_male, X_male, entity_effects=True, time_effects=True).fit(cov_type='clustered', cluster_entity = True)

In [78]:
model_male

0,1,2,3
Dep. Variable:,productivity,R-squared:,0.1849
Estimator:,PanelOLS,R-squared (Between):,0.0465
No. Observations:,26904,R-squared (Within):,0.3026
Date:,"Thu, Apr 17 2025",R-squared (Overall):,0.0498
Time:,19:47:12,Log-likelihood,-1.016e+05
Cov. Estimator:,Clustered,,
,,F-statistic:,5844.1
Entities:,1121,P-value,0.0000
Avg Obs:,24.000,Distribution:,"F(1,25759)"
Min Obs:,24.000,,

0,1,2,3,4,5,6
,Parameter,Std. Err.,T-stat,P-value,Lower CI,Upper CI
0.0,22.325,0.3049,73.210,0.0000,21.728,22.923


In [79]:
data["mothers_educ"].unique()

array(['Graduate', 'College', '<HS', 'HS'], dtype=object)

In [80]:
Graduate = data[data["mothers_educ"] == "Graduate"].copy()
College = data[data["mothers_educ"] == "College"].copy()
HS = data[data["mothers_educ"] == "<HS"].copy()
lessHS = data[data["mothers_educ"] == "HS"].copy()

In [82]:
mother_educ_dfs = [Graduate, College, HS, lessHS]
labels = ['Graduate', 'College', '<HS', 'HS']

coefficients = {}

for index, df in enumerate(mother_educ_dfs):
    y = df["productivity"]

    X = df["post"]*df["treat_group"]

    model = PanelOLS(y, X, entity_effects=True, time_effects=True).fit(cov_type='clustered', cluster_entity = True)
    
    coeff = model.params[0]
    coefficients[labels[index]] = coeff   


In [83]:
coefficients

{'Graduate': 22.9042929415321,
 'College': 21.92490282031732,
 '<HS': 22.701607334959373,
 'HS': 22.509884336551867}

### Comparison Regressions

Test whether these differences are statistically significant using interaction terms:

$$Y_{it} = \alpha_i + \delta_t + \theta_1(W_i \times Post_t) + \theta_2(W_i \times Post_t \times Covariate_i) + \epsilon_{it}$$

In [90]:
def estimate_differences(df, covariate):
        y = df["productivity"]

        simple_interaction = df["post"]*df["treat_group"]
        long_interaction = df["post"]*df["treat_group"]*df[covariate]

        X = pd.concat([simple_interaction, long_interaction], axis = 1)
        X.columns = ["simple", "long"]

        model = PanelOLS(y, X, entity_effects=True, time_effects=True).fit(cov_type='clustered', cluster_entity = True)
        
        coeff = model.params["long"]
        p_value = model.pvalues["long"]

        return (coeff, p_value)

In [88]:
data.columns

Index(['female', 'mothers_educ', 'technical', 'immigrant', 'base_productivity',
       'treat_group', 'tenure', 'prev_performance', 'team_size', 'manager_exp',
       'post', 'treated', 'treat_hours', 'productivity', 'log_productivity'],
      dtype='object')

In [89]:
ordinal_map = {'<HS': 0, 'HS': 1, 'College': 2, 'Graduate': 3}
data["mothers_educ"] = data["mothers_educ"].map(ordinal_map)

In [91]:
technical = estimate_differences(data, "technical")
gender = estimate_differences(data, "female")
mothers_edu = estimate_differences(data, "mothers_educ")

In [92]:
vars = [technical, gender, mothers_edu]
var_names = ["technical", "gender", "mothers_edu"]
for index,var in enumerate(vars):
    print(f"The coefficient for the {var_names[index]} regression is {var[0]}, with a p-value of {var[1]}")

The coefficient for the technical regression is -0.22940760829183868, with a p-value of 0.5713396345302115
The coefficient for the gender regression is 0.3178480784836144, with a p-value of 0.39780173064113056
The coefficient for the mothers_edu regression is -0.0716040567630537, with a p-value of 0.7288846103535906
