In [1]:
from ValidMLInference import ols, ols_bcm, one_step, load_dataset, one_step_gaussian_mixture
import pandas as pd
import numpy as np
from scipy import stats
import math

### The Dataset

The package contains a toy slice of a larger dataset regarding work from home, with the following entries:
1. `city_name` 
2. `naics_2022_2` - an industry code 
3. `id` 
4. `salary` 
5. `wfh_wham` 
6. `soc_2021_2` 
7. `employment_type_name` 

In [2]:
df = load_dataset()
df.head(5)

Unnamed: 0.1,Unnamed: 0,city_name,naics_2022_2,id,salary,wfh_wham,soc_2021_2,employment_type_name
0,1,"San Diego, CA",72,002e22ebe1b837ac6b0cebcbb720613138765f51,57500.0,0,11-0000,Full-time (> 32 hours)
1,2,"San Diego, CA",72,00442454060b60c1c0ad4ed78bc29111935f400b,31200.0,0,35-0000,Full-time (> 32 hours)
2,3,"San Diego, CA",72,007a1c1a527ed15006705379cec780aaae4930af,33280.0,0,35-0000,Part-time / full-time
3,4,"San Diego, CA",72,00991b69215b1cc14c08c4cdfa1b10bbbdf6ceba,40560.0,0,35-0000,Full-time (> 32 hours)
4,5,"San Diego, CA",72,00edf6dc0abb731a0befa73f6748ff3f5ce842f4,45760.0,0,11-0000,Full-time (> 32 hours)


We take the log of salary and filter through the dataset to have: 
* industry 72 for San Diego
* industry 51 for Austin and San Francisco 

In [3]:
SD_data = df[df['city_name'] == 'San Diego, CA']
SD_data = SD_data[SD_data['naics_2022_2'] == 72]
SD_data['salary'] = np.log(SD_data['salary'])

SF_data = df[df['city_name'] == 'San Francisco, CA']
SF_data = SF_data[SF_data['naics_2022_2'] == 51]
SF_data['salary'] = np.log(SF_data['salary'])

Austin_data = df[df['city_name'] == 'Austin, TX']
Austin_data = Austin_data[Austin_data['naics_2022_2'] == 51]
Austin_data['salary'] = np.log(Austin_data['salary'])

## Error Correction
The variable `wfh_wham` describes whether the job p[osting implies that the job can be performed from home. This variable is not manually collected, it is imputed via ML methods. **Based on a sample of size 1000, we know the false positive rate (fpr) to be 0.009**

# Coefficient estimates for San Diego

We will estimate 6 coefficients in total, reproducing Table 1 from [Battaglia et al.](https://arxiv.org/abs/2402.15585) -- we will show the results for the usual two-step strategy, BCM correction, and the one-step strategy twice: once controlling for occupation fixed effects and once not. 

### Estimates without accounting for fixed effects
This section presents simple regression coefficients for models estimated not acocunting for the `soc_2021_2` and `employment_type_name` variables

In [4]:
res = ols(formula = "salary ~ wfh_wham", data=SD_data, intercept = True)
display(res.summary())

Unnamed: 0,Estimate,Std. Error,z value,P>|z|,2.5%,97.5%
Intercept,10.655967,0.002589,4115.094402,0.0,10.650891,10.661042
wfh_wham,0.648514,0.024911,26.033376,0.0,0.59969,0.697339


In [5]:
res = ols_bcm(formula= "salary ~ wfh_wham", data=SD_data, fpr = 0.009, m = 1000, intercept=True)
res.summary()

Unnamed: 0,Estimate,Std. Error,z value,P>|z|,2.5%,97.5%
wfh_wham,1.052442,0.140035,7.515553,5.662137e-14,0.777978,1.326906
Intercept,10.646261,0.004174,2550.611816,0.0,10.63808,10.654442


In [6]:
res = ols_bcm(Y= SD_data['salary'], Xhat = SD_data['wfh_wham'], data=SD_data, fpr = 0.009, m = 1000, intercept=True)
res.summary()

Unnamed: 0,Estimate,Std. Error,z value,P>|z|,2.5%,97.5%
Intercept,10.646261,0.004174,2550.611816,0.0,10.63808,10.654442
x1,1.052442,0.140035,7.515553,5.662137e-14,0.777978,1.326906


In [7]:
res = one_step(Y = SD_data['salary'], Xhat = SD_data['wfh_wham'], intercept = True)
display(res.summary())

Unnamed: 0,Estimate,Std. Error,z value,P>|z|,2.5%,97.5%
Intercept,10.526436,0.001539,6840.255371,0.0,10.52342,10.529452
x1,0.486144,0.008413,57.781845,0.0,0.469654,0.502634


In [8]:
res = one_step(formula = 'salary ~ wfh_wham', data=SD_data, intercept = True)
display(res.summary())

Unnamed: 0,Estimate,Std. Error,z value,P>|z|,2.5%,97.5%
wfh_wham,0.486144,0.008413,57.781769,0.0,0.469654,0.502634
Intercept,10.526436,0.001539,6840.253906,0.0,10.52342,10.529452


### Estimates accounting for fixed effects

The formula interface for the package's function allow us to easily account for categorical variables `soc_2021_2` and `naics_2022_2`

In [9]:
res = ols(formula = "salary ~ wfh_wham + C(soc_2021_2) + C(employment_type_name)", data = SD_data, intercept=True)
display(res.summary())

Unnamed: 0,Estimate,Std. Error,z value,P>|z|,2.5%,97.5%
Intercept,11.08826,0.009039,1226.680057,0.0,11.070543,11.105977
C(soc_2021_2)[T.13-0000],-0.143451,0.021168,-6.776768,1.228928e-11,-0.184939,-0.101962
C(soc_2021_2)[T.15-0000],0.196962,0.036147,5.448865,5.069241e-08,0.126114,0.267809
C(soc_2021_2)[T.17-0000],-0.046056,0.044523,-1.034415,0.3009421,-0.13332,0.041209
C(soc_2021_2)[T.19-0000],-0.053094,0.071982,-0.737597,0.4607595,-0.194176,0.087988
C(soc_2021_2)[T.21-0000],-0.30435,0.040091,-7.591548,3.153033e-14,-0.382926,-0.225774
C(soc_2021_2)[T.23-0000],-0.061614,0.297594,-0.20704,0.8359784,-0.644888,0.52166
C(soc_2021_2)[T.25-0000],-0.224073,0.035299,-6.347885,2.182945e-10,-0.293258,-0.154889
C(soc_2021_2)[T.27-0000],-0.251113,0.040512,-6.198417,5.703396e-10,-0.330515,-0.17171
C(soc_2021_2)[T.29-0000],0.117454,0.063808,1.840732,0.06566085,-0.007608,0.242516


In [10]:
res = ols_bcm(formula = "salary ~ wfh_wham + C(soc_2021_2) + C(employment_type_name)", target_variable="wfh_wham", data = SD_data, fpr = 0.009, m=1000)
display(res.summary())

Unnamed: 0,Estimate,Std. Error,z value,P>|z|,2.5%,97.5%
wfh_wham,0.641276,0.099605,6.43821,1.208906e-10,0.446054,0.836497
Intercept,11.074074,0.010335,1071.556274,0.0,11.053818,11.094329
C(soc_2021_2)[T.13-0000],-0.183179,0.025246,-7.25575,3.994582e-13,-0.23266,-0.133698
C(soc_2021_2)[T.15-0000],0.102763,0.049093,2.093238,0.03632788,0.006543,0.198983
C(soc_2021_2)[T.17-0000],-0.057574,0.044722,-1.287387,0.1979594,-0.145227,0.030079
C(soc_2021_2)[T.19-0000],-0.104802,0.074381,-1.408983,0.1588402,-0.250586,0.040983
C(soc_2021_2)[T.21-0000],-0.320699,0.040431,-7.932071,2.220446e-15,-0.399941,-0.241456
C(soc_2021_2)[T.23-0000],-0.126672,0.297895,-0.425224,0.6706731,-0.710536,0.457191
C(soc_2021_2)[T.25-0000],-0.22896,0.03534,-6.478743,9.249024e-11,-0.298226,-0.159695
C(soc_2021_2)[T.27-0000],-0.273719,0.041304,-6.626945,3.427059e-11,-0.354673,-0.192765


In [11]:
res = one_step(formula = "salary ~ wfh_wham + C(soc_2021_2) + C(employment_type_name)", data = SD_data, treatment_var="wfh_wham")
display(res.summary())

Unnamed: 0,Estimate,Std. Error,z value,P>|z|,2.5%,97.5%
wfh_wham,0.290135,0.010862,26.711267,0.0,0.268846,0.311424
Intercept,10.946439,0.011201,977.31543,0.0,10.924486,10.968391
C(soc_2021_2)[T.13-0000],-0.148209,0.015713,-9.432081,0.0,-0.179007,-0.117412
C(soc_2021_2)[T.15-0000],0.169538,0.035844,4.729833,2.24705e-06,0.099284,0.239792
C(soc_2021_2)[T.17-0000],-0.146164,0.023462,-6.229773,4.671115e-10,-0.192149,-0.100179
C(soc_2021_2)[T.19-0000],-0.227685,0.038526,-5.909871,3.423755e-09,-0.303194,-0.152175
C(soc_2021_2)[T.21-0000],-0.263549,0.025164,-10.47325,0.0,-0.31287,-0.214228
C(soc_2021_2)[T.23-0000],-0.087676,0.084325,-1.039747,0.2984572,-0.25295,0.077597
C(soc_2021_2)[T.25-0000],-0.265532,0.036623,-7.250412,4.154455e-13,-0.337312,-0.193752
C(soc_2021_2)[T.27-0000],-0.29102,0.028751,-10.121975,0.0,-0.347371,-0.234668


In [None]:
res = one_step_gaussian_mixture(formula = "salary ~ wfh_wham + C(soc_2021_2) + C(employment_type_name", data=SD_data, k = 3, maxiter=150, nguess = 30)
res.summary()

Unnamed: 0,Estimate,Std. Error,z value,P>|z|,2.5%,97.5%
wfh_wham,0.268864,0.012261,21.928057,0.0,0.244832,0.292895
Intercept,10.96936,0.007296,1503.496582,0.0,10.955061,10.98366
C(employment_type_name)[T.Part-time (≤ 32 hours)],-0.047279,0.002675,-17.671429,0.0,-0.052523,-0.042035
C(employment_type_name)[T.Part-time / full-time],-0.050777,0.002686,-18.904667,0.0,-0.056042,-0.045513
C(soc_2021_2)[T.13-0000],-0.152849,0.013216,-11.565246,0.0,-0.178753,-0.126946
C(soc_2021_2)[T.15-0000],0.195005,0.033479,5.82479,5.71843e-09,0.129389,0.260622
C(soc_2021_2)[T.17-0000],-0.108683,0.019523,-5.566925,2.592738e-08,-0.146947,-0.070418
C(soc_2021_2)[T.19-0000],-0.176315,0.040227,-4.383048,1.170303e-05,-0.255157,-0.097472
C(soc_2021_2)[T.21-0000],-0.284183,0.024749,-11.482588,0.0,-0.33269,-0.235676
C(soc_2021_2)[T.23-0000],0.131732,0.085691,1.537288,0.1242228,-0.03622,0.299683


Unnamed: 0,Estimate,Std. Error,z value,P>|z|,2.5%,97.5%
x1,5.491232,0.003874,1417.487915,0.0,5.483639,5.498825
x2,5.483846,0.003854,1422.927612,0.0,5.476293,5.4914
x3,-0.046814,0.002653,-17.643869,0.0,-0.052014,-0.041613
x4,-0.049911,0.002664,-18.736954,0.0,-0.055132,-0.04469
x5,-0.162013,0.013585,-11.925467,0.0,-0.18864,-0.135386
x6,0.198245,0.031297,6.334368,2.383165e-10,0.136904,0.259585
x7,-0.115059,0.018726,-6.144433,8.024965e-10,-0.151761,-0.078357
x8,-0.172348,0.048131,-3.580796,0.0003425493,-0.266683,-0.078012
x9,-0.290226,0.023263,-12.475692,0.0,-0.335822,-0.244631
x10,-0.027171,0.068294,-0.397861,0.6907327,-0.161025,0.106682
