In [None]:
from ValidMLInference import ols, ols_bcm, one_step, load_dataset
import pandas as pd
import numpy as np
from scipy import stats
import math

### The Dataset

The package contains a toy slice of a larger dataset regarding work from home, with the following entries:
1. `city_name` 
2. `naics_2022_2` - an industry code 
3. `id` - unique identifier of the job posting
4. `salary` 
5. `wfh_wham` - ML-generated indicator of whether the job offers work from home 
6. `soc_2021_2` - Bureau of Labor Statistics Standard Occupational Classification code
7. `employment_type_name` - indicates whether the position is full-time or part-time 

In [7]:
SD_data = load_dataset()
SD_data.head(5)

Unnamed: 0.1,Unnamed: 0,city_name,naics_2022_2,id,salary,wfh_wham,soc_2021_2,employment_type_name
0,1,"San Diego, CA",72,002e22ebe1b837ac6b0cebcbb720613138765f51,57500.0,0,11-0000,Full-time (> 32 hours)
1,2,"San Diego, CA",72,00442454060b60c1c0ad4ed78bc29111935f400b,31200.0,0,35-0000,Full-time (> 32 hours)
2,3,"San Diego, CA",72,007a1c1a527ed15006705379cec780aaae4930af,33280.0,0,35-0000,Part-time / full-time
3,4,"San Diego, CA",72,00991b69215b1cc14c08c4cdfa1b10bbbdf6ceba,40560.0,0,35-0000,Full-time (> 32 hours)
4,5,"San Diego, CA",72,00edf6dc0abb731a0befa73f6748ff3f5ce842f4,45760.0,0,11-0000,Full-time (> 32 hours)


For purpose of this estimation, we also log-transform the data. 

In [8]:
SD_data['salary'] = np.log(SD_data['salary'])

# ML-classification error 
The variable `wfh_wham` describing whether the job posting offers remote work is not manually collected, but is imputed via ML methods. **Based on a sample of size 1000, we know the false positive rate (fpr) to be 0.009**

# Coefficient estimates for San Diego

We will estimate 6 coefficients in total, reproducing Table 1 from [Battaglia et al.](https://arxiv.org/abs/2402.15585) -- we will show the results for the usual two-step strategy, BCM correction, and the one-step strategy twice: once controlling for occupation fixed effects and once not. 

### Estimates without accounting for fixed effects
This section presents simple regression coefficients for models estimated not acocunting for the `soc_2021_2` and `employment_type_name` variables

In [None]:
rows = ["Intercept", "wfh_wham"]

In [42]:
res = ols(formula = "salary ~ wfh_wham", data=SD_data, intercept = True)
summary = res.summary()
print(summary.loc[rows])

            Estimate  Std. Error      z value  P>|z|       2.5%      97.5%
Intercept  10.655967    0.002589  4115.094402    0.0  10.650891  10.661042
wfh_wham    0.648514    0.024911    26.033376    0.0   0.599690   0.697339


In [43]:
res = ols_bcm(formula= "salary ~ wfh_wham", data=SD_data, fpr = 0.009, m = 1000, intercept=True)
summary = res.summary()
print(summary.loc[rows])

            Estimate  Std. Error      z value         P>|z|       2.5%  \
Intercept  10.646261    0.004174  2550.611816  0.000000e+00  10.638080   
wfh_wham    1.052442    0.140035     7.515553  5.662137e-14   0.777978   

               97.5%  
Intercept  10.654442  
wfh_wham    1.326906  


In [44]:
res = one_step(formula = 'salary ~ wfh_wham', treatment_var = "wfh_wham", data=SD_data, intercept = True)
summary = res.summary()
print(summary.loc[rows])

            Estimate  Std. Error      z value  P>|z|       2.5%      97.5%
Intercept  10.526436    0.001539  6840.253906    0.0  10.523420  10.529452
wfh_wham    0.486144    0.008413    57.781769    0.0   0.469654   0.502634


### Estimates accounting for fixed effects

The formula interface for the package's function allow us to easily account for categorical variables `soc_2021_2` and `naics_2022_2`

In [None]:
res = ols(formula = "salary ~ wfh_wham + C(employment_type_name) + C(soc_2021_2)", data = SD_data, intercept=True)
summary = res.summary()
print(summary.loc[rows])


            Estimate  Std. Error      z value  P>|z|       2.5%      97.5%
Intercept  11.088260    0.009039  1226.680057    0.0  11.070543  11.105977
wfh_wham    0.363921    0.021544    16.892263    0.0   0.321696   0.406146


In [39]:
res = ols_bcm(formula = "salary ~ wfh_wham + C(soc_2021_2) + C(employment_type_name)", treatment_variable="wfh_wham", data = SD_data, fpr = 0.009, m=1000)
summary = res.summary()
print(summary.loc[rows])

            Estimate  Std. Error      z value         P>|z|       2.5%  \
Intercept  11.074074    0.010335  1071.556274  0.000000e+00  11.053818   
wfh_wham    0.641276    0.099605     6.438210  1.208906e-10   0.446054   

               97.5%  
Intercept  11.094329  
wfh_wham    0.836497  


In [37]:
res = one_step(formula = "salary ~ wfh_wham + C(soc_2021_2) + C(employment_type_name)", data = SD_data, treatment_var="wfh_wham")
summary = res.summary()
print(summary.loc[rows])

            Estimate  Std. Error     z value  P>|z|       2.5%      97.5%
Intercept  10.946439    0.011201  977.315430    0.0  10.924486  10.968391
wfh_wham    0.290135    0.010862   26.711267    0.0   0.268846   0.311424


Note: the results for `one_step` estimates diverge from those reported in the "Joint" column of Table 1 in the paper as those were generated using a Gaussian mixture model not yet available in the package