In [1]:
import os
import sys
import warnings

import numpy as np
import pandas as pd
from mizani.formatters import percent_format
from plotnine import *
from datetime import datetime
import statsmodels.api as sm
import statsmodels.formula.api as smf
from scipy.stats import norm
from IPython.core.display import HTML
from stargazer.stargazer import Stargazer
import statsmodels.nonparametric.kernel_regression as loess

warnings.filterwarnings("ignore")


In [7]:
# Current script folder
current_path = os.getcwd()
dirname = current_path

# location folders
data_in = dirname
data_out = dirname
output = dirname + "/output/"
func = dirname + "/ch00-tech-prep/"
sys.path.append(func)

In [10]:
# Import the prewritten helper functions
from py_helper_functions import *

In [9]:

data_all = pd.read_csv(data_in + "/morg-2014-emp.csv")


In [11]:
# SELECT OCCUPATION
# keep only two occupation types: Business Operations Specialists
data_all.loc[
    ((data_all["occ2012"] >= 500) & (data_all["occ2012"] <= 740)), "sample"
] = 1
data_all.loc[data_all["sample"].isna(), "sample"] = 0

In [12]:
data_all = data_all.loc[
    data_all["sample"] == 1, :
].reset_index(drop=True)

In [13]:
data_all["sample"].value_counts()

sample
1.0    3922
Name: count, dtype: int64

In [14]:
data_all["female"] = (data_all.sex == 2).astype(int)
data_all["w"] = data_all["earnwke"] / data_all["uhours"]
data_all["lnw"] = np.log(data_all["w"])
data_all["agesq"] = np.power(data_all["age"], 2)

In [16]:
i = 1
data = data_all.loc[data_all["sample"] == i, :].reset_index(drop=True)
data.to_csv(data_out + "/earnings_inference.csv", index=False)

In [17]:
#####################
# DISTRIBUTION OF EARNINGS
#######################
data.loc[:, ["earnwke", "uhours", "w"]].describe()

Unnamed: 0,earnwke,uhours,w
count,3922.0,3922.0,3922.0
mean,1196.077634,41.022438,28.842154
std,671.841672,8.560748,16.570141
min,0.01,1.0,0.0005
25%,720.0,40.0,17.933333
50%,1057.0,40.0,25.277797
75%,1538.46,40.0,36.0575
max,2884.61,99.0,461.538


In [18]:
data["female"].value_counts()

female
1    2288
0    1634
Name: count, dtype: int64

In [19]:
data.groupby(["occ2012", "female"]).size()

occ2012  female
500      0          26
         1          22
510      0          11
         1           7
520      0          96
         1         114
530      0         128
         1         186
540      0         130
         1         208
565      0         140
         1         151
600      0         115
         1          13
630      0         181
         1         526
640      0          17
         1          59
650      0          61
         1          89
700      0          59
         1          47
710      0         408
         1         336
725      0          28
         1         118
726      0          32
         1          91
735      0         109
         1         172
740      0          93
         1         149
dtype: int64

In [20]:
##############################
# linear regressions
##############################

In [21]:
reg1 = smf.ols(formula="lnw~female", data=data).fit()
reg1.summary()

0,1,2,3
Dep. Variable:,lnw,R-squared:,0.025
Model:,OLS,Adj. R-squared:,0.025
Method:,Least Squares,F-statistic:,100.5
Date:,"Sat, 25 Nov 2023",Prob (F-statistic):,2.2199999999999997e-23
Time:,21:18:41,Log-Likelihood:,-3409.0
No. Observations:,3922,AIC:,6822.0
Df Residuals:,3920,BIC:,6835.0
Df Model:,1,,
Covariance Type:,nonrobust,,

0,1,2,3,4,5,6
,coef,std err,t,P>|t|,[0.025,0.975]
Intercept,3.3303,0.014,233.206,0.000,3.302,3.358
female,-0.1875,0.019,-10.027,0.000,-0.224,-0.151

0,1,2,3
Omnibus:,3104.647,Durbin-Watson:,1.944
Prob(Omnibus):,0.0,Jarque-Bera (JB):,335414.411
Skew:,-3.082,Prob(JB):,0.0
Kurtosis:,47.883,Cond. No.,2.86


In [22]:
reg2 = smf.ols(formula="lnw~female", data=data).fit(cov_type="HC1")
reg2.summary()

0,1,2,3
Dep. Variable:,lnw,R-squared:,0.025
Model:,OLS,Adj. R-squared:,0.025
Method:,Least Squares,F-statistic:,104.7
Date:,"Sat, 25 Nov 2023",Prob (F-statistic):,2.9199999999999998e-24
Time:,21:20:38,Log-Likelihood:,-3409.0
No. Observations:,3922,AIC:,6822.0
Df Residuals:,3920,BIC:,6835.0
Df Model:,1,,
Covariance Type:,HC1,,

0,1,2,3,4,5,6
,coef,std err,z,P>|z|,[0.025,0.975]
Intercept,3.3303,0.013,251.136,0.000,3.304,3.356
female,-0.1875,0.018,-10.230,0.000,-0.223,-0.152

0,1,2,3
Omnibus:,3104.647,Durbin-Watson:,1.944
Prob(Omnibus):,0.0,Jarque-Bera (JB):,335414.411
Skew:,-3.082,Prob(JB):,0.0
Kurtosis:,47.883,Cond. No.,2.86


In [23]:
stargazer = Stargazer([reg1, reg2])
stargazer.covariate_order(["female","Intercept"])
stargazer.rename_covariates({"Intercept":"Constant"})
stargazer

0,1,2
,,
,Dependent variable: lnw,Dependent variable: lnw
,,
,(1),(2)
,,
female,-0.187***,-0.187***
,(0.019),(0.018)
Constant,3.330***,3.330***
,(0.014),(0.013)
Observations,3922,3922
