In [1]:
import pandas as pd
import numpy as np
import statsmodels.api as sm
from sklearn.model_selection import cross_val_score, KFold
from sklearn.metrics import mean_squared_error
from math import sqrt

### Data Upload

In [181]:
data = pd.read_csv("morg-2014-emp.csv")

  data = pd.read_csv("morg-2014-emp.csv")


In [183]:
business_ops_codes = [500, 510, 520, 530, 540, 565, 600, 630,
                      640, 650, 700, 710, 725, 726, 735, 740]

In [185]:
data_occ = data[data['occ2012'].isin(business_ops_codes)]
data_occ

Unnamed: 0.1,Unnamed: 0,hhid,intmonth,stfips,weight,earnwke,uhours,grade92,race,ethnic,...,ownchild,chldpres,prcitshp,state,ind02,occ2012,class,unionmme,unioncov,lfsr94
0,3,2600310997690,January,AL,3151.6801,1692.00,40,43,1,,...,0,0,"Native, Born In US",63,Employment services (5613),630,"Private, For Profit",No,No,Employed-At Work
184,424,69997206074239,January,AK,374.8012,1057.00,42,39,1,,...,1,3,"Native, Born In US",94,Support activities for mining (213),530,"Private, For Profit",No,No,Employed-At Work
241,534,700209900646154,January,AK,332.7025,1557.69,40,41,1,,...,0,0,"Native, Born In US",94,"Justice, public order, and safety activities (...",740,Government - Federal,No,No,Employed-At Work
300,668,340446097003209,January,AZ,4272.6777,1634.61,40,42,1,,...,1,1,"Native, Born In US",86,** Construction (23),710,"Private, For Profit",No,No,Employed-At Work
344,759,70447007503540,January,AZ,4431.6759,817.00,40,39,1,,...,0,0,"Native, Born In US",86,Insurance carriers and related activities (524),540,"Private, For Profit",No,No,Employed-At Work
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
149001,316451,185610760680506,December,WI,2072.8918,1500.00,40,39,1,,...,0,0,"Native, Born In US",3,Administration of economic programs and space ...,530,Government - Federal,No,No,Employed-At Work
149070,316578,155654140100907,December,WI,3814.6828,1211.53,45,42,1,,...,0,0,"Foreign Born, US Cit By Naturalization",3,Insurance carriers and related activities (524),540,"Private, For Profit",No,No,Employed-At Work
149083,316602,500010413026567,December,WI,4371.4088,1134.61,44,43,4,,...,0,0,"Native, Born In US",3,"**** Data processing, hosting, and related ser...",740,"Private, For Profit",No,No,Employed-At Work
149122,316664,103165050066104,December,WI,3948.9255,1700.00,50,40,1,,...,0,0,"Native, Born In US",3,"Structural metals, and tank and shipping conta...",530,"Private, For Profit",No,No,Employed-At Work


### Construct Variables

In [188]:
data_occ = data_occ.copy()

In [249]:
data_occ.loc[:, 'hourly_earn'] = data_occ['earnwke'] / data_occ['uhours']
data_occ.loc[:, 'education_years'] = data_occ['grade92']
data_occ.loc[:, 'experience'] = data_occ['age'] - data_occ['education_years'] - 6
data_occ.loc[data_occ['experience'] <= 0, 'experience'] = 0
data_occ.loc[:, 'female'] = (data_occ['sex'] == 2).astype(int)
data_occ.loc[:, 'ethnic_1'] = (data_occ['ethnic'] == 1).astype(int)

data_occ.loc[:, 'union'] = data_occ['unionmme'].map(lambda x: x.strip() if isinstance(x, str) else x)
data_occ.loc[:, 'union'] = data_occ['union'].map({"Yes": 1, "No": 0})

data_occ.loc[:, 'unioncov'] = data_occ['unioncov'].map(lambda x: x.strip() if isinstance(x, str) else x)
data_occ.loc[:, 'unioncov'] = data_occ['unioncov'].map({"Yes": 1, "No": 0})

data_occ.loc[:, 'lfsr94'] = data_occ['lfsr94'].map(lambda x: x.strip() if isinstance(x, str) else x)
data_occ.loc[:, 'lfsr94'] = data_occ['lfsr94'].map({"Employed-At Work": 1, "Employed-Absent": 2}).astype("Int64")

In [228]:
recode_prcitshp = {
    "Native, Born In US": 1,
    "Native, Born in PR or US Outlying Area": 2,
    "Native, Born Abroad Of US Parent(s)": 3,
    "Foreign Born, US Cit By Naturalization": 4,
    "Foreign Born, Not a US Citizen": 5}
data_occ['prcitshp_recode'] = data_occ['prcitshp'].map(lambda x: x.strip() if isinstance(x, str) else x)
data_occ['prcitshp_recode'] = data_occ['prcitshp_recode'].map(recode_prcitshp)

recode_class = {
    "Private, Nonprofit": 1,
    "Private, For Profit": 2,
    "Government - State": 3,
    "Foreign Born, Government - Local": 4,
    "Government - Federal": 5}
data_occ['class_recode'] = data_occ['class'].map(lambda x: x.strip() if isinstance(x, str) else x)
data_occ['class_recode'] = data_occ['class_recode'].map(recode_class)

In [230]:
data_occ

Unnamed: 0.1,Unnamed: 0,hhid,intmonth,stfips,weight,earnwke,uhours,grade92,race,ethnic,...,unioncov,lfsr94,hourly_earn,education_years,experience,female,union,prcitshp_recode,class_recode,ethnic_1
0,3,2600310997690,January,AL,3151.6801,1692.00,40,43,1,0.0,...,,,42.300000,43,0,1,0,1,2.0,0
184,424,69997206074239,January,AK,374.8012,1057.00,42,39,1,0.0,...,,,25.166667,39,3,0,0,1,2.0,0
241,534,700209900646154,January,AK,332.7025,1557.69,40,41,1,0.0,...,,,38.942250,41,10,1,0,1,5.0,0
300,668,340446097003209,January,AZ,4272.6777,1634.61,40,42,1,0.0,...,,,40.865250,42,0,1,0,1,2.0,0
344,759,70447007503540,January,AZ,4431.6759,817.00,40,39,1,0.0,...,,,20.425000,39,0,1,0,1,2.0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
149001,316451,185610760680506,December,WI,2072.8918,1500.00,40,39,1,0.0,...,,,37.500000,39,0,1,0,1,5.0,0
149070,316578,155654140100907,December,WI,3814.6828,1211.53,45,42,1,0.0,...,,,26.922889,42,7,1,0,4,2.0,0
149083,316602,500010413026567,December,WI,4371.4088,1134.61,44,43,4,0.0,...,,,25.786591,43,0,0,0,1,2.0,0
149122,316664,103165050066104,December,WI,3948.9255,1700.00,50,40,1,0.0,...,,,34.000000,40,4,0,0,1,2.0,0


### Establish OLS Regression

In [233]:
y = data_occ['hourly_earn']

In [235]:
x1 = sm.add_constant(data_occ[['education_years']])
model1 = sm.OLS(y, x1).fit()
print(model1.summary())

                            OLS Regression Results                            
Dep. Variable:            hourly_earn   R-squared:                       0.091
Model:                            OLS   Adj. R-squared:                  0.091
Method:                 Least Squares   F-statistic:                     392.8
Date:                Sun, 02 Mar 2025   Prob (F-statistic):           2.16e-83
Time:                        01:58:00   Log-Likelihood:                -16389.
No. Observations:                3922   AIC:                         3.278e+04
Df Residuals:                    3920   BIC:                         3.279e+04
Df Model:                           1                                         
Covariance Type:            nonrobust                                         
                      coef    std err          t      P>|t|      [0.025      0.975]
-----------------------------------------------------------------------------------
const             -84.4484      5.722    -

In [237]:
x2 = sm.add_constant(data_occ[['education_years', 'age', 'experience', 'female']])
model2 = sm.OLS(y, x2).fit()
print(model2.summary())

                            OLS Regression Results                            
Dep. Variable:            hourly_earn   R-squared:                       0.185
Model:                            OLS   Adj. R-squared:                  0.184
Method:                 Least Squares   F-statistic:                     222.1
Date:                Sun, 02 Mar 2025   Prob (F-statistic):          4.68e-172
Time:                        01:58:01   Log-Likelihood:                -16175.
No. Observations:                3922   AIC:                         3.236e+04
Df Residuals:                    3917   BIC:                         3.239e+04
Df Model:                           4                                         
Covariance Type:            nonrobust                                         
                      coef    std err          t      P>|t|      [0.025      0.975]
-----------------------------------------------------------------------------------
const             -97.5536      5.679    -

In [257]:
x3 = sm.add_constant(data_occ[['education_years', 'age', 'experience',
                               'female', 'union', 'ethnic_1']])
x3.replace([np.inf, -np.inf], np.nan, inplace=True)
x3 = x3.infer_objects(copy=False)
y_x3 = y.loc[x3.index]
model3 = sm.OLS(y_x3, x3).fit()
print(model3.summary())

                            OLS Regression Results                            
Dep. Variable:            hourly_earn   R-squared:                       0.186
Model:                            OLS   Adj. R-squared:                  0.184
Method:                 Least Squares   F-statistic:                     148.7
Date:                Sun, 02 Mar 2025   Prob (F-statistic):          2.10e-170
Time:                        02:04:05   Log-Likelihood:                -16173.
No. Observations:                3922   AIC:                         3.236e+04
Df Residuals:                    3915   BIC:                         3.240e+04
Df Model:                           6                                         
Covariance Type:            nonrobust                                         
                      coef    std err          t      P>|t|      [0.025      0.975]
-----------------------------------------------------------------------------------
const             -96.0120      5.746    -

  x3.replace([np.inf, -np.inf], np.nan, inplace=True)


In [265]:
x3 = sm.add_constant(data_occ[['education_years', 'age', 'experience',
                               'female', 'union', 'ethnic_1']])
x3.replace([np.inf, -np.inf], np.nan, inplace=True)
x3 = x3.dropna()
y_x3 = y.loc[x3.index]
model3 = sm.OLS(y_x3, x3).fit()
print(model3.summary())

                            OLS Regression Results                            
Dep. Variable:            hourly_earn   R-squared:                       0.186
Model:                            OLS   Adj. R-squared:                  0.184
Method:                 Least Squares   F-statistic:                     148.7
Date:                Sun, 02 Mar 2025   Prob (F-statistic):          2.10e-170
Time:                        02:08:34   Log-Likelihood:                -16173.
No. Observations:                3922   AIC:                         3.236e+04
Df Residuals:                    3915   BIC:                         3.240e+04
Df Model:                           6                                         
Covariance Type:            nonrobust                                         
                      coef    std err          t      P>|t|      [0.025      0.975]
-----------------------------------------------------------------------------------
const             -96.0120      5.746    -

  x3.replace([np.inf, -np.inf], np.nan, inplace=True)


In [279]:
data_occ.loc[:, 'experience_sq'] = data_occ['experience']**2
data_occ.loc[:, 'female_ethnic'] = data_occ['female'] * data_occ['ethnic']
x4 = sm.add_constant(data_occ[['education_years', 'age', 'experience', 'experience_sq',
                                 'female', 'union', 'ethnic_1', 'female_ethnic']])
x4.replace([np.inf, -np.inf], np.nan, inplace=True)
x4 = x4.dropna()
y_x4 = y.loc[x4.index]
model4 = sm.OLS(y_x4, x4).fit()
print(model4.summary())

                            OLS Regression Results                            
Dep. Variable:            hourly_earn   R-squared:                       0.186
Model:                            OLS   Adj. R-squared:                  0.184
Method:                 Least Squares   F-statistic:                     111.6
Date:                Sun, 02 Mar 2025   Prob (F-statistic):          1.89e-168
Time:                        02:12:22   Log-Likelihood:                -16173.
No. Observations:                3922   AIC:                         3.236e+04
Df Residuals:                    3913   BIC:                         3.242e+04
Df Model:                           8                                         
Covariance Type:            nonrobust                                         
                      coef    std err          t      P>|t|      [0.025      0.975]
-----------------------------------------------------------------------------------
const             -96.4373      5.762    -

  x4.replace([np.inf, -np.inf], np.nan, inplace=True)
