### Instrumental variables regression using the IV2SLS class of statsmodels

In [2]:
import pandas as pd
import numpy as np
import statsmodels.formula.api as smf
from statsmodels.api import add_constant
from statsmodels.sandbox.regression.gmm import IV2SLS


In [3]:
#Load the Panel Study of Income Dynamics (PSID) into a Dataframe
df = pd.read_csv('PSID1976.csv', header=0)

In [5]:
#Use a subset of the dataset where participating=yes
df_1975 = df.query('participation == \'yes\'')
#print(df_1975)

In [6]:
#Let's confirm that meducation and feducation satisfy the relevance condition for education
reg_expr = 'education ~ meducation + feducation'

In [7]:
#Build an train an OLS model that regresses education on meducation and feducation and verify
# using the F-test that coefficients of meducation and feducation are jointly significant
# significance
olsr_model = smf.ols(formula=reg_expr, data=df_1975)
olsr_model_results = olsr_model.fit()
print(olsr_model_results.summary())

                            OLS Regression Results                            
Dep. Variable:              education   R-squared:                       0.208
Model:                            OLS   Adj. R-squared:                  0.204
Method:                 Least Squares   F-statistic:                     55.83
Date:                Sun, 05 Nov 2023   Prob (F-statistic):           2.96e-22
Time:                        23:16:58   Log-Likelihood:                -910.64
No. Observations:                 428   AIC:                             1827.
Df Residuals:                     425   BIC:                             1839.
Df Model:                           2                                         
Covariance Type:            nonrobust                                         
                 coef    std err          t      P>|t|      [0.025      0.975]
------------------------------------------------------------------------------
Intercept      9.4801      0.321     29.523      0.0

In [8]:
#Build the dependent variable column
df_1975['ln_wage'] = np.log(df_1975['wage'])

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_1975['ln_wage'] = np.log(df_1975['wage'])


In [9]:
#Build out the exog matrix. Statsmodels requires this matrix to contain all the endogenous and
# exogenous variables, plus the constant.
exog = df_1975[['education']]
exog = add_constant(exog)

In [10]:
#Build out the instruments matrix. Statsmodels requires this matrix to contain not only all the
# instruments but also the variables in exog that will NOT be instrumented
instruments = df_1975[['meducation', 'feducation']]
instruments = add_constant(instruments)

In [11]:
#Build and train the IV2SLS model
iv2sls_model = IV2SLS(endog=df_1975['ln_wage'], exog=exog, instrument=instruments)
iv2sls_model_results = iv2sls_model.fit()

In [12]:
#Print the training summary
print(iv2sls_model_results.summary())

                          IV2SLS Regression Results                           
Dep. Variable:                ln_wage   R-squared:                       0.084
Model:                         IV2SLS   Adj. R-squared:                  0.082
Method:                     Two Stage   F-statistic:                     2.464
                        Least Squares   Prob (F-statistic):              0.117
Date:                Sun, 05 Nov 2023                                         
Time:                        23:20:08                                         
No. Observations:                 428                                         
Df Residuals:                     426                                         
Df Model:                           1                                         
                 coef    std err          t      P>|t|      [0.025      0.975]
------------------------------------------------------------------------------
const          0.5510      0.409      1.349      0.1

In [13]:
#Compare the performance of 2SLS with OLS of ln(wage) on education
reg_expr = 'ln_wage ~ education'
olsr_model = smf.ols(formula=reg_expr, data=df_1975)
olsr_model_results = olsr_model.fit()
print(olsr_model_results.summary())

                            OLS Regression Results                            
Dep. Variable:                ln_wage   R-squared:                       0.118
Model:                            OLS   Adj. R-squared:                  0.116
Method:                 Least Squares   F-statistic:                     56.93
Date:                Sun, 05 Nov 2023   Prob (F-statistic):           2.76e-13
Time:                        23:20:37   Log-Likelihood:                -441.26
No. Observations:                 428   AIC:                             886.5
Df Residuals:                     426   BIC:                             894.6
Df Model:                           1                                         
Covariance Type:            nonrobust                                         
                 coef    std err          t      P>|t|      [0.025      0.975]
------------------------------------------------------------------------------
Intercept     -0.1852      0.185     -1.000      0.3