In [11]:
import pandas as pd
from src.models.linreg import LinReg
from src.models.iv import IV
from src.displays.display_linear import display_models
from statsmodels.sandbox.regression.gmm import IV2SLS

In [12]:

magic = pd.read_csv('../data/father_education.csv')
magic

Unnamed: 0,wage,educ,ability,fathereduc
0,146.347807,18.053194,348.240021,17.158322
1,147.599580,15.845486,181.160724,13.988533
2,161.820228,15.105207,337.367725,15.994311
3,105.082941,16.458131,106.458032,21.413172
4,167.562196,18.793815,301.510006,16.457630
...,...,...,...,...
995,157.149509,14.883227,461.290475,11.446059
996,166.003573,14.542572,292.700272,16.201575
997,155.693604,18.022844,278.575512,16.739413
998,199.074623,18.885769,346.645549,15.543786


In [3]:
"""Perfect world where we can measure individual ability"""
perfect_model =  LinReg(df = magic,
                        outcome='wage',
                        independent=['ability', 'educ'])

perfect_model.summary(content_type='html')

In [4]:
"""Naive model"""

naive_model = LinReg(df = magic,
                     outcome='wage',
                     independent=['educ'])

naive_model.summary(content_type='html')

To be a valid instrument, it must meet three criteria:

Relevance: Instrument is correlated with policy variable
Exclusion: Instrument is correlated with outcome only through the policy variable
Exogeneity: Instrument isn’t correlated with anything else in the model (i.e. omitted variables)

In [5]:
relevence = LinReg(df = magic,
                     outcome='educ',
                     independent=['fathereduc'])

relevence.summary(content_type='html')

In [6]:
"""Exclusion"""

exclusion = LinReg(df = magic,
                     outcome='wage',
                     independent=['fathereduc'])

exclusion.summary(content_type='html')

In [7]:
"""Luckily in this toy dataset we have ability"""

exclusion = LinReg(df = magic,
                     outcome='ability',
                     independent=['fathereduc'])

exclusion.summary(content_type='html')

In [8]:
"""first stage"""

first_stage = LinReg(df = magic,
                     outcome='educ',
                     independent=['fathereduc'])

first_stage.summary(content_type='html')    

In [9]:
predicted_educ = first_stage.predict(magic['fathereduc'].values)
magic = magic.assign(independent_hat = predicted_educ)
magic

Unnamed: 0,wage,educ,ability,fathereduc,independent_hat
0,146.347807,18.053194,348.240021,17.158322,17.382994
1,147.599580,15.845486,181.160724,13.988533,14.983731
2,161.820228,15.105207,337.367725,15.994311,16.501936
3,105.082941,16.458131,106.458032,21.413172,20.603558
4,167.562196,18.793815,301.510006,16.457630,16.852630
...,...,...,...,...,...
995,157.149509,14.883227,461.290475,11.446059,13.059292
996,166.003573,14.542572,292.700272,16.201575,16.658818
997,155.693604,18.022844,278.575512,16.739413,17.065916
998,199.074623,18.885769,346.645549,15.543786,16.160927


In [10]:
second_stage = LinReg(df = magic,
                     outcome='wage',
                     independent=['independent_hat',])





In [13]:
iv = IV(df = magic,
        outcome='wage',
        independent=['educ'],
        controls=[],
        instruments=['fathereduc'])

iv.summary()

In [12]:
display_models([perfect_model, naive_model, second_stage, iv]) 

AttributeError: 'IV2SLS' object has no attribute 'pinv_wexog'

In [13]:
"""Lets practice on some real data"""

wage = pd.read_csv('../data/wage.csv')

wage.dropna(inplace=True)

wage

Unnamed: 0,wage,hours,IQ,KWW,educ,exper,tenure,age,married,black,south,urban,sibs,brthord,meduc,feduc,lwage
0,769,40,93,35,12,11,2,31,1,0,0,1,1,2.0,8.0,8.0,6.645091
2,825,40,108,46,14,11,9,33,1,0,0,1,1,2.0,14.0,14.0,6.715384
3,650,40,96,32,12,13,7,32,1,0,0,1,4,3.0,12.0,12.0,6.476973
4,562,40,74,27,11,14,5,34,1,0,0,1,10,6.0,6.0,11.0,6.331502
6,600,40,91,24,10,13,0,30,0,0,0,1,1,2.0,8.0,8.0,6.396930
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
924,1442,40,113,45,16,8,10,35,1,0,1,1,2,2.0,8.0,8.0,7.273787
925,645,45,93,39,12,11,3,35,1,0,1,0,7,7.0,7.0,8.0,6.469250
928,477,45,100,33,12,9,3,31,1,0,1,0,3,3.0,7.0,7.0,6.167517
929,664,60,82,30,16,10,9,34,1,1,1,1,3,4.0,16.0,16.0,6.498282


In [14]:
naive_model = LinReg(df = wage,
                     outcome='wage',
                     independent=['educ'])

naive_model.summary(content_type='html')

In [15]:
"""Check for relevence of either parents education"""

relevence = LinReg(df = wage,
                     outcome='educ',
                     independent=['meduc', 'feduc'])

relevence.summary(content_type='html')


In [16]:
iv = IV(df = wage,
        outcome='wage',
        independent=['educ'],
        controls=[],
        instruments=['meduc', 'feduc'])

iv_robust = IV(df = wage,
        outcome='wage',
        independent=['educ'],
        controls=[],
        instruments=['meduc', 'feduc'],
               standard_error_type='hc0')

display_models([naive_model, iv, iv_robust])

In [6]:
"""Lets now look at instrumental variables with controls"""

card = pd.read_csv('../data/card.csv')
card.dropna(subset=['lwage','educ', 'smsa66', 'exper', 'expersq', 'black', 'south66'],inplace=True)
card

Unnamed: 0,id,nearc2,nearc4,educ,age,fatheduc,motheduc,weight,momdad14,sinmom14,...,smsa66,wage,enroll,KWW,IQ,married,libcrd14,exper,lwage,expersq
0,2,0,0,7,29,,,158413,1,0,...,1,548,0,15.0,,1.0,0.0,16,6.306275,256
1,3,0,0,12,27,8.0,8.0,380166,1,0,...,1,481,0,35.0,93.0,1.0,1.0,9,6.175867,81
2,4,0,0,12,34,14.0,12.0,367470,1,0,...,1,721,0,42.0,103.0,1.0,1.0,16,6.580639,256
3,5,1,1,11,27,11.0,12.0,380166,1,0,...,1,250,0,25.0,88.0,1.0,1.0,10,5.521461,100
4,6,1,1,12,34,8.0,7.0,367470,1,0,...,1,729,0,34.0,108.0,1.0,0.0,16,6.591674,256
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
3005,5218,0,1,12,25,8.0,12.0,82135,1,0,...,0,335,0,15.0,,1.0,0.0,7,5.814130,49
3006,5219,0,1,13,34,,,88765,1,0,...,0,481,0,43.0,,1.0,1.0,15,6.175867,225
3007,5220,0,1,12,24,11.0,,89271,0,0,...,0,500,0,25.0,109.0,1.0,0.0,6,6.214608,36
3008,5221,0,1,12,31,,,110376,1,0,...,0,713,0,32.0,107.0,1.0,1.0,13,6.569481,169


In [7]:
naive_model = LinReg(df = card,
                     outcome='lwage',
                     independent=['educ', 'smsa66', 'exper', 'expersq', 'black', 'south66'])

naive_model.summary(content_type='html')

In [9]:
iv = IV(df = card,
        outcome='lwage',
        independent=['educ'],
        controls=['smsa66', 'exper', 'expersq', 'black', 'south66'],
        instruments=['nearc4'])

iv_robust = IV(df = card,
        outcome='lwage',
        independent=['educ'],
        controls=['smsa66', 'exper', 'expersq', 'black', 'south66'],
        instruments=['nearc4'],
        standard_error_type='hc0')

display_models([naive_model, iv, iv_robust])