### filename: run_replication.ipynb

### description: 
    last updated: 2018-04-24
        - update how we record outputs from regressions
            - separate IVs into "Central IVs" and "Controls"
            - collapse categorical variable dummies --> one variable (mean of abs(dummies))
    
    last updated: 2016-07-08
    Run models from an article for the purposes of comparing results to "manual/gold standard" replications by Julianna. 
    

### inputs:

### outputs:

## TO-DOs:
    2018-05-09
        collapse all DVs into 1 regression?
    
    
@author: Misha


In [174]:
# the article we are running
ARTICLE_ID = 4950

In [175]:
from __future__ import division
import pandas as pd
import pickle
import sys
sys.path.append('../')    
import GSSUtility as GU
import numpy as np
import statsmodels.formula.api as smf 
import random
from scipy.stats import pearsonr, ttest_ind, ttest_rel
import time
from collections import Counter
from collections import defaultdict

import matplotlib.pyplot as plt
import seaborn as sb
custom_style = {'axes.facecolor': 'white',
                'grid.color': '0.15',
                'grid.linestyle':'-.'}
sb.set_style("darkgrid", rc=custom_style)

%rm ../GSSUtility.pyc # remove this file because otherwise it will be used instead of the updated .py file
reload(GU)

pathToData = '../../Data/'
dataCont = GU.dataContainer(pathToData)

In [176]:
################################
# ANALYSIS

articlesToUse = GU.filterArticles(dataCont.articleClasses, GSSYearsUsed=True, GSSYearsPossible=False, \
                                    centralIVs=False, nextYearBound=0, linearModels=False)            
article = [a for a in articlesToUse if a.articleID == ARTICLE_ID][0]
maxYearUsed = max(article.GSSYearsUsed)

print 'article id:', article.articleID
print
print 'GSS Years Used:', article.GSSYearsUsed
print
print 'DVs:', article.DVs
print
print 'IVs:', article.IVs
print
print 'Controls:', article.controls
print
print 'Central IVs:', article.centralIVs

article id: 4950

GSS Years Used: [1973, 1974, 1975, 1976, 1977, 1978, 1980, 1982]

DVs: ['SPDEN16', 'SPDEN', 'DENOM', 'RELIG', 'SPREL', 'SPREL16']

IVs: ['AGE', 'EDUC', 'RACE', 'DENOM16', 'AGEWED', 'MARITAL', 'OTHER', 'OTH16', 'SPOTHER', 'SPOTH16', 'RELIG16']

Controls: []

Central IVs: ['AGE', 'EDUC', 'DENOM16', 'AGEWED', 'MARITAL', 'OTHER', 'OTH16', 'SPOTHER', 'SPOTH16', 'RELIG16']


In [177]:
# define the storage containers for outputs
outcomes = ['propSig_ControlVars', 'paramSizesNormed_ControlVars', 'Rs', 'adjRs', 
            'propSig_CentralVars', 'paramSizesNormed_CentralVars']

output = defaultdict(list)
for outcome in outcomes:
    output[outcome] = []

RHS = list(set(article.IVs + article.controls))
dfoutput = pd.DataFrame(index=article.DVs)

In [179]:
print 'Running article:', article.articleID, 'on', maxYearUsed

for DV in article.DVs:
    print DV, '~', RHS
    #     RHS.remove('AGEWED')

#         futureYearsPossible = [yr for yr in article.GSSYearsPossible if yr > maxYearUsed]
#         nextYear = min(futureYearsPossible) # the arguments of GU.filterArticles function ensure that there is a suitable future year (within bound)

#             log.write('id'+str(article.articleID)+' year '+str(maxYearUsed))

    res = GU.runModel(dataCont, maxYearUsed, DV, RHS) #, 
#                                 custom_data=custom_data,
#                                 standardized=False) # models run on max year of data used
    
    if not res: 
        continue
    
#     # TREATMENT OF DUMMY VARIABLES
#     # If #(dummies) for a variable is > 1, then take the absolute value of coefficients of those dummies and avg them
#     # if #(dummies) == 1 or variable is continuous, do nothing
#     def f(x):
#         return x.abs().mean() if len(x) > 1 else x[0]
#     res.params = res.params.groupby(lambda x: x.split('[')[0].split(',')[0]).apply(f)
    
    
#     res.params = res.params.abs().groupby(lambda x: x.split('[')[0].split(',')[0]).mean()
#     res.pvalues = res.pvalues.groupby(lambda x: x.split('[')[0].split(',')[0]).mean()

    dfoutput.loc[DV, 'Rs'] = res.rsquared
    dfoutput.loc[DV, 'adjRs'] = res.rsquared_adj

    for col in res.params.index:          
        dfoutput.loc[DV, col] = res.params[col]
        
    break


Running article: 4950 on 1982
SPDEN16 ~ ['AGEWED', 'DENOM16', 'AGE', 'RELIG16', 'OTHER', 'MARITAL', 'SPOTH16', 'RACE', 'OTH16', 'EDUC', 'SPOTHER']
# cols before dropna(thresh(10%)): 12
# cols design AFTER dropna(thresh(10%)): 10
# rows before dropna(): 1530
Dropping column DENOM16 because it is constant
Dropping column RELIG16 because it is constant
Dropping column MARITAL because it is constant
# rows after dropna(): 57


In [183]:
GU.runModel(dataCont, 1982, 'RELIG', RHS)

# cols before dropna(thresh(10%)): 12
# cols design AFTER dropna(thresh(10%)): 9
# rows before dropna(): 1860
DV is constant
# rows after dropna():

TypeError: object of type 'NoneType' has no len()

In [186]:
design = dataCont.df.loc[1982, ['RELIG'] + RHS]
design.sample(10)

Unnamed: 0_level_0,RELIG,AGEWED,DENOM16,AGE,RELIG16,OTHER,MARITAL,SPOTH16,RACE,OTH16,EDUC,SPOTHER
year,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1
1982,1.0,25.0,38.0,64.0,1.0,,4.0,,1,,16.0,
1982,2.0,,,60.0,2.0,,5.0,,1,,18.0,
1982,2.0,27.0,,63.0,2.0,,3.0,,1,,10.0,
1982,1.0,,,20.0,,31.0,5.0,,1,,11.0,
1982,2.0,17.0,28.0,34.0,1.0,,1.0,,1,,12.0,
1982,1.0,21.0,18.0,38.0,1.0,,1.0,,2,,12.0,
1982,1.0,27.0,18.0,32.0,1.0,,1.0,,1,,12.0,
1982,2.0,22.0,,27.0,2.0,,1.0,,1,,16.0,
1982,2.0,,,22.0,2.0,,5.0,,1,,14.0,
1982,1.0,,18.0,24.0,1.0,,5.0,,2,,13.0,


In [194]:
design.dropna(axis=1, thresh=int(0.1*len(design))).dropna()

Unnamed: 0_level_0,RELIG,AGEWED,DENOM16,AGE,RELIG16,OTHER,MARITAL,RACE,EDUC
year,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1
1982,1.0,18.0,60.0,53.0,1.0,40.0,1.0,1,12.0
1982,1.0,23.0,60.0,53.0,1.0,40.0,1.0,1,18.0
1982,1.0,40.0,60.0,57.0,1.0,40.0,1.0,2,12.0
1982,1.0,18.0,60.0,21.0,1.0,95.0,1.0,1,12.0
1982,1.0,22.0,38.0,82.0,1.0,71.0,1.0,1,5.0
1982,1.0,17.0,60.0,40.0,1.0,96.0,3.0,1,12.0
1982,1.0,21.0,60.0,62.0,1.0,96.0,1.0,1,12.0
1982,1.0,26.0,60.0,71.0,1.0,81.0,1.0,1,12.0
1982,1.0,16.0,60.0,45.0,1.0,18.0,3.0,2,12.0
1982,1.0,21.0,60.0,73.0,1.0,32.0,1.0,1,8.0


In [146]:
#2506
# dfoutput.loc[['SPKRAC', 'ANTIREL', 'LIBHOMO', 'COLRAC'], :].mean().T
# MARITAL AVG = [6:10].mean()
# race avg = [3:5].mean()
# dfoutput.loc[['FEPOLY', 'FEPRES', 'FEFAM'], :].mean().T
# dfoutput.loc[['HOMOSEX', 'PREMARSX', 'XMARSEX'], :].mean().T
# dfoutput.loc[['NATENVIR','NATHEAL','NATEDUC', 'NATSOC'], :].mean().T
# dfoutput.loc[['NATRACE', 'NATCITY', 'NATFARE'], :].mean().T
# dfoutput.loc[['COMMUN', 'CHINA', 'RUSSIA'], :].mean().T[3:5].mean()

In [150]:
design = GU.df.loc[1989, [DV] + RHS]

nominals = GU.createFormula(dataCont, design, return_nominals=True)
non_nominals = list(set(design.columns[1:]) - set(nominals)) # list because sets are unhashable and cant be used for indices and [1:] 

In [155]:
design.columns

Index([u'XMARSEX', u'RELIG', u'PRESTIGE', u'ATTEND', u'AGE', u'EDUC',
       u'MEMCHURH', u'MARITAL', u'POSTLIFE', u'OTHER', u'REGION', u'XNORCSIZ',
       u'INCOME', u'DENOM', u'SEX', u'RELITEN', u'RACE'],
      dtype='object')

In [152]:
design.head()

Unnamed: 0_level_0,XMARSEX,RELIG,PRESTIGE,ATTEND,AGE,EDUC,MEMCHURH,MARITAL,POSTLIFE,OTHER,REGION,XNORCSIZ,INCOME,DENOM,SEX,RELITEN,RACE
year,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1
1989,,2.0,,0.0,22.0,10.0,2.0,2.0,2.0,,2,1,,,1,2.0,3
1989,,4.0,26.0,0.0,40.0,6.0,2.0,1.0,2.0,,2,1,,,1,4.0,3
1989,1.0,4.0,52.0,0.0,36.0,16.0,2.0,5.0,,,2,1,,,1,4.0,1
1989,,4.0,55.0,0.0,26.0,16.0,2.0,5.0,,,2,1,12.0,,1,4.0,1
1989,2.0,1.0,51.0,0.0,38.0,16.0,2.0,3.0,,,2,1,12.0,70.0,1,2.0,1


In [135]:
dfoutput.to_csv('%d_output.csv' % ARTICLE_ID)

In [270]:
from scipy.stats import pearsonr
pearsonr([0.359, 0.469], [0.215, 1.166])

(1.0, 0.0)

In [271]:
np.mean([0.0089, 0.0124])

0.01065