In [1]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
import statsmodels.imputation.mice as mic
import statsmodels.formula.api as smf
import statsmodels.discrete.discrete_model as smf_discrete
import patsy

sns.set()

  from pandas import Int64Index as NumericIndex


In [2]:
study_df = pd.read_csv("data/anticonflict_study_dataframe.csv",low_memory=False)
study_df.shape

(24471, 500)

In [3]:
relevant_vars = pd.read_csv("data/VariableInclusion250.txt",header=None)[0].unique()
addtl_vars = relevant_vars[-1].split("\t")
relevant_vars2 = np.concatenate([relevant_vars[:-1],addtl_vars])
relevant_vars3 = [col.translate({ord("("):"",ord(")"):""}).upper() for col in relevant_vars2]

study_df = study_df[relevant_vars3]
study_df.columns

Index(['HOBGW2', 'STFRW2', 'IPHNW2', 'SMPHW2', 'PHNIW2', 'CMPW2', 'ACTSSW2',
       'ACTSOW2', 'ACTTW2', 'ACTMW2',
       ...
       'MYHEW2', 'MYGSW2', 'SPRTW2', 'HOBGW2', 'STFRW2', 'CN1W2', 'CN2W2',
       'CN3W2', 'CN4W2', 'CN5W2'],
      dtype='object', length=369)

In [4]:
#coercing values which indicate errors or missingness to NaN
study_df = pd.DataFrame(study_df.apply(lambda x: np.where(x.isin([
    "-99              ", "-77              ","","-98              ","-97              ",
    "-55              ","-88              ","-66              ","[MASKED BY ICPSR]"]),np.nan,x)))

#dropping columns where over 50% of values are NaN and rows where all values are NaN
study_df.drop(columns=study_df.columns[(study_df.isna()).mean()>0.5].values, inplace=True)
#study_df.drop(columns=study_df.columns[(study_df.isna()).all(0)].values, inplace=True)
study_df = study_df[~(study_df.isna()).all(1)]

study_df.shape

(24471, 348)

In [5]:
mtest_df = study_df.sample(frac=0.05).reset_index(drop=True)
mtest_ids = mtest_df[["UID","SCHID"]]
mtest_df.drop(columns=["UID","SCHID","SCHOOL_ID","ID2W2","SCHIDW2","IDN","BLOCKNUMBER","BLOCKVAR","SCHTREAT","TREAT","SCHRB","STRB"],inplace=True)
mtest_noms = [col for col in mtest_df.columns if ("ST" in col) & ("CN" in col)]
mtest_df.drop(columns=mtest_noms,inplace=True)
mtest_df.shape

(1224, 336)

In [6]:
response_var_list = pd.read_csv("data/Response_VariableInclusion250.txt",header=None)[0]
response_var_names = [str(col).translate({ord("("):"",ord(")"):""}).upper() for col in response_var_list]

mtest_response = mtest_df[response_var_names]
mtest_response.isna().mean().sort_values()

DN2W2       0.100490
FLIHCW2     0.104575
CILW2       0.104575
CFLW2       0.104575
CSCAW2      0.104575
              ...   
DNCL11W2    0.431373
DNCL12W2    0.431373
DNCL14W2    0.431373
DNCL4W2     0.431373
DNCL9W2     0.431373
Length: 129, dtype: float64

In [7]:
de_var_names = [col for col in mtest_response.columns if "DE" in col]
de_imputed = mtest_response.loc[:,de_var_names].apply(lambda x: np.where(x.isna(),0,x.fillna(0).astype(int)))
mtest_response = pd.concat([mtest_response.drop(columns=de_var_names),de_imputed],axis=1)
mtest_response.head(3)

Unnamed: 0,DN1W2,DN2W2,DN3W2,DN4W2,DN5W2,DN6W2,DN7W2,DN8W2,DN9W2,DN10W2,...,DE43,DE44,DE45,DE46,DE47,DE48,DE49,DE50,DE51,DE98
0,(0) Never,(3) 2-3 times/week,(4) Every day,(3) 2-3 times/week,(4) Every day,(4) Every day,(0) Never,(4) Every day,(0) Never,(4) Every day,...,0,0,0,0,0,0,0,0,0,0
1,(1) 1-2 times/month,(1) 1-2 times/month,(3) 2-3 times/week,(0) Never,(0) Never,(0) Never,(0) Never,(1) 1-2 times/month,(1) 1-2 times/month,(0) Never,...,0,0,0,0,0,0,0,0,0,0
2,(4) Every day,(4) Every day,(4) Every day,(4) Every day,(3) 2-3 times/week,(4) Every day,(3) 2-3 times/week,(4) Every day,(1) 1-2 times/month,(2) About 1 time/week,...,0,0,0,0,0,0,0,0,0,0


In [8]:
r_dummy_df = pd.get_dummies(mtest_response,drop_first=True,prefix_sep='_DUMMY')
r_dummy_df.columns = ["c"+str(i)+"_"+(col.split()[0]).translate({ord("("):"",ord(")"):"",ord("["):"",ord("]"):""})\
                    for i, col in zip(range(r_dummy_df.shape[1]),r_dummy_df.columns)]
r_dummy_df = r_dummy_df.loc[:,~(r_dummy_df.var()<0.05).values]
r_dummy_df.head(3)

Unnamed: 0,c1_DE2,c22_DE23,c24_DE25,c25_DE26,c29_DE30,c30_DE31,c31_DE32,c33_DE34,c40_DE41,c44_DE45,...,c212_TOMELNW2_DUMMY1,c213_TOMERGW2_DUMMY1,c214_TOMERBW2_DUMMY1,c215_TOMEMFW2_DUMMY1,c216_TOMEPMW2_DUMMY1,c217_TOMETHPW2_DUMMY1,c218_TOMEREW2_DUMMY1,c219_TOMESGW2_DUMMY1,c220_TOMESUW2_DUMMY1,c221_TOMEHDW2_DUMMY1
0,0,0,0,0,0,0,0,0,0,0,...,1,0,0,0,0,1,0,0,0,1
1,0,0,0,0,0,0,0,0,0,0,...,1,1,0,0,1,0,0,0,0,1
2,0,0,0,0,0,0,0,0,0,0,...,1,1,1,1,1,1,1,1,0,1


In [9]:
imputed_df = mic.MICEData(r_dummy_df)
mice_col = "c190_CILW2_DUMMY1"
mice_formula = "" + mice_col + " ~ " + ("+").join([(str(col)) for col in r_dummy_df.columns\
                                            if (col != mice_col)])
imputed_df

<statsmodels.imputation.mice.MICEData at 0x7ff528c03340>

In [10]:
#patsy.dmatrices(mice_formula, r_dummy_df)

In [11]:
game_mice = mic.MICE(model_class=smf_discrete.Logit, model_formula=mice_formula,
                 data=imputed_df)
results = game_mice.fit(10, 10)
print(results.params)

Optimization terminated successfully.
         Current function value: 0.294111
         Iterations 8
Optimization terminated successfully.
         Current function value: 0.294111
         Iterations 1
Optimization terminated successfully.
         Current function value: 0.294111
         Iterations 1
Optimization terminated successfully.
         Current function value: 0.294111
         Iterations 1
Optimization terminated successfully.
         Current function value: 0.294111
         Iterations 1
Optimization terminated successfully.
         Current function value: 0.294111
         Iterations 1
Optimization terminated successfully.
         Current function value: 0.294111
         Iterations 1
Optimization terminated successfully.
         Current function value: 0.294111
         Iterations 1
Optimization terminated successfully.
         Current function value: 0.294111
         Iterations 1
Optimization terminated successfully.
         Current function value: 0.294111
  

## 