In [1]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
import statsmodels.imputation.mice as mic
import statsmodels.formula.api as smf
import statsmodels.discrete.discrete_model as smf_discrete
import statsmodels.api as sm
import patsy
import miceforest as mf

sns.set()

  from pandas import Int64Index as NumericIndex


In [2]:
study_df = pd.read_csv("data/anticonflict_study_dataframe.csv",low_memory=False)
study_df.shape

(24471, 500)

In [3]:
relevant_vars = pd.read_csv("data/VariableInclusion250.txt",header=None)[0].unique()
addtl_vars = relevant_vars[-1].split("\t")
relevant_vars2 = np.concatenate([relevant_vars[:-1],addtl_vars])
relevant_vars3 = [col.translate({ord("("):"",ord(")"):""}).upper() for col in relevant_vars2]

study_df = study_df[relevant_vars3]

In [4]:
#coercing values which indicate errors or missingness to NaN
study_df = pd.DataFrame(study_df.apply(lambda x: np.where(x.isin([
    "-99              ", "-77              ","","-98              ","-97              ",
    "-55              ","-88              ","-66              ","[MASKED BY ICPSR]"]),np.nan,x)))

#dropping columns where over 50% of values are NaN and rows where all values are NaN
study_df.drop(columns=study_df.columns[(study_df.isna()).mean()>0.5].values, inplace=True)
study_df = study_df[~(study_df.isna()).all(1)]

study_df.shape

(24471, 348)

In [5]:
mtest_df = study_df.sample(frac=0.15,random_state=396).reset_index(drop=True)
mtest_ids = mtest_df[["UID","SCHID"]]
mtest_df.drop(columns=["UID","SCHID","SCHOOL_ID","ID2W2","SCHIDW2","IDN",
                       "BLOCKNUMBER","BLOCKVAR","SCHTREAT","TREAT","SCHRB","STRB"],inplace=True)
mtest_noms = [col for col in mtest_df.columns if ("ST" in col) & ("CN" in col)]
mtest_df.drop(columns=mtest_noms,inplace=True)
mtest_df.shape

(3671, 336)

In [6]:
mtest_df.columns[9:19]

Index(['ACTMW2', 'ACTAW2', 'ACTCW2', 'ACTDW2', 'SMFBW2', 'SMSCW2', 'SMTWW2',
       'SMTUW2', 'SMINW2', 'SMAFMW2'],
      dtype='object')

In [7]:
response_var_list = pd.read_csv("data/Response_VariableInclusion250.txt",header=None)[0]
response_var_names = [str(col).translate({ord("("):"",ord(")"):""}).upper() for col in response_var_list]

mtest_response = mtest_df[response_var_names]
mtest_response.isna().mean().sort_values()

CILW2       0.100245
CBNPW2      0.100518
CFLW2       0.100518
CSCAW2      0.100518
CLHCW2      0.100518
              ...   
PNCL10W2    0.427676
PNCL2W2     0.427676
PNCL11W2    0.427949
PNCL12W2    0.427949
PNCL13W2    0.427949
Length: 129, dtype: float64

In [8]:
de_var_names = [col for col in mtest_response.columns if "DE" in col]
de_imputed = mtest_response.loc[:,de_var_names].apply(lambda x: np.where(x.isna(),0,x.fillna(0).astype(int)))
mtest_response = pd.concat([mtest_response.drop(columns=de_var_names),de_imputed],axis=1)
mtest_response.head(2)

Unnamed: 0,DN1W2,DN2W2,DN3W2,DN4W2,DN5W2,DN6W2,DN7W2,DN8W2,DN9W2,DN10W2,...,DE43,DE44,DE45,DE46,DE47,DE48,DE49,DE50,DE51,DE98
0,(0) Never,(3) 2-3 times/week,(1) 1-2 times/month,(0) Never,(1) 1-2 times/month,(0) Never,(1) 1-2 times/month,(1) 1-2 times/month,(0) Never,(1) 1-2 times/month,...,0,0,0,0,0,0,0,0,0,0
1,,,,,,,,,,,...,0,0,0,0,0,0,0,0,0,0


In [9]:
r_dummy_df = pd.get_dummies(mtest_response,drop_first=True,prefix_sep='_DUMMY',dummy_na=True)
r_dummy_df.columns = [(col.split()[0]).translate({ord("("):"",ord(")"):"",ord("["):"",ord("]"):""})\
                    for i, col in zip(range(r_dummy_df.shape[1]),r_dummy_df.columns)]

def retain_nans(x):
    try:
        return np.where(r_dummy_df[x.name[:-1]+"nan"]==1,np.nan,x)
    except:
        #print("Except: "+x.name+" -- with "+x.name[:-1]+"nan")
        return x

r_dummy_df = r_dummy_df.apply(lambda x: retain_nans(x))
r_dummy_df = r_dummy_df.loc[:,~r_dummy_df.columns.str.endswith("nan")]

r_dummy_df.isna().mean().sort_values().tail(10)

PNCL4W2_DUMMY1     0.427676
PNCL5W2_DUMMY1     0.427676
PNCL8W2_DUMMY1     0.427676
PNCL7W2_DUMMY1     0.427676
PNCL10W2_DUMMY1    0.427676
PNCL6W2_DUMMY1     0.427676
PNCL9W2_DUMMY1     0.427676
PNCL11W2_DUMMY1    0.427949
PNCL12W2_DUMMY1    0.427949
PNCL13W2_DUMMY1    0.427949
dtype: float64

In [10]:
mice_imputed_df = mic.MICEData(r_dummy_df)

In [11]:
def fit_mice(dummy_df, mice_col, imp_df=mice_imputed_df):
    
    #if dummy_df[mice_col].isna().sum() == 0:
    #return dummy_df[mice_col]
    
    #if dummy_df[mice_col].var() < 0.001:
    #print("Imputed mode: "+mice_col)
    #return dummy_df[mice_col].fillna(dummy_df[mice_col].mode())
    
    #print(mice_col)
    
    #dummy_df = pd.concat([(dummy_df.drop(columns=mice_col).\
    #loc[:,((dummy_df.drop(columns=mice_col).var()<0.1).values)]),
    #dummy_df[mice_col]],axis=1)
    #dummy_df = dummy_df.loc[:,(np.abs(dummy_df.corr()[mice_col])>0.1).values]
    mice_formula = "" + mice_col + " ~ " + ("+").join([(str(col)) for col in dummy_df.columns\
                                            if (col != mice_col)])
    #print(mice_formula)
    
    if "DUMMY" in mice_col:
        mice_model = smf_discrete.Logit
        numeric_var = False
    else:
        mice_model = sm.OLS
        numeric_var = True
    
    #game_mice = mic.MICE(model_class=mice_model, model_formula=mice_formula, 
    #data=mice_imputed_df,fit_kwds={"method":"nm"})
    results = []
    
    kernel = mf.ImputationKernel(
      dummy_df,
      datasets=5,
      save_all_iterations=True,
      random_state=396
    )
    
    kernel.mice(5)
    
    

    for k in range(5):
        results.append(kernel.complete_data(k))
        
    return results
        
#if numeric_var:
#return np.mean(np.array(results),axis=0)
#else:
#return_indices = np.random.randint(10,size=dummy_df.shape[0])
#return (np.array(results))[:,return_indices]

response_imputed_df = fit_mice(r_dummy_df, "DN1W2_DUMMY1")
#r_dummy_df.apply(lambda x: fit_mice(r_dummy_df, x.name))

AttributeError: 'list' object has no attribute 'head'

In [17]:
for k in range(5):
    response_imputed_df[k].to_csv(("data/imputed_response_df_iter"+str(k)))

In [19]:
exo_mtest_df = mtest_df.drop(columns=response_var_names)
exo_mtest_df.isna().mean().sort_values()

NUMMALE          0.000000
LEP              0.000000
REDUCED_LUNCH    0.000000
FREE_LUNCH       0.000000
DENSITY          0.000000
                   ...   
CN1W2            0.416508
DNCL13W2         0.421956
TRD              0.454100
ST9              0.476437
CN1              0.477527
Length: 207, dtype: float64

In [22]:
e_dummy_df = pd.get_dummies(exo_mtest_df,drop_first=True,prefix_sep='_DUMMY',dummy_na=True)
e_dummy_df.columns = [(col.split()[0]).translate({ord("("):"",ord(")"):"",ord("["):"",ord("]"):""})\
                    for i, col in zip(range(e_dummy_df.shape[1]),e_dummy_df.columns)]

e_dummy_df = e_dummy_df.apply(lambda x: retain_nans(x))
e_dummy_df = e_dummy_df.loc[:,~e_dummy_df.columns.str.endswith("nan")]

e_dummy_df.columns = ["c"+str(i)+col for i, col in zip(range(e_dummy_df.shape[1]),e_dummy_df.columns)]

#e_mice_imputed_df = mic.MICEData(e_dummy_df)

e_imputed_df = fit_mice(e_dummy_df, "LEP")

for k in range(5):
    response_imputed_df[k].to_csv(("data/imputed_predictor_df_iter"+str(k)))

## 