In [1]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
import miceforest as mf

sns.set()

In [2]:
study_df0 = pd.read_csv("data/anticonflict_study_dataframe.csv",low_memory=False)
study_df = study_df0.copy()
study_df.shape

(24471, 500)

In [3]:
#coercing values which indicate errors or missingness to NaN
study_df = pd.DataFrame(study_df.apply(lambda x: np.where(x.isin([
    "-99              ", "-77              ","","-98              ","-97              ",
    "-55              ","-88              ","-66              ","[MASKED BY ICPSR]"]),np.nan,x)))

#dropping columns where over 50% of values are NaN, rows where all values are NaN,
#and rows where student IDs are invalid
#study_df.drop(columns=study_df.columns[(study_df.isna()).mean()>0.5].values, inplace=True)
study_df = study_df[~(study_df.isna()).all(1)]
study_df = study_df[(study_df["ID"] != "--blank--") & (study_df["ID"] != "999")]
study_df.dropna(subset=["ID","TREAT","SCHTREAT"],inplace=True)

study_df.shape

(22721, 500)

In [4]:
study_df.columns

Index(['Column1', 'SCHID', 'UID', 'SCHTREAT', 'SCHRB', 'ID', 'TREAT', 'STRB',
       'SID1', 'SID3',
       ...
       'AM_F', 'PI_M', 'PI_F', 'MU_M', 'MU_F', 'ROW_TOTAL', 'FREE_LUNCH',
       'REDUCED_LUNCH', 'LEP', 'MIGRANT'],
      dtype='object', length=500)

In [5]:
relevant_vars = pd.read_csv("data/Limited_VariableInclusion250_v2.csv",header=None)[0].unique()
#addtl_vars = relevant_vars[-1].split("\t")
#relevant_vars2 = np.concatenate([relevant_vars[:-1],addtl_vars])
relevant_vars3 = [col.translate({ord("("):"",ord(")"):""}).upper() for col in relevant_vars]

In [6]:
mtest_df = (study_df.copy())[relevant_vars3]
#mtest_df.drop(columns=["ID","TREAT","SCHTREAT"], inplace=True)

#mtest_df.drop(columns=["SCHOOL_ID","ID2W2","SCHIDW2","IDN",
#"BLOCKNUMBER","BLOCKVAR","SCHTREAT","TREAT","SCHRB","STRB","RAWSIZE"],inplace=True)
mtest_noms = [col for col in mtest_df.columns if ("ST" in col) & ("CN" in col)]
mtest_df.drop(columns=mtest_noms,inplace=True)
mtest_df.shape

(22721, 227)

In [8]:
response_var_list = pd.read_csv("data/Response_VariableInclusion250.csv",header=None)[0]
response_var_names = [str(col).translate({ord("("):"",ord(")"):""}).upper() for col in response_var_list]

mtest_response = mtest_df[response_var_names]

In [9]:
de_var_names = [col for col in mtest_response.columns if "DE" in col]
de_imputed = mtest_response.loc[:,de_var_names].apply(lambda x: np.where(x.isna(),0,x.fillna(0).astype(int)))
mtest_response = pd.concat([mtest_response.drop(columns=de_var_names),de_imputed],axis=1)
mtest_response.head(2)

Unnamed: 0,DN1W2,DN2W2,DN3W2,DN4W2,DN5W2,DN6W2,DN7W2,DN8W2,DN9W2,DN10W2,...,DE43,DE44,DE45,DE46,DE47,DE48,DE49,DE50,DE51,DE98
0,(3) 2-3 times/week,(4) Every day,(4) Every day,(3) 2-3 times/week,(2) About 1 time/week,(1) 1-2 times/month,(2) About 1 time/week,(3) 2-3 times/week,(3) 2-3 times/week,(3) 2-3 times/week,...,0,0,0,0,0,0,3,0,0,0
1,(3) 2-3 times/week,(4) Every day,(0) Never,(3) 2-3 times/week,(4) Every day,(3) 2-3 times/week,(1) 1-2 times/month,(1) 1-2 times/month,(3) 2-3 times/week,(1) 1-2 times/month,...,0,0,0,0,0,0,11,0,0,0


In [10]:
r_dummy_df = pd.get_dummies(mtest_response,drop_first=True,prefix_sep='_DUMMY',dummy_na=True)
r_dummy_df.columns = [((col.split()[0])).translate({ord("("):"",ord(")"):"",ord("["):"",ord("]"):""})\
                    for col in (r_dummy_df.columns)]

def retain_nans(x):
    try:
        return np.where(r_dummy_df[x.name[:-1]+"nan"]==1,np.nan,x)
    except:
        #print("Except: "+x.name+" -- with "+x.name[:-1]+"nan")
        return x

r_dummy_df = r_dummy_df.apply(lambda x: retain_nans(x))
r_dummy_df = r_dummy_df.loc[:,~r_dummy_df.columns.str.endswith("nan")]

r_dummy_df.isna().mean().sort_values().tail(10)

DNCL10W2_DUMMY1    0.419788
DNCL2W2_DUMMY1     0.419788
DNCL9W2_DUMMY1     0.419788
DNCL8W2_DUMMY1     0.419788
DNCL6W2_DUMMY1     0.419788
DNCL5W2_DUMMY1     0.419788
DNCL4W2_DUMMY1     0.419788
DNCL1W2_DUMMY1     0.419788
DNCL3W2_DUMMY1     0.419788
DNCL7W2_DUMMY1     0.419788
dtype: float64

In [11]:
def fit_mice(dummy_df):
    
    results = []
    
    kernel = mf.ImputationKernel(
      dummy_df,
      datasets=5,
      save_all_iterations=True,
      random_state=396
    )
    
    kernel.mice(5)

    for k in range(5):
        results.append(kernel.complete_data(k))
        
    return results
        
#if numeric_var:
#return np.mean(np.array(results),axis=0)
#else:
#return_indices = np.random.randint(10,size=dummy_df.shape[0])
#return (np.array(results))[:,return_indices]

In [12]:
#response_imputed_df = fit_mice(r_dummy_df)

In [13]:
r_dummy_df.to_csv("data/full_unimputed_response_df.csv")
#for k in range(5):
 #   response_imputed_df[k].to_csv(("data/full_imputed_response_df_iter"+str(k)+".csv"))

In [14]:
mtest_df.columns

Index(['UID', 'ID', 'TREAT', 'SCHTREAT', 'SCHID', 'AGEC_NEW', 'GENDER', 'GR',
       'COLL', 'CELL',
       ...
       'DE98', 'RTSM1', 'RTSM2', 'RTSM3', 'RTSM4', 'RTSM5', 'RTSM6', 'RTSM7',
       'RTSM8', 'RTSM9'],
      dtype='object', length=227)

In [15]:
#exo_mtest_df = mtest_df.drop(columns=response_var_names)
exo_mtest_df = pd.concat([mtest_df.loc[:,relevant_vars3[5:12]],
                          study_df[[col.replace("W2","") for col in response_var_names\
                                    if (("W2" in col)&(col.replace("W2","") in study_df.columns))]]],axis=1)
#exo_mtest_df.drop(columns=["UID","ID","TREAT","SCHTREAT","SCHID"],inplace=True)
exo_mtest_df.isna().mean().sort_values()

GENDER      0.000880
GR          0.000880
AGEC_NEW    0.012411
COLL        0.026055
PN1         0.029576
              ...   
DNCL3       0.291492
DNCL1       0.291492
DNCL6       0.291492
DNCL5       0.291492
DNCL7       0.291492
Length: 72, dtype: float64

In [16]:
e_dummy_df = pd.get_dummies(exo_mtest_df,drop_first=True,dummy_na=True)
#e_dummy_df.columns = [("_".join(col.split())) for col in (e_dummy_df.columns)]


e_dummy_df = e_dummy_df.apply(lambda x: retain_nans(x))
e_dummy_df = e_dummy_df.loc[:,~e_dummy_df.columns.str.endswith("nan")]

#e_dummy_df.columns = ["c"+str(i)+"_"+col for i, col in zip(range(e_dummy_df.shape[1]),e_dummy_df.columns)]

e_imputed_df = fit_mice(e_dummy_df)

for k in range(5):
    e_imputed_df[k].to_csv(("data/full_imputed_predictor_df_iter"+str(k)+".csv"))

In [17]:
roots_vars = [col for col in study_df if \
              (("RTSM" in col)|(col=="ID")|(col=="SCHID")|(col=="TREAT")|\
              (col=="SCHTREAT")|(col=="UID"))]

study_df_trt = study_df.loc[:,roots_vars]
study_df_trt["ID"] = study_df_trt["ID"].astype(int)
study_df_trt["SCHID"] = study_df_trt["SCHID"].astype(int)
study_df_trt["UID"] = study_df_trt["UID"].astype(int)

study_df_trt = pd.get_dummies(study_df_trt,drop_first=True)
#study_df_trt.columns = [("_".join(col.split())).translate({ord("("):"",ord(")"):"",ord("["):"",ord("]"):""})\
#for col in (study_df_trt.columns)]


study_df_trt.to_csv("data/full_unimputed_trt_df.csv")