In [1]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
import miceforest as mf

sns.set()

In [2]:
study_df = pd.read_csv("data/anticonflict_study_dataframe.csv",low_memory=False)
study_df.shape

(24471, 500)

In [3]:
relevant_vars = pd.read_csv("data/VariableInclusion250.txt",header=None)[0].unique()
addtl_vars = relevant_vars[-1].split("\t")
relevant_vars2 = np.concatenate([relevant_vars[:-1],addtl_vars])
relevant_vars3 = [col.translate({ord("("):"",ord(")"):""}).upper() for col in relevant_vars2]

study_df = study_df[relevant_vars3]

In [4]:
#coercing values which indicate errors or missingness to NaN
study_df = pd.DataFrame(study_df.apply(lambda x: np.where(x.isin([
    "-99              ", "-77              ","","-98              ","-97              ",
    "-55              ","-88              ","-66              ","[MASKED BY ICPSR]"]),np.nan,x)))

#dropping columns where over 50% of values are NaN and rows where all values are NaN
study_df.drop(columns=study_df.columns[(study_df.isna()).mean()>0.5].values, inplace=True)
study_df = study_df[~(study_df.isna()).all(1)]

study_df.shape

(24471, 348)

In [5]:
mtest_df = study_df.sample(frac=1,random_state=396)
#pd.Series(mtest_df.index).to_csv("tenth_sample_indices.csv")
mtest_df=mtest_df.reset_index(drop=True)
mtest_ids = mtest_df[["UID","SCHID"]]
mtest_df.drop(columns=["UID","SCHID","SCHOOL_ID","ID2W2","SCHIDW2","IDN",
                       "BLOCKNUMBER","BLOCKVAR","SCHTREAT","TREAT","SCHRB","STRB"],inplace=True)
mtest_noms = [col for col in mtest_df.columns if ("ST" in col) & ("CN" in col)]
mtest_df.drop(columns=mtest_noms,inplace=True)
mtest_df.shape

(24471, 336)

In [6]:
response_var_list = pd.read_csv("data/Response_VariableInclusion250.txt",header=None)[0]
response_var_names = [str(col).translate({ord("("):"",ord(")"):""}).upper() for col in response_var_list]

mtest_response = mtest_df[response_var_names]
mtest_response.isna().mean().sort_values()

DN2W2       0.100650
CILW2       0.101263
FLIHCW2     0.101263
CBNPW2      0.101304
CFLW2       0.101304
              ...   
DNCL11W2    0.428262
DNCL12W2    0.428262
DNCL14W2    0.428262
DNCL3W2     0.428262
DNCL8W2     0.428262
Length: 129, dtype: float64

In [7]:
de_var_names = [col for col in mtest_response.columns if "DE" in col]
de_imputed = mtest_response.loc[:,de_var_names].apply(lambda x: np.where(x.isna(),0,x.fillna(0).astype(int)))
mtest_response = pd.concat([mtest_response.drop(columns=de_var_names),de_imputed],axis=1)
mtest_response.head(2)

Unnamed: 0,DN1W2,DN2W2,DN3W2,DN4W2,DN5W2,DN6W2,DN7W2,DN8W2,DN9W2,DN10W2,...,DE43,DE44,DE45,DE46,DE47,DE48,DE49,DE50,DE51,DE98
0,(1) 1-2 times/month,(2) About 1 time/week,(0) Never,(2) About 1 time/week,(0) Never,(1) 1-2 times/month,(1) 1-2 times/month,(2) About 1 time/week,(0) Never,(4) Every day,...,0,0,0,0,0,0,0,0,0,0
1,(0) Never,(1) 1-2 times/month,(1) 1-2 times/month,(0) Never,(0) Never,(2) About 1 time/week,(0) Never,(0) Never,(1) 1-2 times/month,(2) About 1 time/week,...,0,0,0,0,0,0,0,0,0,0


In [8]:
r_dummy_df = pd.get_dummies(mtest_response,drop_first=True,prefix_sep='_DUMMY',dummy_na=True)
r_dummy_df.columns = [(col.split()[0]).translate({ord("("):"",ord(")"):"",ord("["):"",ord("]"):""})\
                    for i, col in zip(range(r_dummy_df.shape[1]),r_dummy_df.columns)]

def retain_nans(x):
    try:
        return np.where(r_dummy_df[x.name[:-1]+"nan"]==1,np.nan,x)
    except:
        #print("Except: "+x.name+" -- with "+x.name[:-1]+"nan")
        return x

r_dummy_df = r_dummy_df.apply(lambda x: retain_nans(x))
r_dummy_df = r_dummy_df.loc[:,~r_dummy_df.columns.str.endswith("nan")]

r_dummy_df.isna().mean().sort_values().tail(10)

DNCL3W2_DUMMY1     0.428262
DNCL4W2_DUMMY1     0.428262
DNCL8W2_DUMMY1     0.428262
DNCL10W2_DUMMY1    0.428262
DNCL5W2_DUMMY1     0.428262
DNCL6W2_DUMMY1     0.428262
DNCL6W2_DUMMY2     0.428262
DNCL7W2_DUMMY1     0.428262
DNCL2W2_DUMMY1     0.428262
DNCL7W2_DUMMY3     0.428262
dtype: float64

In [9]:
def fit_mice(dummy_df):
    
    results = []
    
    kernel = mf.ImputationKernel(
      dummy_df,
      datasets=5,
      save_all_iterations=True,
      random_state=396
    )
    
    kernel.mice(5)

    for k in range(5):
        results.append(kernel.complete_data(k))
        
    return results
        
#if numeric_var:
#return np.mean(np.array(results),axis=0)
#else:
#return_indices = np.random.randint(10,size=dummy_df.shape[0])
#return (np.array(results))[:,return_indices]

In [10]:
response_imputed_df = fit_mice(r_dummy_df)

In [11]:
for k in range(5):
    response_imputed_df[k].to_csv(("data/full_imputed_response_df_iter"+str(k)+".csv"))

In [12]:
exo_mtest_df = mtest_df.drop(columns=response_var_names)
exo_mtest_df.isna().mean().sort_values()

NUMMALE     0.000000
RAWSIZE     0.000000
INCSIZE     0.000000
DIST_ID     0.000000
LEP         0.000000
              ...   
CN1W2       0.416575
DNCL13W2    0.428262
TRD         0.449062
ST9         0.470475
CN1         0.473540
Length: 207, dtype: float64

In [13]:
e_dummy_df = pd.get_dummies(exo_mtest_df,drop_first=True,prefix_sep='_DUMMY',dummy_na=True)
e_dummy_df.columns = [(col.split()[0]).translate({ord("("):"",ord(")"):"",ord("["):"",ord("]"):""})\
                    for i, col in zip(range(e_dummy_df.shape[1]),e_dummy_df.columns)]

e_dummy_df = e_dummy_df.apply(lambda x: retain_nans(x))
e_dummy_df = e_dummy_df.loc[:,~e_dummy_df.columns.str.endswith("nan")]

e_dummy_df.columns = ["c"+str(i)+"_"+col for i, col in zip(range(e_dummy_df.shape[1]),e_dummy_df.columns)]

e_imputed_df = fit_mice(e_dummy_df)

for k in range(5):
    e_imputed_df[k].to_csv(("data/full_imputed_predictor_df_iter"+str(k)+".csv"))

## 

In [16]:
train_df = pd.DataFrame({"Cat_Var":["Blue","Red","Red","Blue"],"Num_Var":[0,1,2,3]})
test_df = pd.DataFrame({"Cat_Var":["Green","Green","Blue"],"Num_Var":[8,0,2]})

def dummies_test_train_split(train_df,test_df):
    train_df["isTest"] = 0
    test_df["isTest"] = 1
    
    full_df = pd.concat([train_df,test_df])
    dummies_df = pd.get_dummies(full_df,drop_first=True)
    
    dummy_train_df = dummies_df[dummies_df["isTest"]==0].drop(columns="isTest")
    dummy_test_df = dummies_df[dummies_df["isTest"]==1].drop(columns="isTest")
    
    return dummy_train_df, dummy_test_df

dummy_train_df, dummy_test_df = dummies_test_train_split(train_df, test_df)
dummy_train_df.head()

Unnamed: 0,Num_Var,Cat_Var_Green,Cat_Var_Red
0,0,0,0
1,1,0,1
2,2,0,1
3,3,0,0
