# Índice

1. [Definición del *dataset*](#data)
1. [Funciones de limpieza de datos](#functions)
1. [Definición de características](#features)
1. [Procesado de la *pipeline*](#pipe)

<a id='data'></a>
# 1. Definición del dataset

In [133]:
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np
import sklearn 

In [134]:
# Lectura de los datos brutos 

df = pd.read_stata('Cash_and_Childhood_Development_Replication/macoursetal_main.dta')  

# Definición de las variables de resultado
df_out=df[["a5sscore_i_06","a6smemory_p1_06","a6smemory_p2_06","a7a_delay_06","a7b_delay_06",
        "a7c_delay_06","a7d_delay_06","a9sgrossmotor_06","height_06","weight_06","z_tvip_06","z_social_06",
        "z_language_06","z_finmotor_06","z_memory_06","z_grmotor_06","z_legmotor_06","z_behavior_06","z_height_06",
        "z_weight_06","z_all_06"]]

# Definición de las variables de entrada
#     hogarid_old se usará como identificador para la imputación por unidad familiar, no como entrada

df1=df[["hogarid_old","s1age_head_05","s1hhsize_05","s1hhsz_undr5_05","s1hhsz_5_14_05",
        "s1hhsz_15_24_05","s1hhsz_25_64_05","s1hhsz_65plus_05","s1male_head_05","s2mother_inhs_05","s3ap5_rooms_h_05",
        "s3ap23_stime_h_05","s3ap24_htime_h_05","s3ap25_hqtime_h_05","s3atoilet_hh_05","s3awater_access_hh_05",
        "s3aelectric_hh_05","s4p6_vitamina_i_05","s4p7_parasite_i_05","s11ownland_hh_05","cons_food_pc_05","cons_tot_pc_05",
        "height_05","a10whz_05","weight_05","yrsedfath","age_transfer","bweight",
        "s4p7_parasite_i_06","T","male","ed_mom","MUN1","MUN2","MUN3","MUN4",
        "MUN5","MUN6","com_haz_05","com_waz_05","com_vit_05","com_deworm_05",
        "vitamiron_06", "propfood_05","prstap_f_05","pranimalprot_f_05","prfruitveg_f_05"]]

df1=df1.rename(columns = {'T':'tr'})

<a id='functions'></a>
# 2. Funciones de limpieza de datos

In [135]:
# Copia del dataframe
def copy_df(df):
    return df.copy()

In [136]:
# Omisión de registros con más de delta datos faltantes
def drop_ind_missing(df, delta):
    thresh = len(df.columns)*delta
    df.dropna(axis=0, thresh=thresh, inplace=True)
    return df

In [137]:
# Omisión de registros con age_transfer<age
def drop_out_missing(df, age=-11):
    nan_rows=df[df.age_transfer<age].index
    df.drop(nan_rows, inplace=True)
    return df

In [138]:
# Codificación de variables categóricas
def ordinal(df, cols):
    enc = OrdinalEncoder()
    enc.fit(df[cols])
    df[cols] = enc.transform(df[cols])
    
    return df

In [148]:
# Imputación de valores faltantes

# Imputación por media
def median_imputer(df, cols):
    for col in cols:
        df[col].fillna(df[col].median(), inplace=True)
    return df

# Imputación por moda
def mode_imputer(df, cols):
    for col in cols:
        df[col].fillna(df[col].mode()[0], inplace=True)
    return df


# Imputación por moda/media en la unidad familiar
def hh_mode_imputer(df,cols):
    index=df.index
    for col in cols:
        col_hh=[df[df.hogarid_old==familia][[col,"age_transfer"]].sort_values(by=['age_transfer'])[col]
                .values for familia in df.hogarid_old]
        for i in range(len(index)):
               if np.isnan(df[col][index[i]]):
                    non_nan_len=np.count_nonzero(~np.isnan(col_hh[i]))
                    if(non_nan_len!=0):
                        vals,counts = np.unique(col_hh[i], return_counts=True)
                        ind = np.argmax(counts)
                        df.loc[index[i],col]=vals[ind]
    return df             
                            
def hh_mean_imputer(df,cols):
    index=df.index
    for col in cols:
        col_hh=[df[df.hogarid_old==familia][[col,"age_transfer"]].sort_values(by=['age_transfer'])[col]
                .values for familia in df.hogarid_old]
        for i in range(len(index)):
               if np.isnan(df[col][index[i]]):
                    non_nan_len=np.count_nonzero(~np.isnan(col_hh[i]))
                    if(non_nan_len!=0):
                        vals,counts = np.unique(col_hh[i], return_counts=True)
                        ind = np.argmax(counts)
                        df.loc[index[i],col]=vals[ind]
    return df  
    

In [140]:
# Omisión de valores atípicos

def outlier_removal(df, cols):
    outlier_threshold = []
    for col in cols:
            q3 = np.nanpercentile(df[col], 75)
            q1 = np.nanpercentile(df[col], 25)
            iqr = q3 - q1
            out_low = q1 - 3*iqr
            out_high = q3 + 3*iqr
            outlier_threshold.append([out_low, out_high])
    outliers_indexs=[]
    for col,k in zip(cols,range(len(cols))):
        #outliers_indexs_in_col=[]
        for i in df[col].index:
            if df[col][i]<outlier_threshold[k][0] or df[col][i]>outlier_threshold[k][1]:
                outliers_indexs.append(i)
        #outliers_indexs.append(outliers_indexs_in_col)
    return df.drop(list(set(outliers_indexs)), axis=0)

In [141]:
# Normalización y estandarización

def normalize(df, cols):
    Nscaler = MinMaxScaler()

    Nscaler.fit(df[cols])
    df[cols] = Nscaler.transform(df[cols])
    
    return df

def standardize(df, cols):
    Sscaler = StandardScaler()

    Sscaler.fit(df[cols])
    df[cols] = Sscaler.transform(df[cols])
    
    return df

<a id='features'></a>
# 3. Definición de características

A continuación se define los conjuntos de características según las estrategias a implementar.

In [149]:
median_cols = ["prfruitveg_f_05","prstap_f_05","pranimalprot_f_05","propfood_05","cons_food_pc_05","ed_mom","yrsedfath",
               "bweight","weight_05","height_05","a10whz_05"]
mode_cols = ["s2mother_inhs_05","s4p7_parasite_i_06","vitamiron_06","s4p7_parasite_i_05","s4p6_vitamina_i_05"]

hh_mean_cols = ["prfruitveg_f_05","prstap_f_05","pranimalprot_f_05","propfood_05","cons_food_pc_05","ed_mom","yrsedfath"]
hh_mode_cols = ["s2mother_inhs_05","s4p7_parasite_i_06","vitamiron_06","s4p7_parasite_i_05","s4p6_vitamina_i_05"]

categorical_cols=df1.select_dtypes(exclude=["number","bool_","object_"]).columns

outlier_cols=["cons_food_pc_05","cons_tot_pc_05","s3ap24_htime_h_05","bweight","pranimalprot_f_05","prfruitveg_f_05"]

normal_cols =["height_05","a10whz_05","weight_05"]

Nnormal_cols = ['s1age_head_05', 's3ap23_stime_h_05', 's3ap24_htime_h_05', 's3ap25_hqtime_h_05', 'cons_food_pc_05', 
                'cons_tot_pc_05', 'yrsedfath','age_transfer', 'bweight', 'ed_mom', "com_haz_05","com_waz_05"]



<a id='pipe'></a>
# 4. Procesado de la *pipeline*

In [150]:
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import OrdinalEncoder
from sklearn.preprocessing import MinMaxScaler
from sklearn.preprocessing import StandardScaler

cleaned_df=(df1.pipe(copy_df)
            .pipe(drop_ind_missing, 0.8)
            .pipe(drop_out_missing)
            .pipe(ordinal, categorical_cols)
            .pipe(hh_mean_imputer, hh_mean_cols)
            .pipe(hh_mode_imputer, hh_mode_cols)
            .pipe(median_imputer, median_cols)
            .pipe(mode_imputer, mode_cols)
            .pipe(outlier_removal, outlier_cols) 
            .pipe(standardize, normal_cols) 
            .pipe(normalize, Nnormal_cols)) 

<a id='outcome'></a>
# 5. Procesado respecto el outcome

In [152]:
# Se añade el outcome al dataframe, eliminando aquellos individuos con outcome NaN 
cleaned_df.drop(cleaned_df[df_out.iloc[cleaned_df.index].z_all_06.isna()].index, axis=0, inplace=True)
cleaned_df["z_all_06"]=df_out.iloc[cleaned_df.index].z_all_06

# Se eliminan los identificadores
cp_cols=["hogarid_old"]
cleaned_df.drop(cp_cols, axis=1, inplace=True)

# Se eliminan outliers
cleaned_df=outlier_removal(cleaned_df, ["z_all_06"])

In [154]:
import os  
os.makedirs('processed_data', exist_ok=True)  
cleaned_df.to_csv("processed_data/cleaned_df.csv")
