# Índice

1. [Exploración inicial de características](#explo)
1. [Variables de respuesta](#outcome)
   1. [Datos faltantes](#nanout)
1. [Variables independientes](#input)
   1. [Correlaciones](#corr)
   1. [Gestión de datos faltantes](#nan)
   1. [Valores atípicos](#outlier)
   1. [Reescalado de datos](#scale)
1. [Resumen](#summary)

Notebook imitant:
    https://towardsdatascience.com/causal-machine-learning-for-econometrics-causal-forests-5ab3aec825a7

In [1]:
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np
import sklearn 

In [2]:
#Lectura de los datos brutos y división del dataset

df = pd.read_stata('Cash_and_Childhood_Development_Replication/macoursetal_main.dta')  

In [3]:
# "hogarid_old","cp_old","unique_05", "cpmom_06" seran usats com a index
df_out=df[["a5sscore_i_06","a6smemory_p1_06","a6smemory_p2_06","a7a_delay_06","a7b_delay_06",
        "a7c_delay_06","a7d_delay_06","a9sgrossmotor_06","height_06","weight_06","z_tvip_06","z_social_06",
        "z_language_06","z_finmotor_06","z_memory_06","z_grmotor_06","z_legmotor_06","z_behavior_06","z_height_06",
        "z_weight_06","z_all_06"]]


# Income variables

#2-6: Treiem variables:

"hogarid_old","cp_old", "unique_05", "itt_all_i", "TREAT1","TREAT2","TREAT3","TREAT4", "com_tvip_05", "com_control_05"
, "com_notvip", "sample06","weighted_05", ,"itt_i"

In [4]:
df1=df[["hogarid_old","s1age_head_05","s1hhsize_05","s1hhsz_undr5_05","s1hhsz_5_14_05",
        "s1hhsz_15_24_05","s1hhsz_25_64_05","s1hhsz_65plus_05","s1male_head_05","s2mother_inhs_05","s3ap5_rooms_h_05",
        "s3ap23_stime_h_05","s3ap24_htime_h_05","s3ap25_hqtime_h_05","s3atoilet_hh_05","s3awater_access_hh_05",
        "s3aelectric_hh_05","s4p6_vitamina_i_05","s4p7_parasite_i_05","s11ownland_hh_05","cons_food_pc_05","cons_tot_pc_05",
        "height_05","a10whz_05","weight_05","yrsedfath","age_transfer","bweight",
        "s4p7_parasite_i_06","T","male","ed_mom","MUN1","MUN2","MUN3","MUN4",
        "MUN5","MUN6","com_haz_05","com_waz_05","com_vit_05","com_deworm_05",
        "vitamiron_06", "propfood_05","prstap_f_05","pranimalprot_f_05","prfruitveg_f_05"]]

In [5]:
df1.rename(columns = {'T':'tr'}, inplace = True)

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df1.rename(columns = {'T':'tr'}, inplace = True)


In [6]:
#fem una copia del df
def copy_df(df):
   return df.copy()

In [7]:
#eliminem individus amb >delta*100% columnes buides
def drop_ind_missing(df, delta):
    thresh = len(df.columns)*delta
    df.dropna(axis=0, thresh=thresh, inplace=True)
    return df

In [8]:
#eliminem individus amb age transfer<age
def drop_out_missing(df, age=-11):
    nan_rows=df[df.age_transfer<age].index
    df.drop(nan_rows, inplace=True)
    return df

In [9]:
#categorical to ordinal
def ordinal(df, cols):
    enc = OrdinalEncoder()
    enc.fit(df[cols])
    df[cols] = enc.transform(df[cols])
    
    return df

In [10]:
#imputem NaNs

#imputem mitjanes
def mean_imputer(df, cols):
    for col in cols:
        df[col].fillna(df[col].mean(), inplace=True)
    return df

#imputem el valor més frequent de la columna dintre el hh
def hh_mf_imputer(df,cols):
    index=df.index
    for col in cols:
        col_hh=[df[df.hogarid_old==familia][[col,"age_transfer"]].sort_values(by=['age_transfer'])[col]
                .values for familia in df.hogarid_old]
        for i in range(len(index)):
               if np.isnan(df[col][index[i]]):
                    non_nan_len=np.count_nonzero(~np.isnan(col_hh[i]))
                    if(non_nan_len!=0):
                        vals,counts = np.unique(col_hh[i], return_counts=True)
                        ind = np.argmax(counts)
                        df.loc[index[i],col]=vals[ind]
    return df             
                            
                        

In [11]:
#eliminem outliers

def outlier_removal(df, cols):
    outlier_threshold = []
    for col in cols:
            q3 = np.nanpercentile(df[col], 75)
            q1 = np.nanpercentile(df[col], 25)
            iqr = q3 - q1
            out_low = q1 - 3*iqr
            out_high = q3 + 3*iqr
            outlier_threshold.append([out_low, out_high])
    outliers_indexs=[]
    for col,k in zip(cols,range(len(cols))):
        #outliers_indexs_in_col=[]
        for i in df[col].index:
            if df[col][i]<outlier_threshold[k][0] or df[col][i]>outlier_threshold[k][1]:
                outliers_indexs.append(i)
        #outliers_indexs.append(outliers_indexs_in_col)
    return df.drop(list(set(outliers_indexs)), axis=0)

In [12]:
def normalize(df, cols):
    Nscaler = MinMaxScaler()

    Nscaler.fit(df[cols])
    df[cols] = Nscaler.transform(df[cols])
    
    return df

def standardize(df, cols):
    Sscaler = StandardScaler()

    Sscaler.fit(df[cols])
    df[cols] = Sscaler.transform(df[cols])
    
    return df

In [13]:
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import OrdinalEncoder
from sklearn.preprocessing import MinMaxScaler
from sklearn.preprocessing import StandardScaler

mean_cols=["bweight", "height_05", "a10whz_05", "weight_05"]
hh_mf_cols=["s1age_head_05","s1hhsize_05","s1hhsz_undr5_05","s1hhsz_5_14_05","s1hhsz_15_24_05","s1hhsz_25_64_05",
            "s1hhsz_65plus_05","s1male_head_05","s2mother_inhs_05","s3ap5_rooms_h_05","s3atoilet_hh_05",
            "s3awater_access_hh_05","s3aelectric_hh_05","s11ownland_hh_05","s4p7_parasite_i_06","ed_mom","vitamiron_06",
           "s4p6_vitamina_i_05", "s4p7_parasite_i_05", "cons_food_pc_05", "yrsedfath", "propfood_05", "prstap_f_05"
            ,  "pranimalprot_f_05",  "prfruitveg_f_05"]

categorical_cols=df1.select_dtypes(exclude=["number","bool_","object_"]).columns
outlier_cols=["pranimalprot_f_05", "bweight", "prfruitveg_f_05"]

normal_cols =["height_05","a10whz_05","weight_05","com_haz_05","com_waz_05"]

Nnormal_cols = ['s1age_head_05', 's3ap23_stime_h_05', 's3ap24_htime_h_05', 's3ap25_hqtime_h_05', 'cons_food_pc_05', 
                'cons_tot_pc_05', 'yrsedfath','age_transfer', 'bweight', 'ed_mom', 'com_tvip_05', 'com_control_05']


cleaned_df=(df1.pipe(copy_df)
            .pipe(drop_ind_missing, 0.8)
            .pipe(drop_out_missing)
            .pipe(ordinal, categorical_cols)
            .pipe(hh_mf_imputer, hh_mf_cols)
            .pipe(mean_imputer, mean_cols+hh_mf_cols)
            .pipe(outlier_removal, outlier_cols) #outliers
            .pipe(standardize, normal_cols) #estandaritzacio
            .pipe(normalize, Nnormal_cols)) #normalitzacio
                         

GUARDAR DF COMO PICKLE RICK