In [20]:
import requests
import os
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
import math

INTERIM_PATH="data/interim/ENIGH"
EXTRACT_PATH="data/raw/DATOS EDUCACION/ENCUESTA NACIONAL DE INGRESOS Y GASTOS EN LOS HOGARES/Extracted"

YEAR_DICT={
        '2016':[0,1,2,3],
        '2018':[4,5,6,7],
        '2020':[8,9,10,11],
        '2022':[12,13,14,15]
    }

YEARS=['2016','2018','2020','2022']


def ingresos(dataframes,year):
    index=YEAR_DICT[year][0]
    ingresos=dataframes[index][2]
    return ingresos

def poblacion(dataframes,year):
    index=YEAR_DICT[year][1]
    poblacion=dataframes[index][2]
    return poblacion

def gastos_hogar(dataframes,year):
    index=YEAR_DICT[year][2]
    gastos_hogar=dataframes[index][2]
    return gastos_hogar

def gastos_persona(dataframes,year):
    index=YEAR_DICT[year][3]
    gastos_persona=dataframes[index][2]
    return gastos_persona
    


def transformation(dataframes,year):
    df=poblacion(dataframes,year)[[
        'folioviv', 'foliohog', 'numren', 'parentesco', 'sexo', 'edad',
       'madre_hog', 'madre_id', 'padre_hog', 'padre_id',
       'alfabetism','asis_esc', 'nivel', 'grado', 'tipoesc',
       'nivelaprob', 'gradoaprob','antec_esc','edo_conyug', 'pareja_hog', 'conyuge_id'
    ]]
    

    counts = df['folioviv'].value_counts().reset_index()
    counts.columns = ['folioviv', 'counts']
    df= pd.merge(df, counts, on='folioviv')
    df=df.rename(columns={'counts':'integrantes'})

# Generation Classification

    bins = [1901,1924,1945, 1964, 1980, 1996, 2012, 2024 ]  # Define the edges of the bins
    labels = ['Greatest','Silent','Baby Boomer', 'X', 'Milenial', 'Z','Alpha']  # Define labels for the categories

    df['año_nacimiento']=int(year)-df['edad']
    df['generacion'] = pd.cut(df['año_nacimiento'], bins=bins, labels=labels, right=True)


# Income
    join_df=ingresos(dataframes,year)[[
        'folioviv','numren','foliohog',
        'clave', 'mes_1', 'mes_2', 'mes_3','mes_4', 'mes_5', 'mes_6',
        'ing_1', 'ing_2', 'ing_3', 'ing_4', 'ing_5','ing_6','ing_tri']]

# Bills
    gastos_p=gastos_persona(dataframes,year)[ 
        ['folioviv','numren', 
         'clave', 'tipo_gasto',
         'cantidad', 'gasto', 'costo', 'gasto_tri',
    
        ] ]

    #gastos['gasto_tri'].replace(' ', 0, inplace=True)
    #gastos['gasto_tri']=gastos['gasto_tri'].astype('float')
    #total_gasto_tri = gastos.groupby('folioviv', as_index=False)['gasto_tri'].sum().reset_index()
    df=pd.merge(df, gastos_p, on=['folioviv','numren'])

    
# Population and Income dataframes merge

    df=pd.merge(df, join_df, on=['folioviv','foliohog','numren'])




    return df


In [2]:
os.chdir("..")

In [76]:
print(f"<< Extraction Dataframes from {EXTRACT_PATH} >>")
dataframes=[]
for year in YEARS:
    paths={
    "Ingresos":f"{EXTRACT_PATH}/{year}/conjunto_de_datos_ingresos_enigh_{year}_ns/conjunto_de_datos/conjunto_de_datos_ingresos_enigh_{year}_ns.csv",
    "Poblacion":f"{EXTRACT_PATH}/{year}/conjunto_de_datos_poblacion_enigh_{year}_ns/conjunto_de_datos/conjunto_de_datos_poblacion_enigh_{year}_ns.csv",
    "Gastos Hogar":f"{EXTRACT_PATH}/{year}/conjunto_de_datos_gastoshogar_enigh_{year}_ns/conjunto_de_datos/conjunto_de_datos_gastoshogar_enigh_{year}_ns.csv",
    "Gastos Personal":f"{EXTRACT_PATH}/{year}/conjunto_de_datos_gastospersona_enigh_{year}_ns/conjunto_de_datos/conjunto_de_datos_gastospersona_enigh_{year}_ns.csv" }
    if year=='2022':
        paths={
        "Ingresos":f"{EXTRACT_PATH}/{year}/conjunto_de_datos_ingresos_enigh{year}_ns/conjunto_de_datos/conjunto_de_datos_ingresos_enigh{year}_ns.csv",
        "Poblacion":f"{EXTRACT_PATH}/{year}/conjunto_de_datos_poblacion_enigh{year}_ns/conjunto_de_datos/conjunto_de_datos_poblacion_enigh{year}_ns.csv",
        "Gastos Hogar":f"{EXTRACT_PATH}/{year}/conjunto_de_datos_gastoshogar_enigh{year}_ns/conjunto_de_datos/conjunto_de_datos_gastoshogar_enigh{year}_ns.csv",
        "Gastos Personal":f"{EXTRACT_PATH}/{year}/conjunto_de_datos_gastospersona_enigh{year}_ns/conjunto_de_datos/conjunto_de_datos_gastospersona_enigh{year}_ns.csv"}


    print(f" Extracting data for the year {year}...")
    for key,path in paths.items():
        print(f"{key} table is being read...")
        df=pd.read_csv(path)
        #df=df[df['entidad']==26]
        dataframes.append( (year, key, df) ) 
        print(f"{key} dataframe has been loaded to the dataframe list succesfully!")




<< Extraction Dataframes from data/raw/DATOS EDUCACION/ENCUESTA NACIONAL DE INGRESOS Y GASTOS EN LOS HOGARES/Extracted >>
 Extracting data for the year 2016...
Ingresos table is being read...
Ingresos dataframe has been loaded to the dataframe list succesfully!
Poblacion table is being read...


  df=pd.read_csv(path)


Poblacion dataframe has been loaded to the dataframe list succesfully!
Gastos Hogar table is being read...
Gastos Hogar dataframe has been loaded to the dataframe list succesfully!
Gastos Personal table is being read...
Gastos Personal dataframe has been loaded to the dataframe list succesfully!
 Extracting data for the year 2018...
Ingresos table is being read...
Ingresos dataframe has been loaded to the dataframe list succesfully!
Poblacion table is being read...


  df=pd.read_csv(path)


Poblacion dataframe has been loaded to the dataframe list succesfully!
Gastos Hogar table is being read...


  df=pd.read_csv(path)


Gastos Hogar dataframe has been loaded to the dataframe list succesfully!
Gastos Personal table is being read...


  df=pd.read_csv(path)


Gastos Personal dataframe has been loaded to the dataframe list succesfully!
 Extracting data for the year 2020...
Ingresos table is being read...
Ingresos dataframe has been loaded to the dataframe list succesfully!
Poblacion table is being read...


  df=pd.read_csv(path)


Poblacion dataframe has been loaded to the dataframe list succesfully!
Gastos Hogar table is being read...


  df=pd.read_csv(path)


Gastos Hogar dataframe has been loaded to the dataframe list succesfully!
Gastos Personal table is being read...


  df=pd.read_csv(path)


Gastos Personal dataframe has been loaded to the dataframe list succesfully!
 Extracting data for the year 2022...
Ingresos table is being read...
Ingresos dataframe has been loaded to the dataframe list succesfully!
Poblacion table is being read...


  df=pd.read_csv(path)


Poblacion dataframe has been loaded to the dataframe list succesfully!
Gastos Hogar table is being read...
Gastos Hogar dataframe has been loaded to the dataframe list succesfully!
Gastos Personal table is being read...
Gastos Personal dataframe has been loaded to the dataframe list succesfully!


In [228]:
print("<< Starting income,population and house bills merging >>")

print(" Merging for year 2022 taking place...")
df_2022=transformation(dataframes,'2022')
print(" Merging for year 2022 completed!")

print(" Merging for year 2020 taking place...")
df_2020=transformation(dataframes,'2020')
print(" Merging for year 2020 completed!")

print(" Merging for year 2018 taking place...")
df_2018=transformation(dataframes,'2018')
print(" Merging for year 2018 completed!")

print(" Merging for year 2016 taking place...")
df_2016=transformation(dataframes,'2016')
print(" Merging for year 2016 completed!")

<< Starting income,population and house bills merging >>
 Merging for year 2022 taking place...
 Merging for year 2022 completed!
 Merging for year 2020 taking place...
 Merging for year 2020 completed!
 Merging for year 2018 taking place...
 Merging for year 2018 completed!
 Merging for year 2016 taking place...
 Merging for year 2016 completed!


# Formatting

madre_id: es tipo int,  se refiere al valor de 'numren' del padre del hogar, el 0 significa que no se encuentra la madre del entrevistado en el hogar.El valor 99 proveniente de validación se etiqueta como “no especificado” o “perdido por el usuario”

padre_id: es tipo int, se refiere al valor de 'numren' del padre del hogar, el 0 significa que no se encuentra el padre del entrevistado en el hogar.El valor 99  proveniente de validación se etiqueta como “no especificado” o “perdido por el usuario”. 

alfabetism: es tipo categoría(int),el valor 0 significa que el entrevistado tiene menos de 3 años,1 sí y 2 no.

mes_n: tipo categoría(int), mes en el que recibió el entrevistado ingresos en el n-esimo mes, n va de 1 a 6. 

ing_n:tipo float, cantidad de ingreso que el entrevistado recibió en el mes_n. 




In [235]:
def null_imputation(df):
    ZERO=['madre_id','padre_id','conyuge_id','alfabetism',
          'edo_conyug','pareja_hog','antec_esc',
          'gradoaprob','nivelaprob','tipoesc','grado','nivel','asis_esc',
          'cantidad','gasto','gasto_tri','costo',
          'mes_1','mes_2','mes_3','mes_4','mes_5','mes_6',
          'ing_1','ing_2','ing_3','ing_4','ing_5','ing_6',
          ] 
    

    LOST_DATA=['madre_id','padre_id','conyuge_id'
               
               ]
    
    for column in df.columns:
        #print(column)
        if pd.api.types.is_categorical_dtype(df[column]):
            continue
        if pd.api.types.is_integer_dtype(df[column]):
            df[column]=df[column].fillna(0)
            #print(f"{column} filled with {0}")
            continue
        if pd.api.types.is_float_dtype(df[column]):
            df[column]=df[column].fillna(0)
            #print(f"{column} filled with {0}")
            continue
        if pd.api.types.is_object_dtype(df[column]):
            df[column]=df[column].fillna('0')
            #print(f"{column} filled with {'0'}")
            continue
        if pd.api.types.is_string_dtype(df[column]):
            df[column]=df[column].fillna('0')
            #print(f"{column} filled with {'0'}")
            continue

    for column in ZERO:
        df[column]=df[column].replace(' ','0')
        #print(f'Blank strings dealt with in column :{column} ')
    for column in LOST_DATA:
        df[column]=df[column].replace('&','99')
        #print(f'Lost data dealt with in column :{column} ')
    return df

def set_format(df):
    FORMATS={

    'int':['folioviv', 'foliohog', 'numren', 'parentesco', 'sexo', 'edad','año_nacimiento',
           'alfabetism','conyuge_id','padre_id','madre_hog', 'madre_id', 'padre_hog','edo_conyug',
           'mes_1', 'mes_2', 'mes_3','mes_4', 'mes_5', 'mes_6',
           'integrantes','pareja_hog',
           'grado','nivel','asis_esc','tipoesc','nivelaprob','gradoaprob','antec_esc',] ,
    
    'string':['generacion','clave_y','clave_x','tipo_gasto'],
    
    'float':['costo','gasto', 'gasto_tri','ing_tri','ing_1', 'ing_2', 'ing_3', 'ing_4', 'ing_5','ing_6','cantidad'],
    }
    columns_with_errors=[]
    for column in df.columns:
        for format,columns in FORMATS.items():
            #print(f"{column} | {format}  | {columns}")
            if column in columns:
                try:
                    #print(column)
                    df[column]=df[column].astype(format)
                    #print(f'{column} guardada como {format}')
                    break
                except ValueError as e:
                    columns_with_errors.append([column,e])
    return df,columns_with_errors

In [234]:
print("<< Starting Formatting >>")

print(" Formatting for year 2022 taking place...")
df_2022=null_imputation(df_2022)
df_2022,errors_2022=set_format(df_2022)
print(f" Formatting for year 2022 completed with {len(errors_2022)} errors!")

print(" Formatting for year 2020 taking place...")
df_2020=null_imputation(df_2020)
df_2020,errors_2020=set_format(df_2020)
print(f" Formatting for year 2020 completed with {len(errors_2020)} errors!")

print(" Formatting for year 2018 taking place...")
df_2018=null_imputation(df_2018)
df_2018,errors_2018=set_format(df_2018)
print(f" Formatting for year 2018 completed with {len(errors_2018)} errors!")

print(" Formatting for year 2016 taking place...")
df_2016=null_imputation(df_2016)
df_2016,errors_2016=set_format(df_2016)
print(f" Formatting for year 2016 completed with {len(errors_2016)} errors!")

<< Starting Formatting >>
 Formatting for year 2022 taking place...
 Formatting for year 2022 completed with 0 errors!
 Formatting for year 2020 taking place...
 Formatting for year 2020 completed with 0 errors!
 Formatting for year 2018 taking place...
 Formatting for year 2018 completed with 0 errors!
 Formatting for year 2016 taking place...
 Formatting for year 2016 completed with 0 errors!


In [74]:
def generacion_conyuge(df,folioviv,conyuge_id,edo_conyug):
    if (conyuge_id==' '):
        if (edo_conyug in ['6','5','4','3']) or (edo_conyug in [ 6.,  5.,  3.,  4.]):
            return 'No tiene conyuge'
        if (edo_conyug==' ')or (math.isnan(edo_conyug)):
            return "Tiene menos de 12 años"
        if (edo_conyug in ['2','1']) or (edo_conyug in [2.,1.]) :
            return 'No se sabe que significa'
        
    if conyuge_id=='&':
        return "Dato perdido o no especificado"
    
    try :
        if math.isnan(conyuge_id):
            if (edo_conyug in ['6','5','4','3']) or (edo_conyug in [ 6.,  5.,  3.,  4.]):
                return 'No tiene conyuge'
            if (edo_conyug==' ')or (math.isnan(edo_conyug)):
                return "Tiene menos de 12 años"
            if (edo_conyug in ['2','1']) or (edo_conyug in [2.,1.]) :
                return 'No se sabe que significa'
    except ValueError:
        return 'No se sabe'
    
    work_df=df[df['folioviv']==folioviv]

    
    conyuge_row=work_df[work_df['numren']==int(conyuge_id)].reset_index()
    if conyuge_row.empty:
        return "No se sabe"
    generacion_conyuge=conyuge_row.loc[0, 'generacion']
    
    return generacion_conyuge

def delta_calculator(generacion_1,generacion_2):
    for case in ['No tiene conyuge','Tiene menos de 12 años','No se sabe que significa','Dato perdido o no especificado','No se sabe']:
        if (generacion_1==case or generacion_2==case):
            return case
    
    
    
    generation_dict={'Greatest':0,
                     'Silent':1,
                     'Baby Boomer':2,
                     'X':3,
                     'Milenial':4,
                     'Z':5,
                     'Alpha':6}
    
    delta=abs(generation_dict[generacion_1]-generation_dict[generacion_2])
    return delta

def generational_transformation(df):
    df['generacion_conyuge']=df.apply(lambda row: generacion_conyuge(df, row['folioviv'], row['conyuge_id'],row['edo_conyug']), axis=1)
    df['delta']=df.apply(lambda row: delta_calculator( row['generacion'], row['generacion_conyuge']), axis=1)
    return df


In [75]:
print("<< Starting generational transformations >>")

print(" Generational transformations for year 2022 taking place...")
df_2022=generational_transformation(df_2022)
print("Transformations for year 2022 completed!")

print(" Generational transformations for year 2020 taking place...")
df_2020=generational_transformation(df_2020)
print("Transformations for year 2020 completed!")

print(" Generational transformations for year 2018 taking place...")
df_2018=generational_transformation(df_2018)
print("Transformations for year 2018 completed!")

print(" Generational transformations for year 2016 taking place...")
df_2016=generational_transformation(df_2016)
print("Transformations for year 2016 completed!")

<< Starting generational transformations >>
 Generational transformations for year 2022 taking place...


TypeError: must be real number, not str

In [12]:
os.makedirs(INTERIM_PATH,exist_ok=True)
print(f"<< Loading processed data to :{INTERIM_PATH} >>")

print("Loading 2022 data...")
df_2022.to_csv(f"{INTERIM_PATH}/2022.csv")
print("ENIGH 2022 data loaded successfully!!")

print("Loading 2020 data...")
df_2020.to_csv(f"{INTERIM_PATH}/2020.csv")
print("ENIGH 2020 data loaded successfully!!")

print("Loading 2018 data...")
df_2018.to_csv(f"{INTERIM_PATH}/2018.csv")
print("ENIGH 2018 data loaded successfully!!")

print("Loading 2016 data...")
df_2016.to_csv(f"{INTERIM_PATH}/2016.csv")
print("ENIGH 2016 data loaded successfully!!")

<< Loading processed data to :data/interim/ENIGH >>
Loading 2022 data...
ENIGH 2022 data loaded successfully!!
Loading 2020 data...
ENIGH 2020 data loaded successfully!!
Loading 2018 data...
ENIGH 2018 data loaded successfully!!
Loading 2016 data...
ENIGH 2016 data loaded successfully!!
