In [2]:
import requests
import os
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
import math

INTERIM_PATH="data/interim/ENIGH"
EXTRACT_PATH="data/raw/DATOS EDUCACION/ENCUESTA NACIONAL DE INGRESOS Y GASTOS EN LOS HOGARES/Extracted"

YEAR_DICT={
        '2016':[0,1,2,3],
        '2018':[4,5,6,7],
        '2020':[8,9,10,11],
        '2022':[12,13,14,15]
    }

YEARS=['2016','2018','2020','2022']


def ingresos(dataframes,year):
    index=YEAR_DICT[year][0]
    ingresos=dataframes[index][2]
    return ingresos

def poblacion(dataframes,year):
    index=YEAR_DICT[year][1]
    poblacion=dataframes[index][2]
    return poblacion

def gastos_hogar(dataframes,year):
    index=YEAR_DICT[year][2]
    gastos_hogar=dataframes[index][2]
    return gastos_hogar

def gastos_persona(dataframes,year):
    index=YEAR_DICT[year][3]
    gastos_persona=dataframes[index][2]
    return gastos_persona
    


def transformation(dataframes,year):
    df=poblacion(dataframes,year)
    

    counts = df['folioviv'].value_counts().reset_index()
    counts.columns = ['folioviv', 'counts']
    df= pd.merge(df, counts, on='folioviv')
    df=df.rename(columns={'counts':'integrantes'})

# Generation Classification

    bins = [1901,1924,1945, 1964, 1980, 1996, 2012, 2024 ]  # Define the edges of the bins
    labels = ['Greatest','Silent','Baby Boomer', 'X', 'Milenial', 'Z','Alpha']  # Define labels for the categories

    df['año_nacimiento']=int(year)-df['edad']
    df['generacion'] = pd.cut(df['año_nacimiento'], bins=bins, labels=labels, right=True)


# Income
    join_df=ingresos(dataframes,year)[['folioviv','numren','foliohog',
                              'ing_tri','clave']]


    counts = join_df['folioviv'].value_counts().reset_index()
    counts.columns = ['folioviv', 'numero_de_ingresos_hogar']
    join_df= pd.merge(join_df, counts, on='folioviv')

    total_ing_tri = ingresos(dataframes,year).groupby('folioviv', as_index=False)['ing_tri'].sum()
    join_df= pd.merge(join_df, total_ing_tri, on='folioviv', suffixes=('', '_hogar_total'))

    ing_tri_ind=join_df.groupby(['folioviv','numren','foliohog'])['ing_tri'].sum().reset_index()
    join_df=pd.merge(join_df, ing_tri_ind, on=['folioviv','numren','foliohog'], suffixes=('', '_individual_total'))

# Bills
    gastos=gastos_hogar(dataframes,year='2022')[ ['folioviv','entidad',
       'clave','gasto','gasto_tri'
       ] ]

    gastos['gasto_tri'].replace(' ', 0, inplace=True)
    gastos['gasto_tri']=gastos['gasto_tri'].astype('float')
    total_gasto_tri = gastos.groupby('folioviv', as_index=False)['gasto_tri'].sum().reset_index()
    df=pd.merge(df, total_gasto_tri, on='folioviv', suffixes=('', '_hogar_total'))

    
# Population and Income dataframes merge

    df=pd.merge(df, join_df, on=['folioviv','foliohog','numren'])

# Income Classification
    bins = [-1, 19999, 59999, 99999, 139999, 189999, 200000, float('inf')]  
    labels = ['0-19,999', '20,000-59,999', '60,000-99,999', '100,000-139,999', '140,000-189,999', '190,000-200,000', '200,000 +']


    df['niveles_hogar'] = pd.cut(df['ing_tri_hogar_total'], bins=bins, labels=labels, right=False)
    df['niveles_individual'] = pd.cut(df['ing_tri_individual_total'], bins=bins, labels=labels, right=False)



    return df


In [6]:
os.chdir("..")

In [8]:
print(f"<< Extraction Dataframes from {EXTRACT_PATH} >>")
dataframes=[]
for year in YEARS:
    paths={
    "Ingresos":f"{EXTRACT_PATH}/{year}/conjunto_de_datos_ingresos_enigh_{year}_ns/conjunto_de_datos/conjunto_de_datos_ingresos_enigh_{year}_ns.csv",
    "Poblacion":f"{EXTRACT_PATH}/{year}/conjunto_de_datos_poblacion_enigh_{year}_ns/conjunto_de_datos/conjunto_de_datos_poblacion_enigh_{year}_ns.csv",
    "Gastos Hogar":f"{EXTRACT_PATH}/{year}/conjunto_de_datos_gastoshogar_enigh_{year}_ns/conjunto_de_datos/conjunto_de_datos_gastoshogar_enigh_{year}_ns.csv",
    "Gastos Personal":f"{EXTRACT_PATH}/{year}/conjunto_de_datos_gastospersona_enigh_{year}_ns/conjunto_de_datos/conjunto_de_datos_gastospersona_enigh_{year}_ns.csv" }
    if year=='2022':
        paths={
        "Ingresos":f"{EXTRACT_PATH}/{year}/conjunto_de_datos_ingresos_enigh{year}_ns/conjunto_de_datos/conjunto_de_datos_ingresos_enigh{year}_ns.csv",
        "Poblacion":f"{EXTRACT_PATH}/{year}/conjunto_de_datos_poblacion_enigh{year}_ns/conjunto_de_datos/conjunto_de_datos_poblacion_enigh{year}_ns.csv",
        "Gastos Hogar":f"{EXTRACT_PATH}/{year}/conjunto_de_datos_gastoshogar_enigh{year}_ns/conjunto_de_datos/conjunto_de_datos_gastoshogar_enigh{year}_ns.csv",
        "Gastos Personal":f"{EXTRACT_PATH}/{year}/conjunto_de_datos_gastospersona_enigh{year}_ns/conjunto_de_datos/conjunto_de_datos_gastospersona_enigh{year}_ns.csv"}


    print(f" Extracting data for the year {year}...")
    for key,path in paths.items():
        print(f"{key} table is being read...")
        df=pd.read_csv(path)
        #df=df[df['entidad']==26]
        dataframes.append( (year, key, df) ) 
        print(f"{key} dataframe has been loaded to the dataframe list succesfully!")




<< Extraction Dataframes from data/raw/DATOS EDUCACION/ENCUESTA NACIONAL DE INGRESOS Y GASTOS EN LOS HOGARES/Extracted >>
 Extracting data for the year 2016...
Ingresos table is being read...
Ingresos dataframe has been loaded to the dataframe list succesfully!
Poblacion table is being read...


  df=pd.read_csv(path)


Poblacion dataframe has been loaded to the dataframe list succesfully!
Gastos Hogar table is being read...
Gastos Hogar dataframe has been loaded to the dataframe list succesfully!
Gastos Personal table is being read...
Gastos Personal dataframe has been loaded to the dataframe list succesfully!
 Extracting data for the year 2018...
Ingresos table is being read...
Ingresos dataframe has been loaded to the dataframe list succesfully!
Poblacion table is being read...


  df=pd.read_csv(path)


Poblacion dataframe has been loaded to the dataframe list succesfully!
Gastos Hogar table is being read...


  df=pd.read_csv(path)


Gastos Hogar dataframe has been loaded to the dataframe list succesfully!
Gastos Personal table is being read...


  df=pd.read_csv(path)


Gastos Personal dataframe has been loaded to the dataframe list succesfully!
 Extracting data for the year 2020...
Ingresos table is being read...
Ingresos dataframe has been loaded to the dataframe list succesfully!
Poblacion table is being read...


  df=pd.read_csv(path)


Poblacion dataframe has been loaded to the dataframe list succesfully!
Gastos Hogar table is being read...


  df=pd.read_csv(path)


Gastos Hogar dataframe has been loaded to the dataframe list succesfully!
Gastos Personal table is being read...


  df=pd.read_csv(path)


Gastos Personal dataframe has been loaded to the dataframe list succesfully!
 Extracting data for the year 2022...
Ingresos table is being read...
Ingresos dataframe has been loaded to the dataframe list succesfully!
Poblacion table is being read...


  df=pd.read_csv(path)


Poblacion dataframe has been loaded to the dataframe list succesfully!
Gastos Hogar table is being read...
Gastos Hogar dataframe has been loaded to the dataframe list succesfully!
Gastos Personal table is being read...
Gastos Personal dataframe has been loaded to the dataframe list succesfully!


In [9]:
def generacion_conyuge(df,folioviv,conyuge_id):
    if (conyuge_id==' '):
        return "No se sabe"
    if conyuge_id=='&':
        return "Que pedo"
    
    try:
        if math.isnan(float(conyuge_id)):
            return "No se sabe"
    except ValueError:
        # Manejo del caso donde conyuge_id no puede ser convertido a float
        return "No se sabe"
    
    work_df=df[df['folioviv']==folioviv]

    
    conyuge_row=work_df[work_df['numren']==int(conyuge_id)].reset_index()
    if conyuge_row.empty:
        return "No se sabe"
    generacion_conyuge=conyuge_row.loc[0, 'generacion']
    
    return generacion_conyuge

def delta_calculator(generacion_1,generacion_2):
    if (generacion_1=="No se sabe" or generacion_2=="No se sabe"):
        return "No se sabe"
    if (generacion_1=="Que pedo" or generacion_2=="Que pedo"):
        return "Que pedo"
    
    generation_dict={'Greatest':0,
                     'Silent':1,
                     'Baby Boomer':2,
                     'X':3,
                     'Milenial':4,
                     'Z':5,
                     'Alpha':6}
    delta=abs(generation_dict[generacion_1]-generation_dict[generacion_2])
    return delta

def generational_transformation(df):
    df['generacion_conyuge']=df.apply(lambda row: generacion_conyuge(df, row['folioviv'], row['conyuge_id']), axis=1)
    df['delta']=df.apply(lambda row: delta_calculator( row['generacion'], row['generacion_conyuge']), axis=1)
    return df


In [10]:
print("<< Starting income,population and house bills merging >>")

print(" Merging for year 2022 taking place...")
df_2022=transformation(dataframes,'2022')
print(" Merging for year 2022 completed!")

print(" Merging for year 2020 taking place...")
df_2020=transformation(dataframes,'2020')
print(" Merging for year 2020 completed!")

print(" Merging for year 2018 taking place...")
df_2018=transformation(dataframes,'2018')
print(" Merging for year 2018 completed!")

print(" Merging for year 2016 taking place...")
df_2016=transformation(dataframes,'2016')
print(" Merging for year 2016 completed!")

<< Starting income,population and house bills merging >>
 Merging for year 2022 taking place...


The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  gastos['gasto_tri'].replace(' ', 0, inplace=True)
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  gastos['gasto_tri'].replace(' ', 0, inplace=True)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  gastos['gasto_tri']=gastos['gasto_tri'].ast

 Merging for year 2022 completed!
 Merging for year 2020 taking place...


The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  gastos['gasto_tri'].replace(' ', 0, inplace=True)
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  gastos['gasto_tri'].replace(' ', 0, inplace=True)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  gastos['gasto_tri']=gastos['gasto_tri'].ast

 Merging for year 2020 completed!
 Merging for year 2018 taking place...


The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  gastos['gasto_tri'].replace(' ', 0, inplace=True)
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  gastos['gasto_tri'].replace(' ', 0, inplace=True)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  gastos['gasto_tri']=gastos['gasto_tri'].ast

 Merging for year 2018 completed!
 Merging for year 2016 taking place...


The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  gastos['gasto_tri'].replace(' ', 0, inplace=True)
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  gastos['gasto_tri'].replace(' ', 0, inplace=True)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  gastos['gasto_tri']=gastos['gasto_tri'].ast

 Merging for year 2016 completed!


In [11]:
print("<< Starting generational transformations >>")

print(" Generational transformations for year 2022 taking place...")
df_2022=generational_transformation(df_2022)
print("Transformations for year 2022 completed!")

print(" Generational transformations for year 2020 taking place...")
df_2020=generational_transformation(df_2020)
print("Transformations for year 2020 completed!")

print(" Generational transformations for year 2018 taking place...")
df_2018=generational_transformation(df_2018)
print("Transformations for year 2018 completed!")

print(" Generational transformations for year 2016 taking place...")
df_2016=generational_transformation(df_2016)
print("Transformations for year 2016 completed!")

<< Starting generational transformations >>
 Generational transformations for year 2022 taking place...
Transformations for year 2022 completed!
 Generational transformations for year 2020 taking place...
Transformations for year 2020 completed!
 Generational transformations for year 2018 taking place...
Transformations for year 2018 completed!
 Generational transformations for year 2016 taking place...
Transformations for year 2016 completed!


In [12]:
os.makedirs(INTERIM_PATH,exist_ok=True)
print(f"<< Loading processed data to :{INTERIM_PATH} >>")

print("Loading 2022 data...")
df_2022.to_csv(f"{INTERIM_PATH}/2022.csv")
print("ENIGH 2022 data loaded successfully!!")

print("Loading 2020 data...")
df_2020.to_csv(f"{INTERIM_PATH}/2020.csv")
print("ENIGH 2020 data loaded successfully!!")

print("Loading 2018 data...")
df_2018.to_csv(f"{INTERIM_PATH}/2018.csv")
print("ENIGH 2018 data loaded successfully!!")

print("Loading 2016 data...")
df_2016.to_csv(f"{INTERIM_PATH}/2016.csv")
print("ENIGH 2016 data loaded successfully!!")

<< Loading processed data to :data/interim/ENIGH >>
Loading 2022 data...
ENIGH 2022 data loaded successfully!!
Loading 2020 data...
ENIGH 2020 data loaded successfully!!
Loading 2018 data...
ENIGH 2018 data loaded successfully!!
Loading 2016 data...
ENIGH 2016 data loaded successfully!!
