In [4]:
# Tratamiento de datos
# -----------------------------------------------------------------------
import pandas as pd
import numpy as np

# Imputación de nulos usando métodos avanzados estadísticos
# -----------------------------------------------------------------------
from sklearn.impute import SimpleImputer
from sklearn.experimental import enable_iterative_imputer
from sklearn.impute import IterativeImputer
from sklearn.impute import KNNImputer

# Configuración
# -----------------------------------------------------------------------
pd.set_option('display.max_columns', None) # para poder visualizar todas las columnas de los DataFrames

import warnings
warnings.filterwarnings("ignore")

In [6]:
# Cargar los csv como dataframes
df_2015 = pd.read_csv("DATASET_ORIGINAL/World Happiness Report 2015.csv")
df_2016 = pd.read_csv("DATASET_ORIGINAL/World Happiness Report 2016.csv")
df_2017 = pd.read_csv("DATASET_ORIGINAL/World Happiness Report 2017.csv")
df_2018 = pd.read_csv("DATASET_ORIGINAL/World Happiness Report 2018.csv")
df_2019 = pd.read_csv("DATASET_ORIGINAL/World Happiness Report 2019.csv")
df_2020 = pd.read_csv("DATASET_ORIGINAL/World Happiness Report 2020.csv")
df_2021 = pd.read_csv("DATASET_ORIGINAL/World Happiness Report 2021.csv")
df_2022 = pd.read_csv("DATASET_ORIGINAL/World Happiness Report 2022.csv")
df_2023 = pd.read_csv("DATASET_ORIGINAL/World Happiness Report 2023.csv")

In [3]:
# Función para exploración general de datos
def exploracion_datos(df):
    print('_____________ INFORMACIÓN GENERAL DEL DATAFRAME ____________\n')
    print(df.info())
    
    print('___________________ FORMA DEL DATAFRAME ____________________\n')
    print(f"El número de filas que tenemos es de {df.shape[0]}.\nEl número de columnas es de {df.shape[1]}\n")
    
    print('_______________ NULOS, ÚNICOS Y DUPLICADOS _________________\n')
    print('La cantidad de valores NULOS por columna es de:\n')
    print(df.isnull().sum())
    print('____________________________________________________________\n')

    print('El porcentaje de valores NULOS por columna es de:\n')
    porcentaje_nulos = (df.isnull().sum() / df.shape[0]) * 100
    porcentaje_nulos = porcentaje_nulos.round(2)
    print(porcentaje_nulos)
    print('____________________________________________________________\n')

    print('La cantidad de valores ÚNICOS por columna es de:\n')
    for columna in df.columns:
        cantidad_unicos = df[columna].nunique()
        print(f'La columna {columna} tiene {cantidad_unicos} valores únicos.')
    print('____________________________________________________________\n')

    print('La cantidad de valores DUPLICADOS por columna es de:\n')
    for columna in df.columns:
        cantidad_duplicados = df.duplicated(subset=columna).sum()
        print(f'La columna {columna} tiene {cantidad_duplicados} valores duplicados.')
    print('____________________________________________________________\n')
    
    print('____________________ RESUMEN ESTADÍSTICO ____________________\n')
    print('___________________ Variables Numéricas _____________________\n')
    if df.select_dtypes(include=[float, int]).empty:
        print("No hay variables numéricas para mostrar el resumen estadístico.")
    else:
        print(df.describe().T)
    print('____________________________________________________________\n')
    print('___________________ Variables Categóricas _____________________\n')
    if df.select_dtypes(include=[object]).empty:
        print("No hay variables categóricas para mostrar el resumen estadístico.")
    else:
        print('__________________ Variables Categóricas ____________________\n')
        print(df.describe(include='object').T)

In [4]:
exploracion_datos(df_2015)

_____________ INFORMACIÓN GENERAL DEL DATAFRAME ____________

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 158 entries, 0 to 157
Data columns (total 12 columns):
 #   Column                         Non-Null Count  Dtype  
---  ------                         --------------  -----  
 0   Country                        158 non-null    object 
 1   Region                         158 non-null    object 
 2   Happiness Rank                 158 non-null    int64  
 3   Happiness Score                158 non-null    float64
 4   Standard Error                 158 non-null    float64
 5   Economy (GDP per Capita)       158 non-null    float64
 6   Family                         158 non-null    float64
 7   Health (Life Expectancy)       158 non-null    float64
 8   Freedom                        158 non-null    float64
 9   Trust (Government Corruption)  158 non-null    float64
 10  Generosity                     158 non-null    float64
 11  Dystopia Residual              158 non-null    f

In [5]:
exploracion_datos(df_2016)

_____________ INFORMACIÓN GENERAL DEL DATAFRAME ____________

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 157 entries, 0 to 156
Data columns (total 13 columns):
 #   Column                         Non-Null Count  Dtype  
---  ------                         --------------  -----  
 0   Country                        157 non-null    object 
 1   Region                         157 non-null    object 
 2   Happiness Rank                 157 non-null    int64  
 3   Happiness Score                157 non-null    float64
 4   Lower Confidence Interval      157 non-null    float64
 5   Upper Confidence Interval      157 non-null    float64
 6   Economy (GDP per Capita)       157 non-null    float64
 7   Family                         157 non-null    float64
 8   Health (Life Expectancy)       157 non-null    float64
 9   Freedom                        157 non-null    float64
 10  Trust (Government Corruption)  157 non-null    float64
 11  Generosity                     157 non-null    f

In [6]:
exploracion_datos(df_2017)

_____________ INFORMACIÓN GENERAL DEL DATAFRAME ____________

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 155 entries, 0 to 154
Data columns (total 12 columns):
 #   Column                         Non-Null Count  Dtype  
---  ------                         --------------  -----  
 0   Country                        155 non-null    object 
 1   Happiness.Rank                 155 non-null    int64  
 2   Happiness.Score                155 non-null    float64
 3   Whisker.high                   155 non-null    float64
 4   Whisker.low                    155 non-null    float64
 5   Economy..GDP.per.Capita.       155 non-null    float64
 6   Family                         155 non-null    float64
 7   Health..Life.Expectancy.       155 non-null    float64
 8   Freedom                        155 non-null    float64
 9   Generosity                     155 non-null    float64
 10  Trust..Government.Corruption.  155 non-null    float64
 11  Dystopia.Residual              155 non-null    f

In [7]:
exploracion_datos(df_2018)

_____________ INFORMACIÓN GENERAL DEL DATAFRAME ____________

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 156 entries, 0 to 155
Data columns (total 9 columns):
 #   Column                        Non-Null Count  Dtype  
---  ------                        --------------  -----  
 0   Overall rank                  156 non-null    int64  
 1   Country or region             156 non-null    object 
 2   Score                         156 non-null    float64
 3   GDP per capita                156 non-null    float64
 4   Social support                156 non-null    float64
 5   Healthy life expectancy       156 non-null    float64
 6   Freedom to make life choices  156 non-null    float64
 7   Generosity                    156 non-null    float64
 8   Perceptions of corruption     155 non-null    float64
dtypes: float64(7), int64(1), object(1)
memory usage: 11.1+ KB
None
___________________ FORMA DEL DATAFRAME ____________________

El número de filas que tenemos es de 156.
El número de 

In [8]:
exploracion_datos(df_2019)

_____________ INFORMACIÓN GENERAL DEL DATAFRAME ____________

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 156 entries, 0 to 155
Data columns (total 9 columns):
 #   Column                        Non-Null Count  Dtype  
---  ------                        --------------  -----  
 0   Overall rank                  156 non-null    int64  
 1   Country or region             156 non-null    object 
 2   Score                         156 non-null    float64
 3   GDP per capita                156 non-null    float64
 4   Social support                156 non-null    float64
 5   Healthy life expectancy       156 non-null    float64
 6   Freedom to make life choices  156 non-null    float64
 7   Generosity                    156 non-null    float64
 8   Perceptions of corruption     156 non-null    float64
dtypes: float64(7), int64(1), object(1)
memory usage: 11.1+ KB
None
___________________ FORMA DEL DATAFRAME ____________________

El número de filas que tenemos es de 156.
El número de 

In [9]:
exploracion_datos(df_2020)

_____________ INFORMACIÓN GENERAL DEL DATAFRAME ____________

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 153 entries, 0 to 152
Data columns (total 20 columns):
 #   Column                                      Non-Null Count  Dtype  
---  ------                                      --------------  -----  
 0   Country name                                153 non-null    object 
 1   Regional indicator                          153 non-null    object 
 2   Ladder score                                153 non-null    float64
 3   Standard error of ladder score              153 non-null    float64
 4   upperwhisker                                153 non-null    float64
 5   lowerwhisker                                153 non-null    float64
 6   Logged GDP per capita                       153 non-null    float64
 7   Social support                              153 non-null    float64
 8   Healthy life expectancy                     153 non-null    float64
 9   Freedom to make life choi

In [10]:
exploracion_datos(df_2021)

_____________ INFORMACIÓN GENERAL DEL DATAFRAME ____________

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 149 entries, 0 to 148
Data columns (total 20 columns):
 #   Column                                      Non-Null Count  Dtype  
---  ------                                      --------------  -----  
 0   Country name                                149 non-null    object 
 1   Regional indicator                          149 non-null    object 
 2   Ladder score                                149 non-null    float64
 3   Standard error of ladder score              149 non-null    float64
 4   upperwhisker                                149 non-null    float64
 5   lowerwhisker                                149 non-null    float64
 6   Logged GDP per capita                       149 non-null    float64
 7   Social support                              149 non-null    float64
 8   Healthy life expectancy                     149 non-null    float64
 9   Freedom to make life choi

In [11]:
exploracion_datos(df_2022)

_____________ INFORMACIÓN GENERAL DEL DATAFRAME ____________

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 146 entries, 0 to 145
Data columns (total 12 columns):
 #   Column                                      Non-Null Count  Dtype  
---  ------                                      --------------  -----  
 0   RANK                                        146 non-null    int64  
 1   Country                                     146 non-null    object 
 2   Happiness score                             146 non-null    float64
 3   Whisker-high                                146 non-null    float64
 4   Whisker-low                                 146 non-null    float64
 5   Dystopia (1.83) + residual                  146 non-null    float64
 6   Explained by: GDP per capita                146 non-null    float64
 7   Explained by: Social support                146 non-null    float64
 8   Explained by: Healthy life expectancy       146 non-null    float64
 9   Explained by: Freedom to 

In [12]:
exploracion_datos(df_2023)

_____________ INFORMACIÓN GENERAL DEL DATAFRAME ____________

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 137 entries, 0 to 136
Data columns (total 19 columns):
 #   Column                                      Non-Null Count  Dtype  
---  ------                                      --------------  -----  
 0   Country name                                137 non-null    object 
 1   Ladder score                                137 non-null    float64
 2   Standard error of ladder score              137 non-null    float64
 3   upperwhisker                                137 non-null    float64
 4   lowerwhisker                                137 non-null    float64
 5   Logged GDP per capita                       137 non-null    float64
 6   Social support                              137 non-null    float64
 7   Healthy life expectancy                     136 non-null    float64
 8   Freedom to make life choices                137 non-null    float64
 9   Generosity               

In [13]:
# Función para chequear si hay valores negativos en las columnas numéricas
def neg_values(df):
    neg_counts = {}

    for column in df.columns:
        if pd.api.types.is_numeric_dtype(df[column]):
            neg_counts[column] = (df[column] < 0).sum()

    return neg_counts

In [14]:
# Llamar a la función para contar valores negativos en datos 2015
neg_counts_2015 = neg_values(df_2015)

# Mostrar los resultados
print("Número de valores negativos por columna:")
for column, count in neg_counts_2015.items():
    print(f"Columna '{column}': {count} valores negativos")

Número de valores negativos por columna:
Columna 'Happiness Rank': 0 valores negativos
Columna 'Happiness Score': 0 valores negativos
Columna 'Standard Error': 0 valores negativos
Columna 'Economy (GDP per Capita)': 0 valores negativos
Columna 'Family': 0 valores negativos
Columna 'Health (Life Expectancy)': 0 valores negativos
Columna 'Freedom': 0 valores negativos
Columna 'Trust (Government Corruption)': 0 valores negativos
Columna 'Generosity': 0 valores negativos
Columna 'Dystopia Residual': 0 valores negativos


In [15]:
# Llamar a la función para contar valores negativos en datos 2016
neg_counts_2016 = neg_values(df_2016)

# Mostrar los resultados
print("Número de valores negativos por columna:")
for column, count in neg_counts_2016.items():
    print(f"Columna '{column}': {count} valores negativos")

Número de valores negativos por columna:
Columna 'Happiness Rank': 0 valores negativos
Columna 'Happiness Score': 0 valores negativos
Columna 'Lower Confidence Interval': 0 valores negativos
Columna 'Upper Confidence Interval': 0 valores negativos
Columna 'Economy (GDP per Capita)': 0 valores negativos
Columna 'Family': 0 valores negativos
Columna 'Health (Life Expectancy)': 0 valores negativos
Columna 'Freedom': 0 valores negativos
Columna 'Trust (Government Corruption)': 0 valores negativos
Columna 'Generosity': 0 valores negativos
Columna 'Dystopia Residual': 0 valores negativos


In [16]:
# Llamar a la función para contar valores negativos
neg_counts_2017 = neg_values(df_2017)

# Mostrar los resultados
print("Número de valores negativos por columna:")
for column, count in neg_counts_2017.items():
    print(f"Columna '{column}': {count} valores negativos")

Número de valores negativos por columna:
Columna 'Happiness.Rank': 0 valores negativos
Columna 'Happiness.Score': 0 valores negativos
Columna 'Whisker.high': 0 valores negativos
Columna 'Whisker.low': 0 valores negativos
Columna 'Economy..GDP.per.Capita.': 0 valores negativos
Columna 'Family': 0 valores negativos
Columna 'Health..Life.Expectancy.': 0 valores negativos
Columna 'Freedom': 0 valores negativos
Columna 'Generosity': 0 valores negativos
Columna 'Trust..Government.Corruption.': 0 valores negativos
Columna 'Dystopia.Residual': 0 valores negativos


In [17]:
# Llamar a la función para contar valores negativos
neg_counts_2018 = neg_values(df_2018)

# Mostrar los resultados
print("Número de valores negativos por columna:")
for column, count in neg_counts_2018.items():
    print(f"Columna '{column}': {count} valores negativos")

Número de valores negativos por columna:
Columna 'Overall rank': 0 valores negativos
Columna 'Score': 0 valores negativos
Columna 'GDP per capita': 0 valores negativos
Columna 'Social support': 0 valores negativos
Columna 'Healthy life expectancy': 0 valores negativos
Columna 'Freedom to make life choices': 0 valores negativos
Columna 'Generosity': 0 valores negativos
Columna 'Perceptions of corruption': 0 valores negativos


In [18]:
# Llamar a la función para contar valores negativos
neg_counts_2019 = neg_values(df_2019)

# Mostrar los resultados
print("Número de valores negativos por columna:")
for column, count in neg_counts_2019.items():
    print(f"Columna '{column}': {count} valores negativos")

Número de valores negativos por columna:
Columna 'Overall rank': 0 valores negativos
Columna 'Score': 0 valores negativos
Columna 'GDP per capita': 0 valores negativos
Columna 'Social support': 0 valores negativos
Columna 'Healthy life expectancy': 0 valores negativos
Columna 'Freedom to make life choices': 0 valores negativos
Columna 'Generosity': 0 valores negativos
Columna 'Perceptions of corruption': 0 valores negativos


In [19]:
# Llamar a la función para contar valores negativos
neg_counts_2020 = neg_values(df_2020)

# Mostrar los resultados
print("Número de valores negativos por columna:")
for column, count in neg_counts_2020.items():
    print(f"Columna '{column}': {count} valores negativos")

Número de valores negativos por columna:
Columna 'Ladder score': 0 valores negativos
Columna 'Standard error of ladder score': 0 valores negativos
Columna 'upperwhisker': 0 valores negativos
Columna 'lowerwhisker': 0 valores negativos
Columna 'Logged GDP per capita': 0 valores negativos
Columna 'Social support': 0 valores negativos
Columna 'Healthy life expectancy': 0 valores negativos
Columna 'Freedom to make life choices': 0 valores negativos
Columna 'Generosity': 87 valores negativos
Columna 'Perceptions of corruption': 0 valores negativos
Columna 'Ladder score in Dystopia': 0 valores negativos
Columna 'Explained by: Log GDP per capita': 0 valores negativos
Columna 'Explained by: Social support': 0 valores negativos
Columna 'Explained by: Healthy life expectancy': 0 valores negativos
Columna 'Explained by: Freedom to make life choices': 0 valores negativos
Columna 'Explained by: Generosity': 0 valores negativos
Columna 'Explained by: Perceptions of corruption': 0 valores negativos
C

In [20]:
# Llamar a la función para contar valores negativos
neg_counts_2021 = neg_values(df_2021)

# Mostrar los resultados
print("Número de valores negativos por columna:")
for column, count in neg_counts_2021.items():
    print(f"Columna '{column}': {count} valores negativos")

Número de valores negativos por columna:
Columna 'Ladder score': 0 valores negativos
Columna 'Standard error of ladder score': 0 valores negativos
Columna 'upperwhisker': 0 valores negativos
Columna 'lowerwhisker': 0 valores negativos
Columna 'Logged GDP per capita': 0 valores negativos
Columna 'Social support': 0 valores negativos
Columna 'Healthy life expectancy': 0 valores negativos
Columna 'Freedom to make life choices': 0 valores negativos
Columna 'Generosity': 86 valores negativos
Columna 'Perceptions of corruption': 0 valores negativos
Columna 'Ladder score in Dystopia': 0 valores negativos
Columna 'Explained by: Log GDP per capita': 0 valores negativos
Columna 'Explained by: Social support': 0 valores negativos
Columna 'Explained by: Healthy life expectancy': 0 valores negativos
Columna 'Explained by: Freedom to make life choices': 0 valores negativos
Columna 'Explained by: Generosity': 0 valores negativos
Columna 'Explained by: Perceptions of corruption': 0 valores negativos
C

In [21]:
# Llamar a la función para contar valores negativos
neg_counts_2022 = neg_values(df_2022)

# Mostrar los resultados
print("Número de valores negativos por columna:")
for column, count in neg_counts_2022.items():
    print(f"Columna '{column}': {count} valores negativos")

Número de valores negativos por columna:
Columna 'RANK': 0 valores negativos
Columna 'Happiness score': 0 valores negativos
Columna 'Whisker-high': 0 valores negativos
Columna 'Whisker-low': 0 valores negativos
Columna 'Dystopia (1.83) + residual': 0 valores negativos
Columna 'Explained by: GDP per capita': 0 valores negativos
Columna 'Explained by: Social support': 0 valores negativos
Columna 'Explained by: Healthy life expectancy': 0 valores negativos
Columna 'Explained by: Freedom to make life choices': 0 valores negativos
Columna 'Explained by: Generosity': 0 valores negativos
Columna 'Explained by: Perceptions of corruption': 0 valores negativos


In [22]:
# Llamar a la función para contar valores negativos
neg_counts_2023 = neg_values(df_2023)

# Mostrar los resultados
print("Número de valores negativos por columna:")
for column, count in neg_counts_2023.items():
    print(f"Columna '{column}': {count} valores negativos")

Número de valores negativos por columna:
Columna 'Ladder score': 0 valores negativos
Columna 'Standard error of ladder score': 0 valores negativos
Columna 'upperwhisker': 0 valores negativos
Columna 'lowerwhisker': 0 valores negativos
Columna 'Logged GDP per capita': 0 valores negativos
Columna 'Social support': 0 valores negativos
Columna 'Healthy life expectancy': 0 valores negativos
Columna 'Freedom to make life choices': 0 valores negativos
Columna 'Generosity': 67 valores negativos
Columna 'Perceptions of corruption': 0 valores negativos
Columna 'Ladder score in Dystopia': 0 valores negativos
Columna 'Explained by: Log GDP per capita': 0 valores negativos
Columna 'Explained by: Social support': 0 valores negativos
Columna 'Explained by: Healthy life expectancy': 0 valores negativos
Columna 'Explained by: Freedom to make life choices': 0 valores negativos
Columna 'Explained by: Generosity': 0 valores negativos
Columna 'Explained by: Perceptions of corruption': 0 valores negativos
C

In [23]:
""""# Función para estandarizar nombres de columnas
def estandarizar_columnas(df):
    columnas = {
        'Country': 'Country',
        'Country name': 'Country',
        'Country or region': 'Country',
        'Regional indicator': 'Region',
        'Region': 'Region',
        'Happiness Rank': 'Happiness Rank',
        'Overall rank': 'Happiness Rank',
        'RANK': 'Happiness Rank',
        'Happiness Score': 'Happiness Score',
        'Score': 'Happiness Score',
        'Ladder score': 'Happiness Score',
        'Standard Error': 'Standard Error',
        'Standard error of ladder score': 'Standard Error',
        'Economy (GDP per Capita)': 'Economy (GDP per Capita)',
        'Logged GDP per capita': 'Economy (GDP per Capita)',
        'GDP per capita': 'Economy (GDP per Capita)',
        'Economy..GDP.per.Capita.': 'Economy (GDP per Capita)',
        'Family': 'Family',
        'Social support': 'Family',
        'Health (Life Expectancy)': 'Health (Life Expectancy)',
        'Healthy life expectancy': 'Health (Life Expectancy)',
        'Health..Life.Expectancy.': 'Health (Life Expectancy)',
        'Freedom': 'Freedom',
        'Freedom to make life choices': 'Freedom',
        'Trust (Government Corruption)': 'Trust (Government Corruption)',
        'Perceptions of corruption': 'Trust (Government Corruption)',
        'Trust..Government.Corruption.': 'Trust (Government Corruption)',
        'Generosity': 'Generosity',
        'Dystopia Residual': 'Dystopia Residual',
        'Dystopia + residual': 'Dystopia Residual',
        'Dystopia (1.83) + residual': 'Dystopia Residual',
        'Ladder score in Dystopia': 'Dystopia Residual',
        'Lower Confidence Interval': 'Lower Confidence Interval',
        'Upper Confidence Interval': 'Upper Confidence Interval',
        'Whisker.low': 'Lower Confidence Interval',
        'lowerwhisker': 'Lower Confidence Interval',
        'Whisker.high': 'Upper Confidence Interval',
        'upperwhisker': 'Upper Confidence Interval',
        'Explained by: Log GDP per capita': 'Economy (GDP per Capita)',
        'Explained by: Social support': 'Family',
        'Explained by: Healthy life expectancy': 'Health (Life Expectancy)',
        'Explained by: Freedom to make life choices': 'Freedom',
        'Explained by: Generosity': 'Generosity',
        'Explained by: Perceptions of corruption': 'Trust (Government Corruption)',
    }
    return df.rename(columns=columnas)"""

'"# Función para estandarizar nombres de columnas\ndef estandarizar_columnas(df):\n    columnas = {\n        \'Country\': \'Country\',\n        \'Country name\': \'Country\',\n        \'Country or region\': \'Country\',\n        \'Regional indicator\': \'Region\',\n        \'Region\': \'Region\',\n        \'Happiness Rank\': \'Happiness Rank\',\n        \'Overall rank\': \'Happiness Rank\',\n        \'RANK\': \'Happiness Rank\',\n        \'Happiness Score\': \'Happiness Score\',\n        \'Score\': \'Happiness Score\',\n        \'Ladder score\': \'Happiness Score\',\n        \'Standard Error\': \'Standard Error\',\n        \'Standard error of ladder score\': \'Standard Error\',\n        \'Economy (GDP per Capita)\': \'Economy (GDP per Capita)\',\n        \'Logged GDP per capita\': \'Economy (GDP per Capita)\',\n        \'GDP per capita\': \'Economy (GDP per Capita)\',\n        \'Economy..GDP.per.Capita.\': \'Economy (GDP per Capita)\',\n        \'Family\': \'Family\',\n        \'Social

In [24]:
"""# Suponiendo que los DataFrames se llaman df_2015, df_2016, ..., df_2023
dataframes = [df_2015, df_2016, df_2017, df_2018, df_2019, df_2020, df_2021, df_2022, df_2023]

# Estandarizar columnas
dataframes = [estandarizar_columnas(df) for df in dataframes]"""

'# Suponiendo que los DataFrames se llaman df_2015, df_2016, ..., df_2023\ndataframes = [df_2015, df_2016, df_2017, df_2018, df_2019, df_2020, df_2021, df_2022, df_2023]\n\n# Estandarizar columnas\ndataframes = [estandarizar_columnas(df) for df in dataframes]'

In [7]:
def drop_col(df, lista_columnas):
  
    # Verificar si las columnas existen en el DataFrame
    columnas_a_eliminar = [col for col in lista_columnas if col in df.columns]
    
    if not columnas_a_eliminar:
        print("Ninguna de las columnas especificadas se encontró en el DataFrame.")
        return df
    
    # Eliminar las columnas
    df_sin_columnas = df.drop(columns=columnas_a_eliminar)
    return df_sin_columnas

In [8]:
# Lista de columnas a eliminar de datos 2015
drop_col_2015 = ['Dystopia Residual', 'Dystopia Residual', 'Standard Error']

# Llamando a la función
df_2015_col_drop = drop_col(df_2015, drop_col_2015)
print(df_2015_col_drop.columns)

Index(['Country', 'Region', 'Happiness Rank', 'Happiness Score',
       'Economy (GDP per Capita)', 'Family', 'Health (Life Expectancy)',
       'Freedom', 'Trust (Government Corruption)', 'Generosity'],
      dtype='object')


In [9]:
# Lista de columnas a eliminar de datos 2016
drop_col_2016 = ['Lower Confidence Interval', 'Upper Confidence Interval', 'Dystopia Residual']

# Llamando a la función
df_2016_col_drop = drop_col(df_2016, drop_col_2016)
print(df_2016_col_drop.columns)

Index(['Country', 'Region', 'Happiness Rank', 'Happiness Score',
       'Economy (GDP per Capita)', 'Family', 'Health (Life Expectancy)',
       'Freedom', 'Trust (Government Corruption)', 'Generosity'],
      dtype='object')


In [10]:
# Lista de columnas a eliminar de datos 2017
drop_col_2017 = ['Whisker.high', 'Whisker.low', 'Dystopia.Residual']

# Llamando a la función
df_2017_col_drop = drop_col(df_2017, drop_col_2017)
print(df_2017_col_drop.columns)

Index(['Country', 'Happiness.Rank', 'Happiness.Score',
       'Economy..GDP.per.Capita.', 'Family', 'Health..Life.Expectancy.',
       'Freedom', 'Generosity', 'Trust..Government.Corruption.'],
      dtype='object')


In [11]:
# Lista de columnas a eliminar de datos 2020
drop_col_2020 = ['upperwhisker', 'lowerwhisker', 'Ladder score in Dystopia', 'Dystopia + residual', 'Standard error of ladder score']

# Llamando a la función
df_2020_col_drop = drop_col(df_2020, drop_col_2020)
print(df_2020_col_drop.columns)

Index(['Country name', 'Regional indicator', 'Ladder score',
       'Logged GDP per capita', 'Social support', 'Healthy life expectancy',
       'Freedom to make life choices', 'Generosity',
       'Perceptions of corruption', 'Explained by: Log GDP per capita',
       'Explained by: Social support', 'Explained by: Healthy life expectancy',
       'Explained by: Freedom to make life choices',
       'Explained by: Generosity', 'Explained by: Perceptions of corruption'],
      dtype='object')


In [12]:
# Lista de columnas a eliminar de datos 2021
drop_col_2021 = ['upperwhisker', 'lowerwhisker', 'Ladder score in Dystopia', 'Dystopia + residual', 'Standard error of ladder score']

# Llamando a la función
df_2021_col_drop = drop_col(df_2021, drop_col_2021)
print(df_2021_col_drop.columns)

Index(['Country name', 'Regional indicator', 'Ladder score',
       'Logged GDP per capita', 'Social support', 'Healthy life expectancy',
       'Freedom to make life choices', 'Generosity',
       'Perceptions of corruption', 'Explained by: Log GDP per capita',
       'Explained by: Social support', 'Explained by: Healthy life expectancy',
       'Explained by: Freedom to make life choices',
       'Explained by: Generosity', 'Explained by: Perceptions of corruption'],
      dtype='object')


In [13]:
# Lista de columnas a eliminar de datos 2022
drop_col_2022 = ['Whisker-high', 'Whisker-low', 'Dystopia (1.83) + residual']

# Llamando a la función
df_2022_col_drop = drop_col(df_2022, drop_col_2022)
print(df_2022_col_drop.columns)

Index(['RANK', 'Country', 'Happiness score', 'Explained by: GDP per capita',
       'Explained by: Social support', 'Explained by: Healthy life expectancy',
       'Explained by: Freedom to make life choices',
       'Explained by: Generosity', 'Explained by: Perceptions of corruption'],
      dtype='object')


In [14]:
# Lista de columnas a eliminar de datos 2023
drop_col_2023 = ['upperwhisker', 'lowerwhisker', 'Dystopia + residual', 'Ladder score in Dystopia', 'Standard error of ladder score']

# Llamando a la función
df_2023_col_drop = drop_col(df_2023, drop_col_2023)
print(df_2023_col_drop.columns)

Index(['Country name', 'Ladder score', 'Logged GDP per capita',
       'Social support', 'Healthy life expectancy',
       'Freedom to make life choices', 'Generosity',
       'Perceptions of corruption', 'Explained by: Log GDP per capita',
       'Explained by: Social support', 'Explained by: Healthy life expectancy',
       'Explained by: Freedom to make life choices',
       'Explained by: Generosity', 'Explained by: Perceptions of corruption'],
      dtype='object')


In [15]:
# Renombrar las columnas para cada DataFrame
df_2015_col_drop = df_2015_col_drop.rename(columns={
    'Economy (GDP per Capita)': 'GDP per capita',
    'Family': 'Social support',
    'Health (Life Expectancy)': 'Healthy life expectancy',
    'Freedom': 'Freedom to make life choices',
    'Trust (Government Corruption)': 'Perceptions of corruption'})

In [16]:
df_2015_col_drop['Año'] = 2015

In [17]:
df_2015_col_drop

Unnamed: 0,Country,Region,Happiness Rank,Happiness Score,GDP per capita,Social support,Healthy life expectancy,Freedom to make life choices,Perceptions of corruption,Generosity,Año
0,Switzerland,Western Europe,1,7.587,1.39651,1.34951,0.94143,0.66557,0.41978,0.29678,2015
1,Iceland,Western Europe,2,7.561,1.30232,1.40223,0.94784,0.62877,0.14145,0.43630,2015
2,Denmark,Western Europe,3,7.527,1.32548,1.36058,0.87464,0.64938,0.48357,0.34139,2015
3,Norway,Western Europe,4,7.522,1.45900,1.33095,0.88521,0.66973,0.36503,0.34699,2015
4,Canada,North America,5,7.427,1.32629,1.32261,0.90563,0.63297,0.32957,0.45811,2015
...,...,...,...,...,...,...,...,...,...,...,...
153,Rwanda,Sub-Saharan Africa,154,3.465,0.22208,0.77370,0.42864,0.59201,0.55191,0.22628,2015
154,Benin,Sub-Saharan Africa,155,3.340,0.28665,0.35386,0.31910,0.48450,0.08010,0.18260,2015
155,Syria,Middle East and Northern Africa,156,3.006,0.66320,0.47489,0.72193,0.15684,0.18906,0.47179,2015
156,Burundi,Sub-Saharan Africa,157,2.905,0.01530,0.41587,0.22396,0.11850,0.10062,0.19727,2015


In [18]:
 # Guardar el DataFrame limpio como un archivo CSV
df_2015_col_drop.to_csv("happy_2015.csv", index=False)

In [20]:
# Renombrar las columnas para cada DataFrame
df_2016_col_drop = df_2016_col_drop.rename(columns={
    'Economy (GDP per Capita)': 'GDP per capita',
    'Family': 'Social support',
    'Health (Life Expectancy)': 'Healthy life expectancy',
    'Freedom': 'Freedom to make life choices',
    'Trust (Government Corruption)': 'Perceptions of corruption'})

In [21]:
df_2016_col_drop['Año'] = 2016

In [22]:
 # Guardar el DataFrame limpio como un archivo CSV
df_2016_col_drop.to_csv("happy_2016.csv", index=False)

In [23]:
# Renombrar las columnas para cada DataFrame
df_2017_col_drop = df_2017_col_drop.rename(columns={
    'Happiness.Rank': 'Happiness Rank',
    'Happiness.Score': 'Happiness Score',
    'Economy..GDP.per.Capita.': 'GDP per capita',
    'Family': 'Social support',
    'Health..Life.Expectancy.': 'Healthy life expectancy',
    'Freedom': 'Freedom to make life choices',
    'Trust..Government.Corruption.': 'Perceptions of corruption'})

In [24]:
df_2017_col_drop['Año'] = 2017

In [25]:
 # Guardar el DataFrame limpio como un archivo CSV
df_2017_col_drop.to_csv("happy_2017.csv", index=False)

In [26]:
# Renombrar las columnas para cada DataFrame
df_2018 = df_2018.rename(columns={
    'Overall rank': 'Happiness Rank',
    'Score': 'Happiness Score',
    })

In [27]:
df_2018['Año'] = 2018

In [28]:
 # Guardar el DataFrame limpio como un archivo CSV
df_2018.to_csv("happy_2018.csv", index=False)

In [29]:
# Renombrar las columnas para cada DataFrame
df_2019 = df_2019.rename(columns={
    'Overall rank': 'Happiness Rank',
    'Score': 'Happiness Score',
    })

In [30]:
df_2019['Año'] = 2019

In [31]:
 # Guardar el DataFrame limpio como un archivo CSV
df_2019.to_csv("happy_2019.csv", index=False)

In [32]:
# Renombrar las columnas para cada DataFrame
df_2020_col_drop = df_2020_col_drop.rename(columns={
    'Country name': 'Country',
    'Regional indicator': 'Region',
    'Ladder score': 'Happiness Score',
    'Logged GDP per capita': 'GDP per capita'})

In [33]:
df_2020_col_drop['Año'] = 2020

In [34]:
 # Guardar el DataFrame limpio como un archivo CSV
df_2020_col_drop.to_csv("happy_2020.csv", index=False)

In [35]:
# Renombrar las columnas para cada DataFrame
df_2021_col_drop = df_2021_col_drop.rename(columns={
    'Country name': 'Country',
    'Regional indicator': 'Region',
    'Ladder score': 'Happiness Score',
    'Logged GDP per capita': 'GDP per capita'})

In [36]:
df_2021_col_drop['Año'] = 2021

In [37]:
 # Guardar el DataFrame limpio como un archivo CSV
df_2021_col_drop.to_csv("happy_2021.csv", index=False)

In [38]:
# Renombrar las columnas para cada DataFrame
df_2022_col_drop = df_2022_col_drop.rename(columns={
    'RANK': 'Happiness Rank'})

In [39]:
df_2022_col_drop['Año'] = 2022

In [40]:
 # Guardar el DataFrame limpio como un archivo CSV
df_2022_col_drop.to_csv("happy_2022.csv", index=False)

In [41]:
# Renombrar las columnas para cada DataFrame
df_2023_col_drop = df_2023_col_drop.rename(columns={
    'Country name': 'Country',
    'Ladder score': 'Happiness Score',
    'Logged GDP per capita': 'GDP per capita'})

In [42]:
df_2023_col_drop['Año'] = 2023

In [43]:
 # Guardar el DataFrame limpio como un archivo CSV
df_2023_col_drop.to_csv("happy_2023.csv", index=False)

In [42]:
"""# Combinar los DataFrames
df_combined = pd.concat(dataframes, ignore_index=True)

# Guardar el DataFrame combinado
df_combined.to_csv('happiness_combined.csv', index=False)

df_combined.head()"""

"# Combinar los DataFrames\ndf_combined = pd.concat(dataframes, ignore_index=True)\n\n# Guardar el DataFrame combinado\ndf_combined.to_csv('happiness_combined.csv', index=False)\n\ndf_combined.head()"

In [58]:
df = pd.read_csv("DF_LIMPIOS/happy_2015.csv")
df.sample(10)

Unnamed: 0,Country,Region,Happiness Rank,Happiness Score,GDP per capita,Social support,Healthy life expectancy,Freedom to make life choices,Perceptions of corruption,Generosity,Año
148,Chad,Sub-Saharan Africa,149,3.667,0.34193,0.76062,0.1501,0.23501,0.05269,0.18386,2015
23,Singapore,Southeastern Asia,24,6.798,1.52186,1.02,1.02525,0.54252,0.4921,0.31105,2015
124,Kenya,Sub-Saharan Africa,125,4.419,0.36471,0.99876,0.41435,0.42215,0.05839,0.37542,2015
25,Germany,Western Europe,26,6.75,1.32792,1.29937,0.89186,0.61477,0.21843,0.28214,2015
7,Sweden,Western Europe,8,7.364,1.33171,1.28907,0.91087,0.6598,0.43844,0.36262,2015
102,Lebanon,Middle East and Northern Africa,103,4.839,1.02564,0.80001,0.83947,0.33916,0.04582,0.21854,2015
21,Oman,Middle East and Northern Africa,22,6.853,1.36011,1.08182,0.76276,0.63274,0.32524,0.21542,2015
120,Nepal,Southern Asia,121,4.514,0.35997,0.86449,0.56874,0.38282,0.05907,0.32296,2015
105,Tajikistan,Central and Eastern Europe,106,4.786,0.39047,0.85563,0.57379,0.47216,0.15072,0.22974,2015
96,Lesotho,Sub-Saharan Africa,97,4.898,0.37545,1.04103,0.07612,0.31767,0.12504,0.16388,2015


In [64]:
# Cargar los csv como dataframes
happy_2015 = pd.read_csv("DF_LIMPIOS/happy_2015.csv")
happy_2016 = pd.read_csv("DF_LIMPIOS/happy_2016.csv")
happy_2017 = pd.read_csv("DF_LIMPIOS/happy_2017.csv")
happy_2018 = pd.read_csv("DF_LIMPIOS/happy_2018.csv")
happy_2019 = pd.read_csv("DF_LIMPIOS/happy_2019.csv")
happy_2020 = pd.read_csv("DF_LIMPIOS/happy_2020.csv")
happy_2021 = pd.read_csv("DF_LIMPIOS/happy_2021.csv")
happy_2022 = pd.read_csv("DF_LIMPIOS/happy_2022.csv")
happy_2023 = pd.read_csv("DF_LIMPIOS/happy_2023.csv")

In [None]:
# Lista de DataFrames
dataframes = [
    happy_2015, happy_2016, happy_2017, happy_2018,
    happy_2019, happy_2020, happy_2021, happy_2022, 
    happy_2023
]

# Columnas en común
common_columns = [
    'Country', 'Happiness Score', 'GDP per capita', 'Social support',
    'Healthy life expectancy', 'Freedom to make life choices', 
    'Generosity', 'Año'
]

# Asegurarse de que todas las DataFrames contengan las columnas comunes
for df in dataframes:
    for column in common_columns:
        if column not in df.columns:
            df[column] = None  # Añadir la columna con valores None si no existe en el DataFrame

# Concatenar todos los DataFrames en uno solo
combined_df = pd.concat(dataframes, axis=0, join='outer', ignore_index=False)


In [78]:
combined_df.to_csv("todos_los_años_happy.csv", index=False)

In [77]:

# Imprimir el DataFrame resultante
combined_df.sample(20)


Unnamed: 0,Country,Region,Happiness Rank,Happiness Score,GDP per capita,Social support,Healthy life expectancy,Freedom to make life choices,Perceptions of corruption,Generosity,Año,Country or region,Explained by: Log GDP per capita,Explained by: Social support,Explained by: Healthy life expectancy,Explained by: Freedom to make life choices,Explained by: Generosity,Explained by: Perceptions of corruption,Happiness score,Explained by: GDP per capita
120,Kenya,Sub-Saharan Africa,,4.583,8.029776,0.702652,60.096931,0.829748,0.831499,0.294682,2020,,0.476413,0.905078,0.536313,0.519181,0.393902,0.067201,,
10,Israel,Middle East and Northern Africa,11.0,7.267,1.33766,0.99537,0.84917,0.36432,0.08728,0.32288,2016,,,,,,,,,
27,,,28.0,6.419,0.986,1.474,0.675,0.493,0.088,0.11,2018,Brazil,,,,,,,,
32,Taiwan Province of China,,33.0,6.422,1.433627,1.384565,0.793984,0.361467,0.063829,0.25836,2017,,,,,,,,,
116,,,117.0,4.456,1.01,0.971,0.536,0.304,0.095,0.148,2018,Iraq,,,,,,,,
134,Niger,,135.0,4.028,0.161925,0.993025,0.268505,0.363659,0.138573,0.228674,2017,,,,,,,,,
77,Nepal,,,5.36,8.256,0.748,61.847,0.808,0.783,0.146,2023,,0.979,1.027,0.281,0.567,0.215,0.104,,
148,Tanzania,Sub-Saharan Africa,149.0,3.666,0.47155,0.77623,0.357,0.3176,0.05099,0.31472,2016,,,,,,,,,
39,Slovakia,,40.0,6.098,1.325394,1.505059,0.712733,0.295817,0.024211,0.136544,2017,,,,,,,,,
63,China,,,5.818,9.738,0.836,68.689,0.882,0.727,-0.041,2023,,1.51,1.249,0.468,0.666,0.115,0.145,,


In [73]:
def exploracion_df(df):
    
    print(' Filas y Columnas del DATAFRAME \n')
    print(f"El número de filas que tenemos es de {df.shape[0]}.\nEl número de columnas es de {df.shape[1]}\n")
    print('____________________________________________________________\n')
    
    print(' Nombre de todas las Columnas del DATAFRAME: \n')
    print(df.columns)
    print('____________________________________________________________\n')
    
    print('INFORMACIÓN GENERAL DEL DATAFRAME \n')
    print(df.info())
    print('____________________________________________________________\n')
    
    print('Ver los NULOS del DataFrame \n')
    print(f'Los nulos: --> {df.isnull().sum().mean() * 100} \n')
    for columna in df.columns:
        cantidad_valores_the_null = df[columna].isnull().mean() * 100
        print(f'La columna {columna}: {cantidad_valores_the_null}')
    print('____________________________________________________________\n')
    
    print('Valores ÚNICOS por columna:\n')
    for columna in df.columns:
        cantidad_valores_unicos = df[columna].unique()
        print(f'La columna {columna}: {len(cantidad_valores_unicos)}')
        print(f'La columna {columna}: {cantidad_valores_unicos}')
        
    print('____________________________________________________________\n')
    
    print('Valores DUPLICADOS por columna es de:\n')
    for columna in df.columns:
        cantidad_duplicados = df[columna].duplicated().sum()
        print(f'La columna {columna}: {cantidad_duplicados}')
    print('____________________________________________________________\n')
  
    print('--> RESUMEN ESTADÍSTICO \n')
    
    try:
        numeric_summary = df.describe().select_dtypes(include=['number']).T
        if not numeric_summary.empty:
            print('<<< Variables Numéricas >>> \n')
            print(f'{numeric_summary} \n')     
    except:
        print('No hay variables numéricas en el DataFrame.')
    
    try:
        categorical_summary = df.describe(include='object').T
        if not categorical_summary.empty:
            print('<<< Variables Categóricas >>> \n')
            print(f'{categorical_summary} \n')     
    except:
        print('No hay variables categóricas en el DataFrame.')

In [75]:
exploracion_df(combined_df)

 Filas y Columnas del DATAFRAME 

El número de filas que tenemos es de 1367.
El número de columnas es de 20

____________________________________________________________

 Nombre de todas las Columnas del DATAFRAME: 

Index(['Country', 'Region', 'Happiness Rank', 'Happiness Score',
       'GDP per capita', 'Social support', 'Healthy life expectancy',
       'Freedom to make life choices', 'Perceptions of corruption',
       'Generosity', 'Año', 'Country or region',
       'Explained by: Log GDP per capita', 'Explained by: Social support',
       'Explained by: Healthy life expectancy',
       'Explained by: Freedom to make life choices',
       'Explained by: Generosity', 'Explained by: Perceptions of corruption',
       'Happiness score', 'Explained by: GDP per capita'],
      dtype='object')
____________________________________________________________

INFORMACIÓN GENERAL DEL DATAFRAME 

<class 'pandas.core.frame.DataFrame'>
Index: 1367 entries, 0 to 136
Data columns (total 20 column