### EXPLORATORY DATA ANALYSIS

In this EDA, you will find the entire process of ETL, feature selection, and model training to predict happiness scores based on certain characteristics.

Import necessary libraries.

In [3]:
import pandas as pd
import numpy as np
import os
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_squared_error, r2_score

Read data 2015.

In [4]:
data_2015_csv = '../data/2015.csv' 
data_2015 = pd.read_csv(data_2015_csv, delimiter=',') 
print(data_2015.info())

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 158 entries, 0 to 157
Data columns (total 12 columns):
 #   Column                         Non-Null Count  Dtype  
---  ------                         --------------  -----  
 0   Country                        158 non-null    object 
 1   Region                         158 non-null    object 
 2   Happiness Rank                 158 non-null    int64  
 3   Happiness Score                158 non-null    float64
 4   Standard Error                 158 non-null    float64
 5   Economy (GDP per Capita)       158 non-null    float64
 6   Family                         158 non-null    float64
 7   Health (Life Expectancy)       158 non-null    float64
 8   Freedom                        158 non-null    float64
 9   Trust (Government Corruption)  158 non-null    float64
 10  Generosity                     158 non-null    float64
 11  Dystopia Residual              158 non-null    float64
dtypes: float64(9), int64(1), object(2)
memory usage: 1

In [7]:
data_2015.head(5)

Unnamed: 0,Country,Region,Happiness Rank,Happiness Score,Standard Error,Economy (GDP per Capita),Family,Health (Life Expectancy),Freedom,Trust (Government Corruption),Generosity,Dystopia Residual
0,Switzerland,Western Europe,1,7.587,0.03411,1.39651,1.34951,0.94143,0.66557,0.41978,0.29678,2.51738
1,Iceland,Western Europe,2,7.561,0.04884,1.30232,1.40223,0.94784,0.62877,0.14145,0.4363,2.70201
2,Denmark,Western Europe,3,7.527,0.03328,1.32548,1.36058,0.87464,0.64938,0.48357,0.34139,2.49204
3,Norway,Western Europe,4,7.522,0.0388,1.459,1.33095,0.88521,0.66973,0.36503,0.34699,2.46531
4,Canada,North America,5,7.427,0.03553,1.32629,1.32261,0.90563,0.63297,0.32957,0.45811,2.45176


In [33]:
data_2015.describe().T

Unnamed: 0,count,mean,std,min,25%,50%,75%,max
Happiness Rank,158.0,79.493671,45.754363,1.0,40.25,79.5,118.75,158.0
Happiness Score,158.0,5.375734,1.14501,2.839,4.526,5.2325,6.24375,7.587
Standard Error,158.0,0.047885,0.017146,0.01848,0.037268,0.04394,0.0523,0.13693
Economy (GDP per Capita),158.0,0.846137,0.403121,0.0,0.545808,0.910245,1.158448,1.69042
Family,158.0,0.991046,0.272369,0.0,0.856823,1.02951,1.214405,1.40223
Health (Life Expectancy),158.0,0.630259,0.247078,0.0,0.439185,0.696705,0.811013,1.02525
Freedom,158.0,0.428615,0.150693,0.0,0.32833,0.435515,0.549092,0.66973
Trust (Government Corruption),158.0,0.143422,0.120034,0.0,0.061675,0.10722,0.180255,0.55191
Generosity,158.0,0.237296,0.126685,0.0,0.150553,0.21613,0.309883,0.79588
Dystopia Residual,158.0,2.098977,0.55355,0.32858,1.75941,2.095415,2.462415,3.60214


Read data 2016.

In [5]:
data_2016_csv = '../data/2016.csv' 
data_2016 = pd.read_csv(data_2016_csv, delimiter=',') 
print(data_2016.info())

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 157 entries, 0 to 156
Data columns (total 13 columns):
 #   Column                         Non-Null Count  Dtype  
---  ------                         --------------  -----  
 0   Country                        157 non-null    object 
 1   Region                         157 non-null    object 
 2   Happiness Rank                 157 non-null    int64  
 3   Happiness Score                157 non-null    float64
 4   Lower Confidence Interval      157 non-null    float64
 5   Upper Confidence Interval      157 non-null    float64
 6   Economy (GDP per Capita)       157 non-null    float64
 7   Family                         157 non-null    float64
 8   Health (Life Expectancy)       157 non-null    float64
 9   Freedom                        157 non-null    float64
 10  Trust (Government Corruption)  157 non-null    float64
 11  Generosity                     157 non-null    float64
 12  Dystopia Residual              157 non-null    flo

In [8]:
data_2016.head(5)

Unnamed: 0,Country,Region,Happiness Rank,Happiness Score,Lower Confidence Interval,Upper Confidence Interval,Economy (GDP per Capita),Family,Health (Life Expectancy),Freedom,Trust (Government Corruption),Generosity,Dystopia Residual
0,Denmark,Western Europe,1,7.526,7.46,7.592,1.44178,1.16374,0.79504,0.57941,0.44453,0.36171,2.73939
1,Switzerland,Western Europe,2,7.509,7.428,7.59,1.52733,1.14524,0.86303,0.58557,0.41203,0.28083,2.69463
2,Iceland,Western Europe,3,7.501,7.333,7.669,1.42666,1.18326,0.86733,0.56624,0.14975,0.47678,2.83137
3,Norway,Western Europe,4,7.498,7.421,7.575,1.57744,1.1269,0.79579,0.59609,0.35776,0.37895,2.66465
4,Finland,Western Europe,5,7.413,7.351,7.475,1.40598,1.13464,0.81091,0.57104,0.41004,0.25492,2.82596


In [32]:
data_2016.describe().T

Unnamed: 0,count,mean,std,min,25%,50%,75%,max
Happiness Rank,157.0,78.980892,45.46603,1.0,40.0,79.0,118.0,157.0
Happiness Score,157.0,5.382185,1.141674,2.905,4.404,5.314,6.269,7.526
Lower Confidence Interval,157.0,5.282395,1.148043,2.732,4.327,5.237,6.154,7.46
Upper Confidence Interval,157.0,5.481975,1.136493,3.078,4.465,5.419,6.434,7.669
Economy (GDP per Capita),157.0,0.95388,0.412595,0.0,0.67024,1.0278,1.27964,1.82427
Family,157.0,0.793621,0.266706,0.0,0.64184,0.84142,1.02152,1.18326
Health (Life Expectancy),157.0,0.557619,0.229349,0.0,0.38291,0.59659,0.72993,0.95277
Freedom,157.0,0.370994,0.145507,0.0,0.25748,0.39747,0.48453,0.60848
Trust (Government Corruption),157.0,0.137624,0.111038,0.0,0.06126,0.10547,0.17554,0.50521
Generosity,157.0,0.242635,0.133756,0.0,0.15457,0.22245,0.31185,0.81971


Read data 2017.

In [9]:
data_2017_csv = '../data/2017.csv' 
data_2017 = pd.read_csv(data_2017_csv, delimiter=',') 
print(data_2017.info())

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 155 entries, 0 to 154
Data columns (total 12 columns):
 #   Column                         Non-Null Count  Dtype  
---  ------                         --------------  -----  
 0   Country                        155 non-null    object 
 1   Happiness.Rank                 155 non-null    int64  
 2   Happiness.Score                155 non-null    float64
 3   Whisker.high                   155 non-null    float64
 4   Whisker.low                    155 non-null    float64
 5   Economy..GDP.per.Capita.       155 non-null    float64
 6   Family                         155 non-null    float64
 7   Health..Life.Expectancy.       155 non-null    float64
 8   Freedom                        155 non-null    float64
 9   Generosity                     155 non-null    float64
 10  Trust..Government.Corruption.  155 non-null    float64
 11  Dystopia.Residual              155 non-null    float64
dtypes: float64(10), int64(1), object(1)
memory usage: 

In [10]:
data_2017.head(5)

Unnamed: 0,Country,Happiness.Rank,Happiness.Score,Whisker.high,Whisker.low,Economy..GDP.per.Capita.,Family,Health..Life.Expectancy.,Freedom,Generosity,Trust..Government.Corruption.,Dystopia.Residual
0,Norway,1,7.537,7.594445,7.479556,1.616463,1.533524,0.796667,0.635423,0.362012,0.315964,2.277027
1,Denmark,2,7.522,7.581728,7.462272,1.482383,1.551122,0.792566,0.626007,0.35528,0.40077,2.313707
2,Iceland,3,7.504,7.62203,7.38597,1.480633,1.610574,0.833552,0.627163,0.47554,0.153527,2.322715
3,Switzerland,4,7.494,7.561772,7.426227,1.56498,1.516912,0.858131,0.620071,0.290549,0.367007,2.276716
4,Finland,5,7.469,7.527542,7.410458,1.443572,1.540247,0.809158,0.617951,0.245483,0.382612,2.430182


In [31]:
data_2017.describe().T

Unnamed: 0,count,mean,std,min,25%,50%,75%,max
Happiness.Rank,155.0,78.0,44.888751,1.0,39.5,78.0,116.5,155.0
Happiness.Score,155.0,5.354019,1.13123,2.693,4.5055,5.279,6.1015,7.537
Whisker.high,155.0,5.452326,1.118542,2.864884,4.608172,5.370032,6.1946,7.62203
Whisker.low,155.0,5.255713,1.14503,2.521116,4.374955,5.193152,6.006527,7.479556
Economy..GDP.per.Capita.,155.0,0.984718,0.420793,0.0,0.663371,1.064578,1.318027,1.870766
Family,155.0,1.188898,0.287263,0.0,1.042635,1.253918,1.414316,1.610574
Health..Life.Expectancy.,155.0,0.551341,0.237073,0.0,0.369866,0.606042,0.723008,0.949492
Freedom,155.0,0.408786,0.149997,0.0,0.303677,0.437454,0.516561,0.658249
Generosity,155.0,0.246883,0.13478,0.0,0.154106,0.231538,0.323762,0.838075
Trust..Government.Corruption.,155.0,0.12312,0.101661,0.0,0.057271,0.089848,0.153296,0.464308


Read data 2018.

In [11]:
data_2018_csv = '../data/2018.csv' 
data_2018 = pd.read_csv(data_2018_csv, delimiter=',') 
print(data_2018.info())

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 156 entries, 0 to 155
Data columns (total 9 columns):
 #   Column                        Non-Null Count  Dtype  
---  ------                        --------------  -----  
 0   Overall rank                  156 non-null    int64  
 1   Country or region             156 non-null    object 
 2   Score                         156 non-null    float64
 3   GDP per capita                156 non-null    float64
 4   Social support                156 non-null    float64
 5   Healthy life expectancy       156 non-null    float64
 6   Freedom to make life choices  156 non-null    float64
 7   Generosity                    156 non-null    float64
 8   Perceptions of corruption     155 non-null    float64
dtypes: float64(7), int64(1), object(1)
memory usage: 11.1+ KB
None


In [12]:
data_2018.head(5)

Unnamed: 0,Overall rank,Country or region,Score,GDP per capita,Social support,Healthy life expectancy,Freedom to make life choices,Generosity,Perceptions of corruption
0,1,Finland,7.632,1.305,1.592,0.874,0.681,0.202,0.393
1,2,Norway,7.594,1.456,1.582,0.861,0.686,0.286,0.34
2,3,Denmark,7.555,1.351,1.59,0.868,0.683,0.284,0.408
3,4,Iceland,7.495,1.343,1.644,0.914,0.677,0.353,0.138
4,5,Switzerland,7.487,1.42,1.549,0.927,0.66,0.256,0.357


In [30]:
data_2018.describe().T

Unnamed: 0,count,mean,std,min,25%,50%,75%,max
Overall rank,156.0,78.5,45.177428,1.0,39.75,78.5,117.25,156.0
Score,156.0,5.375917,1.119506,2.905,4.45375,5.378,6.1685,7.632
GDP per capita,156.0,0.891449,0.391921,0.0,0.61625,0.9495,1.19775,2.096
Social support,156.0,1.213237,0.302372,0.0,1.06675,1.255,1.463,1.644
Healthy life expectancy,156.0,0.597346,0.247579,0.0,0.42225,0.644,0.77725,1.03
Freedom to make life choices,156.0,0.454506,0.162424,0.0,0.356,0.487,0.5785,0.724
Generosity,156.0,0.181006,0.098471,0.0,0.1095,0.174,0.239,0.598
Perceptions of corruption,155.0,0.112,0.096492,0.0,0.051,0.082,0.137,0.457


Read data 2019.

In [14]:
data_2019_csv = '../data/2019.csv' 
data_2019 = pd.read_csv(data_2019_csv, delimiter=',') 
print(data_2019.info())

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 156 entries, 0 to 155
Data columns (total 9 columns):
 #   Column                        Non-Null Count  Dtype  
---  ------                        --------------  -----  
 0   Overall rank                  156 non-null    int64  
 1   Country or region             156 non-null    object 
 2   Score                         156 non-null    float64
 3   GDP per capita                156 non-null    float64
 4   Social support                156 non-null    float64
 5   Healthy life expectancy       156 non-null    float64
 6   Freedom to make life choices  156 non-null    float64
 7   Generosity                    156 non-null    float64
 8   Perceptions of corruption     156 non-null    float64
dtypes: float64(7), int64(1), object(1)
memory usage: 11.1+ KB
None


In [15]:
data_2019.head(5)

Unnamed: 0,Overall rank,Country or region,Score,GDP per capita,Social support,Healthy life expectancy,Freedom to make life choices,Generosity,Perceptions of corruption
0,1,Finland,7.769,1.34,1.587,0.986,0.596,0.153,0.393
1,2,Denmark,7.6,1.383,1.573,0.996,0.592,0.252,0.41
2,3,Norway,7.554,1.488,1.582,1.028,0.603,0.271,0.341
3,4,Iceland,7.494,1.38,1.624,1.026,0.591,0.354,0.118
4,5,Netherlands,7.488,1.396,1.522,0.999,0.557,0.322,0.298


In [29]:
data_2019.describe().T

Unnamed: 0,count,mean,std,min,25%,50%,75%,max
Overall rank,156.0,78.5,45.177428,1.0,39.75,78.5,117.25,156.0
Score,156.0,5.407096,1.11312,2.853,4.5445,5.3795,6.1845,7.769
GDP per capita,156.0,0.905147,0.398389,0.0,0.60275,0.96,1.2325,1.684
Social support,156.0,1.208814,0.299191,0.0,1.05575,1.2715,1.4525,1.624
Healthy life expectancy,156.0,0.725244,0.242124,0.0,0.54775,0.789,0.88175,1.141
Freedom to make life choices,156.0,0.392571,0.143289,0.0,0.308,0.417,0.50725,0.631
Generosity,156.0,0.184846,0.095254,0.0,0.10875,0.1775,0.24825,0.566
Perceptions of corruption,156.0,0.110603,0.094538,0.0,0.047,0.0855,0.14125,0.453


Comparison.

In [22]:
for year, data in {2015: data_2015, 2016: data_2016, 2017: data_2017, 2018: data_2018, 2019: data_2019}.items():
    print(f"Año {year}: {data.shape[0]} filas, {data.shape[1]} columnas")

Año 2015: 158 filas, 12 columnas
Año 2016: 157 filas, 13 columnas
Año 2017: 155 filas, 12 columnas
Año 2018: 156 filas, 9 columnas
Año 2019: 156 filas, 9 columnas


In [28]:

# Crear un conjunto con todas las columnas únicas de todos los DataFrames
all_columns = list(set().union(*[df.columns for df in [data_2015, data_2016, data_2017, data_2018, data_2019]]))

# Crear un DataFrame de comparación
comparison_df = pd.DataFrame({year: [col in df.columns for col in all_columns] for year, df in dfs.items()}, index=all_columns)

# Reemplazar True por 'X' y False por vacío
comparison_df = comparison_df.replace(True, '*').replace(False, '')

# Mostrar el resultado
print(comparison_df)


                              2015 2016 2017 2018 2019
Dystopia Residual                *    *               
Country                          *    *    *          
Happiness.Rank                             *          
Happiness Score                  *    *               
Freedom to make life choices                    *    *
Freedom                          *    *    *          
Score                                           *    *
Whisker.high                               *          
Health (Life Expectancy)         *    *               
Happiness Rank                   *    *               
GDP per capita                                  *    *
Overall rank                                    *    *
Perceptions of corruption                       *    *
Standard Error                   *                    
Dystopia.Residual                          *          
Healthy life expectancy                         *    *
Upper Confidence Interval             *               
Economy..G

### Model training

### Regression Models