In [44]:
import pandas as pd
from sklearn.preprocessing import StandardScaler

In [25]:
df_2015 = pd.read_csv('../data/2015.csv')
df_2016 = pd.read_csv('../data/2016.csv')
df_2017 = pd.read_csv('../data/2017.csv')
df_2018 = pd.read_csv('../data/2018.csv')
df_2019 = pd.read_csv('../data/2019.csv')

In [21]:
# Vamos a ver ahora las columnas comunes que tienen todos los datasets
common_cols = set(df_2015.columns)
for df in [df_2016, df_2017, df_2018, df_2019]:
    common_cols = common_cols.intersection(set(df.columns))

print("Columnas comunes en todos los datasets:")
print(common_cols)

Columnas comunes en todos los datasets:
{'Generosity'}


In [22]:
print(df_2015.columns)
print(df_2016.columns)
print(df_2017.columns)
print(df_2018.columns)
print(df_2019.columns)

Index(['Country', 'Region', 'Happiness Rank', 'Happiness Score',
       'Standard Error', 'Economy (GDP per Capita)', 'Family',
       'Health (Life Expectancy)', 'Freedom', 'Trust (Government Corruption)',
       'Generosity', 'Dystopia Residual'],
      dtype='object')
Index(['Country', 'Region', 'Happiness Rank', 'Happiness Score',
       'Lower Confidence Interval', 'Upper Confidence Interval',
       'Economy (GDP per Capita)', 'Family', 'Health (Life Expectancy)',
       'Freedom', 'Trust (Government Corruption)', 'Generosity',
       'Dystopia Residual'],
      dtype='object')
Index(['Country', 'Happiness.Rank', 'Happiness.Score', 'Whisker.high',
       'Whisker.low', 'Economy..GDP.per.Capita.', 'Family',
       'Health..Life.Expectancy.', 'Freedom', 'Generosity',
       'Trust..Government.Corruption.', 'Dystopia.Residual'],
      dtype='object')
Index(['Overall rank', 'Country or region', 'Score', 'GDP per capita',
       'Social support', 'Healthy life expectancy',
       'Fre

In [26]:
# Borramos las columnas que no son comunes
df_2015.drop(columns=['Region', 'Standard Error', 'Dystopia Residual'], inplace=True)
df_2016.drop(columns=['Region', 'Lower Confidence Interval', 'Upper Confidence Interval', 'Dystopia Residual'], inplace=True)
df_2017.drop(columns=['Whisker.high', 'Dystopia.Residual'], inplace=True)

In [28]:
print(df_2015.columns)
print(df_2016.columns)
print(df_2017.columns)
print(df_2018.columns)
print(df_2019.columns)

Index(['Country', 'Happiness Rank', 'Happiness Score',
       'Economy (GDP per Capita)', 'Family', 'Health (Life Expectancy)',
       'Freedom', 'Trust (Government Corruption)', 'Generosity'],
      dtype='object')
Index(['Country', 'Happiness Rank', 'Happiness Score',
       'Economy (GDP per Capita)', 'Family', 'Health (Life Expectancy)',
       'Freedom', 'Trust (Government Corruption)', 'Generosity'],
      dtype='object')
Index(['Country', 'Happiness.Rank', 'Happiness.Score', 'Whisker.low',
       'Economy..GDP.per.Capita.', 'Family', 'Health..Life.Expectancy.',
       'Freedom', 'Generosity', 'Trust..Government.Corruption.'],
      dtype='object')
Index(['Overall rank', 'Country or region', 'Score', 'GDP per capita',
       'Social support', 'Healthy life expectancy',
       'Freedom to make life choices', 'Generosity',
       'Perceptions of corruption'],
      dtype='object')
Index(['Overall rank', 'Country or region', 'Score', 'GDP per capita',
       'Social support', 'Healt

In [32]:
# Cambiamos los nombres de las columnas para que sean iguales en todos los datasets
new_column_names_2017 = {'Country': 'Country',
 'Happiness.Rank': 'Happiness Rank',
 'Happiness.Score': 'Happiness Score',
 'Economy..GDP.per.Capita.': 'Economy (GDP per Capita)',
 'Family': 'Family',
 'Health..Life.Expectancy.': 'Health (Life Expectancy)',
 'Freedom': 'Freedom',
 'Trust..Government.Corruption.': 'Trust (Government Corruption)',
 'Generosity': 'Generosity'}

df_2017.rename(columns=new_column_names_2017, inplace=True)

new_column_names_2018 = {'Country or region': 'Country',
 'Overall rank': 'Happiness Rank',
 'Score': 'Happiness Score',
 'GDP per capita': 'Economy (GDP per Capita)',
 'Social support': 'Family',
 'Healthy life expectancy': 'Health (Life Expectancy)',
 'Freedom to make life choices': 'Freedom',
 'Perceptions of corruption': 'Trust (Government Corruption)',
 'Generosity': 'Generosity'}

df_2018.rename(columns=new_column_names_2018, inplace=True)

new_column_names_2019 = {'Country or region': 'Country',
 'Overall rank': 'Happiness Rank',
 'Score': 'Happiness Score',
 'GDP per capita': 'Economy (GDP per Capita)',
 'Social support': 'Family',
 'Healthy life expectancy': 'Health (Life Expectancy)',
 'Freedom to make life choices': 'Freedom',
 'Perceptions of corruption': 'Trust (Government Corruption)',
 'Generosity': 'Generosity'}

df_2019.rename(columns=new_column_names_2019, inplace=True)



In [33]:
common_cols = set(df_2015.columns)
for df in [df_2016, df_2017, df_2018, df_2019]:
    common_cols = common_cols.intersection(set(df.columns))

print("Columnas comunes en todos los datasets:")
print(common_cols)

Columnas comunes en todos los datasets:
{'Economy (GDP per Capita)', 'Generosity', 'Country', 'Family', 'Health (Life Expectancy)', 'Freedom', 'Trust (Government Corruption)', 'Happiness Score', 'Happiness Rank'}


In [35]:
# Ordenamos ahora para que todos los datasets tengan las mismas columnas en el mismo orden
column_order = ['Country', 'Happiness Rank', 'Happiness Score', 'Economy (GDP per Capita)', 'Family', 'Health (Life Expectancy)', 'Freedom', 'Trust (Government Corruption)', 'Generosity']
df_2015 = df_2015[column_order]
df_2016 = df_2016[column_order]
df_2017 = df_2017[column_order]
df_2018 = df_2018[column_order] 
df_2019 = df_2019[column_order]

In [None]:
# Vamos a estandarizar todas las variables de todos los datasets excepto 'Country' y 'Happiness Rank', porque 'Happiness Rank' es una variable ordinal que no queremos estandarizar y 'Country' es una variable categ√≥rica.
scaler = StandardScaler()
for df in [df_2015, df_2016, df_2017, df_2018, df_2019]:
    cols_to_scale = df.columns.difference(['Country', 'Happiness Rank'])
    df[cols_to_scale] = scaler.fit_transform(df[cols_to_scale])


In [43]:
# Vamos a guardar los datasets preprocesados
df_2015.to_csv('../data//2015_processed.csv', index=False)
df_2016.to_csv('../data//2016_processed.csv', index=False)
df_2017.to_csv('../data//2017_processed.csv', index=False)
df_2018.to_csv('../data//2018_processed.csv', index=False)
df_2019.to_csv('../data//2019_processed.csv', index=False)