In [1]:
%matplotlib notebook

In [2]:
import pandas as pd
import os
import matplotlib.pyplot as plt
import plotly.express as px
import scipy.stats as sts
from pandas_profiling import ProfileReport

In [3]:
path = os.path.join("Resources")
df_dict = {}
for file in os.listdir(path):
    name = "df_"  +  str(file[:4])
    df_dict[name] = pd.read_csv(os.path.join("Resources", file))

In [4]:
#loading the datasets into their respective variables
df_2020 = df_dict["df_2020"]
df_2019 = df_dict["df_2019"]
df_2018 = df_dict["df_2018"]
df_2017 = df_dict["df_2017"]
df_2016 = df_dict["df_2016"]
df_2015 = df_dict["df_2018"]

# Organizing the Data
The datasets, though they point to generally the same things, have differences in the naming of their columns. Some datasets tend to have data that isn't present in other datasets. We rename the columns according to the 2019 naming convention, since that dataset has the least number of columns.

In [5]:
df_2019.rename(columns={"Country or region": "Country"}, inplace=True)
df_2019.head()

Unnamed: 0,Overall rank,Country,Score,GDP per capita,Social support,Healthy life expectancy,Freedom to make life choices,Generosity,Perceptions of corruption
0,1,Finland,7.769,1.34,1.587,0.986,0.596,0.153,0.393
1,2,Denmark,7.6,1.383,1.573,0.996,0.592,0.252,0.41
2,3,Norway,7.554,1.488,1.582,1.028,0.603,0.271,0.341
3,4,Iceland,7.494,1.38,1.624,1.026,0.591,0.354,0.118
4,5,Netherlands,7.488,1.396,1.522,0.999,0.557,0.322,0.298


In [6]:
#Making changes to the 2015 Dataset
df_2015.rename(columns={"Country or region": "Country"}, inplace=True)
df_2015.head()

Unnamed: 0,Overall rank,Country,Score,GDP per capita,Social support,Healthy life expectancy,Freedom to make life choices,Generosity,Perceptions of corruption
0,1,Finland,7.632,1.305,1.592,0.874,0.681,0.202,0.393
1,2,Norway,7.594,1.456,1.582,0.861,0.686,0.286,0.34
2,3,Denmark,7.555,1.351,1.59,0.868,0.683,0.284,0.408
3,4,Iceland,7.495,1.343,1.644,0.914,0.677,0.353,0.138
4,5,Switzerland,7.487,1.42,1.549,0.927,0.66,0.256,0.357


In [7]:
column_renames_2016 = {"Happiness Score": "Score","Happiness Rank": "Overall Rank","Economy (GDP per Capita)": "GDP per capita", "Family": "Social support", "Freedom": "Freedom to make life choices", "Health (Life Expectancy)": "Healthy life expectancy", "Trust (Government Corruption)": "Perceptions of corruption"}
df_2016.rename(columns=column_renames_2016, inplace=True)
df_2016.drop(columns=["Region", "Lower Confidence Interval", "Upper Confidence Interval", "Dystopia Residual"], inplace=True)
df_2016.head()

Unnamed: 0,Country,Overall Rank,Score,GDP per capita,Social support,Healthy life expectancy,Freedom to make life choices,Perceptions of corruption,Generosity
0,Denmark,1,7.526,1.44178,1.16374,0.79504,0.57941,0.44453,0.36171
1,Switzerland,2,7.509,1.52733,1.14524,0.86303,0.58557,0.41203,0.28083
2,Iceland,3,7.501,1.42666,1.18326,0.86733,0.56624,0.14975,0.47678
3,Norway,4,7.498,1.57744,1.1269,0.79579,0.59609,0.35776,0.37895
4,Finland,5,7.413,1.40598,1.13464,0.81091,0.57104,0.41004,0.25492


In [8]:
column_renames_2017 = {"Happiness.Rank": "Overall Rank","Happiness.Score": "Score", "Economy..GDP.per.Capita.": "GDP per capita", "Family": "Social support", "Health..Life.Expectancy.": "Healthy life expectancy","Freedom": "Freedom to make life choices", "Trust..Government.Corruption.":"Perceptions of corruption"}
df_2017.rename(columns=column_renames_2017, inplace=True)
df_2017.drop(columns=["Whisker.high", "Whisker.low", "Dystopia.Residual"])
df_2017.head()

Unnamed: 0,Country,Overall Rank,Score,Whisker.high,Whisker.low,GDP per capita,Social support,Healthy life expectancy,Freedom to make life choices,Generosity,Perceptions of corruption,Dystopia.Residual
0,Norway,1,7.537,7.594445,7.479556,1.616463,1.533524,0.796667,0.635423,0.362012,0.315964,2.277027
1,Denmark,2,7.522,7.581728,7.462272,1.482383,1.551122,0.792566,0.626007,0.35528,0.40077,2.313707
2,Iceland,3,7.504,7.62203,7.38597,1.480633,1.610574,0.833552,0.627163,0.47554,0.153527,2.322715
3,Switzerland,4,7.494,7.561772,7.426227,1.56498,1.516912,0.858131,0.620071,0.290549,0.367007,2.276716
4,Finland,5,7.469,7.527542,7.410458,1.443572,1.540247,0.809158,0.617951,0.245483,0.382612,2.430182


In [18]:
#we shall discuss this on Monday
df_2020.columns

Index(['Country name', 'Regional indicator', 'Ladder score',
       'Standard error of ladder score', 'upperwhisker', 'lowerwhisker',
       'Logged GDP per capita', 'Social support', 'Healthy life expectancy',
       'Freedom to make life choices', 'Generosity',
       'Perceptions of corruption', 'Ladder score in Dystopia',
       'Explained by: Log GDP per capita', 'Explained by: Social support',
       'Explained by: Healthy life expectancy',
       'Explained by: Freedom to make life choices',
       'Explained by: Generosity', 'Explained by: Perceptions of corruption',
       'Dystopia + residual'],
      dtype='object')