In [45]:
# Initial imports
import pandas as pd
import plotly.express as px
import hvplot.pandas
from path import Path
from sklearn.preprocessing import StandardScaler, MinMaxScaler
from sklearn.decomposition import PCA
from sklearn.cluster import KMeans
import warnings
warnings.filterwarnings('ignore')

import statsmodels.api as sm
from statsmodels.formula.api import ols

In [13]:
pd.set_option('display.max_rows', None)

In [34]:
# Load the World Happiness Report 2022.csv dataset.
file_path = "Resources/2021.csv"
happiness_2021_df = pd.read_csv(file_path)
display(happiness_2021_df)

Unnamed: 0,Country name,Regional indicator,Ladder score,Standard error of ladder score,upperwhisker,lowerwhisker,Logged GDP per capita,Social support,Healthy life expectancy,Freedom to make life choices,Generosity,Perceptions of corruption,Ladder score in Dystopia,Explained by: Log GDP per capita,Explained by: Social support,Explained by: Healthy life expectancy,Explained by: Freedom to make life choices,Explained by: Generosity,Explained by: Perceptions of corruption,Dystopia + residual
0,Finland,Western Europe,7.842,0.032,7.904,7.78,10.775,0.954,72.0,0.949,-0.098,0.186,2.43,1.446,1.106,0.741,0.691,0.124,0.481,3.253
1,Denmark,Western Europe,7.62,0.035,7.687,7.552,10.933,0.954,72.7,0.946,0.03,0.179,2.43,1.502,1.108,0.763,0.686,0.208,0.485,2.868
2,Switzerland,Western Europe,7.571,0.036,7.643,7.5,11.117,0.942,74.4,0.919,0.025,0.292,2.43,1.566,1.079,0.816,0.653,0.204,0.413,2.839
3,Iceland,Western Europe,7.554,0.059,7.67,7.438,10.878,0.983,73.0,0.955,0.16,0.673,2.43,1.482,1.172,0.772,0.698,0.293,0.17,2.967
4,Netherlands,Western Europe,7.464,0.027,7.518,7.41,10.932,0.942,72.4,0.913,0.175,0.338,2.43,1.501,1.079,0.753,0.647,0.302,0.384,2.798
5,Norway,Western Europe,7.392,0.035,7.462,7.323,11.053,0.954,73.3,0.96,0.093,0.27,2.43,1.543,1.108,0.782,0.703,0.249,0.427,2.58
6,Sweden,Western Europe,7.363,0.036,7.433,7.293,10.867,0.934,72.7,0.945,0.086,0.237,2.43,1.478,1.062,0.763,0.685,0.244,0.448,2.683
7,Luxembourg,Western Europe,7.324,0.037,7.396,7.252,11.647,0.908,72.6,0.907,-0.034,0.386,2.43,1.751,1.003,0.76,0.639,0.166,0.353,2.653
8,New Zealand,North America and ANZ,7.277,0.04,7.355,7.198,10.643,0.948,73.4,0.929,0.134,0.242,2.43,1.4,1.094,0.785,0.665,0.276,0.445,2.612
9,Austria,Western Europe,7.268,0.036,7.337,7.198,10.906,0.934,73.3,0.908,0.042,0.481,2.43,1.492,1.062,0.782,0.64,0.215,0.292,2.784


In [35]:
happiness_2021_df.dtypes

Country name                                   object
Regional indicator                             object
Ladder score                                  float64
Standard error of ladder score                float64
upperwhisker                                  float64
lowerwhisker                                  float64
Logged GDP per capita                         float64
Social support                                float64
Healthy life expectancy                       float64
Freedom to make life choices                  float64
Generosity                                    float64
Perceptions of corruption                     float64
Ladder score in Dystopia                      float64
Explained by: Log GDP per capita              float64
Explained by: Social support                  float64
Explained by: Healthy life expectancy         float64
Explained by: Freedom to make life choices    float64
Explained by: Generosity                      float64
Explained by: Perceptions of

In [None]:
#Note: The choice of Dystopia as a benchmark permits every real country to
    #have a positive (or at least zero) contribution from each of the six factors.
    #Since our objective is to fit the best predictive model, we should not use Dystopia.

In [36]:
cleaned_2021_df = happiness_2021_df.drop(['Regional indicator', 'upperwhisker', 'lowerwhisker','Ladder score in Dystopia', 'Explained by: Log GDP per capita', 'Explained by: Social support', 'Explained by: Healthy life expectancy', 'Explained by: Freedom to make life choices', 'Explained by: Generosity', 'Explained by: Perceptions of corruption', 'Dystopia + residual'], axis = 1)
cleaned_2021_df

Unnamed: 0,Country name,Ladder score,Standard error of ladder score,Logged GDP per capita,Social support,Healthy life expectancy,Freedom to make life choices,Generosity,Perceptions of corruption
0,Finland,7.842,0.032,10.775,0.954,72.0,0.949,-0.098,0.186
1,Denmark,7.62,0.035,10.933,0.954,72.7,0.946,0.03,0.179
2,Switzerland,7.571,0.036,11.117,0.942,74.4,0.919,0.025,0.292
3,Iceland,7.554,0.059,10.878,0.983,73.0,0.955,0.16,0.673
4,Netherlands,7.464,0.027,10.932,0.942,72.4,0.913,0.175,0.338
5,Norway,7.392,0.035,11.053,0.954,73.3,0.96,0.093,0.27
6,Sweden,7.363,0.036,10.867,0.934,72.7,0.945,0.086,0.237
7,Luxembourg,7.324,0.037,11.647,0.908,72.6,0.907,-0.034,0.386
8,New Zealand,7.277,0.04,10.643,0.948,73.4,0.929,0.134,0.242
9,Austria,7.268,0.036,10.906,0.934,73.3,0.908,0.042,0.481


In [18]:
cleaned_2021_df.shape

(149, 9)

In [3]:
# Load the income inequality dataset.
file_path = "Resources/csvData.csv"
income_inequality_df = pd.read_csv(file_path)
income_inequality_df

Unnamed: 0,country,netGini,wealthGini,medianIncome,povertyRate
0,South Africa,57.7,86.7,4.7,35.9
1,Namibia,55.0,91.0,3.5,47.0
2,Sri Lanka,51.4,66.5,5.5,16.1
3,China,51.0,78.9,7.7,12.1
4,Zambia,49.5,81.0,1.6,74.3
...,...,...,...,...,...
101,Finland,25.6,76.7,43.5,6.3
102,Czech Republic,25.6,64.8,24.3,5.9
103,Denmark,25.3,80.9,44.7,5.5
104,Norway,24.9,80.5,63.8,8.1


In [6]:
# Load the country coordinates dataset.
file_path = "Resources/country-coordinates-world.csv"
country_coordinates_df = pd.read_csv(file_path)
country_coordinates_df

Unnamed: 0,latitude,longitude,Country
0,33.939110,67.709953,Afghanistan
1,41.153332,20.168331,Albania
2,28.033886,1.659626,Algeria
3,-14.270972,-170.132217,American Samoa
4,42.546245,1.601554,Andorra
...,...,...,...
239,-13.768752,-177.156097,Wallis and Futuna
240,24.215527,-12.885834,Western Sahara
241,15.552727,48.516388,Yemen
242,-13.133897,27.849332,Zambia


In [7]:
happiness_df.shape

(146, 12)

In [8]:
income_inequality_df.shape

(106, 5)

In [30]:
file_path = "Resources/Unemployment_data_2021.csv"
country_unempl_df = pd.read_csv(file_path)
country_unempl_df

Unnamed: 0,Country Name,Country Code,Indicator Name,Indicator Code,1960,1961,1962,1963,1964,1965,...,2012,2013,2014,2015,2016,2017,2018,2019,2020,2021
0,Aruba,ABW,"Unemployment, total (% of total labor force) (...",SL.UEM.TOTL.ZS,,,,,,,...,,,,,,,,,,
1,Africa Eastern and Southern,AFE,"Unemployment, total (% of total labor force) (...",SL.UEM.TOTL.ZS,,,,,,,...,6.562179,6.445456,6.405195,6.490041,6.610205,6.714955,6.731163,6.914353,7.563187,8.111783
2,Afghanistan,AFG,"Unemployment, total (% of total labor force) (...",SL.UEM.TOTL.ZS,,,,,,,...,11.341,11.193,11.142,11.127,11.158,11.18,11.152,11.217,11.71,13.283
3,Africa Western and Central,AFW,"Unemployment, total (% of total labor force) (...",SL.UEM.TOTL.ZS,,,,,,,...,4.637602,4.410216,4.688088,4.626737,5.567017,6.019505,6.041092,6.063362,6.774914,6.839009
4,Angola,AGO,"Unemployment, total (% of total labor force) (...",SL.UEM.TOTL.ZS,,,,,,,...,7.347,7.366,7.372,7.392,7.412,7.408,7.421,7.421,8.333,8.53
5,Albania,ALB,"Unemployment, total (% of total labor force) (...",SL.UEM.TOTL.ZS,,,,,,,...,13.38,15.87,18.049999,17.190001,15.42,13.62,12.3,11.47,13.329,11.819
6,Andorra,AND,"Unemployment, total (% of total labor force) (...",SL.UEM.TOTL.ZS,,,,,,,...,,,,,,,,,,
7,Arab World,ARB,"Unemployment, total (% of total labor force) (...",SL.UEM.TOTL.ZS,,,,,,,...,10.663129,10.735393,10.872227,10.965901,10.761115,10.896379,10.502336,10.006154,11.487321,11.625344
8,United Arab Emirates,ARE,"Unemployment, total (% of total labor force) (...",SL.UEM.TOTL.ZS,,,,,,,...,2.185,2.044,1.911,1.768,1.64,2.46,2.352,2.23,3.188,3.358
9,Argentina,ARG,"Unemployment, total (% of total labor force) (...",SL.UEM.TOTL.ZS,,,,,,,...,7.22,7.1,7.27,7.521,8.111,8.35,9.22,9.84,11.46,10.902


In [31]:
country_unempl_df.dtypes

Country Name       object
Country Code       object
Indicator Name     object
Indicator Code     object
1960              float64
1961              float64
1962              float64
1963              float64
1964              float64
1965              float64
1966              float64
1967              float64
1968              float64
1969              float64
1970              float64
1971              float64
1972              float64
1973              float64
1974              float64
1975              float64
1976              float64
1977              float64
1978              float64
1979              float64
1980              float64
1981              float64
1982              float64
1983              float64
1984              float64
1985              float64
1986              float64
1987              float64
1988              float64
1989              float64
1990              float64
1991              float64
1992              float64
1993              float64
1994        

In [32]:
country_unempl_df = country_unempl_df[['Country Name', '2021']]
country_unempl_df

Unnamed: 0,Country Name,2021
0,Aruba,
1,Africa Eastern and Southern,8.111783
2,Afghanistan,13.283
3,Africa Western and Central,6.839009
4,Angola,8.53
5,Albania,11.819
6,Andorra,
7,Arab World,11.625344
8,United Arab Emirates,3.358
9,Argentina,10.902


In [25]:
country_unempl_df.shape

(266, 2)

In [26]:
country_unempl_df.isnull().sum()

Country Name     0
2021            31
dtype: int64

In [37]:
country_unempl_df = country_unempl_df.rename(columns = {'Country Name':'Country name', '2021':'Unemployment rate'})
country_unempl_df

Unnamed: 0,Country name,Unemployment rate
0,Aruba,
1,Africa Eastern and Southern,8.111783
2,Afghanistan,13.283
3,Africa Western and Central,6.839009
4,Angola,8.53
5,Albania,11.819
6,Andorra,
7,Arab World,11.625344
8,United Arab Emirates,3.358
9,Argentina,10.902


In [38]:
happiness_2 = pd.merge(cleaned_2021_df,country_unempl_df, how='left' )
happiness_2

Unnamed: 0,Country name,Ladder score,Standard error of ladder score,Logged GDP per capita,Social support,Healthy life expectancy,Freedom to make life choices,Generosity,Perceptions of corruption,Unemployment rate
0,Finland,7.842,0.032,10.775,0.954,72.0,0.949,-0.098,0.186,7.525
1,Denmark,7.62,0.035,10.933,0.954,72.7,0.946,0.03,0.179,4.798
2,Switzerland,7.571,0.036,11.117,0.942,74.4,0.919,0.025,0.292,5.321
3,Iceland,7.554,0.059,10.878,0.983,73.0,0.955,0.16,0.673,5.401
4,Netherlands,7.464,0.027,10.932,0.942,72.4,0.913,0.175,0.338,4.01
5,Norway,7.392,0.035,11.053,0.954,73.3,0.96,0.093,0.27,4.985
6,Sweden,7.363,0.036,10.867,0.934,72.7,0.945,0.086,0.237,8.661
7,Luxembourg,7.324,0.037,11.647,0.908,72.6,0.907,-0.034,0.386,5.227
8,New Zealand,7.277,0.04,10.643,0.948,73.4,0.929,0.134,0.242,4.124
9,Austria,7.268,0.036,10.906,0.934,73.3,0.908,0.042,0.481,6.301


In [39]:
happiness_2.shape

(149, 10)

In [40]:
happiness_2.isnull().sum()

Country name                       0
Ladder score                       0
Standard error of ladder score     0
Logged GDP per capita              0
Social support                     0
Healthy life expectancy            0
Freedom to make life choices       0
Generosity                         0
Perceptions of corruption          0
Unemployment rate                 18
dtype: int64

In [43]:
happiness_3 = happiness_2.dropna()
happiness_3

Unnamed: 0,Country name,Ladder score,Standard error of ladder score,Logged GDP per capita,Social support,Healthy life expectancy,Freedom to make life choices,Generosity,Perceptions of corruption,Unemployment rate
0,Finland,7.842,0.032,10.775,0.954,72.0,0.949,-0.098,0.186,7.525
1,Denmark,7.62,0.035,10.933,0.954,72.7,0.946,0.03,0.179,4.798
2,Switzerland,7.571,0.036,11.117,0.942,74.4,0.919,0.025,0.292,5.321
3,Iceland,7.554,0.059,10.878,0.983,73.0,0.955,0.16,0.673,5.401
4,Netherlands,7.464,0.027,10.932,0.942,72.4,0.913,0.175,0.338,4.01
5,Norway,7.392,0.035,11.053,0.954,73.3,0.96,0.093,0.27,4.985
6,Sweden,7.363,0.036,10.867,0.934,72.7,0.945,0.086,0.237,8.661
7,Luxembourg,7.324,0.037,11.647,0.908,72.6,0.907,-0.034,0.386,5.227
8,New Zealand,7.277,0.04,10.643,0.948,73.4,0.929,0.134,0.242,4.124
9,Austria,7.268,0.036,10.906,0.934,73.3,0.908,0.042,0.481,6.301


In [44]:
happiness_3.shape

(131, 10)

In [47]:
y = happiness_3['Ladder score']
X = happiness_3[['Logged GDP per capita', 
                'Social support', 
                'Healthy life expectancy', 
                'Freedom to make life choices', 
                'Generosity', 
                'Perceptions of corruption'
               ]]

X = sm.add_constant(X)
est = sm.OLS(y, X).fit()

est.summary()

0,1,2,3
Dep. Variable:,Ladder score,R-squared:,0.781
Model:,OLS,Adj. R-squared:,0.771
Method:,Least Squares,F-statistic:,73.74
Date:,"Sat, 07 May 2022",Prob (F-statistic):,1.55e-38
Time:,00:26:30,Log-Likelihood:,-98.919
No. Observations:,131,AIC:,211.8
Df Residuals:,124,BIC:,232.0
Df Model:,6,,
Covariance Type:,nonrobust,,

0,1,2,3,4,5,6
,coef,std err,t,P>|t|,[0.025,0.975]
const,-2.1922,0.670,-3.274,0.001,-3.518,-0.867
Logged GDP per capita,0.2293,0.095,2.416,0.017,0.041,0.417
Social support,2.7225,0.718,3.790,0.000,1.301,4.144
Healthy life expectancy,0.0419,0.015,2.832,0.005,0.013,0.071
Freedom to make life choices,1.5237,0.533,2.859,0.005,0.469,2.578
Generosity,0.2259,0.345,0.655,0.513,-0.456,0.908
Perceptions of corruption,-0.7930,0.296,-2.679,0.008,-1.379,-0.207

0,1,2,3
Omnibus:,15.339,Durbin-Watson:,1.631
Prob(Omnibus):,0.0,Jarque-Bera (JB):,17.362
Skew:,-0.764,Prob(JB):,0.00017
Kurtosis:,3.921,Cond. No.,1190.0


In [46]:
y = happiness_3['Ladder score']
X = happiness_3[['Logged GDP per capita', 
                'Social support', 
                'Healthy life expectancy', 
                'Freedom to make life choices', 
                'Generosity', 
                'Perceptions of corruption',
                'Unemployment rate'
               ]]

X = sm.add_constant(X)
est = sm.OLS(y, X).fit()

est.summary()

0,1,2,3
Dep. Variable:,Ladder score,R-squared:,0.795
Model:,OLS,Adj. R-squared:,0.783
Method:,Least Squares,F-statistic:,68.02
Date:,"Sat, 07 May 2022",Prob (F-statistic):,2.81e-39
Time:,00:25:34,Log-Likelihood:,-94.717
No. Observations:,131,AIC:,205.4
Df Residuals:,123,BIC:,228.4
Df Model:,7,,
Covariance Type:,nonrobust,,

0,1,2,3,4,5,6
,coef,std err,t,P>|t|,[0.025,0.975]
const,-1.7941,0.666,-2.694,0.008,-3.112,-0.476
Logged GDP per capita,0.2801,0.094,2.980,0.003,0.094,0.466
Social support,2.8253,0.699,4.040,0.000,1.441,4.210
Healthy life expectancy,0.0341,0.015,2.327,0.022,0.005,0.063
Freedom to make life choices,1.1388,0.535,2.127,0.035,0.079,2.199
Generosity,-0.0044,0.345,-0.013,0.990,-0.687,0.678
Perceptions of corruption,-0.7123,0.289,-2.463,0.015,-1.285,-0.140
Unemployment rate,-0.0261,0.009,-2.855,0.005,-0.044,-0.008

0,1,2,3
Omnibus:,15.797,Durbin-Watson:,1.731
Prob(Omnibus):,0.0,Jarque-Bera (JB):,17.659
Skew:,-0.808,Prob(JB):,0.000146
Kurtosis:,3.788,Cond. No.,1220.0


In [53]:
cleaned_2021_df['Country name'].unique().difference(country_unempl_df['Country name'].unique())

AttributeError: 'numpy.ndarray' object has no attribute 'difference'

In [55]:
import numpy as np
# union of the series
union = pd.Series(np.union1d(cleaned_2021_df['Country name'], country_unempl_df['Country name']))
  
# intersection of the series
intersect = pd.Series(np.intersect1d(cleaned_2021_df['Country name'], country_unempl_df['Country name']))
  
# uncommon elements in both the series 
notcommon = union[~union.isin(intersect)]

notcommon

1                            Africa Eastern and Southern
2                             Africa Western and Central
5                                         American Samoa
6                                                Andorra
7                                                 Angola
8                                    Antigua and Barbuda
9                                             Arab World
12                                                 Aruba
16                                          Bahamas, The
19                                              Barbados
22                                                Belize
24                                               Bermuda
25                                                Bhutan
30                                British Virgin Islands
31                                     Brunei Darussalam
35                                            Cabo Verde
39                                Caribbean small states
40                             