In [22]:
#Import necessary libraries
import numpy as np
import pandas as pd
from sklearn.preprocessing import OneHotEncoder
from sklearn.model_selection import train_test_split
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score

In [2]:
#Import data
fifteen = pd.read_csv("../data/2015.csv")
sixteen = pd.read_csv("../data/2016.csv")
seventeen = pd.read_csv("../data/2017.csv")
eighteen = pd.read_csv("../data/2018.csv")
nineteen = pd.read_csv("../data/2019.csv")

In [3]:
# Rename columns
seventeen = seventeen.rename(columns = {
    "Happiness.Rank":"Happiness Rank", "Economy..GDP.per.Capita.":"Economy (GDP per Capita)", 
    "Health..Life.Expectancy.":"Health (Life Expectancy)","Trust..Government.Corruption.":"Trust (Government Corruption)"
})

eighteen = eighteen.rename(columns = {
    "Country or region":"Country", "Overall rank":"Happiness Rank", "GDP per capita":"Economy (GDP per Capita)",
    "Social support":"Family", "Healthy life expectancy":"Health (Life Expectancy)", "Freedom to make life choices":"Freedom",
    "Perceptions of corruption":"Trust (Government Corruption)"
})
nineteen = nineteen.rename(columns = {
    "Country or region":"Country", "Overall rank":"Happiness Rank", "GDP per capita":"Economy (GDP per Capita)",
    "Social support":"Family", "Healthy life expectancy":"Health (Life Expectancy)", "Freedom to make life choices":"Freedom",
    "Perceptions of corruption":"Trust (Government Corruption)"
})

In [4]:

dfs = [fifteen, sixteen, seventeen, eighteen, nineteen]
for df in dfs:
    print(df.columns)

Index(['Country', 'Region', 'Happiness Rank', 'Happiness Score',
       'Standard Error', 'Economy (GDP per Capita)', 'Family',
       'Health (Life Expectancy)', 'Freedom', 'Trust (Government Corruption)',
       'Generosity', 'Dystopia Residual'],
      dtype='object')
Index(['Country', 'Region', 'Happiness Rank', 'Happiness Score',
       'Lower Confidence Interval', 'Upper Confidence Interval',
       'Economy (GDP per Capita)', 'Family', 'Health (Life Expectancy)',
       'Freedom', 'Trust (Government Corruption)', 'Generosity',
       'Dystopia Residual'],
      dtype='object')
Index(['Country', 'Happiness Rank', 'Happiness.Score', 'Whisker.high',
       'Whisker.low', 'Economy (GDP per Capita)', 'Family',
       'Health (Life Expectancy)', 'Freedom', 'Generosity',
       'Trust (Government Corruption)', 'Dystopia.Residual'],
      dtype='object')
Index(['Happiness Rank', 'Country', 'Score', 'Economy (GDP per Capita)',
       'Family', 'Health (Life Expectancy)', 'Freedom', 'Gene

In [5]:
type(dfs)

list

In [6]:
df = pd.concat(dfs) # build pandas dataframe from list(dfs)
df = df.sort_values(by = ['Country']) # sort by country

In [7]:
# drop columns that are not necessary/helpful in analysis
df = df.drop(columns=['Happiness Score','Standard Error','Dystopia Residual','Lower Confidence Interval','Upper Confidence Interval','Happiness.Score','Whisker.high','Whisker.low','Dystopia.Residual','Score',])

df = df.reset_index() # reset the indext

# Drop Northern Cyprus because it is not recognised as a country (apart from by Turkey)
df = df.drop(index=df.iloc[np.where(df['Country'] == 'Northern Cyprus')].index)

df = df.drop(index=df.iloc[np.where(df['Country'] == 'Northern Macedonia')].index)

df.at[625,['Country']] = 'Somaliland Region'

In [8]:
type(df)

pandas.core.frame.DataFrame

In [9]:
# Pre-process countries
countries = list(set(df['Country'].values))
countries = sorted(countries)

In [10]:
# Drop countries with NaN's
new_regions = []
for i in range(len(countries)):
        country_index = np.where(df['Country'] == countries[i]) # Get df of current country
        current_country_df = df.iloc[country_index]
        if all(pd.isnull(current_country_df['Region'].values)): # if all null for regions, drop country from df
            country_index = np.where(df['Country'] == countries[i])[0]
            for vals in country_index:
                try:
                    df = df.drop(index = vals)
                except KeyError:
                    pass
        else:
            region_name_pos = np.where([type(x) == str for x in current_country_df["Region"].values])[0][0]
            region = current_country_df["Region"].values[region_name_pos]
            current_country_df = current_country_df.replace(np.nan, region)
            new_regions.append(current_country_df["Region"].values)

In [11]:
new_regions = list(new_regions)
k = 0
region_vals = []
for arr in new_regions:
    for vals in arr:
        region_vals.append(vals)
        k+=1
        
len(region_vals)
df["Region"] = region_vals
df.head()

Unnamed: 0,index,Country,Region,Happiness Rank,Economy (GDP per Capita),Family,Health (Life Expectancy),Freedom,Trust (Government Corruption),Generosity
0,153,Afghanistan,Southern Asia,154,0.38227,0.11037,0.17344,0.1643,0.07112,0.31268
1,152,Afghanistan,Southern Asia,153,0.31982,0.30285,0.30335,0.23414,0.09719,0.3651
2,144,Afghanistan,Southern Asia,145,0.332,0.537,0.255,0.085,0.036,0.191
3,153,Afghanistan,Southern Asia,154,0.35,0.517,0.361,0.0,0.025,0.158
4,140,Afghanistan,Southern Asia,141,0.401477,0.581543,0.180747,0.10618,0.061158,0.311871


In [12]:
# One Hot Encode regions
ohe = OneHotEncoder(sparse=False)
ohe.fit(df[["Region"]])
df.head()

Unnamed: 0,index,Country,Region,Happiness Rank,Economy (GDP per Capita),Family,Health (Life Expectancy),Freedom,Trust (Government Corruption),Generosity
0,153,Afghanistan,Southern Asia,154,0.38227,0.11037,0.17344,0.1643,0.07112,0.31268
1,152,Afghanistan,Southern Asia,153,0.31982,0.30285,0.30335,0.23414,0.09719,0.3651
2,144,Afghanistan,Southern Asia,145,0.332,0.537,0.255,0.085,0.036,0.191
3,153,Afghanistan,Southern Asia,154,0.35,0.517,0.361,0.0,0.025,0.158
4,140,Afghanistan,Southern Asia,141,0.401477,0.581543,0.180747,0.10618,0.061158,0.311871


In [13]:
df[ohe.get_feature_names_out()] = ohe.transform(df[["Region"]])
df.drop(columns="Region", inplace=True)
df.head()

Unnamed: 0,index,Country,Happiness Rank,Economy (GDP per Capita),Family,Health (Life Expectancy),Freedom,Trust (Government Corruption),Generosity,Region_Australia and New Zealand,Region_Central and Eastern Europe,Region_Eastern Asia,Region_Latin America and Caribbean,Region_Middle East and Northern Africa,Region_North America,Region_Southeastern Asia,Region_Southern Asia,Region_Sub-Saharan Africa,Region_Western Europe
0,153,Afghanistan,154,0.38227,0.11037,0.17344,0.1643,0.07112,0.31268,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0
1,152,Afghanistan,153,0.31982,0.30285,0.30335,0.23414,0.09719,0.3651,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0
2,144,Afghanistan,145,0.332,0.537,0.255,0.085,0.036,0.191,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0
3,153,Afghanistan,154,0.35,0.517,0.361,0.0,0.025,0.158,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0
4,140,Afghanistan,141,0.401477,0.581543,0.180747,0.10618,0.061158,0.311871,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0


In [14]:
happiness_benchmark = []
for rank in df['Happiness Rank'].values:
    if rank <= 20:
        happiness_benchmark.append(1)
    else:
        happiness_benchmark.append(0)
df['Happiness Rank'] = happiness_benchmark
df=df.sample(frac=1)

In [15]:
df.head()

Unnamed: 0,index,Country,Happiness Rank,Economy (GDP per Capita),Family,Health (Life Expectancy),Freedom,Trust (Government Corruption),Generosity,Region_Australia and New Zealand,Region_Central and Eastern Europe,Region_Eastern Asia,Region_Latin America and Caribbean,Region_Middle East and Northern Africa,Region_North America,Region_Southeastern Asia,Region_Southern Asia,Region_Sub-Saharan Africa,Region_Western Europe
375,53,Latvia,0,1.260749,1.404715,0.638567,0.325708,0.073843,0.153075,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
40,86,Azerbaijan,0,1.024,1.161,0.603,0.43,0.176,0.031,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
65,49,Belize,0,0.907975,1.081418,0.450192,0.547509,0.096581,0.240016,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0
139,25,Chile,0,1.159,1.369,0.92,0.357,0.056,0.187,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0
507,134,Niger,0,0.161925,0.993025,0.268505,0.363659,0.138573,0.228674,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0


In [16]:
df = df.drop(columns = ['Country', 'index'])
df = df.dropna(axis=0)
output = df['Happiness Rank']

In [17]:
df.head()

Unnamed: 0,Happiness Rank,Economy (GDP per Capita),Family,Health (Life Expectancy),Freedom,Trust (Government Corruption),Generosity,Region_Australia and New Zealand,Region_Central and Eastern Europe,Region_Eastern Asia,Region_Latin America and Caribbean,Region_Middle East and Northern Africa,Region_North America,Region_Southeastern Asia,Region_Southern Asia,Region_Sub-Saharan Africa,Region_Western Europe
375,0,1.260749,1.404715,0.638567,0.325708,0.073843,0.153075,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
40,0,1.024,1.161,0.603,0.43,0.176,0.031,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
65,0,0.907975,1.081418,0.450192,0.547509,0.096581,0.240016,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0
139,0,1.159,1.369,0.92,0.357,0.056,0.187,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0
507,0,0.161925,0.993025,0.268505,0.363659,0.138573,0.228674,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0


In [20]:
X = df.loc[:, df.columns != "Happiness Rank"]
y = df["Happiness Rank"]
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=1)

In [26]:
rfc = RandomForestClassifier()
logreg = LogisticRegression()
dtc = DecisionTreeClassifier(max_depth = 2)

In [27]:
rfc.fit(X_train, y_train)
logreg.fit(X_train, y_train)
dtc.fit(X_train, y_train)

In [30]:
preds_dtc = dtc.predict(X_test)
preds_rfc = rfc.predict(X_test)
preds_logreg = logreg.predict(X_test)

In [36]:
print("RFC: ", accuracy_score(y_true = y_test, y_pred = preds_rfc))
print("DTC: ", accuracy_score(y_true = y_test, y_pred = preds_dtc))
print("LogReg: ", accuracy_score(y_true = y_test, y_pred = preds_logreg))

RFC:  0.961038961038961
DTC:  0.8831168831168831
LogReg:  0.948051948051948
