In [1]:
import numpy as np
import pandas as pd
import statsmodels.api as sm

import seaborn as sns
import matplotlib.pyplot as plt

from sklearn.preprocessing import MinMaxScaler

In [2]:
#Significance level
ALPHA = 0.11

---

In [3]:
def PlotSimpleRegression(data, variable, ax):

    data = data.copy()
    data = data.sort_values(variable).reset_index(drop=True)

    # Scatterplot of the observations
    sns.scatterplot(
        data = data,
        x=variable,
        y="Life Ladder",
        ax=ax,
        label="Observations"
    )

    # Plot predicted mean
    ax.plot(
        data[variable],
        data["mean"],
        color="k",
        label="Prediction"
    )

    # Plot prediction interval
    ax.fill_between(
        data[variable],
        data["obs_ci_lower"],
        data["obs_ci_upper"],
        color="rebeccapurple",
        alpha=0.5,
        label="Prediction interval"
    )

    # Plot confidence interval
    ax.fill_between(
        data[variable],
        data["mean_ci_lower"],
        data["mean_ci_upper"],
        color="pink",
        alpha=0.5,
        label="Confidence interval"
    )

    ax.legend(frameon=False)
    ax.spines[['right', 'top']].set_visible(False)

    return ax

In [4]:
def PlotCompareYHatY(data, ax):
    ax.scatter(data["Life Ladder"], data["mean"], color="k")

    ax.errorbar(
        data["Life Ladder"],
        data["mean"],
        yerr=data["obs_ci_upper"] - data["mean"],
        fmt="o",
        color="k"
    )

    ax.plot(
        [data["Life Ladder"].min(), data["Life Ladder"].max()]
        , [data["Life Ladder"].min(), data["Life Ladder"].max()]
        , color='r'
        , linestyle='--'
    )

    ax.set_xlabel(r"$Y$")
    ax.set_ylabel(r"$\hat{Y}$")
    ax.spines[['right', 'top']].set_visible(False)

    return ax

---

Reading and preprocessing data

In [5]:
DataWhr2024 = pd.read_csv("DataWhr2024.csv")
UnM49 = pd.read_csv("UnM49.csv", sep=';')

In [6]:
DataWhr2024.loc[DataWhr2024["Country name"].str.startswith("Hong"), "Country name"] = "Hong Kong"
DataWhr2024.loc[DataWhr2024["Country name"].str.startswith("Somaliland"), "Country name"] = "Somaliland"
DataWhr2024.loc[DataWhr2024["Country name"].str.startswith("Taiwan"), "Country name"] = "Taiwan"

In [7]:
UnM49 = UnM49[['Country or Area', 'Sub-region Name', 'Region Name']]
UnM49 = UnM49.rename({'Country or Area':'Country name', 'Sub-region Name':'Subregion', 'Region Name':'Continent'}, axis=1)

In [8]:
UnM49.loc[97, "Country name"] = "Bolivia"
UnM49.loc[33, "Country name"] = "Congo (Brazzaville)"
UnM49.loc[34, "Country name"] = "Congo (Kinshasa)"
UnM49.loc[124, "Country name"] = "Hong Kong"
UnM49.loc[125, "Country name"] = "Macao"
UnM49.loc[126, "Country name"] = "North Korea"
UnM49.loc[145, "Country name"] = "Iran"
UnM49.loc[46, "Country name"] = "Ivory Coast"
UnM49.loc[133, "Country name"] = "Laos"
UnM49.loc[129, "Country name"] = "South Korea"
UnM49.loc[173, "Country name"] = "Moldova"
UnM49.loc[217, "Country name"] = "Netherlands"
UnM49.loc[175, "Country name"] = "Russia"
UnM49.loc[164, "Country name"] = "Syria"
UnM49.loc[26, "Country name"] = "Tanzania"
UnM49.loc[116, "Country name"] = "United States"
UnM49.loc[193, "Country name"] = "United Kingdom"
UnM49.loc[111, "Country name"] = "Venezuela"
UnM49.loc[140, "Country name"] = "Vietnam"

In [9]:
_ = pd.DataFrame(
    {
        "Country name": ["Kosovo", "Somaliland", "Taiwan"],
        "Subregion": ["Southern Europe", "Sub-Saharan Africa", "Eastern Asia"],
        "Continent": ["Europe", "Africa", "Asia"],
    }
)

UnM49 = pd.concat([UnM49, _], axis=0)
UnM49 = UnM49.reset_index(drop=True)

Merging the datasets

In [10]:
# Data
Dat = pd.merge(DataWhr2024, UnM49)

# Data of 2023
Dat2023 = Dat[Dat['year'] == 2023]
Dat2023 = Dat2023.reset_index(drop=True)

In a previous analysis, I found that Afghanistan behaves as a leverage point, while Botswana and Sri Lanka bahave as outliers. Thus, we will not consider these countries in our analyses

In [11]:
Dat2023.loc[[0, 13, 115]]

Unnamed: 0,Country name,year,Life Ladder,Log GDP per capita,Social support,Healthy life expectancy at birth,Freedom to make life choices,Generosity,Perceptions of corruption,Positive affect,Negative affect,Subregion,Continent
0,Afghanistan,2023,1.446,,0.368,55.2,0.228,,0.738,0.261,0.46,Southern Asia,Asia
13,Botswana,2023,3.332,9.673,0.701,55.0,0.741,-0.264,0.814,0.657,0.247,Sub-Saharan Africa,Africa
115,Sri Lanka,2023,3.602,9.364,0.79,67.4,0.754,0.05,0.922,0.709,0.353,Southern Asia,Asia


In [12]:
Dat2023 = Dat2023.drop([0, 13, 115])

More preprocessing

In [13]:
Y = Dat2023["Life Ladder"]

X = Dat2023[[
    'Log GDP per capita',
    'Social support',
    'Healthy life expectancy at birth',
    'Freedom to make life choices',
    'Generosity',
    'Perceptions of corruption',
    'Positive affect',
    'Negative affect'
]]

X = sm.add_constant(X)

---