In [24]:
# Import All Necessary Libraries
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import statsmodels.api as sm
import seaborn as sns
import ipywidgets as widgets
import sklearn.linear_model
from sklearn.tree import DecisionTreeRegressor
from sklearn.metrics import mean_squared_error, r2_score, root_mean_squared_error
from sklearn.model_selection import train_test_split
from tensorflow import keras
from keras import layers
from keras.models import Sequential
from keras.layers import Input, Dense

In [25]:
# Read Data from Files
hls_all_raw = pd.read_csv("HSL_ALL.csv")
weo_raw = pd.read_csv("WEOOct2023all.xls", sep='\t')

In [26]:
# Print and Display Raw Data
print(hls_all_raw["Indicator"])
print("\n===========================================================\n")
hls_slice = pd.DataFrame(hls_all_raw, columns =["Country","Indicator","Type of indicator","Time","Value"])
print(hls_slice)

0                  Household income
1                  Household income
2                  Household income
3                  Household income
4                  Household income
                    ...            
17544                      Earnings
17545                      Earnings
17546    Satisfaction with time use
17547    Satisfaction with time use
17548    Satisfaction with time use
Name: Indicator, Length: 17549, dtype: object


         Country                   Indicator Type of indicator  Time  \
0      Australia            Household income           Average  2004   
1      Australia            Household income           Average  2005   
2      Australia            Household income           Average  2006   
3      Australia            Household income           Average  2007   
4      Australia            Household income           Average  2008   
...          ...                         ...               ...   ...   
17544    Germany                    Earnings       D

In [27]:
# Print Indicator and Country of HSL Data
hls_ls = hls_slice.loc[hls_all_raw["Indicator"] == "Life satisfaction"]
print(hls_ls)
print("\n===========================================================\n")
print("Total records:")
print(len(hls_ls))

print("\n===========================================================\n")
print("Total Unique Countries:")
print(len(hls_ls["Country"].unique()))

print("\n===========================================================\n")
print("Country List")
print(hls_ls["Country"].unique())

         Country          Indicator Type of indicator  Time     Value
180    Australia  Life satisfaction           Average  2014  7.600000
181    Australia  Life satisfaction           Average  2019  7.500000
182    Australia  Life satisfaction           Average  2020  7.200000
489      Austria  Life satisfaction           Average  2013  7.800000
490      Austria  Life satisfaction           Average  2018  8.002416
...          ...                ...               ...   ...       ...
15227     Canada  Life satisfaction       Deprivation  2017  2.800000
15228     Canada  Life satisfaction       Deprivation  2018  2.500000
15229     Canada  Life satisfaction       Deprivation  2019  2.500000
15230     Canada  Life satisfaction       Deprivation  2020  2.800000
15231     Canada  Life satisfaction       Deprivation  2021  2.800000

[224 rows x 5 columns]


Total records:
224


Total Unique Countries:
35


Country List
['Australia' 'Austria' 'Belgium' 'Canada' 'Czechia' 'Denmark' 'Finland'

In [28]:
# Filter to Year 2013
hls_train = hls_ls.loc[(hls_ls["Time"] >= 2009) & (hls_ls["Time"] <= 2022)]
hls_train = hls_train.loc[hls_ls["Type of indicator"] == "Average"]
print("\n===========================================================\n")
print("Total records:")
print(len(hls_train))

print("\n===========================================================\n")
print("Total Unique Countries:")
print(len(hls_train["Country"].unique()))

print("\n===========================================================\n")
print("Record:")
print(hls_train)



Total records:
118


Total Unique Countries:
35


Record:
         Country          Indicator Type of indicator  Time     Value
180    Australia  Life satisfaction           Average  2014  7.600000
181    Australia  Life satisfaction           Average  2019  7.500000
182    Australia  Life satisfaction           Average  2020  7.200000
489      Austria  Life satisfaction           Average  2013  7.800000
490      Austria  Life satisfaction           Average  2018  8.002416
...          ...                ...               ...   ...       ...
11425  Lithuania  Life satisfaction           Average  2018  6.382413
11426  Lithuania  Life satisfaction           Average  2021  7.000000
11894   Slovenia  Life satisfaction           Average  2013  6.963051
11895   Slovenia  Life satisfaction           Average  2018  7.310714
11896   Slovenia  Life satisfaction           Average  2021  7.500000

[118 rows x 5 columns]


In [29]:
# Select Row of Data from WEO Dataset and Select Year (Column)
weo_selected_measurement = weo_raw.loc[weo_raw['WEO Subject Code'].str.contains("LUR")]
weo_selected_measurement_2s = pd.DataFrame(weo_selected_measurement, columns=['Country', '2009','2010','2011','2012','2013','2014','2015','2016','2017','2018','2019'])



weo_selected_measurement_2s.describe()

Unnamed: 0,Country,2009,2010,2011,2012,2013,2014,2015,2016,2017,2018,2019
count,196,110.0,112.0,112.0,112.0,112.0,112.0,112,112.0,113.0,112,110.0
unique,196,107.0,108.0,109.0,110.0,110.0,107.0,110,108.0,113.0,108,108.0
top,Afghanistan,5.9,3.3,7.9,8.1,3.1,10.6,7,9.541,13.7,9,5.2
freq,1,3.0,2.0,2.0,2.0,2.0,2.0,2,2.0,1.0,2,2.0


In [30]:
weo_transposed_data = pd.melt(weo_selected_measurement_2s, id_vars=['Country'], var_name='Year', value_name="Unemployment Measurement")

hls_train = hls_train.rename(columns={"Time": "Year"})
weo_transposed_data['Year'] = pd.to_numeric(weo_transposed_data['Year'])
hls_train['Year'] = pd.to_numeric(hls_train['Year'])

merged_train_data = pd.merge(hls_train, weo_transposed_data, on=["Country", "Year"])
merged_train_data = merged_train_data.rename(columns={"Value": "Happiness Measurement"})

In [31]:
# Clean Data for Trainning
merged_train_data.describe(include='all')

Unnamed: 0,Country,Indicator,Type of indicator,Year,Happiness Measurement,Unemployment Measurement
count,81,81,81,81.0,81.0,81.0
unique,31,1,1,,,76.0
top,Canada,Life satisfaction,Average,,,7.033
freq,11,81,81,,,2.0
mean,,,,2015.358025,7.348162,
std,,,,2.633005,0.711794,
min,,,,2009.0,5.7,
25%,,,,2013.0,6.951438,
50%,,,,2015.0,7.6,
75%,,,,2018.0,7.948925,


In [32]:
# Drop NaN
merged_train_data = merged_train_data.dropna(subset=['Unemployment Measurement'])
merged_train_data.describe(include='all')

Unnamed: 0,Country,Indicator,Type of indicator,Year,Happiness Measurement,Unemployment Measurement
count,81,81,81,81.0,81.0,81.0
unique,31,1,1,,,76.0
top,Canada,Life satisfaction,Average,,,7.033
freq,11,81,81,,,2.0
mean,,,,2015.358025,7.348162,
std,,,,2.633005,0.711794,
min,,,,2009.0,5.7,
25%,,,,2013.0,6.951438,
50%,,,,2015.0,7.6,
75%,,,,2018.0,7.948925,


In [33]:
merged_train_data['Unemployment Measurement'] = merged_train_data['Unemployment Measurement'].astype(float)
merged_train_data = merged_train_data.dropna()
merged_train_data = merged_train_data.drop_duplicates()
merged_train_data.describe(include='all')

Unnamed: 0,Country,Indicator,Type of indicator,Year,Happiness Measurement,Unemployment Measurement
count,81,81,81,81.0,81.0,81.0
unique,31,1,1,,,
top,Canada,Life satisfaction,Average,,,
freq,11,81,81,,,
mean,,,,2015.358025,7.348162,7.437765
std,,,,2.633005,0.711794,4.512644
min,,,,2009.0,5.7,2.358
25%,,,,2013.0,6.951438,4.881
50%,,,,2015.0,7.6,6.408
75%,,,,2018.0,7.948925,8.325


In [46]:
# Drop the Outliers
x_iqr = 8.325 - 4.881
x_upper_fence = 8.325 + 1.5 * x_iqr
x_lower_fence = 4.881 - 1.5 * x_iqr

y_iqr = 7.94 - 6.951438
y_upper_fence = 7.94 + 1.5 * y_iqr
y_lower_fence = 6.951438 - 1.5 * y_iqr


In [47]:
x_outliers = np.where((merged_train_data['Unemployment Measurement'] > x_upper_fence) | (merged_train_data['Unemployment Measurement'] < x_lower_fence))
y_outliers = np.where((merged_train_data['Happiness Measurement'] > y_upper_fence) | (merged_train_data['Happiness Measurement'] < y_lower_fence))

In [48]:
merged_train_data = merged_train_data.drop(x_outliers[0])
merged_train_data = merged_train_data.drop(y_outliers[0])

In [49]:
# Training[80%] and Test[20%] Sets of the Data\
X = np.array(merged_train_data["Unemployment Measurement"]).reshape(-1,1)
# Y for Fitting in Sci-Kit Learn
Y = np.array(merged_train_data["Happiness Measurement"]).reshape(-1,1)

x = X.flatten()
y = Y.flatten()

# Training and Test Data
X_train, X_test, Y_train, Y_test = train_test_split(X, y, train_size=0.8, random_state=42)

# plot data
out1 = widgets.Output()
with out1:
  plt.scatter(x, y)
  plt.xlabel('Unemployment')
  plt.ylabel('Happiness')
  plt.title("Data Plot: 2009 - 2022")
  plt.show()

# Fit linear model
model_linear = sklearn.linear_model.LinearRegression()
model_linear.fit(X_train, Y_train)
# Fit regression model
model_regr_1 = DecisionTreeRegressor(max_depth=2)
model_regr_2 = DecisionTreeRegressor(max_depth=6)
model_regr_1.fit(X_train, Y_train)
model_regr_2.fit(X_train, Y_train)

# Plot predictions - for Linear Regression
predict_x = [X_test for X_test in range(15)]
predict_x = [[X_test/100] for X_test in predict_x]
# Predict Y - for Linear Regression
predict_y = model_linear.predict(predict_x)
# Predict - Descision Tree
y_1 = model_regr_1.predict(X_test)
y_2 = model_regr_2.predict(X_test)

# Linear Regression
out2 = widgets.Output()
with out2:
  plt.scatter(predict_x, predict_y)
  plt.scatter(x, y)
  plt.xlabel('Unemployment')
  plt.ylabel('Happiness')
  plt.title("Prediction Line: 2009 - 2022")
  plt.show()

# Decision Tree
out3 = widgets.Output()
with out3:
  plt.scatter(x, y, s=50, edgecolor="black", c="darkorange", label="data")
  plt.plot(X_test, y_1, color="blue", label="max_depth=2", linewidth=2)
  plt.plot(X_test, y_2, color="violet", label="max_depth=6", linewidth=2)
  plt.xlabel('Unemployment')
  plt.ylabel('Happiness')
  plt.title("Decision Tree Regressor: 2009 - 2022")
  plt.show()


In [50]:
# Neural Network
tf_model = Sequential()
tf_model.add(Input(shape=(1,)))
tf_model.add(Dense(250,activation = "relu"))
tf_model.add(Dense(250,activation = "LeakyReLU"))
tf_model.add(Dense(250,activation = "ThresholdedReLU"))
tf_model.add(Dense(250,activation = "PReLU"))
tf_model.add(Dense(250,activation = "ELU"))
tf_model.add(Dense(250,activation = "relu"))
tf_model.add(Dense(1))

tf_model.compile(
loss = 'MSE', optimizer = 'Adam', metrics=['mse']
)
tf_model.fit(X_train, Y_train, batch_size = 256, epochs = 100) 

Epoch 1/100
Epoch 2/100
Epoch 3/100
Epoch 4/100
Epoch 5/100
Epoch 6/100
Epoch 7/100
Epoch 8/100
Epoch 9/100
Epoch 10/100
Epoch 11/100
Epoch 12/100
Epoch 13/100
Epoch 14/100
Epoch 15/100
Epoch 16/100
Epoch 17/100
Epoch 18/100
Epoch 19/100
Epoch 20/100
Epoch 21/100
Epoch 22/100
Epoch 23/100
Epoch 24/100
Epoch 25/100
Epoch 26/100
Epoch 27/100
Epoch 28/100
Epoch 29/100
Epoch 30/100
Epoch 31/100
Epoch 32/100
Epoch 33/100
Epoch 34/100
Epoch 35/100
Epoch 36/100
Epoch 37/100
Epoch 38/100
Epoch 39/100
Epoch 40/100
Epoch 41/100
Epoch 42/100
Epoch 43/100
Epoch 44/100
Epoch 45/100
Epoch 46/100
Epoch 47/100
Epoch 48/100
Epoch 49/100
Epoch 50/100
Epoch 51/100
Epoch 52/100
Epoch 53/100
Epoch 54/100
Epoch 55/100
Epoch 56/100
Epoch 57/100
Epoch 58/100
Epoch 59/100
Epoch 60/100
Epoch 61/100
Epoch 62/100
Epoch 63/100
Epoch 64/100
Epoch 65/100
Epoch 66/100
Epoch 67/100
Epoch 68/100
Epoch 69/100
Epoch 70/100
Epoch 71/100
Epoch 72/100
Epoch 73/100
Epoch 74/100
Epoch 75/100
Epoch 76/100
Epoch 77/100
Epoch 78

<keras.src.callbacks.History at 0x7f7d3c4395d0>

In [51]:
out4 = widgets.Output()
with out4:
    py = tf_model.predict(X_train)
    plt.plot(X_train, py)
    plt.plot(X_train, Y_train)
    plt.title("Model Fit")
    plt.show()
    es = []
    for i in range(len(py)):
      es.append(Y_train[i] - py[i])
    plt.plot(X_train, es)
    plt.title("Error")
    plt.show()
    

In [52]:
# Linear Regression
mse_linear = root_mean_squared_error(Y_test, predict_y)
mse_linear = round(mse_linear,2)

rmse_linear = root_mean_squared_error(Y_test, predict_y)
rmse_linear = round(rmse_linear,2)
coed_linear = r2_score(Y_test,predict_y) 
coed_linear = round(rmse_linear,2) 

# Decision Tree
mse_decision_tree_1 = root_mean_squared_error(Y_test, y_1)
mse_decision_tree_2 = root_mean_squared_error(Y_test, y_2)
mse_decision_tree_1 = round(mse_decision_tree_1,2)
mse_decision_tree_2 = round(mse_decision_tree_2,2)

#Neural Networks
mse_nn = root_mean_squared_error(Y_train, py)
mse_nn = round(mse_nn,2)

out5 = widgets.Output()
with out5:
    # Normal Plot
    plt.scatter(X_test,Y_test)
    # Linear Regression
    plt.scatter(X_test, predict_y, label=f"MSE Linear Regresssion: {mse_linear}")
    # Decision Tree
    plt.scatter(X_test, y_1, color="blue", label=f"MSE Descision Tree Regresssor 1: {mse_decision_tree_1}")
    plt.scatter(X_test, y_2, color="blue", label=f"MSE Descision Tree Regresssor 2: {mse_decision_tree_2}")
    # Neural Network
    plt.scatter(X_train,py, label=f"MSE Neural Network: {mse_decision_tree_2}")
    plt.xlabel('Unemployment')
    plt.ylabel('Happiness')
    plt.title("Prediction Line: 2009 - 2022")
    plt.legend()
    plt.show()
    
    

In [53]:
display(widgets.HBox([out1]))

HBox(children=(Output(outputs=({'output_type': 'display_data', 'data': {'text/plain': '<Figure size 640x480 wi…

In [54]:
display(widgets.HBox([out2]))

HBox(children=(Output(outputs=({'output_type': 'display_data', 'data': {'text/plain': '<Figure size 640x480 wi…

In [55]:
display(widgets.HBox([out3]))

HBox(children=(Output(outputs=({'output_type': 'display_data', 'data': {'text/plain': '<Figure size 640x480 wi…

In [56]:
display(widgets.HBox([out4]))

HBox(children=(Output(),))

In [57]:
display(widgets.HBox([out5]))

HBox(children=(Output(),))