In [1]:
# Import All Necessary Libraries
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import statsmodels.api as sm
import seaborn as sns
import ipywidgets as widgets
import sklearn.linear_model

In [2]:
# Read Data from Files
hls_all_raw = pd.read_csv("HSL_ALL.csv")
weo_raw = pd.read_csv("WEOOct2023all.xls", sep='\t')

In [3]:
# Print Raw Information
print(hls_all_raw)
print('\n')
print(weo_raw)

      LOCATION    Country TYPE_VAR Type of indicator VARIABLE  \
0          AUS  Australia  AVERAGE           Average      1_1   
1          AUS  Australia  AVERAGE           Average      1_1   
2          AUS  Australia  AVERAGE           Average      1_1   
3          AUS  Australia  AVERAGE           Average      1_1   
4          AUS  Australia  AVERAGE           Average      1_1   
...        ...        ...      ...               ...      ...   
17544      DEU    Germany      DEP       Deprivation      2_8   
17545      DEU    Germany      DEP       Deprivation      2_8   
17546      MEX     Mexico      DEP       Deprivation      4_4   
17547      CAN     Canada      DEP       Deprivation      4_4   
17548      CAN     Canada      DEP       Deprivation      4_4   

                        Indicator   WB Current/Future Well-being  SEX  \
0                Household income  CWB        Current Well-being  TOT   
1                Household income  CWB        Current Well-being  TOT   


In [4]:
# Print and Display Raw Data
print(hls_all_raw["Indicator"])
print("\n===========================================================\n")
hls_slice = pd.DataFrame(hls_all_raw, columns =["Country","Indicator","Type of indicator","Time","Value"])
print(hls_slice)

0                  Household income
1                  Household income
2                  Household income
3                  Household income
4                  Household income
                    ...            
17544                      Earnings
17545                      Earnings
17546    Satisfaction with time use
17547    Satisfaction with time use
17548    Satisfaction with time use
Name: Indicator, Length: 17549, dtype: object


         Country                   Indicator Type of indicator  Time  \
0      Australia            Household income           Average  2004   
1      Australia            Household income           Average  2005   
2      Australia            Household income           Average  2006   
3      Australia            Household income           Average  2007   
4      Australia            Household income           Average  2008   
...          ...                         ...               ...   ...   
17544    Germany                    Earnings       D

In [5]:
# Print Indicator and Country of HSL Data
hls_ls = hls_slice.loc[hls_all_raw["Indicator"] == "Life satisfaction"]
print(hls_ls)
print("\n===========================================================\n")
print("Total records:")
print(len(hls_ls))

print("\n===========================================================\n")
print("Total Unique Countries:")
print(len(hls_ls["Country"].unique()))

print("\n===========================================================\n")
print("Country List")
print(hls_ls["Country"].unique())

         Country          Indicator Type of indicator  Time     Value
180    Australia  Life satisfaction           Average  2014  7.600000
181    Australia  Life satisfaction           Average  2019  7.500000
182    Australia  Life satisfaction           Average  2020  7.200000
489      Austria  Life satisfaction           Average  2013  7.800000
490      Austria  Life satisfaction           Average  2018  8.002416
...          ...                ...               ...   ...       ...
15227     Canada  Life satisfaction       Deprivation  2017  2.800000
15228     Canada  Life satisfaction       Deprivation  2018  2.500000
15229     Canada  Life satisfaction       Deprivation  2019  2.500000
15230     Canada  Life satisfaction       Deprivation  2020  2.800000
15231     Canada  Life satisfaction       Deprivation  2021  2.800000

[224 rows x 5 columns]


Total records:
224


Total Unique Countries:
35


Country List
['Australia' 'Austria' 'Belgium' 'Canada' 'Czechia' 'Denmark' 'Finland'

In [6]:
# Filter to Year 2015
hls_train = hls_ls.loc[hls_ls["Time"] == 2013]
hls_train = hls_train.loc[hls_ls["Type of indicator"] == "Average"]
print("\n===========================================================\n")
print("Total records:")
print(len(hls_train))

print("\n===========================================================\n")
print("Total Unique Countries:")
print(len(hls_train["Country"].unique()))

print("\n===========================================================\n")
print("Record:")
print(hls_train)



Total records:
29


Total Unique Countries:
29


Record:
               Country          Indicator Type of indicator  Time     Value
489            Austria  Life satisfaction           Average  2013  7.800000
835            Belgium  Life satisfaction           Average  2013  7.600000
1158            Canada  Life satisfaction           Average  2013  8.000000
1484           Czechia  Life satisfaction           Average  2013  6.879920
1822           Denmark  Life satisfaction           Average  2013  8.010768
2165           Finland  Life satisfaction           Average  2013  8.045856
2510            France  Life satisfaction           Average  2013  7.053189
2854           Germany  Life satisfaction           Average  2013  7.254309
3187            Greece  Life satisfaction           Average  2013  6.160032
3539           Hungary  Life satisfaction           Average  2013  6.112792
3849           Iceland  Life satisfaction           Average  2013  7.948925
4147           Ireland  Life 

In [7]:
# Select Row of Data from WEO Dataset and Select Year (Column) weo_selected_measurement = weo_raw.loc[weo_raw['WEO Subject Code'].str.contains("NGDP_RPCH")]
weo_selected_measurement = weo_raw.loc[weo_raw['WEO Subject Code'].str.contains("LUR")]
weo_selected_measurement_2013 = pd.DataFrame(weo_selected_measurement, columns=['Country', '2009','2010','2011','2012','2013','2014','2015','2016','2017','2018','2019'])

weo_selected_measurement_2013.describe()

Unnamed: 0,Country,2009,2010,2011,2012,2013,2014,2015,2016,2017,2018,2019
count,196,110.0,112.0,112.0,112.0,112.0,112.0,112,112.0,113.0,112,110.0
unique,196,107.0,108.0,109.0,110.0,110.0,107.0,110,108.0,113.0,108,108.0
top,Afghanistan,5.9,3.3,7.9,8.1,3.1,10.6,7,9.541,13.7,9,5.2
freq,1,3.0,2.0,2.0,2.0,2.0,2.0,2,2.0,1.0,2,2.0


In [8]:
# Clean Data for Trainning
merged_train_data = pd.merge(hls_train, weo_selected_measurement_2013, on="Country")
merged_train_data = merged_train_data.rename(columns={"Value": "Happiness Measurement", "2013": "Unemployment Measurement"})
merged_train_data = pd.DataFrame(merged_train_data, columns=['Country','Happiness Measurement', 'Unemployment Measurement'])

merged_train_data.describe(include='all')

Unnamed: 0,Country,Happiness Measurement,Unemployment Measurement
count,27,27.0,27.0
unique,27,,27.0
top,Austria,,5.35
freq,1,,1.0
mean,,7.199823,
std,,0.702092,
min,,5.7,
25%,,6.714711,
50%,,7.3154,
75%,,7.843941,


In [9]:
# Drop NaN
merged_train_data = merged_train_data.dropna(subset=['Unemployment Measurement'])
merged_train_data.describe(include='all')

Unnamed: 0,Country,Happiness Measurement,Unemployment Measurement
count,27,27.0,27.0
unique,27,,27.0
top,Austria,,5.35
freq,1,,1.0
mean,,7.199823,
std,,0.702092,
min,,5.7,
25%,,6.714711,
50%,,7.3154,
75%,,7.843941,


In [10]:
merged_train_data['Unemployment Measurement'] = merged_train_data['Unemployment Measurement'].astype(float)
merged_train_data = merged_train_data.dropna()
merged_train_data = merged_train_data.drop_duplicates()
merged_train_data.describe(include='all')

Unnamed: 0,Country,Happiness Measurement,Unemployment Measurement
count,27,27.0,27.0
unique,27,,
top,Austria,,
freq,1,,
mean,,7.199823,10.079778
std,,0.702092,5.884017
min,,5.7,3.1
25%,,6.714711,7.0025
50%,,7.3154,8.55
75%,,7.843941,11.819


In [11]:
# Drop the Outliers
iqr = 11.819000 - 7.002500
upper_fence = 11.819000 + 1.5 * iqr
lower_fence = 7.002500 - 1.5 * iqr

In [12]:
outliers = np.where((merged_train_data['Unemployment Measurement'] > upper_fence) | (merged_train_data['Unemployment Measurement'] < lower_fence))

In [13]:
merged_train_data = merged_train_data.drop(outliers[0])

In [14]:
# Train the Data and Create the Model
X = np.c_[merged_train_data["Unemployment Measurement"]]
Y = np.c_[merged_train_data["Happiness Measurement"]]
x = X.tolist()
y = Y.tolist()

# plot data
out1 = widgets.Output()
with out1:
  plt.scatter(x, y)
  plt.xlabel('Unemployment')
  plt.ylabel('Happiness')
  plt.title("Data Plot: 2013")
  plt.show()

# fit linear model
model = sklearn.linear_model.LinearRegression()
model.fit(X, Y)

# plot predictions
predict_x = [x for x in range(901)]
predict_x = [[x/100] for x in predict_x]
predict_y = model.predict(predict_x)

out2 = widgets.Output()
with out2:
  plt.scatter(predict_x, predict_y)
  plt.scatter(x, y)
  plt.xlabel('Unemployment')
  plt.ylabel('Happiness')
  plt.title("Prediction Line: 2013 -")
  plt.show()

display(widgets.HBox([out1,out2]))

HBox(children=(Output(), Output()))