In [None]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
import statsmodels.api as sm

In [None]:
#import and read .csv files
data2018= pd.read_csv('../data_files/raw/2018.csv', encoding='ISO-8859-1')
data2019= pd.read_csv('../data_files/raw/2019.csv', encoding='ISO-8859-1')
data2020= pd.read_csv('../data_files/raw/2020.csv', encoding='ISO-8859-1')
data2021= pd.read_csv('../data_files/raw/2021.csv', encoding='ISO-8859-1')
data2022= pd.read_csv('../data_files/raw/2022.csv', encoding='ISO-8859-1')

In [None]:
#Function to clean columns names
def format_columns(df,year):
    df.columns = df.columns.str.strip().str.lower().str.replace(' ', '_')
    df.insert(0, 'year', year)
    return df

In [None]:
data2018 = format_columns(data2018, 2018)
data2019 = format_columns(data2019, 2019)
data2020 = format_columns(data2020, 2020)
data2021 = format_columns(data2021, 2021)
data2022 = format_columns(data2022, 2022)

In [None]:
#Rename the columns
data2018 = data2018.rename(columns={'score': 'ladder_score'})

In [None]:
data2019 = data2019.rename(columns={'score': 'ladder_score'})

In [None]:
#sort df_202 by ladder_score to be sure is sorted, restart index and place rank to each country
data2020 = data2020.sort_values(by='ladder_score', ascending=False)
data2020.reset_index(drop=True, inplace=True)
data2020.insert(1, "rank", range(1, len(data2020) + 1))

In [None]:
# columns to drop
columns_to_keep_2020 = [
    'year',
    'rank',
    'country_name',
    'ladder_score',
    'explained_by:_log_gdp_per_capita',
    'explained_by:_social_support',
    'explained_by:_healthy_life_expectancy',
    'explained_by:_freedom_to_make_life_choices',
    'explained_by:_generosity',
    'explained_by:_perceptions_of_corruption',
    'dystopia_+_residual'
]
data2020 = data2020[columns_to_keep_2020]
data2020.head()

In [None]:
data2018.columns= [
    "year", "rank", "country", "happiness", "GDP", "social_support", 
    "healthy", "freedom", "generosity", "corruption"
]

In [None]:
data2019.columns= [
    "year", "rank", "country", "happiness", "GDP", "social_support", 
    "healthy", "freedom", "generosity", "corruption"
]

In [None]:
data2020.columns= [
    "year", "rank", "country", "happiness", "GDP", "social_support", 
    "healthy", "freedom", "generosity", "corruption", "dystopia"
]

In [None]:
data2021.columns= [
    "year", "country", "region", "happiness", "std", "high", "low", "GDP", "social_support", 
    "healthy", "freedom", "generosity", "corruption", "dystopia", "exp_GDP", "exp_social",
    "exp_healthy", "exp_freedom", "exp_generosity", "exp_corruption", "dystopia_residual"
]

In [None]:
# columns to drop in 2021
columns_to_keep_2021 = [
    'year',
    'country',
    'happiness',
    'exp_GDP',
    'exp_social',
    'exp_healthy',
    'exp_freedom',
    'exp_generosity',
    'exp_corruption',
    'dystopia_residual'
]
data2021 = data2021[columns_to_keep_2021]
data2021.head()

In [None]:
# add columns RANK to 2021
data2021.loc[:, 'rank'] = range(1, len(data2021) + 1)
# Insert 'rank' as the second column
data2021.insert(1, 'rank', data2021.pop('rank'))
data2021.columns

In [None]:
data2021.columns= [
    "year", "rank", "country", "happiness", "GDP", "social_support", 
    "healthy", "freedom", "generosity", "corruption", "dystopia"
]
data2021.columns

In [None]:
data2022.columns= [
    "year","rank","country", "happiness", "high", "low", "dystopia", "GDP", "social_support",
    "healthy", "freedom", "generosity", "corruption", 
]
data2022 = data2022.drop(columns=['high', 'low'])
data2022.columns

In [None]:
# To clean the country names and delete the last row
data2022['country'] = data2022['country'].str.replace('*', '', regex=False)
data2022 = data2022.drop(146)
data2022

In [None]:
# To change the order of the columns
new_column_order = [
    'year', 'rank', 'country', 'happiness', 'GDP', 
    'social_support', 'healthy', 'freedom', 'generosity', 
    'corruption', 'dystopia'
]
data2022 = data2022[new_column_order]

In [None]:
# To change the data types of the columns that should be numerical 
columns_to_numerical = ['happiness', 'GDP', 'social_support', 'healthy', 'freedom', 'generosity','corruption', 'dystopia']

In [None]:
for col in columns_to_numerical:
    data2022[col] = data2022[col].apply(lambda x: float( x.replace(",",".") ) )

In [None]:
data2019.columns= [
    "year", "rank2019", "country", "happiness2019", "GDP2019", "social_support2019", 
    "healthy2019", "freedom2019", "generosity2019", "corruption2019"
]

In [None]:
data2020.columns= [
    "year", "rank2020", "country", "happiness2020", "GDP2020", "social_support2020", 
    "healthy2020", "freedom2020", "generosity2020", "corruption2020", "dystopia2020"
]

In [None]:
data2021.columns= [
    "year", "rank2021", "country", "happiness2021", "GDP2021", "social_support2021", 
    "healthy2021", "freedom2021", "generosity2021", "corruption2021", "dystopia2021"
]

In [None]:
data2022.columns= [
    "year", "rank2022", "country", "happiness2022", "GDP2022", "social_support2022", 
    "healthy2022", "freedom2022", "generosity2022", "corruption2022", "dystopia2022"
]

In [None]:
results = {
    'year': ['2018', '2019', '2020', "2021", "2022"],
    'mean': [
        data2018['happiness'].mean(),
        data2019['happiness2019'].mean(),
        data2020['happiness2020'].mean(),
        data2021['happiness2021'].mean(),
        data2022['happiness2022'].mean()
        
    ],
    'median': [
        data2018['happiness'].median(),
        data2019['happiness2019'].median(),
        data2020['happiness2020'].median(),
        data2021['happiness2021'].median(),
        data2022['happiness2022'].median()
    ],
    'std': [
        data2018['happiness'].std(),
        data2019['happiness2019'].std(),
        data2020['happiness2020'].std(),
        data2021['happiness2021'].std(),
        data2022['happiness2022'].std()
    ]
}
df_summary = pd.DataFrame(results)
df_summary

In [None]:
#data2018.merge(data2019, on="country", how="inner")
pd.concat([data2018, data2019], axis=0)

In [None]:
plt.figure(figsize=(10, 6))
plt.plot(df_summary['year'], df_summary['mean'], label='Mean', marker='o')
plt.title('Mean of Happiness score Over Years')
plt.xlabel('Year')
plt.ylabel('Value')
plt.legend()
plt.show()

In [None]:
data2018.to_csv('../data_files/clean/2018_clean.csv', index=False)
data2019.to_csv('../data_files/clean/2019_clean.csv', index=False)
data2020.to_csv('../data_files/clean/2020_clean.csv', index=False)
data2021.to_csv('../data_files/clean/2021_clean.csv', index=False)
data2022.to_csv('../data_files/clean/2022_clean.csv', index=False)

In [None]:
# Multiple regression 2022
# X is your independent variable(s), y is the dependent variable
X = data2022[['GDP2022', 'social_support2022', 'healthy2022', "freedom2022"]]  
y = data2022['happiness2022']
# Add a constant to the independent variables (for the intercept term)
# y_pred = b0 + b1 * 'GDP2022' + b2 * 'social_support2022' +...
# y_pred = b0 * 1 + b1 * 'GDP2022' + b2 * 'social_support2022' +...
# y_pred = b0 * (X0 = 1) + b1 * 'GDP2022' + b2 * 'social_support2022' +...
X = sm.add_constant(X) 
# Fit the model
model = sm.OLS(y, X).fit()
# Print the summary of the regression
print(model.summary())

In [None]:
# List
columns_list = ['happiness2022', 'GDP2022', 'social_support2022', 'healthy2022', 'freedom2022',"generosity2022", "corruption2022"]
# Corr
correlation_matrix = data2022[columns_list].corr()

In [None]:
# Figure
plt.figure(figsize=(10, 8))
sns.heatmap(correlation_matrix, annot=True, fmt=".2f", cmap='coolwarm', cbar=True)
plt.title('Correlation Matrix')
plt.xticks(rotation=45, ha='right')
plt.yticks(rotation=0)
plt.show()

In [None]:
plt.figure(figsize=(10, 6))
plt.scatter(data2022['GDP2022'], data2022['happiness2022'], alpha=0.7, edgecolors='w', s=100)
plt.title('GDP and Happiness Score 2022')
plt.xlabel('GDP')
plt.ylabel('Happiness Score)')
plt.grid(True)
plt.show()

In [None]:
plt.figure(figsize=(10, 6))
plt.scatter(data2022['freedom2022'], data2022['happiness2022'], alpha=0.7, edgecolors='w', s=100)
plt.title('Freedom and Happiness Score 2022')
plt.xlabel('Freedom')
plt.ylabel('Happiness Score)')
plt.grid(True)
plt.show()

In [None]:
plt.figure(figsize=(10, 6))
plt.scatter(data2022['healthy2022'], data2022['happiness2022'], alpha=0.7, edgecolors='w', s=100)
plt.title('Healthy and Happiness Score 2022')
plt.xlabel('Healthy')
plt.ylabel('Happiness Score)')
plt.grid(True)
plt.show()

In [None]:
plt.figure(figsize=(10, 6))
plt.scatter(data2022['social_support2022'], data2022['happiness2022'], alpha=0.7, edgecolors='w', s=100)
plt.title('Social support and Happiness Score 2022')
plt.xlabel('Social Support')
plt.ylabel('Happiness Score)')
plt.grid(True)
plt.show()

In [None]:
plt.figure(figsize=(10, 6))
plt.scatter(data2022['generosity2022'], data2022['happiness2022'], alpha=0.7, edgecolors='w', s=100)
plt.title('Generosity and Happiness Score 2022')
plt.xlabel('Generosity')
plt.ylabel('Happiness Score)')
plt.grid(True)
plt.show()

In [None]:
plt.figure(figsize=(10, 6))
plt.scatter(data2022['corruption2022'], data2022['happiness2022'], alpha=0.7, edgecolors='w', s=100)
plt.title('Corruption and Happiness Score 2022')
plt.xlabel('Corruption')
plt.ylabel('Happiness Score)')
plt.grid(True)
plt.show()