In [78]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.impute import KNNImputer
from sklearn.linear_model import LinearRegression
import math
import seaborn as sns
from statsmodels.stats.outliers_influence import variance_inflation_factor
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn import metrics
import pickle

In [79]:
df = pd.read_csv('../data/processed/features_selection.csv') 

In [80]:
df.head()

Unnamed: 0,gdp_per_capita,family,freedom,life_expectancy,social_support,score
0,1.39651,1.34951,0.66557,0.94143,1.652631,7.587
1,1.30232,1.40223,0.62877,0.94784,1.628286,7.561
2,1.32548,1.36058,0.64938,0.87464,1.620469,7.527
3,1.459,1.33095,0.66973,0.88521,1.650376,7.522
4,1.32629,1.32261,0.63297,0.90563,1.607299,7.427


# Model Training

## Selection of features

In [81]:
X = df[['gdp_per_capita', 'family', 'freedom', 'life_expectancy', 'social_support']]

vif_data = pd.DataFrame()
vif_data["Variable"] = X.columns
vif_data["VIF"] = [variance_inflation_factor(X.values, i) for i in range(X.shape[1])]
vif_data

Unnamed: 0,Variable,VIF
0,gdp_per_capita,20.414279
1,family,26.17572
2,freedom,11.961291
3,life_expectancy,20.537147
4,social_support,43.62277


In [82]:
def evaluate_model(name, X, y):
    X_train, X_test, y_train, y_test = train_test_split(
        X, y, test_size=0.3, random_state=42
    )
    model = LinearRegression()
    model.fit(X_train, y_train)
    y_pred = model.predict(X_test)
    r2 = metrics.r2_score(y_test, y_pred)
    mae = metrics.mean_absolute_error(y_test, y_pred)
    rmse = np.sqrt(metrics.mean_squared_error(y_test, y_pred))
    return {"Modelo": name, "R²": r2, "MAE": mae, "RMSE": rmse}

y = df['score']

sets = {
    "Modelo 1: GDP + Social Support": ['gdp_per_capita', 'social_support'],
    "Modelo 2: GDP + Life Expectancy + Social Support": ['gdp_per_capita', 'life_expectancy', 'social_support'],
    "Modelo 3: GDP + Family + Freedom": ['gdp_per_capita', 'family', 'freedom'],
    "Modelo 4: GDP + Family + Life Expectancy": ['gdp_per_capita', 'family', 'life_expectancy'],
    "Modelo 5: Todas las variables": ['gdp_per_capita', 'family', 'freedom', 'life_expectancy', 'social_support']
}

results = []
for name, features in sets.items():
    X = df[features].dropna()
    y_ = y.loc[X.index]
    results.append(evaluate_model(name, X, y_))

df_results = pd.DataFrame(results)
print(df_results)

                                             Modelo        R²       MAE  \
0                    Modelo 1: GDP + Social Support  0.769728  0.411021   
1  Modelo 2: GDP + Life Expectancy + Social Support  0.776812  0.408609   
2                  Modelo 3: GDP + Family + Freedom  0.692096  0.487746   
3          Modelo 4: GDP + Family + Life Expectancy  0.704246  0.481329   
4                     Modelo 5: Todas las variables  0.803618  0.380818   

       RMSE  
0  0.536197  
1  0.527886  
2  0.620029  
3  0.607672  
4  0.495170  


In [83]:
X = df[['gdp_per_capita', 'life_expectancy', 'social_support']].dropna()

vif_data = pd.DataFrame()
vif_data["Variable"] = X.columns
vif_data["VIF"] = [variance_inflation_factor(X.values, i) for i in range(X.shape[1])]
vif_data

Unnamed: 0,Variable,VIF
0,gdp_per_capita,19.008588
1,life_expectancy,20.435076
2,social_support,18.651086


## Model Election

In [84]:
features = ['gdp_per_capita', 'life_expectancy', 'social_support']
df_model = df.dropna(subset=features + ['score']).copy()

X = df_model[features]
y = df_model['score']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)

model = LinearRegression()
model.fit(X_train, y_train)
y_pred = model.predict(X_test)

r2 = metrics.r2_score(y_test, y_pred)
mae = metrics.mean_absolute_error(y_test, y_pred)
rmse = np.sqrt(metrics.mean_squared_error(y_test, y_pred))

print(f'R²={r2:.3f}\nMAE={mae:.3f}\nRMSE={rmse:.3f}')

R²=0.777
MAE=0.409
RMSE=0.528


## Export

In [85]:
with open("../models/happiness_score_lr_model.pkl", "wb") as f:
    pickle.dump(model, f)

In [86]:
df_export = df[['gdp_per_capita', 'life_expectancy', 'social_support', 'score']]
df_export.to_csv("../data/processed/Happiness_Score.csv", index=False)
df_export.head()

Unnamed: 0,gdp_per_capita,life_expectancy,social_support,score
0,1.39651,0.94143,1.652631,7.587
1,1.30232,0.94784,1.628286,7.561
2,1.32548,0.87464,1.620469,7.527
3,1.459,0.88521,1.650376,7.522
4,1.32629,0.90563,1.607299,7.427
