## ***MODEL TRAINING***

### **Required Libraries**

In [90]:
import sys
import os
project_path = os.path.abspath("../../workshop_3")
sys.path.append(project_path)

In [91]:
import pandas as pd
import pickle
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LinearRegression
from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor
from sklearn.metrics import mean_squared_error, r2_score
from sklearn.pipeline import Pipeline
from xgboost import XGBRegressor

### **Load data**

In [92]:
df = pd.read_csv("../data/output/happiness_merged.csv")
df.head(5)

Unnamed: 0,year,country,continent,happiness_score,gdp_per_capita,social_support,healthy_life_expectancy,freedom,corruption,generosity
0,2015,Switzerland,Europe,7.587,1.39651,1.34951,0.94143,0.66557,0.41978,0.29678
1,2015,Iceland,Europe,7.561,1.30232,1.40223,0.94784,0.62877,0.14145,0.4363
2,2015,Denmark,Europe,7.527,1.32548,1.36058,0.87464,0.64938,0.48357,0.34139
3,2015,Norway,Europe,7.522,1.459,1.33095,0.88521,0.66973,0.36503,0.34699
4,2015,Canada,North America,7.427,1.32629,1.32261,0.90563,0.63297,0.32957,0.45811


### **Dummy variables**

In [93]:
def create_dummy(df):
    """
    Crea dummy variables para las columnas categóricas especificadas.
    
    Parámetros:
    - df (pd.DataFrame): El dataframe que contiene las columnas categóricas.

    Retorna:
    - pd.DataFrame: El dataframe con las dummy variables agregadas.
    """
    dummies = pd.get_dummies(df["continent"], drop_first=True)
    
    df = df.drop(columns=["continent"])
    
    df = pd.concat([df, dummies], axis=1)
    
    return df

In [94]:
df = create_dummy(df)
df.head(5)

Unnamed: 0,year,country,happiness_score,gdp_per_capita,social_support,healthy_life_expectancy,freedom,corruption,generosity,America,Asia,Central America,Europe,North America,Oceania,South America
0,2015,Switzerland,7.587,1.39651,1.34951,0.94143,0.66557,0.41978,0.29678,False,False,False,True,False,False,False
1,2015,Iceland,7.561,1.30232,1.40223,0.94784,0.62877,0.14145,0.4363,False,False,False,True,False,False,False
2,2015,Denmark,7.527,1.32548,1.36058,0.87464,0.64938,0.48357,0.34139,False,False,False,True,False,False,False
3,2015,Norway,7.522,1.459,1.33095,0.88521,0.66973,0.36503,0.34699,False,False,False,True,False,False,False
4,2015,Canada,7.427,1.32629,1.32261,0.90563,0.63297,0.32957,0.45811,False,False,False,False,True,False,False


In [95]:
df.info()   

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 782 entries, 0 to 781
Data columns (total 16 columns):
 #   Column                   Non-Null Count  Dtype  
---  ------                   --------------  -----  
 0   year                     782 non-null    int64  
 1   country                  782 non-null    object 
 2   happiness_score          782 non-null    float64
 3   gdp_per_capita           782 non-null    float64
 4   social_support           782 non-null    float64
 5   healthy_life_expectancy  782 non-null    float64
 6   freedom                  782 non-null    float64
 7   corruption               782 non-null    float64
 8   generosity               782 non-null    float64
 9   America                  782 non-null    bool   
 10  Asia                     782 non-null    bool   
 11  Central America          782 non-null    bool   
 12  Europe                   782 non-null    bool   
 13  North America            782 non-null    bool   
 14  Oceania                  7

In [96]:
rename = {
    'America' :'america',
    'Asia' : 'asia',
    'Europe' : 'europe',
    'Oceania' : 'oceania',
    'Central America' : 'central_america',
    'North America' : 'north_america',
    'South America' : 'south_america'

}
df.rename(columns=rename, inplace=True)
df.head(5)

Unnamed: 0,year,country,happiness_score,gdp_per_capita,social_support,healthy_life_expectancy,freedom,corruption,generosity,america,asia,central_america,europe,north_america,oceania,south_america
0,2015,Switzerland,7.587,1.39651,1.34951,0.94143,0.66557,0.41978,0.29678,False,False,False,True,False,False,False
1,2015,Iceland,7.561,1.30232,1.40223,0.94784,0.62877,0.14145,0.4363,False,False,False,True,False,False,False
2,2015,Denmark,7.527,1.32548,1.36058,0.87464,0.64938,0.48357,0.34139,False,False,False,True,False,False,False
3,2015,Norway,7.522,1.459,1.33095,0.88521,0.66973,0.36503,0.34699,False,False,False,True,False,False,False
4,2015,Canada,7.427,1.32629,1.32261,0.90563,0.63297,0.32957,0.45811,False,False,False,False,True,False,False


### **Data division**

In [97]:
X = df.drop(columns=["happiness_score", "country"])
y = df["happiness_score"]

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=77)

print("X_train shape:", X_train.shape)
print("X_test shape:", X_test.shape)

X_train shape: (625, 14)
X_test shape: (157, 14)


## **Model training**

### **Models**

In [98]:
modelos = {
    "Linear Regression": LinearRegression(),
    "Random Forest": RandomForestRegressor(),
    "Gradient Boosting": GradientBoostingRegressor(),
    "XGBoost": XGBRegressor()
}

In [102]:
resultados = []

for nombre, modelo in modelos.items():
    pipeline = Pipeline([
        ("scaler", StandardScaler()),
        ("model", modelo)
    ])
    
    pipeline.fit(X_train, y_train)
    
    y_train_pred = pipeline.predict(X_train)
    y_test_pred = pipeline.predict(X_test)
    
    train_rmse = mean_squared_error(y_train, y_train_pred)
    test_rmse = mean_squared_error(y_test, y_test_pred)
    train_r2 = r2_score(y_train, y_train_pred)
    test_r2 = r2_score(y_test, y_test_pred)
    
    resultados.append({
        "Modelo": nombre,
        "Train MSE": train_rmse,
        "Test MSE": test_rmse,
        "Train R2": train_r2,
        "Test R2": test_r2
    })

In [103]:
resultados_df = pd.DataFrame(resultados).sort_values(by="Test R2", ascending=False)
print("\n Resultados del Entrenamiento:")
print(resultados_df)


 Resultados del Entrenamiento:
              Modelo  Train MSE  Test MSE  Train R2   Test R2
1      Random Forest   0.029483  0.170735  0.976817  0.862949
3            XGBoost   0.000126  0.171540  0.999901  0.862303
2  Gradient Boosting   0.093289  0.183958  0.926644  0.852334
0  Linear Regression   0.239794  0.214016  0.811442  0.828206


### **Save Model**

In [104]:
model = '../model/model.pkl'

with open(model, 'wb') as file:
    pickle.dump(pipeline, file)

print(f"Modelo guardado en: {model}")

Modelo guardado en: ../model/model.pkl


## **Conclusion**

During the training and evaluation phase of regression models, four algorithms were compared: Random Forest, XGBoost, Gradient Boosting and Linear Regression. The final model selection was based on the analysis of performance metrics in both the training and test sets.

The Random Forest model presented an outstanding balance between accuracy and generalization. It obtained a test MSE of 0.1707 and a test R² of 0.8629, indicating that it explains approximately 86% of the variance in the unseen data. Although XGBoost showed a very close test MSE (0.1715) and a similar R² (0.8623), its extremely low performance on the training MSE (0.0001) and near perfect R² (0.9999) suggest possible overfitting, which may compromise its robustness in productive environments.

Gradient Boosting also performed solidly, but with a slightly higher MSE Test (0.1839) and lower R² (0.8523), which places it below the previous ones. Finally, linear regression was the poorest performing model, with a significantly higher MSE (0.2140) and lower R² (0.8282), which was expected given its inability to model complex nonlinear relationships.

In conclusion, **Random Forest** was selected as the final model due to its generalization capacity, high predictive power and stability against overfitting, positioning it as the best option for use within the streaming system implemented in the project.