## ***MODEL TRAINING***

### **Required Libraries**

In [77]:
import sys
import os
project_path = os.path.abspath("../../workshop_3")
sys.path.append(project_path)

In [91]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LinearRegression
from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor
from sklearn.metrics import mean_squared_error, r2_score
from sklearn.pipeline import Pipeline
from xgboost import XGBRegressor

### **Read data**

In [79]:
df = pd.read_csv("../data/output/happiness_merged.csv")
df.head(5)

Unnamed: 0,year,country,continent,happiness_score,gdp_per_capita,social_support,healthy_life_expectancy,freedom,corruption,generosity
0,2015,Switzerland,Europe,7.587,1.39651,1.34951,0.94143,0.66557,0.41978,0.29678
1,2015,Iceland,Europe,7.561,1.30232,1.40223,0.94784,0.62877,0.14145,0.4363
2,2015,Denmark,Europe,7.527,1.32548,1.36058,0.87464,0.64938,0.48357,0.34139
3,2015,Norway,Europe,7.522,1.459,1.33095,0.88521,0.66973,0.36503,0.34699
4,2015,Canada,North America,7.427,1.32629,1.32261,0.90563,0.63297,0.32957,0.45811


### **Dummy variables**

In [80]:
def create_dummy(df):
    """
    Crea dummy variables para las columnas categóricas especificadas.
    
    Parámetros:
    - df (pd.DataFrame): El dataframe que contiene las columnas categóricas.

    Retorna:
    - pd.DataFrame: El dataframe con las dummy variables agregadas.
    """
    dummies = pd.get_dummies(df["continent"], drop_first=True)
    
    df = df.drop(columns=["continent"])
    
    df = pd.concat([df, dummies], axis=1)
    
    return df

In [81]:
df = create_dummy(df)
df.head(5)

Unnamed: 0,year,country,happiness_score,gdp_per_capita,social_support,healthy_life_expectancy,freedom,corruption,generosity,America,Asia,Central America,Europe,North America,Oceania,South America
0,2015,Switzerland,7.587,1.39651,1.34951,0.94143,0.66557,0.41978,0.29678,False,False,False,True,False,False,False
1,2015,Iceland,7.561,1.30232,1.40223,0.94784,0.62877,0.14145,0.4363,False,False,False,True,False,False,False
2,2015,Denmark,7.527,1.32548,1.36058,0.87464,0.64938,0.48357,0.34139,False,False,False,True,False,False,False
3,2015,Norway,7.522,1.459,1.33095,0.88521,0.66973,0.36503,0.34699,False,False,False,True,False,False,False
4,2015,Canada,7.427,1.32629,1.32261,0.90563,0.63297,0.32957,0.45811,False,False,False,False,True,False,False


In [82]:
df.info()   

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 782 entries, 0 to 781
Data columns (total 16 columns):
 #   Column                   Non-Null Count  Dtype  
---  ------                   --------------  -----  
 0   year                     782 non-null    int64  
 1   country                  782 non-null    object 
 2   happiness_score          782 non-null    float64
 3   gdp_per_capita           782 non-null    float64
 4   social_support           782 non-null    float64
 5   healthy_life_expectancy  782 non-null    float64
 6   freedom                  782 non-null    float64
 7   corruption               782 non-null    float64
 8   generosity               782 non-null    float64
 9   America                  782 non-null    bool   
 10  Asia                     782 non-null    bool   
 11  Central America          782 non-null    bool   
 12  Europe                   782 non-null    bool   
 13  North America            782 non-null    bool   
 14  Oceania                  7

### **Data division**

In [83]:
X = df.drop(columns=["happiness_score", "country", "year"])
y = df["happiness_score"]

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=77)

print("X_train shape:", X_train.shape)
print("X_test shape:", X_test.shape)

X_train shape: (547, 13)
X_test shape: (235, 13)


## **Model training**

### **Models**

In [84]:
modelos = {
    "Linear Regression": LinearRegression(),
    "Random Forest": RandomForestRegressor(),
    "Gradient Boosting": GradientBoostingRegressor(),
    "XGBoost": XGBRegressor()
}

In [85]:
resultados = []

for nombre, modelo in modelos.items():
    pipeline = Pipeline([
        ("scaler", StandardScaler()),
        ("model", modelo)
    ])
    
    pipeline.fit(X_train, y_train)
    
    y_train_pred = pipeline.predict(X_train)
    y_test_pred = pipeline.predict(X_test)
    
    train_rmse = mean_squared_error(y_train, y_train_pred)
    test_rmse = mean_squared_error(y_test, y_test_pred)
    train_r2 = r2_score(y_train, y_train_pred)
    test_r2 = r2_score(y_test, y_test_pred)
    
    resultados.append({
        "Modelo": nombre,
        "Train MSE": train_rmse,
        "Test MSE": test_rmse,
        "Train R2": train_r2,
        "Test R2": test_r2
    })

In [90]:
resultados_df = pd.DataFrame(resultados).sort_values(by="Test R2", ascending=False)
print("\n Resultados del Entrenamiento:")
print(resultados_df)


 Resultados del Entrenamiento:
              Modelo  Train MSE  Test MSE  Train R2   Test R2
2  Gradient Boosting   0.089863  0.192603  0.929207  0.848129
1      Random Forest   0.030327  0.195096  0.976108  0.846163
0  Linear Regression   0.250576  0.209521  0.802598  0.834789
3            XGBoost   0.000055  0.213305  0.999957  0.831805
