In [4]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression, Ridge, Lasso, ElasticNet
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor
from sklearn.svm import SVR
from sklearn.metrics import r2_score

data = pd.read_csv("../data/Model_Data.csv")

# Separar variables dependientes e independientes
X = data.drop(columns=['Happiness Score'])
y = data['Happiness Score']

# Realizar la partición de los datos en 70% entrenamiento y 30% prueba
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)

# Instanciar los modelos de regresión
models = {
    "Linear Regression": LinearRegression(),
    "Ridge": Ridge(),
    "Lasso": Lasso(),
    "ElasticNet": ElasticNet(),
    "Decision Tree": DecisionTreeRegressor(random_state=42),
    "Random Forest": RandomForestRegressor(random_state=42),
    "Support Vector Regressor": SVR()
}

# Entrenar y evaluar cada modelo
results = {}
for name, model in models.items():
    model.fit(X_train, y_train)  # Entrenamiento
    y_pred = model.predict(X_test)  # Predicción
    r2 = r2_score(y_test, y_pred)  # Cálculo de R^2
    results[name] = r2  # Guardar resultado

results

{'Linear Regression': 0.7631982762214953,
 'Ridge': 0.764932422361992,
 'Lasso': -0.0010128125203199279,
 'ElasticNet': -0.0010128125203199279,
 'Decision Tree': 0.6161375078327344,
 'Random Forest': 0.8157932154005578,
 'Support Vector Regressor': -0.007542022649132507}

In [5]:
from sklearn.model_selection import GridSearchCV

# Definir el espacio de búsqueda para los hiperparámetros
param_grid = {
    'n_estimators': [50, 100, 200],
    'max_depth': [None, 10, 20, 30],
    'min_samples_split': [2, 5, 10],
    'min_samples_leaf': [1, 2, 4],
    'max_features': ['auto', 'sqrt', 'log2']
}

# Instanciar GridSearchCV
grid_search = GridSearchCV(estimator=RandomForestRegressor(random_state=42),
                           param_grid=param_grid,
                           cv=3,  # Número de pliegues para la validación cruzada
                           n_jobs=-1,  # Utilizar todos los núcleos disponibles
                           scoring='r2')

# Ajustar el modelo
grid_search.fit(X_train, y_train)

# Mostrar los mejores hiperparámetros
print("Mejores Hiperparámetros:", grid_search.best_params_)

# Evaluar el modelo ajustado
best_rf = grid_search.best_estimator_
y_pred_best = best_rf.predict(X_test)
r2_best = r2_score(y_test, y_pred_best)
print("R² del modelo optimizado:", r2_best)


324 fits failed out of a total of 972.
The score on these train-test partitions for these parameters will be set to nan.
If these failures are not expected, you can try to debug them by setting error_score='raise'.

Below are more details about the failures:
--------------------------------------------------------------------------------
246 fits failed with the following error:
Traceback (most recent call last):
  File "c:\Users\juanm\AppData\Local\Programs\Python\Python311\Lib\site-packages\sklearn\model_selection\_validation.py", line 888, in _fit_and_score
    estimator.fit(X_train, y_train, **fit_params)
  File "c:\Users\juanm\AppData\Local\Programs\Python\Python311\Lib\site-packages\sklearn\base.py", line 1466, in wrapper
    estimator._validate_params()
  File "c:\Users\juanm\AppData\Local\Programs\Python\Python311\Lib\site-packages\sklearn\base.py", line 666, in _validate_params
    validate_parameter_constraints(
  File "c:\Users\juanm\AppData\Local\Programs\Python\Python311\L

Mejores Hiperparámetros: {'max_depth': 10, 'max_features': 'sqrt', 'min_samples_leaf': 1, 'min_samples_split': 2, 'n_estimators': 200}
R² del modelo optimizado: 0.8148766106896578


In [6]:
import numpy as np

# Obtener importancias de características
importances = best_rf.feature_importances_
indices = np.argsort(importances)[::-1]

# Mostrar las características más importantes
for f in range(X.shape[1]):
    print(f"{f + 1}. Característica: {X.columns[indices[f]]} (Importancia: {importances[indices[f]]})")


1. Característica: Health (Life Expectancy) (Importancia: 0.29159933702799695)
2. Característica: Economy (GDP per Capita) (Importancia: 0.2884084456453942)
3. Característica: Social support (Importancia: 0.14561332537483956)
4. Característica: Freedom (Importancia: 0.11995972162382947)
5. Característica: Trust (Government Corruption) (Importancia: 0.07940218541105797)
6. Característica: Generosity (Importancia: 0.05960343914474325)
7. Característica: Year (Importancia: 0.015413545772138429)


In [7]:
from xgboost import XGBRegressor

# Instanciar y ajustar el modelo XGBoost
xgb_model = XGBRegressor(random_state=42)
xgb_model.fit(X_train, y_train)
y_pred_xgb = xgb_model.predict(X_test)
r2_xgb = r2_score(y_test, y_pred_xgb)
print("R² con XGBoost:", r2_xgb)


R² con XGBoost: 0.7825527918390888


In [8]:
from sklearn.preprocessing import StandardScaler

scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)

# Ahora usa X_scaled en lugar de X para tus modelos


In [9]:
from sklearn.model_selection import cross_val_score

cross_val_scores = cross_val_score(best_rf, X, y, cv=5, scoring='r2')
print("R² promedio con validación cruzada:", np.mean(cross_val_scores))


R² promedio con validación cruzada: 0.7941562526576961
