In [None]:
from sklearn.preprocessing import StandardScaler

scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)

# Ahora usa X_scaled en lugar de X para tus modelos

In [17]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression, Ridge, Lasso, ElasticNet
from sklearn.neighbors import KNeighborsRegressor
from sklearn.ensemble import RandomForestRegressor
from xgboost import XGBRegressor
from sklearn.metrics import mean_squared_error, r2_score

# Cargar el archivo de datos
file_path = '../data/Model_Data.csv'
data = pd.read_csv(file_path)

# Convertir la columna 'Country' a variables dummy
data = pd.get_dummies(data, columns=['Country'], drop_first=True)

# Separar variables predictoras y variable objetivo
X = data.drop(columns=['Happiness Score'])
y = data['Happiness Score']

# Dividir los datos en conjunto de entrenamiento y prueba
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Definir los modelos de regresión
models = {
    "Linear Regression": LinearRegression(),
    "Ridge Regression": Ridge(alpha=1.0),
    "Lasso Regression": Lasso(alpha=0.1),
    "Elastic Net Regression": ElasticNet(alpha=0.1, l1_ratio=0.5),
    "K-Nearest Neighbors Regression": KNeighborsRegressor(n_neighbors=5),
    "Random Forest Regression": RandomForestRegressor(n_estimators=100, random_state=42),
    "XGBoost Regression": XGBRegressor(n_estimators=100, random_state=42)
}

# Entrenar los modelos y evaluar su rendimiento
results = {}
for model_name, model in models.items():
    # Entrenar el modelo
    model.fit(X_train, y_train)
    
    # Realizar predicciones en el conjunto de prueba
    y_pred = model.predict(X_test)
    
    # Calcular métricas
    mse = mean_squared_error(y_test, y_pred)
    r2 = r2_score(y_test, y_pred)
    results[model_name] = {"Mean Squared Error": mse, "R2 Score": r2}

# Mostrar los resultados de los modelos
for model_name, metrics in results.items():
    print(f"{model_name}: MSE = {metrics['Mean Squared Error']:.4f}, R2 Score = {metrics['R2 Score']:.4f}")


Linear Regression: MSE = 0.0755, R2 Score = 0.9393
Ridge Regression: MSE = 0.0941, R2 Score = 0.9243
Lasso Regression: MSE = 0.4664, R2 Score = 0.6249
Elastic Net Regression: MSE = 0.4189, R2 Score = 0.6632
K-Nearest Neighbors Regression: MSE = 0.1789, R2 Score = 0.8562
Random Forest Regression: MSE = 0.1942, R2 Score = 0.8438
XGBoost Regression: MSE = 0.1386, R2 Score = 0.8885


In [18]:
from sklearn.model_selection import RandomizedSearchCV
from xgboost import XGBRegressor

# Configurar la búsqueda de hiperparámetros para XGBoost
param_grid = {
    'n_estimators': [100, 200, 300],
    'learning_rate': [0.01, 0.05, 0.1],
    'max_depth': [3, 5, 7, 10],
    'subsample': [0.6, 0.8, 1.0],
    'colsample_bytree': [0.6, 0.8, 1.0]
}

xgb_model = XGBRegressor(random_state=42)
random_search = RandomizedSearchCV(estimator=xgb_model, param_distributions=param_grid,
                                   n_iter=50, scoring='r2', cv=5, random_state=42, n_jobs=-1)

random_search.fit(X_train, y_train)

# Mejor estimador y mejor puntuación
best_model = random_search.best_estimator_
print("Mejor puntuación R2:", random_search.best_score_)
print("Mejores parámetros:", random_search.best_params_)


Mejor puntuación R2: 0.8816568845342886
Mejores parámetros: {'subsample': 1.0, 'n_estimators': 300, 'max_depth': 3, 'learning_rate': 0.1, 'colsample_bytree': 1.0}


In [6]:
from sklearn.model_selection import GridSearchCV

# Definir el espacio de búsqueda para los hiperparámetros
param_grid = {
    'n_estimators': [50, 100, 200],
    'max_depth': [None, 10, 20, 30],
    'min_samples_split': [2, 5, 10],
    'min_samples_leaf': [1, 2, 4],
    'max_features': ['auto', 'sqrt', 'log2']
}

# Instanciar GridSearchCV
grid_search = GridSearchCV(estimator=RandomForestRegressor(random_state=42),
                           param_grid=param_grid,
                           cv=3,  # Número de pliegues para la validación cruzada
                           n_jobs=-1,  # Utilizar todos los núcleos disponibles
                           scoring='r2')

# Ajustar el modelo
grid_search.fit(X_train, y_train)

# Mostrar los mejores hiperparámetros
print("Mejores Hiperparámetros:", grid_search.best_params_)

# Evaluar el modelo ajustado
best_rf = grid_search.best_estimator_
y_pred_best = best_rf.predict(X_test)
r2_best = r2_score(y_test, y_pred_best)
print("R² del modelo optimizado:", r2_best)


324 fits failed out of a total of 972.
The score on these train-test partitions for these parameters will be set to nan.
If these failures are not expected, you can try to debug them by setting error_score='raise'.

Below are more details about the failures:
--------------------------------------------------------------------------------
215 fits failed with the following error:
Traceback (most recent call last):
  File "c:\Users\juanm\AppData\Local\Programs\Python\Python311\Lib\site-packages\sklearn\model_selection\_validation.py", line 888, in _fit_and_score
    estimator.fit(X_train, y_train, **fit_params)
  File "c:\Users\juanm\AppData\Local\Programs\Python\Python311\Lib\site-packages\sklearn\base.py", line 1466, in wrapper
    estimator._validate_params()
  File "c:\Users\juanm\AppData\Local\Programs\Python\Python311\Lib\site-packages\sklearn\base.py", line 666, in _validate_params
    validate_parameter_constraints(
  File "c:\Users\juanm\AppData\Local\Programs\Python\Python311\L

Mejores Hiperparámetros: {'max_depth': 10, 'max_features': 'sqrt', 'min_samples_leaf': 1, 'min_samples_split': 2, 'n_estimators': 200}
R² del modelo optimizado: 0.8148766106896578


In [None]:
import numpy as np

# Obtener importancias de características
importances = best_rf.feature_importances_
indices = np.argsort(importances)[::-1]

# Mostrar las características más importantes
for f in range(X.shape[1]):
    print(f"{f + 1}. Característica: {X.columns[indices[f]]} (Importancia: {importances[indices[f]]})")


1. Característica: Health (Life Expectancy) (Importancia: 0.29159933702799695)
2. Característica: Economy (GDP per Capita) (Importancia: 0.2884084456453942)
3. Característica: Social support (Importancia: 0.14561332537483956)
4. Característica: Freedom (Importancia: 0.11995972162382947)
5. Característica: Trust (Government Corruption) (Importancia: 0.07940218541105797)
6. Característica: Generosity (Importancia: 0.05960343914474325)
7. Característica: Year (Importancia: 0.015413545772138429)


IndexError: index 7 is out of bounds for axis 0 with size 7

In [15]:
from xgboost import XGBRegressor

# Instanciar y ajustar el modelo XGBoost
xgb_model = XGBRegressor(random_state=42)
xgb_model.fit(X_train, y_train)
y_pred_xgb = xgb_model.predict(X_test)
r2_xgb = r2_score(y_test, y_pred_xgb)
print("R² con XGBoost:", r2_xgb)


R² con XGBoost: 0.888515610668252


In [None]:
from sklearn.preprocessing import StandardScaler

scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)

# Ahora usa X_scaled en lugar de X para tus modelos


In [10]:
from sklearn.model_selection import cross_val_score

cross_val_scores = cross_val_score(best_rf, X, y, cv=5, scoring='r2')
print("R² promedio con validación cruzada:", np.mean(cross_val_scores))


R² promedio con validación cruzada: 0.7941562526576961
