In [None]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression, Ridge, Lasso, ElasticNet
from sklearn.neighbors import KNeighborsRegressor
from sklearn.ensemble import RandomForestRegressor
from xgboost import XGBRegressor
from sklearn.metrics import r2_score

# Cargar los datos
file_path = '../data/Model_Data.csv'
data = pd.read_csv(file_path)

# Realizar codificación one-hot para la columna 'Country'
data = pd.get_dummies(data, columns=['Country'], drop_first=True)

# Separar las características (X) y la variable objetivo (y)
X = data.drop(columns=['Happiness Score'])
y = data['Happiness Score']

# Dividir el conjunto de datos en entrenamiento y prueba
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)

# Definir los modelos a evaluar
models = {
    "Linear Regression": LinearRegression(),
    "Ridge Regression": Ridge(alpha=1.0),
    "Lasso Regression": Lasso(alpha=0.1),
    "Elastic Net Regression": ElasticNet(alpha=0.1, l1_ratio=0.5),
    "K-Nearest Neighbors Regression": KNeighborsRegressor(n_neighbors=5),
    "Random Forest Regression": RandomForestRegressor(n_estimators=100, random_state=42),
    "XGBoost Regression": XGBRegressor(n_estimators=100, random_state=42)
}

# Evaluación de modelos
results = {}
best_model_name = None
best_model = None
best_r2_score = float('-inf')

for model_name, model in models.items():
    model.fit(X_train, y_train)
    y_pred = model.predict(X_test)
    r2 = r2_score(y_test, y_pred)
    results[model_name] = {"R2 Score": r2}
    
    # Verificar si este modelo tiene el mejor R2 score
    if r2 > best_r2_score:
        best_r2_score = r2
        best_model = model
        best_model_name = model_name

# Imprimir los resultados de R2 para cada modelo
for model_name, metrics in results.items():
    print(f"{model_name}: R2 Score = {metrics['R2 Score']:.4f}")

print(f"\nBest model: {best_model_name} with R2 Score = {best_r2_score:.4f}")


Linear Regression: R2 Score = 0.9392
Ridge Regression: R2 Score = 0.9280
Lasso Regression: R2 Score = 0.6092
Elastic Net Regression: R2 Score = 0.6485
K-Nearest Neighbors Regression: R2 Score = 0.8527
Random Forest Regression: R2 Score = 0.8538
XGBoost Regression: R2 Score = 0.8758

Best model: Linear Regression with R2 Score = 0.9392


In [None]:
import pandas as pd
from sklearn.model_selection import train_test_split, GridSearchCV, cross_val_predict
from sklearn.linear_model import LinearRegression, Ridge, Lasso, ElasticNet
from sklearn.neighbors import KNeighborsRegressor
from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor, VotingRegressor, StackingRegressor
from xgboost import XGBRegressor
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import r2_score
from sklearn.pipeline import Pipeline

# Cargar los datos
file_path = '../data/Model_Data.csv'
data = pd.read_csv(file_path)

# Realizar codificación one-hot para la columna 'Country'
data = pd.get_dummies(data, columns=['Country'], drop_first=True)

# Separar las características (X) y la variable objetivo (y)
X = data.drop(columns=['Happiness Score'])
y = data['Happiness Score']

# Dividir el conjunto de datos en entrenamiento y prueba
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)

# Definir los modelos y sus hiperparámetros para la búsqueda
models = {
    "Linear Regression": {
        "model": LinearRegression(),
        "params": {}
    },
    "Ridge Regression": {
        "model": Ridge(),
        "params": {"alpha": [0.1, 1.0, 10.0]}
    },
    "Lasso Regression": {
        "model": Lasso(),
        "params": {"alpha": [0.01, 0.1, 1.0]}
    },
    "Elastic Net Regression": {
        "model": ElasticNet(),
        "params": {"alpha": [0.01, 0.1, 1.0], "l1_ratio": [0.2, 0.5, 0.8]}
    },
    "K-Nearest Neighbors Regression": {
        "model": KNeighborsRegressor(),
        "params": {"n_neighbors": [3, 5, 7, 10]}
    },
    "Random Forest Regression": {
        "model": RandomForestRegressor(random_state=42),
        "params": {"n_estimators": [200, 300, 500], "max_depth": [None, 10, 20]}
    },
    "Gradient Boosting Regression": {
        "model": GradientBoostingRegressor(random_state=42),
        "params": {"n_estimators": [200, 300, 500], "learning_rate": [0.05, 0.1, 0.2], "max_depth": [3, 5, 7]}
    },
    "XGBoost Regression": {
        "model": XGBRegressor(random_state=42),
        "params": {"n_estimators": [200, 300, 500], "learning_rate": [0.05, 0.1, 0.2], "max_depth": [3, 5, 7]}
    }
}

# Evaluación de modelos y búsqueda de hiperparámetros
results = {}
best_model_name = None
best_model = None
best_r2_score = float('-inf')

for model_name, model_dict in models.items():
    pipe = Pipeline([
        ('scaler', StandardScaler()),  # Solo escalado
        ('model', model_dict["model"])
    ])
    
    # Configurar GridSearchCV con los hiperparámetros del modelo actual
    grid = GridSearchCV(pipe, {'model__' + key: value for key, value in model_dict["params"].items()},
                        cv=5, scoring='r2', n_jobs=-1)
    grid.fit(X_train, y_train)
    
    # Predicciones y evaluación del modelo
    y_pred = grid.predict(X_test)
    r2 = r2_score(y_test, y_pred)
    results[model_name] = {"R2 Score": r2, "Best Params": grid.best_params_}
    
    # Guardar el mejor modelo según el R2 Score
    if r2 > best_r2_score:
        best_r2_score = r2
        best_model = grid.best_estimator_
        best_model_name = model_name

# Prueba con VotingRegressor usando los mejores modelos
voting_reg = VotingRegressor(estimators=[
    ('KNN', KNeighborsRegressor(n_neighbors=3)),
    ('RandomForest', RandomForestRegressor(n_estimators=300, max_depth=None, random_state=42)),
    ('XGBoost', XGBRegressor(n_estimators=300, learning_rate=0.05, max_depth=5, random_state=42))
])
voting_reg.fit(X_train, y_train)
y_pred_voting = voting_reg.predict(X_test)
r2_voting = r2_score(y_test, y_pred_voting)

# Prueba con StackingRegressor usando los mejores modelos
stacking_reg = StackingRegressor(
    estimators=[
        ('KNN', KNeighborsRegressor(n_neighbors=3)),
        ('RandomForest', RandomForestRegressor(n_estimators=300, max_depth=None, random_state=42)),
        ('XGBoost', XGBRegressor(n_estimators=300, learning_rate=0.05, max_depth=5, random_state=42))
    ],
    final_estimator=GradientBoostingRegressor(n_estimators=300, learning_rate=0.05, max_depth=5, random_state=42)
)
stacking_reg.fit(X_train, y_train)
y_pred_stacking = stacking_reg.predict(X_test)
r2_stacking = r2_score(y_test, y_pred_stacking)

# Agregar los resultados de VotingRegressor y StackingRegressor
results["Voting Regressor"] = {"R2 Score": r2_voting, "Best Params": "N/A"}
results["Stacking Regressor"] = {"R2 Score": r2_stacking, "Best Params": "N/A"}

if r2_voting > best_r2_score:
    best_r2_score = r2_voting
    best_model = voting_reg
    best_model_name = "Voting Regressor"

if r2_stacking > best_r2_score:
    best_r2_score = r2_stacking
    best_model = stacking_reg
    best_model_name = "Stacking Regressor"

# Imprimir los resultados de R2 y los mejores parámetros para cada modelo
for model_name, metrics in results.items():
    print(f"{model_name}: R2 Score = {metrics['R2 Score']:.4f}, Best Params = {metrics['Best Params']}")

print(f"\nBest model: {best_model_name} with R2 Score = {best_r2_score:.4f}")


Linear Regression: R2 Score = 0.9392, Best Params = {}
Ridge Regression: R2 Score = 0.9444, Best Params = {'model__alpha': 10.0}
Lasso Regression: R2 Score = 0.9339, Best Params = {'model__alpha': 0.01}
Elastic Net Regression: R2 Score = 0.9435, Best Params = {'model__alpha': 0.01, 'model__l1_ratio': 0.2}
K-Nearest Neighbors Regression: R2 Score = 0.5460, Best Params = {'model__n_neighbors': 10}
Random Forest Regression: R2 Score = 0.8534, Best Params = {'model__max_depth': 20, 'model__n_estimators': 500}
Gradient Boosting Regression: R2 Score = 0.8994, Best Params = {'model__learning_rate': 0.1, 'model__max_depth': 3, 'model__n_estimators': 500}
XGBoost Regression: R2 Score = 0.9075, Best Params = {'model__learning_rate': 0.2, 'model__max_depth': 3, 'model__n_estimators': 500}
Voting Regressor: R2 Score = 0.8855, Best Params = N/A
Stacking Regressor: R2 Score = 0.8445, Best Params = N/A

Best model: Ridge Regression with R2 Score = 0.9444


In [20]:
import pandas as pd
import pickle
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor, AdaBoostRegressor
from sklearn.svm import SVR
from sklearn.neighbors import KNeighborsRegressor
from sklearn.metrics import r2_score
from sklearn.preprocessing import OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer

# Cargar los datos
data = pd.read_csv('../data/Model_Data.csv')

# Definir las características y la variable objetivo
X = data.drop(columns=['Happiness Score'])
y = data['Happiness Score']

# Preprocesar los datos: aplicar One-Hot Encoding a la columna 'Country' y escalar el resto
preprocessor = ColumnTransformer(
    transformers=[
        ('country', OneHotEncoder(handle_unknown='ignore'), ['Country']),  # Codificar la columna 'Country' y manejar categorías desconocidas
        ('num', SimpleImputer(strategy='mean'), X.columns.difference(['Country', 'Year']))  # Imputar valores faltantes para otras columnas numéricas
    ])

# Crear un diccionario de modelos
models = {
    'Linear Regression': LinearRegression(),
    'Random Forest': RandomForestRegressor(random_state=42),
    'Gradient Boosting': GradientBoostingRegressor(random_state=42),
    'AdaBoost': AdaBoostRegressor(random_state=42),
    'Support Vector Regression': SVR(),
    'K-Nearest Neighbors': KNeighborsRegressor()
}

# Dividir los datos en entrenamiento y prueba (80% entrenamiento, 20% prueba)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)

# Variables para almacenar el mejor modelo y su rendimiento
best_model = None
best_r2 = float('-inf')  # Inicializamos con el peor valor posible para R^2
best_model_name = ""  # Para almacenar el nombre del mejor modelo

# Entrenar y evaluar cada modelo
for model_name, model in models.items():
    # Crear un pipeline para cada modelo
    pipeline = Pipeline(steps=[
        ('preprocessor', preprocessor),
        ('regressor', model)
    ])
    
    # Entrenar el modelo
    pipeline.fit(X_train, y_train)
    
    # Hacer predicciones
    y_pred = pipeline.predict(X_test)
    
    # Evaluar el modelo
    r2 = r2_score(y_test, y_pred)
    
    print(f'\nModelo: {model_name}')
    print(f'R^2 Score: {r2}')
    
    # Si el modelo actual es el mejor, lo guardamos
    if r2 > best_r2:
        best_r2 = r2
        best_model = pipeline
        best_model_name = model_name

print(f"\nBest model: {best_model_name} with R2 Score = {best_r2:.4f}")




Modelo: Linear Regression
R^2 Score: 0.9495858523063235

Modelo: Random Forest
R^2 Score: 0.8486986222615859

Modelo: Gradient Boosting
R^2 Score: 0.842152498124545

Modelo: AdaBoost
R^2 Score: 0.7726456924117253

Modelo: Support Vector Regression
R^2 Score: 0.9237282170578003

Modelo: K-Nearest Neighbors
R^2 Score: 0.8937572721799011

Best model: Linear Regression with R2 Score = 0.9496


In [21]:
# Guardar el mejor modelo en un archivo .pkl
with open('../Model/Regression_Model.pkl', 'wb') as file:
    pickle.dump(best_model, file)

print("Modelo guardado exitosamente en 'Model/Regression_Model'")

Modelo guardado exitosamente en 'Model/Regression_Model'
