In [53]:
# python
import pandas as pd
import numpy as np
from sklearn.preprocessing import StandardScaler
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.model_selection import RandomizedSearchCV
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score
import joblib
import funciones as fn

In [54]:
# Rutas y nombre de la columna objetivo
train_path = "dataset/train.csv"
val_path = "dataset/val.csv"
target_column = "shares"

# Cargar datos
df_train = pd.read_csv(train_path)
df_val = pd.read_csv(val_path)

In [55]:
# Columnas especificadas manualmente vía funciones
numeric_cols = fn.columnas_numericas()
#numeric_cols = fn.columnas_cantidad()
#ohe_cols = fn.columnas_one_hot()
ohe_cols = fn.columnas_one_hot_dia_semana() + fn.columnas_one_hot_tematica()
use_imputer_for_ohe = False  # True si las OHE tienen NaNs

In [56]:

# Validar existencia de columnas
missing = [c for c in numeric_cols + ohe_cols + [target_column] if c not in df_train.columns]
if missing:
    raise ValueError(f"Faltan columnas en `train`: {missing}")
missing = [c for c in numeric_cols + ohe_cols + [target_column] if c not in df_val.columns]
if missing:
    raise ValueError(f"Faltan columnas en `val`: {missing}")

In [57]:

# Separar X / y
X_train = df_train.drop(columns=[target_column])
y_train = df_train[target_column]
X_val = df_val.drop(columns=[target_column])
y_val = df_val[target_column]

In [58]:
# Construir transformadores
transformers = []
if numeric_cols:
    num_pipeline = Pipeline([('scaler', StandardScaler())])
    transformers.append(('num', num_pipeline, numeric_cols))

if ohe_cols:
    transformers.append(('ohe_given', 'passthrough', ohe_cols))

if not transformers:
    raise ValueError('No se definieron columnas numéricas ni OHE.')

preprocessor = ColumnTransformer(transformers=transformers, remainder='drop')

# Espacio de búsqueda de hiperparámetros
param_distributions = {
    'reg__n_estimators': [1000, 2000],
    'reg__max_depth': [5, 10, 15, 20, 30, None],
    'reg__min_samples_split': [2, 5, 10, 20],
    'reg__min_samples_leaf': [1, 2, 4, 8],
    'reg__max_features': ['sqrt', 'log2', 0.5, 0.75],
    'reg__bootstrap': [True, False]
}

# Modelo base
rf = RandomForestRegressor(random_state=42, n_jobs=-1)

# Pipeline con el preprocesamiento + modelo
pipeline = Pipeline([
    ('preprocessor', preprocessor),
    ('reg', rf)
])

# Búsqueda aleatoria de hiperparámetros
random_search = RandomizedSearchCV(
    estimator=pipeline,
    param_distributions=param_distributions,
    n_iter=5,
    cv=3,
    scoring='r2',
    verbose=2,
    random_state=42,
    n_jobs=-1
)

# Entrenamiento con los datos transformados (usando log1p)
random_search.fit(X_train, y_train)

# Mejor modelo ya entrenado
best_model = random_search.best_estimator_

print("Mejores parámetros:", random_search.best_params_)
print("Mejor R2 (CV):", random_search.best_score_)

# Función de evaluación usando el mejor modelo
def evaluar(nombre, X, y_true):
    y_pred = np.expm1(best_model.predict(X))  # revertimos log1p
    mse = mean_squared_error(y_true, y_pred)
    rmse = np.sqrt(mse)
    mae = mean_absolute_error(y_true, y_pred)
    r2 = r2_score(y_true, y_pred)
    print(f"--- {nombre} ---")
    print(f"MSE: {mse:.4f}")
    print(f"RMSE: {rmse:.4f}")
    print(f"MAE: {mae:.4f}")
    print(f"R2: {r2:.4f}")

# Evaluar en validación
evaluar('validation', X_val, np.expm1(y_val))

Fitting 3 folds for each of 5 candidates, totalling 15 fits
[CV] END reg__bootstrap=False, reg__max_depth=30, reg__max_features=sqrt, reg__min_samples_leaf=2, reg__min_samples_split=20, reg__n_estimators=1000; total time= 1.9min
[CV] END reg__bootstrap=False, reg__max_depth=30, reg__max_features=sqrt, reg__min_samples_leaf=2, reg__min_samples_split=20, reg__n_estimators=1000; total time= 1.9min
[CV] END reg__bootstrap=False, reg__max_depth=5, reg__max_features=0.5, reg__min_samples_leaf=8, reg__min_samples_split=10, reg__n_estimators=1000; total time= 1.9min
[CV] END reg__bootstrap=False, reg__max_depth=5, reg__max_features=0.5, reg__min_samples_leaf=8, reg__min_samples_split=10, reg__n_estimators=1000; total time= 1.9min
[CV] END reg__bootstrap=False, reg__max_depth=5, reg__max_features=0.5, reg__min_samples_leaf=8, reg__min_samples_split=10, reg__n_estimators=1000; total time= 1.9min
[CV] END reg__bootstrap=False, reg__max_depth=30, reg__max_features=sqrt, reg__min_samples_leaf=2, re