In [1]:
%load_ext autoreload
%autoreload 2

import sys
import os

# Agregar la carpeta raíz del proyecto al path
sys.path.append(os.path.dirname(os.getcwd()))

In [2]:
import pandas as pd
print("Cargando datos procesados...")
train_df = pd.read_csv('../temp/train_processed.csv')
test_df = pd.read_csv('../temp/test_processed.csv')

# 2. Separar X e y
X_train = train_df.drop(columns=['popularity'])
y_train = train_df['popularity']

X_test = test_df.drop(columns=['popularity'])
y_test = test_df['popularity']

print(f"X_train: {X_train.shape}")
print(f"X_test: {X_test.shape}")

Cargando datos procesados...
X_train: (333345, 43)
X_test: (83336, 43)


In [3]:
from helpers.trainer import ModelTrainer
trainer = ModelTrainer(
    X_train=X_train,
    X_test=X_test,
    y_train=y_train,
    y_test=y_test,
    target_col='popularity'
)

✓ ModelTrainer inicializado - Target: popularity
  Train: 333,345 muestras, 43 features
  Test: 83,336 muestras
  Target range: [0.0, 96.0]
  Target mean: 29.3 ± 15.8


In [4]:
custom_grid = {
    'n_estimators': [200],
    'max_depth': [None],
    'min_samples_split': [10],
    'min_samples_leaf': [2],
    'max_features': ['sqrt']
} # Pueden configurar esta grid como deseen o quitarla para usar la por defecto



In [5]:
trainer.train_model(
    model_name='randomforest',
    cv=5,
    param_grid=custom_grid, # Pueden pasar custom_grid o None para usar la grid por defecto
)


ENTRENANDO: RANDOMFOREST
Grid de búsqueda:
  n_estimators: [200]
  max_depth: [None]
  min_samples_split: [10]
  min_samples_leaf: [2]
  max_features: ['sqrt']

 Buscando mejores hiperparámetros con 5-Fold CV...
  Total combinaciones: 1

  [1/1] Probando: {'n_estimators': 200, 'max_depth': None, 'min_samples_split': 10, 'min_samples_leaf': 2, 'max_features': 'sqrt'}
    MSE CV: 152.0561 (±0.8494)
    ✓ Nuevo mejor score: 152.0561

✓ Mejores parámetros encontrados: {'n_estimators': 200, 'max_depth': None, 'min_samples_split': 10, 'min_samples_leaf': 2, 'max_features': 'sqrt'}
✓ Mejor MSE en CV: 152.0561

 Reentrenando en train completo (333,345 muestras)...
✓ Reentrenamiento completado

Métricas en Train Set:
  MSE: 69.8772
  RMSE: 8.3593
  MAE: 6.5338
  R²: 0.7188

Métricas en Test Set (evaluación final):
  MSE: 153.0176
  RMSE: 12.3700
  MAE: 9.7815
  R²: 0.3864

Tiempo total: 643.6s


{'model_name': 'randomforest',
 'best_params': {'n_estimators': 200,
  'max_depth': None,
  'min_samples_split': 10,
  'min_samples_leaf': 2,
  'max_features': 'sqrt'},
 'cv_score': 152.05613259431198,
 'train_metrics': {'mse': 69.87716733559922,
  'rmse': 8.359256386521425,
  'mae': 6.5338138258002,
  'r2': 0.7187936980932406},
 'test_metrics': {'mse': 153.0175518266421,
  'rmse': 12.370026347047208,
  'mae': 9.781476748083614,
  'r2': 0.3863895453119731},
 'training_time': 643.634944,
 'model': RandomForestRegressor(max_features='sqrt', min_samples_leaf=2,
                       min_samples_split=10, n_estimators=200, n_jobs=-1,
                       random_state=42)}

In [6]:
print(trainer.results.keys())  #Debe incluir el modelo entrenado

dict_keys(['randomforest'])


In [7]:
results = trainer.results['randomforest'] #No da error si el modelo fue entrenado correctamente
print(f"RandomForest CV MSE: {results['cv_score']:.4f}")
print(f"RandomForest Test R²: {results['test_metrics']['r2']:.4f}")

RandomForest CV MSE: 152.0561
RandomForest Test R²: 0.3864


In [8]:
trainer.save_model(model_name='randomforest', path='../models/randomforest_model.joblib')

✓ Modelo guardado: ../models/randomforest_model.joblib
✓ Metadata guardada: ../models/randomforest_model_metadata.joblib
