In [1]:
%load_ext autoreload
%autoreload 2

import sys
import os

# Agregar la carpeta raíz del proyecto al path
sys.path.append(os.path.dirname(os.getcwd()))

In [2]:
import pandas as pd
print("Cargando datos procesados...")
train_df = pd.read_csv('../temp/train_processed_ordinal.csv')
test_df = pd.read_csv('../temp/test_processed_ordinal.csv')

# 2. Separar X e y
X_train = train_df.drop(columns=['popularity'])
y_train = train_df['popularity']

X_test = test_df.drop(columns=['popularity'])
y_test = test_df['popularity']

print(f"X_train: {X_train.shape}")
print(f"X_test: {X_test.shape}")

Cargando datos procesados...
X_train: (564787, 14)
X_test: (141198, 14)


In [3]:
from helpers.trainer import ModelTrainer
trainer = ModelTrainer(
    X_train=X_train,
    X_test=X_test,
    y_train=y_train,
    y_test=y_test,
    target_col='popularity'
)

✓ ModelTrainer inicializado - Target: popularity
  Train: 564,787 muestras, 14 features
  Test: 141,198 muestras
  Target range: [11.0, 96.0]
  Target mean: 28.0 ± 13.1


In [6]:
custom_grid = {
            'n_estimators': [200, 300],
            'learning_rate': [0.01, 0.05],
            'max_depth': [7, 10],
            'num_leaves': [50],
            'min_child_samples': [20, 30],
            'subsample': [0.8, 1.0],
            'colsample_bytree': [0.8, 1.0]
} # Pueden configurar esta grid como deseen o quitarla para usar la por defecto

In [7]:
trainer.train_model(
    model_name='lightgbm',
    cv=5,
    param_grid=custom_grid, # Pueden pasar custom_grid o None para usar la grid por defecto
)


ENTRENANDO: LIGHTGBM
Grid de búsqueda:
  n_estimators: [200, 300]
  learning_rate: [0.01, 0.05]
  max_depth: [7, 10]
  num_leaves: [50]
  min_child_samples: [20, 30]
  subsample: [0.8, 1.0]
  colsample_bytree: [0.8, 1.0]

 Buscando mejores hiperparámetros con 5-Fold CV...
  Total combinaciones: 64

  [1/64] Probando: {'n_estimators': 200, 'learning_rate': 0.01, 'max_depth': 7, 'num_leaves': 50, 'min_child_samples': 20, 'subsample': 0.8, 'colsample_bytree': 0.8}
    MSE CV: 127.9526 (±0.5184)
    ✓ Nuevo mejor score: 127.9526

  [2/64] Probando: {'n_estimators': 200, 'learning_rate': 0.01, 'max_depth': 7, 'num_leaves': 50, 'min_child_samples': 20, 'subsample': 0.8, 'colsample_bytree': 1.0}
    MSE CV: 128.9359 (±0.3841)

  [3/64] Probando: {'n_estimators': 200, 'learning_rate': 0.01, 'max_depth': 7, 'num_leaves': 50, 'min_child_samples': 20, 'subsample': 1.0, 'colsample_bytree': 0.8}
    MSE CV: 127.9526 (±0.5184)

  [4/64] Probando: {'n_estimators': 200, 'learning_rate': 0.01, 'max_de

{'model_name': 'lightgbm',
 'best_params': {'n_estimators': 300,
  'learning_rate': 0.05,
  'max_depth': 10,
  'num_leaves': 50,
  'min_child_samples': 30,
  'subsample': 0.8,
  'colsample_bytree': 0.8},
 'cv_score': np.float64(107.13892065436237),
 'train_metrics': {'mse': 105.10552569717535,
  'rmse': np.float64(10.25209859966121),
  'mae': 8.134306087294632,
  'r2': 0.3890444899195359},
 'test_metrics': {'mse': 106.7624659089692,
  'rmse': np.float64(10.332592409892554),
  'mae': 8.201571196745286,
  'r2': 0.37880331703715775},
 'training_time': 2103.759105,
 'model': LGBMRegressor(colsample_bytree=0.8, learning_rate=0.05, max_depth=10,
               min_child_samples=30, n_estimators=300, n_jobs=-1, num_leaves=50,
               random_state=42, subsample=0.8, verbose=-1)}

In [8]:
print(trainer.results.keys())  #Debe incluir el modelo entrenado

dict_keys(['lightgbm'])


In [9]:
results = trainer.results['lightgbm'] #No da error si el modelo fue entrenado correctamente
print(f"LightGBM CV MSE: {results['cv_score']:.4f}")
print(f"LightGBM Test R²: {results['test_metrics']['r2']:.4f}")

LightGBM CV MSE: 107.1389
LightGBM Test R²: 0.3788


In [10]:
trainer.save_model(model_name='lightgbm', path='../models/lightgbm_model_ordinal.joblib')

✓ Modelo guardado: ../models/lightgbm_model_ordinal.joblib
✓ Metadata guardada: ../models/lightgbm_model_ordinal_metadata.joblib


In [11]:
from helpers.trainer import load_model
# Para cargar el modelo guardado posteriormente
loaded_model,metadata = load_model(path='../models/lightgbm_model_ordinal.joblib')

loaded_model


✓ Modelo: lightgbm
  CV MSE: 107.1389
  Test R²: 0.3788
  Test RMSE: 10.3326


In [12]:
#Test predicción para un dato
i=15
sample = X_test.iloc[[i]]  # Tomar la primera fila como ejemplo
prediction = loaded_model.predict(sample)
print(f"Predicción para el dato de test: {prediction[0]:.4f}")
print(f"Valor real: {y_test.iloc[i]:.4f}")

Predicción para el dato de test: 31.2583
Valor real: 25.0000


In [13]:
#Dataframe de prediccion y reales
test_predictions = loaded_model.predict(X_test)
#Añadir el error RMSE punto a punto
rmse_pointwise = ((y_test - test_predictions) ** 2) ** 0.5

results_df = pd.DataFrame({
    'Actual': y_test,
    'Predicted': test_predictions,
    'RMSE': rmse_pointwise
})
results_df.head(10)

#mostrar dato con mayor error
max_error_row = results_df.loc[results_df['RMSE'].idxmax()]
print("Dato con mayor error de predicción:")
print(max_error_row)
#características del dato con mayor error
features_max_error = X_test.loc[results_df['RMSE'].idxmax()]
print("Características del dato con mayor error:")
print(features_max_error)

#mostrar dato con menor error
min_error_row = results_df.loc[results_df['RMSE'].idxmin()]
print("Dato con menor error de predicción:")
print(min_error_row)
#características del dato con menor error
features_min_error = X_test.loc[results_df['RMSE'].idxmin()]
print("Características del dato con menor error:")
print(features_min_error)


Dato con mayor error de predicción:
Actual       89.000000
Predicted    32.636606
RMSE         56.363394
Name: 133044, dtype: float64
Características del dato con mayor error:
acousticness         0.487310
danceability        -0.137497
energy              -1.202293
instrumentalness    -0.629710
key                  1.044597
liveness            -0.011191
loudness            -0.545201
mode                 0.768310
speechiness         -0.523834
tempo                0.882651
time_signature      -1.982617
valence             -0.703165
duration_ms         -0.459769
genre               27.000000
Name: 133044, dtype: float64
Dato con menor error de predicción:
Actual       27.000000
Predicted    27.000249
RMSE          0.000249
Name: 127990, dtype: float64
Características del dato con menor error:
acousticness        0.872276
danceability       -0.165075
energy             -1.056309
instrumentalness   -0.635218
key                 0.482369
liveness           -0.634693
loudness            0.330