In [1]:
%load_ext autoreload
%autoreload 2

import sys
import os

# Agregar la carpeta raíz del proyecto al path
sys.path.append(os.path.dirname(os.getcwd()))

In [2]:
import pandas as pd
print("Cargando datos procesados...")
train_df = pd.read_csv('../temp/train_processed.csv')
test_df = pd.read_csv('../temp/test_processed.csv')

# 2. Separar X e y
X_train = train_df.drop(columns=['popularity'])
y_train = train_df['popularity']

X_test = test_df.drop(columns=['popularity'])
y_test = test_df['popularity']

print(f"X_train: {X_train.shape}")
print(f"X_test: {X_test.shape}")

Cargando datos procesados...
X_train: (363671, 63)
X_test: (90919, 63)


In [3]:
from helpers.trainer import ModelTrainer
trainer = ModelTrainer(
    X_train=X_train,
    X_test=X_test,
    y_train=y_train,
    y_test=y_test,
    target_col='popularity'
)

✓ ModelTrainer inicializado - Target: popularity
  Train: 363,671 muestras, 63 features
  Test: 90,919 muestras
  Target range: [16.0, 100.0]
  Target mean: 33.2 ± 12.4


In [7]:
custom_grid = {
    "num_leaves": [31, 63],            # 2
    "max_depth": [-1, 10],             # 2
    "learning_rate": [0.03, 0.07],     # 2
    "n_estimators": [600],             # 1
    "min_child_samples": [20, 60],     # 2
    "subsample": [0.8, 1.0],           # 2
    "colsample_bytree": [0.8, 1.0],    # 2
    "reg_lambda": [0, 1],              # 2
}

In [8]:
trainer.train_model(
    model_name='lightgbm',
    cv=5,
    param_grid=custom_grid, # Pueden pasar custom_grid o None para usar la grid por defecto
)


ENTRENANDO: LIGHTGBM
Grid de búsqueda:
  num_leaves: [31, 63]
  max_depth: [-1, 10]
  learning_rate: [0.03, 0.07]
  n_estimators: [600]
  min_child_samples: [20, 60]
  subsample: [0.8, 1.0]
  colsample_bytree: [0.8, 1.0]
  reg_lambda: [0, 1]

 Buscando mejores hiperparámetros con 5-Fold CV...
  Total combinaciones: 128

  [1/128] Probando: {'num_leaves': 31, 'max_depth': -1, 'learning_rate': 0.03, 'n_estimators': 600, 'min_child_samples': 20, 'subsample': 0.8, 'colsample_bytree': 0.8, 'reg_lambda': 0}
    MSE CV: 105.7072 (±0.9627)
    ✓ Nuevo mejor score: 105.7072

  [2/128] Probando: {'num_leaves': 31, 'max_depth': -1, 'learning_rate': 0.03, 'n_estimators': 600, 'min_child_samples': 20, 'subsample': 0.8, 'colsample_bytree': 0.8, 'reg_lambda': 1}
    MSE CV: 105.6519 (±0.9392)
    ✓ Nuevo mejor score: 105.6519

  [3/128] Probando: {'num_leaves': 31, 'max_depth': -1, 'learning_rate': 0.03, 'n_estimators': 600, 'min_child_samples': 20, 'subsample': 0.8, 'colsample_bytree': 1.0, 'reg_la

{'model_name': 'lightgbm',
 'best_params': {'num_leaves': 63,
  'max_depth': 10,
  'learning_rate': 0.07,
  'n_estimators': 600,
  'min_child_samples': 60,
  'subsample': 0.8,
  'colsample_bytree': 1.0,
  'reg_lambda': 1},
 'cv_score': np.float64(103.60327437955964),
 'train_metrics': {'mse': 94.86815460458946,
  'rmse': np.float64(9.740028470419862),
  'mae': 7.726843068205907,
  'r2': 0.38366102466873375},
 'test_metrics': {'mse': 104.26120825747586,
  'rmse': np.float64(10.210837784309172),
  'mae': 8.087636519159853,
  'r2': 0.3279483061780104},
 'training_time': 2478.10262,
 'model': LGBMRegressor(learning_rate=0.07, max_depth=10, min_child_samples=60,
               n_estimators=600, n_jobs=-1, num_leaves=63, random_state=42,
               reg_lambda=1, subsample=0.8, verbose=-1)}

In [9]:
print(trainer.results.keys())  #Debe incluir el modelo entrenado

dict_keys(['lightgbm'])


In [10]:
results = trainer.results['lightgbm'] #No da error si el modelo fue entrenado correctamente
print(f"LightGBM CV MSE: {results['cv_score']:.4f}")
print(f"LightGBM Test R²: {results['test_metrics']['r2']:.4f}")

LightGBM CV MSE: 103.6033
LightGBM Test R²: 0.3279


In [11]:
trainer.save_model(model_name='lightgbm', path='../models/lightgbm_model.joblib')

✓ Modelo guardado: ../models/lightgbm_model.joblib
✓ Metadata guardada: ../models/lightgbm_model_metadata.joblib


In [12]:
from helpers.trainer import load_model
# Para cargar el modelo guardado posteriormente
loaded_model,metadata = load_model(path='../models/lightgbm_model.joblib')

loaded_model


✓ Modelo: lightgbm
  CV MSE: 103.6033
  Test R²: 0.3279
  Test RMSE: 10.2108


In [13]:
#Test predicción para un dato
i=15
sample = X_test.iloc[[i]]  # Tomar la primera fila como ejemplo
prediction = loaded_model.predict(sample)
print(f"Predicción para el dato de test: {prediction[0]:.4f}")
print(f"Valor real: {y_test.iloc[i]:.4f}")

Predicción para el dato de test: 32.2237
Valor real: 26.0000


In [14]:
#Dataframe de prediccion y reales
test_predictions = loaded_model.predict(X_test)
#Añadir el error RMSE punto a punto
rmse_pointwise = ((y_test - test_predictions) ** 2) ** 0.5

results_df = pd.DataFrame({
    'Actual': y_test,
    'Predicted': test_predictions,
    'RMSE': rmse_pointwise
})
results_df.head(10)

#mostrar dato con mayor error
max_error_row = results_df.loc[results_df['RMSE'].idxmax()]
print("Dato con mayor error de predicción:")
print(max_error_row)
#características del dato con mayor error
features_max_error = X_test.loc[results_df['RMSE'].idxmax()]
print("Características del dato con mayor error:")
print(features_max_error)

#mostrar dato con menor error
min_error_row = results_df.loc[results_df['RMSE'].idxmin()]
print("Dato con menor error de predicción:")
print(min_error_row)
#características del dato con menor error
features_min_error = X_test.loc[results_df['RMSE'].idxmin()]
print("Características del dato con menor error:")
print(features_min_error)


Dato con mayor error de predicción:
Actual       91.000000
Predicted    29.618644
RMSE         61.381356
Name: 18229, dtype: float64
Características del dato con mayor error:
acousticness       -0.530759
danceability       -0.548240
energy             -0.782282
instrumentalness    0.735861
key                -1.486062
                      ...   
genre_soul          0.000000
genre_spanish       0.000000
genre_swedish       0.000000
genre_techno        0.000000
genre_trance        0.000000
Name: 18229, Length: 63, dtype: float64
Dato con menor error de predicción:
Actual       30.000000
Predicted    29.999819
RMSE          0.000181
Name: 31147, dtype: float64
Características del dato con menor error:
acousticness       -0.690133
danceability       -0.043702
energy             -1.171257
instrumentalness    0.717460
key                -0.924551
                      ...   
genre_soul          0.000000
genre_spanish       0.000000
genre_swedish       0.000000
genre_techno        0.000000
g