In [0]:
%restart_python

In [0]:
import pandas as pd
import os
import warnings
import yaml

from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor
import mlflow
import mlflow.sklearn
# Suprimir los warnings para una salida más limpia
warnings.filterwarnings('ignore')

# Opciones para una mejor visualización
pd.set_option('display.max_columns', None)
pd.set_option('display.width', None)
pd.set_option('display.max_colwidth', 50)

print("¡Bibliotecas importadas con éxito!")

  from google.protobuf import service as _service


¡Bibliotecas importadas con éxito!


## Cargue de los datos:

Se hace el cargue de los datos previamente preprocesados y separados en los conjuntos de entrenamiento y prueba. Las variables a incluir en el modelo corresponden a las que describen las canciones de manera numérica
| Variable           | Tipo       | Descripción                                                                 | Codificación                       | Normalización                      |
|--------------------|------------|-----------------------------------------------------------------------------|------------------------------------|------------------------------------|
| popularity         | Numeric    | Variable Objetivo: Puntuación de popularidad en Spotify (0–100)              | -                                  | -                                  |
| duration_ms        | Numeric    | Duración de la pista en milisegundos                                         | -                                  | Standard Scaler<br>Outliers: 4.9%<br>Range ratio: 609.98 |
| explicit           | Boolean    | Indica si la pista contiene contenido explícito                              | True/False                         | -                                  |
| danceability       | float      | Qué tan adecuada es la pista para bailar (0.0–1.0).                          | -                                  | -                                  |
| energy             | float      | Nivel de intensidad y actividad de la pista (0.0–1.0).                       | -                                  | -                                  |
| key                | Categorical| Tono musical (0 = Do, 1 = Do♯/Re♭, …, 11 = Si)                               | Label Encoding (cardinalidad: 12)  | -                                  |
| loudness           | Numeric    | Volumen general en dB. Valores altos indican mayor volumen.                  | -                                  | Robust Scaler<br>Outliers: 5.4%<br>Range ratio: ∞ |
| mode_1             | Dummy Var. | Modalidad mayor (1 = mayor, 0 = no)                                          | One-Hot Encoding (para `mode`)     | -                                  |
| speechiness        | float      | Presencia de palabras habladas en la pista (0.0–1.0).                        | -                                  | -                                  |
| acousticness       | float      | Confianza de que la pista es acústica (0.0–1.0).                             | -                                  | -                                  |
| instrumentalness   | float      | Probabilidad de que la pista no contenga voces (0.0–1.0).                    | -                                  | -                                  |
| liveness           | float      | Presencia de una audiencia en la grabación (0.0–1.0).                        | -                                  | -                                  |
| valence            | float      | Positividad musical (0.0 = triste, 1.0 = feliz).                             | -                                  | -                                  |
| tempo              | Numeric    | Tempo estimado en BPM                                                        | -                                  | Standard Scaler<br>Outliers: 0.5%<br>Range ratio: ∞ |
| time_signature_1   | Dummy Var. | Compás 1/4 (1 si el compás es 1/4, 0 en caso contrario)                      | One-Hot Encoding (para `time_signature`) | - |
| time_signature_3   | Dummy Var. | Compás 3/4 (1 si el compás es 3/4, 0 en caso contrario)                      | One-Hot Encoding (para `time_signature`) | - |
| time_signature_4   | Dummy Var. | Compás 4/4 (1 si el compás es 4/4, 0 en caso contrario)                      | One-Hot Encoding (para `time_signature`) | - |
| time_signature_5   | Dummy Var. | Compás 5/4 (1 si el compás es 5/4, 0 en caso contrario)                      | One-Hot Encoding (para `time_signature`) | - |
| track_genre        | String     | Etiqueta de género asignada a la pista     | Label Encoding (cardinalidad: 114) | -                                  |      

In [0]:
# Define the path to the dataset
config = yaml.safe_load(open("params.yaml"))["prepare"]

output_path_train=config['output_path_train']
output_path_test=config['output_path_test']

X_train_path = os.path.join(output_path_train, 'X_train.csv')
X_test_path = os.path.join(output_path_test, 'X_test.csv')
y_train_path = os.path.join(output_path_train, 'y_train.csv')
y_test_path = os.path.join(output_path_test, 'y_test.csv')


# Load the dataset
try:
    X_train = pd.read_csv(X_train_path)
    X_test = pd.read_csv(X_test_path)
    y_train = pd.read_csv(y_train_path)
    y_test = pd.read_csv(y_test_path)
    print(f"\n✓ Datasets loaded successfully!")
    print(f"Datasets shape: \n X_train: {X_train.shape} \n y_train:{y_train.shape}  \n X_test:{X_test.shape}  \n y_test:{y_test.shape} ")
except Exception as e:
    print(f"✗ Error loading dataset: {e}")




✓ Datasets loaded successfully!
Datasets shape: 
 X_train: (91199, 18) 
 y_train:(91199, 1)  
 X_test:(22800, 18)  
 y_test:(22800, 1) 


## Experimento 1:



In [0]:
X_train.head()

Unnamed: 0,duration_ms,explicit,danceability,energy,key,loudness,speechiness,acousticness,instrumentalness,liveness,valence,tempo,track_genre,mode_1,time_signature_1,time_signature_3,time_signature_4,time_signature_5
0,-0.284617,False,0.769,0.802,2,0.28004,0.0617,0.154,1.2e-05,0.0731,0.696,-0.675648,74,False,False,False,True,False
1,0.051902,False,0.557,0.703,10,0.986427,0.0277,0.203,0.000156,0.0927,0.228,0.930321,49,False,False,False,True,False
2,-1.021152,True,0.904,0.601,4,0.288024,0.198,0.0305,0.0,0.615,0.761,-1.171209,51,True,False,False,True,False
3,-0.267925,False,0.731,0.798,9,0.216168,0.294,0.0315,8.6e-05,0.274,0.898,1.663455,9,True,False,False,True,False
4,-0.204343,False,0.352,0.969,5,-0.099202,0.171,1.6e-05,0.863,0.238,0.376,-0.374162,42,True,False,False,True,False


In [0]:
X_train.columns


Index(['duration_ms', 'explicit', 'danceability', 'energy', 'key', 'loudness',
       'speechiness', 'acousticness', 'instrumentalness', 'liveness',
       'valence', 'tempo', 'track_genre', 'mode_1', 'time_signature_1',
       'time_signature_3', 'time_signature_4', 'time_signature_5'],
      dtype='object')

In [0]:
y_test

Unnamed: 0,popularity
0,42
1,45
2,19
3,47
4,21
...,...
22795,0
22796,17
22797,67
22798,63


In [0]:
from sklearn.metrics import mean_squared_error, r2_score
from sklearn.model_selection import train_test_split, GridSearchCV
import matplotlib.pyplot as plt

Definición de los grid search para hallar los mejores hiperparámetros sobre los modelos Random Forest y Gradient Boosting, registrando los experimentos en MLflow:

In [0]:
models_and_parameters = {
    
    "RandomForest": (
        RandomForestRegressor(random_state=42),
        {"n_estimators": [100, 200],
         "max_depth": [10, 20, None]}
    ),
    
    "GradientBoosting": (
        GradientBoostingRegressor(random_state=42),
        {"n_estimators": [100, 200],
         "learning_rate": [0.05, 0.1],
         "max_depth": [3, 5]}
    )
}


In [0]:

for model_name, (model, param_grid) in models_and_parameters.items():
    with mlflow.start_run(run_name=model_name):
        print(f"Entrenando {model_name}...")

        # Grid search
        grid = GridSearchCV(
            estimator=model,
            param_grid=param_grid,
            cv=3,
            scoring="neg_mean_squared_error",
            n_jobs=-1
        )
        grid.fit(X_train, y_train)

        best_model = grid.best_estimator_

        # Predicciones
        y_pred = best_model.predict(X_test)

        # Métricas
        rmse = mean_squared_error(y_test, y_pred, squared=False)
        r2 = r2_score(y_test, y_pred)

        # Log en MLflow
        mlflow.log_params(grid.best_params_)
        mlflow.log_metric("rmse", rmse)
        mlflow.log_metric("r2", r2)

        # Guardar modelo
        mlflow.sklearn.log_model(best_model, model_name)

        print(f"{model_name} → RMSE: {rmse:.2f}, R2: {r2:.2f}")

        # =============================
        # Importancia de variables
        # =============================
        if hasattr(best_model, "feature_importances_"):
            importances = best_model.feature_importances_
            feature_names = X_train.columns if hasattr(X_train, "columns") else [f"f{i}" for i in range(len(importances))]

            # Crear dataframe ordenado
            importance_df = pd.DataFrame({
                "feature": feature_names,
                "importance": importances
            }).sort_values(by="importance", ascending=False)

            # Guardar como csv en MLflow
            importance_csv = f"{model_name}_feature_importances.csv"
            importance_df.to_csv(importance_csv, index=False)
            mlflow.log_artifact(importance_csv)

            # Gráfico
            plt.figure(figsize=(8, 5))
            importance_df.set_index("feature").head(15).plot(kind="barh", legend=False)
            plt.gca().invert_yaxis()
            plt.title(f"Top 15 Importancias - {model_name}")
            plt.tight_layout()
            plt.savefig(f"{model_name}_feature_importances.png")
            mlflow.log_artifact(f"{model_name}_feature_importances.png")
            plt.close()

Entrenando RandomForest...


  return fit_method(estimator, *args, **kwargs)
  return fit_method(estimator, *args, **kwargs)
  return fit_method(estimator, *args, **kwargs)
  return fit_method(estimator, *args, **kwargs)
  return fit_method(estimator, *args, **kwargs)
  return fit_method(estimator, *args, **kwargs)
  return fit_method(estimator, *args, **kwargs)
  return fit_method(estimator, *args, **kwargs)
  return fit_method(estimator, *args, **kwargs)
  return fit_method(estimator, *args, **kwargs)
  return fit_method(estimator, *args, **kwargs)
  return fit_method(estimator, *args, **kwargs)
  return fit_method(estimator, *args, **kwargs)
  return fit_method(estimator, *args, **kwargs)
  return fit_method(estimator, *args, **kwargs)
  return fit_method(estimator, *args, **kwargs)
  return fit_method(estimator, *args, **kwargs)
  return fit_method(estimator, *args, **kwargs)


RandomForest → RMSE: 15.37, R2: 0.52
Entrenando GradientBoosting...


  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)


GradientBoosting → RMSE: 18.16, R2: 0.33


<Figure size 800x500 with 0 Axes>

<Figure size 800x500 with 0 Axes>

In [0]:
import mlflow.xgboost
from sklearn.metrics import accuracy_score, f1_score, classification_report
from sklearn.ensemble import AdaBoostClassifier
from xgboost import XGBClassifier
import numpy as np

In [0]:
def categorize_popularity(y):
    return pd.cut(
        y,
        bins=[-np.inf, 30, 60, np.inf],
        labels=[0, 1, 2]
    )

In [0]:
y_train_cat=categorize_popularity(y_train['popularity'])
y_test_cat=categorize_popularity(y_test['popularity'])

In [0]:
y_train_cat

0        1
1        1
2        0
3        1
4        0
        ..
91194    1
91195    1
91196    1
91197    2
91198    0
Name: popularity, Length: 91199, dtype: category
Categories (3, int64): [0 < 1 < 2]

In [0]:
models_and_parameters = {
    "AdaBoost": (
        AdaBoostClassifier(random_state=42),
        {"n_estimators": [50, 100], "learning_rate": [0.5, 1.0]}
    ),
    "XGBoost": (
        XGBClassifier(use_label_encoder=False, eval_metric="mlogloss", random_state=42),
        {"n_estimators": [100, 200], "max_depth": [3, 5], "learning_rate": [0.05, 0.1]}
    )
}

# ====================================
# 3. Experimentos con MLflow
# ====================================
mlflow.set_registry_uri("databricks-uc")
mlflow.set_experiment("/song_popularity_prediction_cat")

for model_name, (model, param_grid) in models_and_parameters.items():
    with mlflow.start_run(run_name=model_name):
        print(f"Entrenando {model_name}...")

        # Grid search
        grid = GridSearchCV(
            estimator=model,
            param_grid=param_grid,
            cv=3,
            scoring="f1_macro",
            n_jobs=-1
        )
        grid.fit(X_train, y_train_cat)

        best_model = grid.best_estimator_

        # Predicciones
        y_pred = best_model.predict(X_test)

        # Métricas
        acc = accuracy_score(y_test_cat, y_pred)
        f1 = f1_score(y_test_cat, y_pred, average="macro")

        print(f"{model_name} → Acc: {acc:.3f}, F1-macro: {f1:.3f}")
        print(classification_report(y_test_cat, y_pred))

        # Log en MLflow
        mlflow.log_params(grid.best_params_)
        mlflow.log_metric("accuracy", acc)
        mlflow.log_metric("f1_macro", f1)

        # Guardar modelo
        if model_name == "XGBoost":
            mlflow.xgboost.log_model(best_model, model_name)
        else:
            mlflow.sklearn.log_model(best_model, model_name)

        # =============================
        # Importancia de variables
        # =============================
        if hasattr(best_model, "feature_importances_"):
            importances = best_model.feature_importances_
            feature_names = X_train.columns if hasattr(X_train, "columns") else [f"f{i}" for i in range(len(importances))]

            # DataFrame ordenado
            importance_df = pd.DataFrame({
                "feature": feature_names,
                "importance": importances
            }).sort_values(by="importance", ascending=False)

            # Guardar CSV en MLflow
            importance_csv = f"{model_name}_feature_importances.csv"
            importance_df.to_csv(importance_csv, index=False)
            mlflow.log_artifact(importance_csv)

            # Gráfico
            plt.figure(figsize=(8, 5))
            importance_df.set_index("feature").head(15).plot(kind="barh", legend=False)
            plt.gca().invert_yaxis()
            plt.title(f"Top 15 Importancias - {model_name}")
            plt.tight_layout()
            plt.savefig(f"{model_name}_feature_importances.png")
            mlflow.log_artifact(f"{model_name}_feature_importances.png")
            plt.close()

Entrenando AdaBoost...
AdaBoost → Acc: 0.598, F1-macro: 0.448
              precision    recall  f1-score   support

           0       0.61      0.70      0.65     10467
           1       0.59      0.64      0.61      9645
           2       0.47      0.04      0.08      2688

    accuracy                           0.60     22800
   macro avg       0.56      0.46      0.45     22800
weighted avg       0.58      0.60      0.57     22800





Entrenando XGBoost...


Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)
Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)
Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)
Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)
Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)
Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)
Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)
Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)
Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)
Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)
Parameters: { "use_label_encoder" } are not used.


XGBoost → Acc: 0.717, F1-macro: 0.596
              precision    recall  f1-score   support

           0       0.74      0.80      0.77     10467
           1       0.71      0.77      0.74      9645
           2       0.62      0.18      0.28      2688

    accuracy                           0.72     22800
   macro avg       0.69      0.59      0.60     22800
weighted avg       0.71      0.72      0.70     22800





<Figure size 800x500 with 0 Axes>

<Figure size 800x500 with 0 Axes>

In [0]:
importances = best_model.feature_importances_
feature_names = X_train.columns if hasattr(X_train, "columns") else [f"f{i}" for i in range(len(importances))]
# Crear dataframe ordenado
importance_df = pd.DataFrame({
"feature": feature_names,
"importance": importances}).sort_values(by="importance", ascending=False)
importance_df

Unnamed: 0,feature,importance
12,track_genre,0.228872
8,instrumentalness,0.081911
7,acousticness,0.076135
16,time_signature_4,0.071251
1,explicit,0.060934
0,duration_ms,0.053854
5,loudness,0.052993
3,energy,0.052531
2,danceability,0.050552
10,valence,0.049867
