In [1]:
import pandas as pd
import numpy as np

# Read data

In [2]:
df = pd.read_parquet("df_train_ensemble_2022-07-01_2025-10-01_V.1.0.parquet")

In [3]:
df.head()

Unnamed: 0,reporting_month,ISRC,spotify,release_type,continent,zone,quantity,unit_price,mechanical_fee,share_rate,revenue
0,2022/09/01,CA-5KR-00-21353,Other,Music Release,Europe,Italy,1,7e-06,0.0,0.765,5e-06
1,2022/09/01,GX-5MX-22-31727,Other,Music Release,Europe,Turkey,1,7e-06,0.0,0.765,5e-06
2,2022/09/01,GX-5MX-22-31730,Other,Music Release,LATAM,Uruguay,1,1e-05,0.0,0.765,8e-06
3,2022/09/01,CA-5KR-21-13899,Other,Music Release,Europe,Bosnia and herzegovi,5,2e-06,0.0,0.765,8e-06
4,2022/09/01,CA-5KR-21-77573,Other,Music Release,Asia,Jordan,1,1.4e-05,0.0,0.765,1.1e-05


# Delete columns

In [6]:
df.drop(columns = ['zone', 'unit_price', 'mechanical_fee', 'share_rate'])

Unnamed: 0,reporting_month,ISRC,spotify,release_type,continent,quantity,revenue
0,2022/09/01,CA-5KR-00-21353,Other,Music Release,Europe,1,0.000005
1,2022/09/01,GX-5MX-22-31727,Other,Music Release,Europe,1,0.000005
2,2022/09/01,GX-5MX-22-31730,Other,Music Release,LATAM,1,0.000008
3,2022/09/01,CA-5KR-21-13899,Other,Music Release,Europe,5,0.000008
4,2022/09/01,CA-5KR-21-77573,Other,Music Release,Asia,1,0.000011
...,...,...,...,...,...,...,...
896238,2025/10/01,DG-A0M-23-83965,Spotify,Music Release,Europe,98444,295.409068
896239,2025/10/01,FR-X20-25-23762,Spotify,Music Release,North America,121046,348.794584
896240,2025/10/01,FR-X20-25-89898,Other,Music Release,Europe,119656,352.407716
896241,2025/10/01,FR-X20-25-89898,Other,Music Release,North America,113698,358.880249


# Claude ai

In [5]:
import pandas as pd
import numpy as np
from datetime import datetime
import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.preprocessing import LabelEncoder, StandardScaler
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score

# Modelos
from lightgbm import LGBMRegressor
from xgboost import XGBRegressor
import torch
import torch.nn as nn
from torch.utils.data import Dataset, DataLoader
import pytorch_lightning as pl
from pytorch_lightning.callbacks import EarlyStopping, ModelCheckpoint

import warnings
warnings.filterwarnings('ignore')

# ============================================================================
# 1. FEATURE ENGINEERING
# ============================================================================

def create_features(df):
    """
    Crea features temporales y agregadas
    """
    df = df.copy()
    
    # Asegurar que reporting_month es datetime
    df['reporting_month'] = pd.to_datetime(df['reporting_month'])
    
    # Features temporales básicas
    df['year'] = df['reporting_month'].dt.year
    df['month'] = df['reporting_month'].dt.month
    df['quarter'] = df['reporting_month'].dt.quarter
    df['day_of_year'] = df['reporting_month'].dt.dayofyear
    
    # Encoding cíclico para mes (captura estacionalidad)
    df['month_sin'] = np.sin(2 * np.pi * df['month'] / 12)
    df['month_cos'] = np.cos(2 * np.pi * df['month'] / 12)
    
    # Timestamp normalizado (días desde inicio)
    min_date = df['reporting_month'].min()
    df['days_since_start'] = (df['reporting_month'] - min_date).dt.days
    df['days_since_start_norm'] = df['days_since_start'] / df['days_since_start'].max()
    
    # Features agregadas por ISRC
    df['isrc_avg_revenue'] = df.groupby('ISRC')['revenue'].transform('mean')
    df['isrc_std_revenue'] = df.groupby('ISRC')['revenue'].transform('std').fillna(0)
    df['isrc_total_quantity'] = df.groupby('ISRC')['quantity'].transform('sum')
    df['isrc_appearance_count'] = df.groupby('ISRC')['ISRC'].transform('count')
    
    # Features agregadas por continente
    df['continent_avg_revenue'] = df.groupby('continent')['revenue'].transform('mean')
    df['continent_avg_quantity'] = df.groupby('continent')['quantity'].transform('mean')
    
    # Features de combinaciones
    df['quantity_per_appearance'] = df['quantity'] / (df['isrc_appearance_count'] + 1)
    
    # Revenue por unidad (puede ser útil)
    df['revenue_per_quantity'] = df['revenue'] / (df['quantity'] + 1)
    
    return df

# ============================================================================
# 2. SPLIT TEMPORAL
# ============================================================================

def temporal_split(df, train_ratio=0.7, val_ratio=0.15):
    """
    Split temporal: train (70%), validation (15%), test (15%)
    """
    df = df.sort_values('reporting_month').reset_index(drop=True)
    
    n = len(df)
    train_end = int(n * train_ratio)
    val_end = int(n * (train_ratio + val_ratio))
    
    train = df.iloc[:train_end]
    val = df.iloc[train_end:val_end]
    test = df.iloc[val_end:]
    
    print(f"Train: {len(train)} rows ({train['reporting_month'].min()} to {train['reporting_month'].max()})")
    print(f"Val:   {len(val)} rows ({val['reporting_month'].min()} to {val['reporting_month'].max()})")
    print(f"Test:  {len(test)} rows ({test['reporting_month'].min()} to {test['reporting_month'].max()})")
    
    return train, val, test

# ============================================================================
# 3. ENCODING DE CATEGORÍAS
# ============================================================================

def encode_features(train, val, test, categorical_cols):
    """
    Label encoding de variables categóricas
    """
    encoders = {}
    
    for col in categorical_cols:
        le = LabelEncoder()
        # Fit solo en train
        le.fit(train[col].astype(str))
        
        # Transform en todos los sets
        train[f'{col}_encoded'] = le.transform(train[col].astype(str))
        
        # Para val y test, manejar categorías no vistas
        val[f'{col}_encoded'] = val[col].astype(str).map(
            lambda x: le.transform([x])[0] if x in le.classes_ else -1
        )
        test[f'{col}_encoded'] = test[col].astype(str).map(
            lambda x: le.transform([x])[0] if x in le.classes_ else -1
        )
        
        encoders[col] = le
    
    return train, val, test, encoders

# ============================================================================
# 4. MODELO 1: LIGHTGBM
# ============================================================================

def train_lightgbm(X_train, y_train, X_val, y_val):
    """
    Entrena LightGBM - muy eficiente para datos grandes
    """
    model = LGBMRegressor(
        n_estimators=500,
        learning_rate=0.05,
        max_depth=8,
        num_leaves=31,
        subsample=0.8,
        colsample_bytree=0.8,
        random_state=42,
        n_jobs=-1,
        verbose=-1
    )
    
    model.fit(
        X_train, y_train,
        eval_set=[(X_val, y_val)],
        callbacks=[
            tf.keras.callbacks.EarlyStopping(monitor='val_loss', patience=50, verbose=0)
        ] if hasattr(tf.keras.callbacks, 'EarlyStopping') else None
    )
    
    return model

# ============================================================================
# 5. MODELO 2: XGBOOST
# ============================================================================

def train_xgboost(X_train, y_train, X_val, y_val):
    """
    Entrena XGBoost
    """
    model = XGBRegressor(
        n_estimators=500,
        learning_rate=0.05,
        max_depth=8,
        subsample=0.8,
        colsample_bytree=0.8,
        random_state=42,
        n_jobs=-1,
        early_stopping_rounds=50
    )
    
    model.fit(
        X_train, y_train,
        eval_set=[(X_val, y_val)],
        verbose=False
    )
    
    return model

# ============================================================================
# 6. MODELO 3: NEURAL NETWORK (PyTorch Lightning)
# ============================================================================

class MusicRevenueDataset(Dataset):
    """
    Dataset personalizado para PyTorch
    """
    def __init__(self, X_numeric, X_categorical, y):
        self.X_numeric = torch.FloatTensor(X_numeric)
        self.X_categorical = [torch.LongTensor(cat) for cat in X_categorical]
        self.y = torch.FloatTensor(y.values).reshape(-1, 1)
    
    def __len__(self):
        return len(self.y)
    
    def __getitem__(self, idx):
        return {
            'numeric': self.X_numeric[idx],
            'categorical': [cat[idx] for cat in self.X_categorical],
            'target': self.y[idx]
        }


class MusicRevenueModel(pl.LightningModule):
    """
    Red neuronal con embeddings para variables categóricas
    """
    def __init__(self, numeric_dim, categorical_dims, embedding_dim=50, 
                 hidden_dims=[256, 128, 64], dropout=0.3, lr=0.001):
        super().__init__()
        self.save_hyperparameters()
        
        self.lr = lr
        
        # Embeddings para variables categóricas
        self.embeddings = nn.ModuleList([
            nn.Embedding(vocab_size + 1, min(embedding_dim, (vocab_size + 1) // 2))
            for vocab_size in categorical_dims
        ])
        
        # Calcular dimensión total después de embeddings
        embedding_total_dim = sum([min(embedding_dim, (vs + 1) // 2) for vs in categorical_dims])
        total_input_dim = numeric_dim + embedding_total_dim
        
        # Capas densas
        layers = []
        prev_dim = total_input_dim
        
        for i, hidden_dim in enumerate(hidden_dims):
            layers.append(nn.Linear(prev_dim, hidden_dim))
            layers.append(nn.ReLU())
            layers.append(nn.BatchNorm1d(hidden_dim))
            layers.append(nn.Dropout(dropout if i == 0 else dropout * 0.7))
            prev_dim = hidden_dim
        
        self.fc_layers = nn.Sequential(*layers)
        self.output_layer = nn.Linear(prev_dim, 1)
        
        # Métricas
        self.train_mae = []
        self.val_mae = []
    
    def forward(self, numeric, categorical):
        # Embeddings
        embedded = [emb(cat.squeeze()) for emb, cat in zip(self.embeddings, categorical)]
        
        # Concatenar numeric + embeddings
        x = torch.cat([numeric] + embedded, dim=1)
        
        # Forward pass
        x = self.fc_layers(x)
        x = self.output_layer(x)
        
        return x
    
    def training_step(self, batch, batch_idx):
        numeric = batch['numeric']
        categorical = batch['categorical']
        target = batch['target']
        
        pred = self(numeric, categorical)
        loss = nn.functional.mse_loss(pred, target)
        mae = nn.functional.l1_loss(pred, target)
        
        self.log('train_loss', loss, prog_bar=True)
        self.log('train_mae', mae, prog_bar=True)
        
        return loss
    
    def validation_step(self, batch, batch_idx):
        numeric = batch['numeric']
        categorical = batch['categorical']
        target = batch['target']
        
        pred = self(numeric, categorical)
        loss = nn.functional.mse_loss(pred, target)
        mae = nn.functional.l1_loss(pred, target)
        
        self.log('val_loss', loss, prog_bar=True)
        self.log('val_mae', mae, prog_bar=True)
        
        return loss
    
    def configure_optimizers(self):
        optimizer = torch.optim.Adam(self.parameters(), lr=self.lr)
        scheduler = torch.optim.lr_scheduler.ReduceLROnPlateau(
            optimizer, mode='min', factor=0.5, patience=10, verbose=True
        )
        return {
            'optimizer': optimizer,
            'lr_scheduler': {
                'scheduler': scheduler,
                'monitor': 'val_loss'
            }
        }


def train_neural_network(X_train, y_train, X_val, y_val, categorical_cols=None):
    """
    Entrena red neuronal con PyTorch Lightning
    """
    # Escalar features numéricas
    scaler = StandardScaler()
    
    # Separar categóricas y numéricas
    if categorical_cols:
        numeric_cols = [col for col in X_train.columns if col not in categorical_cols]
        X_train_num = scaler.fit_transform(X_train[numeric_cols])
        X_val_num = scaler.transform(X_val[numeric_cols])
        
        # Preparar categóricas
        categorical_dims = [int(X_train[col].max()) + 1 for col in categorical_cols]
        X_train_cat = [X_train[col].values for col in categorical_cols]
        X_val_cat = [X_val[col].values for col in categorical_cols]
    else:
        X_train_num = scaler.fit_transform(X_train)
        X_val_num = scaler.transform(X_val)
        categorical_dims = []
        X_train_cat = []
        X_val_cat = []
    
    # Crear datasets
    train_dataset = MusicRevenueDataset(X_train_num, X_train_cat, y_train)
    val_dataset = MusicRevenueDataset(X_val_num, X_val_cat, y_val)
    
    # DataLoaders
    train_loader = DataLoader(train_dataset, batch_size=1024, shuffle=True, num_workers=4)
    val_loader = DataLoader(val_dataset, batch_size=1024, shuffle=False, num_workers=4)
    
    # Crear modelo
    model = MusicRevenueModel(
        numeric_dim=X_train_num.shape[1],
        categorical_dims=categorical_dims,
        embedding_dim=50,
        hidden_dims=[256, 128, 64],
        dropout=0.3,
        lr=0.001
    )
    
    # Callbacks
    early_stop = EarlyStopping(
        monitor='val_loss',
        patience=20,
        mode='min',
        verbose=True
    )
    
    checkpoint = ModelCheckpoint(
        monitor='val_loss',
        mode='min',
        save_top_k=1,
        filename='best-model-{epoch:02d}-{val_loss:.6f}'
    )
    
    # Trainer
    trainer = pl.Trainer(
        max_epochs=100,
        callbacks=[early_stop, checkpoint],
        accelerator='auto',  # Usa GPU si está disponible
        devices=1,
        enable_progress_bar=True,
        log_every_n_steps=50
    )
    
    # Entrenar
    trainer.fit(model, train_loader, val_loader)
    
    # Cargar mejor modelo
    best_model = MusicRevenueModel.load_from_checkpoint(
        checkpoint.best_model_path,
        numeric_dim=X_train_num.shape[1],
        categorical_dims=categorical_dims
    )
    
    return best_model, scaler, trainer


def predict_with_nn(model, X_test, scaler, categorical_cols):
    """
    Realiza predicciones con el modelo de PyTorch
    """
    model.eval()
    
    # Preparar datos
    numeric_cols = [col for col in X_test.columns if col not in categorical_cols]
    X_test_num = scaler.transform(X_test[numeric_cols])
    X_test_cat = [X_test[col].values for col in categorical_cols]
    
    # Crear dataset y dataloader
    y_dummy = pd.Series([0] * len(X_test))  # Dummy target
    test_dataset = MusicRevenueDataset(X_test_num, X_test_cat, y_dummy)
    test_loader = DataLoader(test_dataset, batch_size=1024, shuffle=False)
    
    # Predicciones
    predictions = []
    with torch.no_grad():
        for batch in test_loader:
            numeric = batch['numeric']
            categorical = batch['categorical']
            pred = model(numeric, categorical)
            predictions.append(pred.cpu().numpy())
    
    return np.concatenate(predictions).flatten()

# ============================================================================
# 7. EVALUACIÓN
# ============================================================================

def evaluate_model(y_true, y_pred, model_name):
    """
    Evalúa modelo y muestra métricas
    """
    mae = mean_absolute_error(y_true, y_pred)
    rmse = np.sqrt(mean_squared_error(y_true, y_pred))
    r2 = r2_score(y_true, y_pred)
    mape = np.mean(np.abs((y_true - y_pred) / (y_true + 1e-10))) * 100
    
    print(f"\n{'='*50}")
    print(f"{model_name} - Resultados:")
    print(f"{'='*50}")
    print(f"MAE:  ${mae:.6f}")
    print(f"RMSE: ${rmse:.6f}")
    print(f"R²:   {r2:.4f}")
    print(f"MAPE: {mape:.2f}%")
    
    return {'mae': mae, 'rmse': rmse, 'r2': r2, 'mape': mape}

# ============================================================================
# 8. PIPELINE COMPLETO
# ============================================================================

def run_complete_pipeline(df):
    """
    Pipeline completo de entrenamiento y evaluación
    """
    print("Iniciando pipeline...")
    print(f"Dataset: {len(df)} filas\n")
    
    # 1. Feature Engineering
    print("1. Creando features...")
    df = create_features(df)
    
    # 2. Split temporal
    print("\n2. Split temporal...")
    train, val, test = temporal_split(df)
    
    # 3. Encoding
    print("\n3. Encoding de variables categóricas...")
    categorical_cols = ['ISRC', 'spotify', 'release_type', 'continent']
    train, val, test, encoders = encode_features(train, val, test, categorical_cols)
    
    # 4. Preparar features
    feature_cols = [
        'year', 'month', 'quarter', 'month_sin', 'month_cos',
        'days_since_start_norm', 'quantity',
        'isrc_avg_revenue', 'isrc_std_revenue', 'isrc_total_quantity',
        'isrc_appearance_count', 'continent_avg_revenue',
        'continent_avg_quantity', 'quantity_per_appearance',
        'ISRC_encoded', 'spotify_encoded', 'release_type_encoded', 'continent_encoded'
    ]
    
    X_train = train[feature_cols]
    y_train = train['revenue']
    X_val = val[feature_cols]
    y_val = val['revenue']
    X_test = test[feature_cols]
    y_test = test['revenue']
    
    results = {}
    
    # 5. Entrenar LightGBM
    print("\n4. Entrenando LightGBM...")
    lgb_model = train_lightgbm(X_train, y_train, X_val, y_val)
    lgb_pred = lgb_model.predict(X_test)
    results['LightGBM'] = evaluate_model(y_test, lgb_pred, "LightGBM")
    
    # 6. Entrenar XGBoost
    print("\n5. Entrenando XGBoost...")
    xgb_model = train_xgboost(X_train, y_train, X_val, y_val)
    xgb_pred = xgb_model.predict(X_test)
    results['XGBoost'] = evaluate_model(y_test, xgb_pred, "XGBoost")
    
    # 7. Entrenar Neural Network
    print("\n6. Entrenando Neural Network...")
    categorical_encoded = ['ISRC_encoded', 'spotify_encoded', 'release_type_encoded', 'continent_encoded']
    nn_model, scaler, history = train_neural_network(
        X_train, y_train, X_val, y_val, 
        categorical_cols=categorical_encoded
    )
    
    # Preparar datos para predicción NN
    numeric_cols = [col for col in feature_cols if col not in categorical_encoded]
    X_test_num = scaler.transform(X_test[numeric_cols])
    X_test_cat = [X_test[col].values.reshape(-1, 1) for col in categorical_encoded]
    nn_pred = nn_model.predict([X_test_num] + X_test_cat, verbose=0).flatten()
    results['Neural Network'] = evaluate_model(y_test, nn_pred, "Neural Network")
    
    # 8. Comparación final
    print("\n" + "="*50)
    print("COMPARACIÓN FINAL")
    print("="*50)
    comparison_df = pd.DataFrame(results).T
    print(comparison_df.to_string())
    
    # Feature importance de LightGBM
    print("\n\nTop 10 Features más importantes (LightGBM):")
    feature_importance = pd.DataFrame({
        'feature': feature_cols,
        'importance': lgb_model.feature_importances_
    }).sort_values('importance', ascending=False).head(10)
    print(feature_importance.to_string(index=False))
    
    return {
        'models': {'lgb': lgb_model, 'xgb': xgb_model, 'nn': nn_model},
        'results': results,
        'encoders': encoders,
        'scaler': scaler,
        'feature_cols': feature_cols
    }

# ============================================================================
# EJEMPLO DE USO
# ============================================================================

ModuleNotFoundError: No module named 'torch'

In [None]:
if __name__ == "__main__":
    # Cargar tus datos
    # df = pd.read_csv('tu_archivo.csv')
    
    # Ejecutar pipeline completo
    # pipeline_output = run_complete_pipeline(df)
    
    print("Pipeline listo para usar.")
    print("\nPara ejecutar:")
    print("  pipeline_output = run_complete_pipeline(df)")