# Producción Modelo

In [14]:
## Imports
import numpy as np
import pandas as pd
import pickle

from sklearn.base import BaseEstimator, RegressorMixin
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import MinMaxScaler
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from xgboost import XGBRegressor
from sklearn.metrics import mean_absolute_error

import sys
sys.path.append('../')

In [15]:
## Cargar dataset

df = pd.read_csv("../data/dataCardsPCAded.csv")

Empezamos con el multimodelRegressor para gestionar varios modelos.

## Modelo

In [16]:
# ---------------------
# 1. Configuración
target = "log_price"
numeric_features = [
    'cmc', 'edhrec_rank', 'released_year', 'keyword_count',
    'mana_produced_count', 'power_num', 'toughness_num',
    'rarity_labeled', "legal_standard", "legal_pioneer",
    "legal_modern", "legal_legacy", "legal_vintage",
    "legal_commander", "legal_pauper"
]


In [17]:
# ---------------------
# 2. Clase MultiModelRegressor

class MultiModelRegressor(BaseEstimator, RegressorMixin):
    def __init__(self, model_cheap=None, model_mid=None, model_expensive=None):
        self.model_cheap = model_cheap
        self.model_mid = model_mid
        self.model_exp = model_expensive

    def fit(self, X, y):
        mask_cheap = y < 2
        mask_mid = (y >= 2) & (y < 4)
        mask_exp = y >= 4

        if self.model_cheap:
            self.model_cheap.fit(X[mask_cheap], y[mask_cheap])
        if self.model_mid:
            self.model_mid.fit(X[mask_mid], y[mask_mid])
        if self.model_exp:
            self.model_exp.fit(X[mask_exp], y[mask_exp])
        return self

    def predict(self, X):
        preds = {}
        if self.model_cheap:
            preds['cheap'] = self.model_cheap.predict(X)
        else:
            preds['cheap'] = np.full(X.shape[0], np.nan)

        if self.model_mid:
            preds['mid'] = self.model_mid.predict(X)
        else:
            preds['mid'] = np.full(X.shape[0], np.nan)

        if self.model_exp:
            preds['expensive'] = self.model_exp.predict(X)
        else:
            preds['expensive'] = np.full(X.shape[0], np.nan)

        return np.column_stack((preds['cheap'], preds['mid'], preds['expensive']))

In [18]:
# ---------------------
# 3. Preprocesamiento

def build_pipeline():
    # Escalado numérico
    scaler = MinMaxScaler()
    preprocessor = ColumnTransformer(
        transformers=[
            ('num', scaler, numeric_features)
        ],
        remainder='drop'
    )

    # Modelos
    params_common = {'max_depth': 6}
    params_cheap_mid = {'learning_rate': 0.5, 'n_estimators': 200, **params_common}
    params_expensive = {'learning_rate': 0.1, 'n_estimators': 100, **params_common}

    model_cheap = XGBRegressor(**params_cheap_mid)
    model_mid = XGBRegressor(**params_cheap_mid)
    model_expensive = XGBRegressor(**params_expensive)

    multi_model = MultiModelRegressor(model_cheap, model_mid, model_expensive)

    # Pipeline general
    pipeline = Pipeline(steps=[
        ('preprocessing', preprocessor),
        ('model', multi_model)
    ])

    return pipeline

In [19]:

# ---------------------
# 4. Preparar datos

def prepare_data(df):
    df_past = df[df['released_year'] < 2025].copy()
    df_future = df[df['released_year'] == 2025].copy()

    X_past = df_past.drop(columns=['final_price_eur', 'log_price', 'name', "oracle_text"])
    y_past = df_past[target]

    X_future = df_future.drop(columns=['final_price_eur', 'log_price', 'name', "oracle_text"])
    y_future = df_future[target]

    return X_past, y_past, X_future, y_future


In [20]:

# ---------------------
# 5. Entrenamiento

def train_model(df):
    X_past, y_past, X_future, y_future = prepare_data(df)

    # Split interno
    X_train, X_test, y_train, y_test = train_test_split(
        X_past, y_past, test_size=0.2, random_state=42
    )

    pipeline = build_pipeline()
    pipeline.fit(X_train, y_train)

    return pipeline, X_test, y_test, X_future, y_future

In [21]:
# ---------------------
# 6. Guardar / cargar

def save_model(model, path="../model/production/mediofinal.pkl"):
    with open(path, "wb") as f:
        pickle.dump(model, f)

def load_model(path="../model/production/mediofinal.pkl"):
    with open(path, "rb") as f:
        return pickle.load(f)


## Ejecución

In [22]:
# df = ... (tu dataframe con todas las columnas)
pipeline, X_test, y_test, X_future, y_future = train_model(df)

# Predicciones (cada fila contiene 3 valores)
y_pred_test = pipeline.predict(X_test)
y_pred_future = pipeline.predict(X_future)

# Guardar para producción
save_model(pipeline)

# Métricas
mae_cheap = mean_absolute_error(np.expm1(y_test), np.expm1(y_pred_test[:, 0]))
mae_mid = mean_absolute_error(np.expm1(y_test), np.expm1(y_pred_test[:, 1]))
mae_exp = mean_absolute_error(np.expm1(y_test), np.expm1(y_pred_test[:, 2]))

mae_cheap_future = mean_absolute_error(np.expm1(y_future), np.expm1(y_pred_future[:, 0]))
mae_mid_future = mean_absolute_error(np.expm1(y_future), np.expm1(y_pred_future[:, 1]))
mae_exp_future = mean_absolute_error(np.expm1(y_future), np.expm1(y_pred_future[:, 2]))

print(f"MAE modelo cheap:     {mae_cheap:.4f} -- Future {mae_cheap_future:.4f}")
print(f"MAE modelo mid:       {mae_mid:.4f} -- Future {mae_mid_future:.4f}")
print(f"MAE modelo expensive: {mae_exp:.4f} -- Future {mae_exp_future:.4f}")


MAE modelo cheap:     13.1111 -- Future 7.1159
MAE modelo mid:       22.7508 -- Future 18.7115
MAE modelo expensive: 120.4352 -- Future 153.4918




In [23]:
## Me dan como el culo, no sé por qué si llevo toda la tarde del martes con putos modelos y en cheap me daba 0.5
## Ya paso, me está doliendo la cabeza.
## Es que a que le he hecho exponencial al resultado ya exponenciado

In [24]:
import numpy as np

# Errores absolutos de cada modelo
errors = np.abs(y_pred_test - y_test.values.reshape(-1, 1))

# Índice del modelo con menor error para cada muestra
best_model_idx = np.argmin(errors, axis=1)

# Contar cuántas veces cada modelo fue el mejor
labels = np.array(['cheap', 'mid', 'expensive'])
(unique, counts) = np.unique(best_model_idx, return_counts=True)

for idx, count in zip(unique, counts):
    print(f"Modelo {labels[idx]} fue el más preciso en {count} muestras ({100*count/len(y_test):.1f}%)")

Modelo cheap fue el más preciso en 14316 muestras (86.9%)
Modelo mid fue el más preciso en 1799 muestras (10.9%)
Modelo expensive fue el más preciso en 353 muestras (2.1%)


In [25]:
# Asegúrate de que el pipeline está entrenado
pipeline, X_test, y_test, X_future, y_future = train_model(df)

# Verifica que el pipeline está entrenado (puedes chequear con algo simple)
print(f"Modelo entrenado: {pipeline.named_steps['model'].model_cheap is not None}")
print(f"Modelo de medio: {pipeline.named_steps['model'].model_mid is not None}")
print(f"Modelo caro: {pipeline.named_steps['model'].model_exp is not None}")

# Luego, guarda el pipeline entrenado
save_model(pipeline)


Modelo entrenado: True
Modelo de medio: True
Modelo caro: True


In [26]:
# Cargar el modelo
with open('../model/production/mediofinal.pkl', 'rb') as f:
    pipeline = pickle.load(f)

# Verifica que el modelo cargado está entrenado
print(f"Modelo entrenado: {pipeline.named_steps['model'].model_cheap is not None}")
print(f"Modelo de medio: {pipeline.named_steps['model'].model_mid is not None}")
print(f"Modelo caro: {pipeline.named_steps['model'].model_exp is not None}")


Modelo entrenado: True
Modelo de medio: True
Modelo caro: True
