In [1]:
# ===================================================================
# CELDA 1: PREPARACIÓN DE DATOS (MISE EN PLACE)
# ===================================================================

# --- Dependencias principales ---
import numpy
import surprise
import pandas as pd
from surprise import Reader, Dataset, SVD
from surprise.model_selection import train_test_split
from surprise import accuracy
import os
import mlflow
import mlflow.sklearn
from pathlib import Path

# --- Información de entorno ---
print("--- Fase de Preparación ---")
print(f"Numpy version: {numpy.__version__}")
print(f"Surprise version: {surprise.__version__}")

# --- Resolución de la raíz del proyecto ---
try:
    BASE_DIR = os.path.dirname(os.path.dirname(os.path.abspath(__file__)))
except NameError:
    if "notebooks" in os.getcwd():
        BASE_DIR = os.path.abspath(os.path.join(os.getcwd(), '..'))
    else:
        BASE_DIR = os.getcwd()
print(f"Raíz del proyecto detectada en: {BASE_DIR}")

# --- Configuración universal de MLflow ---
mlruns_path = Path(BASE_DIR) / 'mlruns'
tracking_uri = mlruns_path.as_uri()
mlflow.set_tracking_uri(tracking_uri)
print(f"MLflow configurado para guardar en: {tracking_uri}")
mlflow.set_experiment("LatentLens-SVD-Evaluation")

# --- Carga y muestreo de datos MovieLens ---
print("Cargando y procesando datos... (Esto puede tardar un poco)")
ratings_path = Path(BASE_DIR) / 'data' / 'ml-25m' / 'ratings.csv'
ratings_df = pd.read_csv(ratings_path)

n_users = 40000
n_movies = 20000
user_ids = ratings_df['userId'].value_counts().nlargest(n_users).index
movie_ids = ratings_df['movieId'].value_counts().nlargest(n_movies).index
sampled_df = ratings_df[(ratings_df['userId'].isin(user_ids)) & (ratings_df['movieId'].isin(movie_ids))]

# --- Conversión a formato Surprise y división ---
reader = Reader(rating_scale=(0.5, 5.0))
data = Dataset.load_from_df(sampled_df[['userId', 'movieId', 'rating']], reader)
trainset, testset = train_test_split(data, test_size=0.2, random_state=42)

print("\n¡Mise en Place completado! Las variables 'trainset' y 'testset' están listas en memoria.")

2025/08/10 11:04:37 INFO mlflow.tracking.fluent: Experiment with name 'LatentLens-SVD-Evaluation' does not exist. Creating a new experiment.


--- Fase de Preparación ---
Numpy version: 1.26.4
Surprise version: 1.1.4
Raíz del proyecto detectada en: c:\Users\Gat\Documents\GitHub\LatentLens
MLflow configurado para guardar en: file:///c:/Users/Gat/Documents/GitHub/LatentLens/mlruns
Cargando y procesando datos... (Esto puede tardar un poco)

¡Mise en Place completado! Las variables 'trainset' y 'testset' están listas en memoria.


In [None]:
# ===================================================================
# CELDA 2: EXPERIMENTO Y ENTRENAMIENTO (VERSIÓN CON FIRMA MANUAL)
# ===================================================================
import pickle
# --- PASO CLAVE 1: Importamos las herramientas para construir el "Blueprint" ---
from mlflow.models.signature import ModelSignature
from mlflow.types.schema import Schema, ColSpec
import pandas as pd

class SurpriseWrapper(mlflow.pyfunc.PythonModel):
    def load_context(self, context):
        with open(context.artifacts["surprise_model_path"], 'rb') as f:
            self.model = pickle.load(f)

    def predict(self, context, model_input: pd.DataFrame) -> pd.Series:
        users = model_input["userId"]
        items = model_input["movieId"]
        predictions = [self.model.predict(uid=user, iid=item).est for user, item in zip(users, items)]
        return pd.Series(predictions)

# ----------------------------------------------------
run_name = "SVD_prod_MANUAL_SIGNATURE_n150" # El nombre definitivo
print(f"--- Iniciando experimento: '{run_name}' ---")

with mlflow.start_run(run_name=run_name) as run:
    # ... (parámetros y entrenamiento sin cambios) ...
    n_factors = 150
    n_epochs = 20
    mlflow.log_param("model_type", "SVD")
    mlflow.log_param("n_factors", n_factors)
    mlflow.log_param("n_epochs", n_epochs)
    
    model = SVD(n_factors=n_factors, n_epochs=n_epochs, random_state=42)
    model.fit(trainset)
    
    predictions = model.test(testset)
    rmse = accuracy.rmse(predictions)
    mlflow.log_metric("rmse", rmse)
    
    # --- PASO CLAVE 2: ESCRIBIMOS EL MANUAL MAESTRO (LA FIRMA) ---
    # 1. Definimos el contrato de entrada: un DataFrame con dos columnas enteras.
    input_schema = Schema([
        ColSpec("integer", "userId"),
        ColSpec("integer", "movieId"),
    ])
    # 2. Definimos el contrato de salida: una predicción de tipo double.
    output_schema = Schema([ColSpec("double")])
    
    # 3. Creamos la firma combinando ambos contratos.
    signature = ModelSignature(inputs=input_schema, outputs=output_schema)
    
    # 4. Definimos un ejemplo de entrada que CUMPLE con nuestro contrato.
    input_example = pd.DataFrame({
        "userId": [1],
        "movieId": [10]
    })
    
    # Guardado manual del modelo
    model_path = "svd_model.pkl"
    with open(model_path, 'wb') as f:
        pickle.dump(model, f)
            
    # --- Logging con el blueprint explícito y a prueba de balas ---
    mlflow.pyfunc.log_model(
        artifact_path="surprise_svd_model",
        python_model=SurpriseWrapper(),
        artifacts={"surprise_model_path": model_path},
        input_example=input_example,
        signature=signature  # <-- Usamos nuestro manual, no la adivinanza del robot
    )
    print("Modelo SVD (artefacto) y métricas registradas con firma de producción MANUAL.")

print(f"\n¡Experimento '{run_name}' finalizado!")

--- Iniciando experimento: 'SVD_prod_signed_n150' ---
RMSE: 0.7433


TypeError: AlgoBase.predict() missing 1 required positional argument: 'iid'