In [1]:
# carga de dataframes
import pandas as pd

PATH = 'movielens-20m-dataset/'
genome_scores = pd.read_csv(PATH + 'genome_scores.csv')
genome_tags = pd.read_csv(PATH + 'genome_tags.csv')
link = pd.read_csv(PATH + 'link.csv')
movie = pd.read_csv(PATH + 'movie.csv')
rating = pd.read_csv(PATH + 'rating.csv')
tag = pd.read_csv(PATH + 'tag.csv')

dataframes = {
    'genome_scores': genome_scores,
    'genome_tags': genome_tags,
    'link': link,
    'movie': movie,
    'rating': rating,
    'tag': tag
}

## 1) Perfil del dataset y decisiones automáticas

Este bloque inspecciona tamaños y sparsity, y define umbrales automáticos para entrenar un modelo user-based con Pearson usando `surprise`.

In [2]:
import numpy as np

# Perfil general de todos los dataframes cargados
summary_rows = []
for name, df in dataframes.items():
    summary_rows.append({
        'dataframe': name,
        'rows': len(df),
        'cols': df.shape[1]
    })

summary_df = pd.DataFrame(summary_rows).sort_values('rows', ascending=False).reset_index(drop=True)
print('Tamaño de dataframes:')
display(summary_df)

# Perfil específico de ratings
ratings = rating[['userId', 'movieId', 'rating', 'timestamp']].copy()
ratings['userId'] = ratings['userId'].astype(np.int32)
ratings['movieId'] = ratings['movieId'].astype(np.int32)
ratings['rating'] = ratings['rating'].astype(np.float32)

n_ratings = len(ratings)
n_users = ratings['userId'].nunique()
n_items = ratings['movieId'].nunique()

user_counts = ratings.groupby('userId').size()
item_counts = ratings.groupby('movieId').size()

sparsity = 1 - (n_ratings / (n_users * n_items))

print('\nPerfil de ratings:')
print(f'ratings: {n_ratings:,}')
print(f'users:   {n_users:,}')
print(f'items:   {n_items:,}')
print(f'sparsity: {sparsity:.6f}')
print('\nDistribución ratings por usuario (quantiles):')
print(user_counts.quantile([0.25, 0.5, 0.75, 0.90, 0.95]).to_string())
print('\nDistribución ratings por item (quantiles):')
print(item_counts.quantile([0.25, 0.5, 0.75, 0.90, 0.95]).to_string())

# Decisiones automáticas según escala
if n_ratings >= 10_000_000:
    min_user_ratings = int(max(40, user_counts.quantile(0.60)))
    min_item_ratings = int(max(40, item_counts.quantile(0.60)))
    k_neighbors = 80
elif n_ratings >= 2_000_000:
    min_user_ratings = int(max(20, user_counts.quantile(0.50)))
    min_item_ratings = int(max(20, item_counts.quantile(0.50)))
    k_neighbors = 60
else:
    min_user_ratings = int(max(10, user_counts.quantile(0.40)))
    min_item_ratings = int(max(10, item_counts.quantile(0.40)))
    k_neighbors = 40

min_user_ratings = max(5, min_user_ratings)
min_item_ratings = max(5, min_item_ratings)

# Límite de usuarios para evitar crash en User-User KNN (matriz UxU)
TARGET_SIM_GB = 0.6
MAX_USERS_BY_MEMORY = int(np.sqrt((TARGET_SIM_GB * (1024 ** 3)) / 8))
MAX_USERS_HARD = 8000
max_users = min(MAX_USERS_BY_MEMORY, MAX_USERS_HARD)

# Tamaño máximo de test a evaluar para tiempos estables
max_test_eval = 20000

config = {
    'min_user_ratings': min_user_ratings,
    'min_item_ratings': min_item_ratings,
    'max_users': max_users,
    'k_neighbors': k_neighbors,
    'min_k': 3,
    'max_test_eval': max_test_eval,
    'random_state': 42
}

est_sim_gb = (config['max_users'] ** 2 * 8) / (1024 ** 3)

print('\nConfiguración automática elegida:')
for key, value in config.items():
    print(f'- {key}: {value}')
print(f"- approx_similarity_matrix_gb: {est_sim_gb:.2f}")

Tamaño de dataframes:


Unnamed: 0,dataframe,rows,cols
0,rating,20000263,4
1,genome_scores,11709768,3
2,tag,465564,4
3,link,27278,3
4,movie,27278,3
5,genome_tags,1128,2



Perfil de ratings:
ratings: 20,000,263
users:   138,493
items:   26,744
sparsity: 0.994600

Distribución ratings por usuario (quantiles):
0.25     35.0
0.50     68.0
0.75    155.0
0.90    334.0
0.95    520.0

Distribución ratings por item (quantiles):
0.25       3.00
0.50      18.00
0.75     205.00
0.90    1305.70
0.95    3612.95

Configuración automática elegida:
- min_user_ratings: 93
- min_item_ratings: 47
- max_users: 8000
- k_neighbors: 80
- min_k: 3
- max_test_eval: 20000
- random_state: 42
- approx_similarity_matrix_gb: 0.48


## 2) Filtrado y split temporal (leave-one-out)

Se filtran usuarios/items poco activos y se toma la última interacción por usuario para test.

In [3]:
# Filtrado por actividad
user_counts = ratings['userId'].value_counts()
item_counts = ratings['movieId'].value_counts()

filtered = ratings[
    ratings['userId'].isin(user_counts[user_counts >= config['min_user_ratings']].index)
    & ratings['movieId'].isin(item_counts[item_counts >= config['min_item_ratings']].index)
].copy()

# Limitar usuarios para entrenamiento estable en user-user similarity
if config['max_users'] is not None:
    top_users = filtered['userId'].value_counts().head(config['max_users']).index
    filtered = filtered[filtered['userId'].isin(top_users)].copy()

filtered.sort_values(['userId', 'timestamp'], inplace=True)

# Leave-one-out temporal: última interacción de cada usuario a test
test_idx = filtered.groupby('userId').tail(1).index
test_df = filtered.loc[test_idx, ['userId', 'movieId', 'rating']].copy()
train_df = filtered.drop(test_idx)[['userId', 'movieId', 'rating']].copy()

# Mantener solo test con usuarios e ítems presentes en train
train_users = set(train_df['userId'].unique())
train_items = set(train_df['movieId'].unique())
test_df = test_df[
    test_df['userId'].isin(train_users) &
    test_df['movieId'].isin(train_items)
].copy()

# Submuestreo de test para evaluación más rápida/estable
if len(test_df) > config['max_test_eval']:
    test_df = test_df.sample(n=config['max_test_eval'], random_state=config['random_state'])

print('Después de filtrado/split:')
print(f"train ratings: {len(train_df):,}")
print(f"test ratings:  {len(test_df):,}")
print(f"train users:   {train_df['userId'].nunique():,}")
print(f"train items:   {train_df['movieId'].nunique():,}")

Después de filtrado/split:
train ratings: 6,713,420
test ratings:  8,000
train users:   8,000
train items:   10,724


## 3) Modelo User-User Pearson con `surprise`

Entrenamiento, evaluación y recomendaciones Top-N.

In [4]:
import sys
import subprocess
import importlib

if importlib.util.find_spec('surprise') is None:
    subprocess.check_call([sys.executable, '-m', 'pip', 'install', '-q', 'scikit-surprise'])

from surprise import Dataset, Reader, KNNBasic

reader = Reader(rating_scale=(0.5, 5.0))
train_data = Dataset.load_from_df(train_df[['userId', 'movieId', 'rating']], reader)
trainset = train_data.build_full_trainset()

sim_options = {
    'name': 'pearson',
    'user_based': True
}

algo = KNNBasic(
    k=config['k_neighbors'],
    min_k=config['min_k'],
    sim_options=sim_options,
    verbose=False
)
algo.fit(trainset)

# Evaluación en test
y_true, y_pred = [], []
for row in test_df.itertuples(index=False):
    pred = algo.predict(uid=row.userId, iid=row.movieId, r_ui=row.rating)
    y_true.append(float(row.rating))
    y_pred.append(float(pred.est))

y_true = np.array(y_true, dtype=np.float32)
y_pred = np.array(y_pred, dtype=np.float32)

rmse = float(np.sqrt(np.mean((y_true - y_pred) ** 2))) if len(y_true) else float('nan')
mae = float(np.mean(np.abs(y_true - y_pred))) if len(y_true) else float('nan')
coverage = float(len(y_pred) / len(test_df)) if len(test_df) else 0.0

print('Resultado modelo Surprise (User-User Pearson):')
print(f"train users: {train_df['userId'].nunique():,} | train ratings: {len(train_df):,}")
print(f"k_neighbors: {config['k_neighbors']} | min_k: {config['min_k']}")
print(f"Evaluados: {len(y_true):,} de {len(test_df):,}")
print(f"RMSE: {rmse:.4f}")
print(f"MAE:  {mae:.4f}")
print(f"Coverage: {coverage:.2%}")

Resultado modelo Surprise (User-User Pearson):
train users: 8,000 | train ratings: 6,713,420
k_neighbors: 80 | min_k: 3
Evaluados: 8,000 de 8,000
RMSE: 0.8826
MAE:  0.6853
Coverage: 100.00%


In [5]:
# Top-N recomendaciones para un usuario de ejemplo
sample_user = int(train_df['userId'].value_counts().index[0])

seen = set(train_df.loc[train_df['userId'] == sample_user, 'movieId'])
all_items = set(train_df['movieId'].unique())
candidates = list(all_items - seen)

scored = []
for movie_id in candidates:
    est = algo.predict(uid=sample_user, iid=int(movie_id)).est
    scored.append((int(movie_id), float(est)))

top_n = 10
top_items = sorted(scored, key=lambda x: x[1], reverse=True)[:top_n]

rec_df = pd.DataFrame(top_items, columns=['movieId', 'pred_rating'])
rec_df = rec_df.merge(movie[['movieId', 'title']], on='movieId', how='left')

print(f'Top-{top_n} recomendaciones para userId={sample_user}')
display(rec_df[['movieId', 'title', 'pred_rating']])

Top-10 recomendaciones para userId=118205


Unnamed: 0,movieId,title,pred_rating
0,6985,"Passion of Joan of Arc, The (Passion de Jeanne...",4.455542
1,93040,"Civil War, The (1990)",4.427976
2,26073,"Human Condition III, The (Ningen no joken III)...",4.402057
3,77658,Cosmos (1980),4.397748
4,102217,Bill Hicks: Revelations (1993),4.36063
5,40697,Babylon 5,4.351508
6,5690,Grave of the Fireflies (Hotaru no haka) (1988),4.351008
7,60904,Heart of a Dog (Sobachye serdtse) (1988),4.34493
8,668,Song of the Little Road (Pather Panchali) (1955),4.277301
9,5498,Red Beard (Akahige) (1965),4.276867


## 4) Exportar y cargar el modelo

Guardar el modelo entrenado para reutilizarlo sin reentrenar.

In [6]:
from pathlib import Path
from surprise import dump
import json

model_dir = Path('artifacts')
model_dir.mkdir(parents=True, exist_ok=True)

model_path = model_dir / 'surprise_user_user_pearson.pkl'
metadata_path = model_dir / 'surprise_user_user_pearson_metadata.json'

# Guardar modelo Surprise
dump.dump(str(model_path), algo=algo)

metadata = {
    'algorithm': 'KNNBasic',
    'similarity': 'pearson',
    'user_based': True,
    'k_neighbors': int(config['k_neighbors']),
    'min_k': int(config['min_k']),
    'train_users': int(train_df['userId'].nunique()),
    'train_items': int(train_df['movieId'].nunique()),
    'train_ratings': int(len(train_df)),
    'rmse': float(rmse),
    'mae': float(mae),
    'coverage': float(coverage)
}

with open(metadata_path, 'w', encoding='utf-8') as f:
    json.dump(metadata, f, ensure_ascii=False, indent=2)

print(f'Modelo guardado en: {model_path}')
print(f'Metadata guardada en: {metadata_path}')

Modelo guardado en: artifacts/surprise_user_user_pearson.pkl
Metadata guardada en: artifacts/surprise_user_user_pearson_metadata.json


In [7]:
# Cargar modelo exportado y probar predicción
from surprise import dump

_, loaded_algo = dump.load('artifacts/surprise_user_user_pearson.pkl')

sample_row = test_df.iloc[0]
loaded_pred = loaded_algo.predict(
    uid=int(sample_row['userId']),
    iid=int(sample_row['movieId']),
    r_ui=float(sample_row['rating'])
)

print('Modelo cargado correctamente.')
print(f"Predicción de prueba -> userId={int(sample_row['userId'])}, movieId={int(sample_row['movieId'])}")
print(f"real={float(sample_row['rating']):.2f}, estimado={loaded_pred.est:.2f}")

Modelo cargado correctamente.
Predicción de prueba -> userId=11, movieId=5971
real=5.00, estimado=4.03
