In [35]:
import pandas as pd
import numpy as np
import ast
import matplotlib.pyplot as plt
import seaborn as sns
from tqdm import tqdm
import random
from datetime import datetime, timedelta
import math

from surprise import Dataset as Dataset_surprise
from surprise import Reader, SVD, KNNBasic, NMF
from surprise.model_selection import train_test_split, cross_validate, GridSearchCV
from surprise import accuracy
import pickle

from sklearn.preprocessing import MultiLabelBinarizer, LabelEncoder
from sklearn.metrics.pairwise import cosine_similarity

import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import Dataset, DataLoader


from sklearn.feature_extraction.text import TfidfVectorizer
from scipy.sparse import coo_matrix
from lightfm import LightFM
from lightfm.evaluation import precision_at_k, recall_at_k, auc_score
from sklearn.metrics import precision_score, recall_score

In [36]:
def extract_director(crew_data):
    try:
        crew = ast.literal_eval(crew_data)
        for person in crew:
            if person['job'] == 'Director':
                return person['name']
        return None
    except:
        return None

def extract_keywords(text):
    try:
        keywords = ast.literal_eval(text)
        return [kw['name'] for kw in keywords]
    except:
        return []

# Paso 2: Convertir a diccionario solo si es string
def safe_literal_eval(val):
    if isinstance(val, dict):
        return val
    try:
        return ast.literal_eval(val)
    except:
        return {'id': None, 'name': 'No Collection', 'poster_path': None, 'backdrop_path': None}

# 2. production_companies: rellenamos listas vacías o NaNs con estructura estándar
def fix_production_companies(val):
    if pd.isna(val) or val == '[]':
        return [{'name': 'No Company', 'id': None}]
    try:
        return ast.literal_eval(val)
    except:
        return [{'name': 'No Company', 'id': None}]

# 4. Generos
def fix_genres(val):
    try:
        genres = ast.literal_eval(val) if isinstance(val, str) else val
        if isinstance(genres, list) and len(genres) > 0:
            return genres
        else:
            return [{'id': None, 'name': 'No Genre'}]
    except:
        return [{'id': None, 'name': 'No Genre'}]

def random_date(start, end):
    """Genera una fecha aleatoria entre start y end"""
    delta = end - start
    random_days = random.randint(0, delta.days)
    return start + timedelta(days=random_days)

def random_unknown_director():
    return 'unknown_' + str(random.randint(10000, 99999))

In [37]:
# Configuraciones generales
pd.set_option('display.max_columns', None)
sns.set_theme(style="whitegrid")

path = '../datos/'

# Archivos principales
movies = pd.read_csv(path + 'movies_metadata.csv', low_memory=False)
credits = pd.read_csv(path + 'credits.csv')
keywords = pd.read_csv(path + 'keywords.csv')
#ratings = pd.read_csv(path + 'ratings_not_so_small.csv')
ratings = pd.read_csv(path + 'ratings_small.csv')
#ratings = pd.read_csv(path + 'ratings.csv')

datasets = {
    "movies_metadata": movies,
    "credits": credits,
    "keywords": keywords,
}

# Aplicar transformación
keywords['keywords'] = keywords['keywords'].apply(extract_keywords)

# Renombramos 'id' para evitar conflictos al hacer merge
keywords.rename(columns={'id': 'id_keywords'}, inplace=True)

# Convertimos a string el id de películas en 'movies' para emparejarlo correctamente
movies['id'] = movies['id'].astype(str)
keywords['id_keywords'] = keywords['id_keywords'].astype(str)

# Merge
movies = movies.merge(keywords, how='left', left_on='id', right_on='id_keywords')
movies.drop(columns=['id_keywords'], inplace=True)

# Aplicamos la función
credits['director'] = credits['crew'].apply(extract_director)

# Renombramos id para evitar conflicto
credits.rename(columns={'id': 'id_credits'}, inplace=True)

# Emparejamos tipos
credits['id_credits'] = credits['id_credits'].astype(str)

# Merge
movies = movies.merge(credits[['id_credits', 'director']], how='left', left_on='id', right_on='id_credits')
movies.drop(columns=['id_credits'], inplace=True)

columns_to_drop = [
    'homepage', 'original_language', 'original_title', 'spoken_languages',
    'poster_path', 'production_countries', 'status', 'video', 'tagline'
]

movies.drop(columns=columns_to_drop, inplace=True)

movies = movies[movies['adult'].isin(['True','False'])]
movies = movies.dropna(subset=['title'])
# Paso 1: Imputar si es NaN
movies['belongs_to_collection'] = movies['belongs_to_collection'].fillna(
    '{"id": none, "name": "No Collection", "poster_path": null, "backdrop_path": null}'
)
movies['belongs_to_collection'] = movies['belongs_to_collection'].apply(safe_literal_eval)
movies['production_companies'] = movies['production_companies'].apply(fix_production_companies)
movies['genres'] = movies['genres'].apply(fix_genres)
movies['director'] = movies['director'].apply(lambda x: x if pd.notnull(x) else random_unknown_director())
movies['overview'] = movies['overview'].fillna('Unknown')
movies['runtime'] = movies['runtime'].fillna(movies['runtime'].mean())
movies['popularity'] = movies['popularity'].fillna(0)
movies['vote_count'] = movies['vote_count'].fillna(0)
movies['vote_average'] = movies['vote_average'].fillna(0)
movies['revenue'] = movies['revenue'].fillna(0)
movies['release_date'] = pd.to_datetime(movies['release_date'], errors='coerce')
# Paso 3: rango de fechas
start_date = datetime(1900, 1, 1)
end_date = datetime(2015, 12, 31)
# Paso 4: imputar fechas nulas
movies['release_date'] = movies['release_date'].apply(
    lambda x: random_date(start_date, end_date) if pd.isnull(x) else x
)
movies['keywords'] = movies['keywords'].apply(lambda x: x if isinstance(x, list) else ['Film'])

movies['id'] = pd.to_numeric(movies['id'], errors='coerce')

valid_movie_ids = set(movies['id'].unique())
ratings = ratings[ratings['movieId'].isin(valid_movie_ids)]

ratings.head(10)

Unnamed: 0,userId,movieId,rating,timestamp
10,1,1371,2.5,1260759135
11,1,1405,1.0,1260759203
13,1,2105,4.0,1260759139
15,1,2193,2.0,1260759198
16,1,2294,2.0,1260759108
17,1,2455,2.5,1260759113
21,2,17,5.0,835355681
26,2,62,3.0,835355749
27,2,110,4.0,835355532
28,2,144,3.0,835356016


## Filtrado colaborativo con sesgo de contenido

In [None]:
# -----------------------------------------
# Preparar dataset para surprise
# -----------------------------------------

print("=== Paso 1: Preparando datos ===")

reader = Reader(rating_scale=(0.5, 5.0))
data = Dataset_surprise.load_from_df(ratings[['userId', 'movieId', 'rating']], reader)
movies['id'] = movies['id'].astype(int)

# -----------------------------------------
# Cross-validation automática
# -----------------------------------------

print("\n=== Paso 2: Cross-validation automática ===")

algorithms = {
    'SVD': SVD(random_state=42),
    'KNNBasic': KNNBasic(sim_options={'name': 'cosine', 'user_based': False}),
    'NMF': NMF(random_state=42)
}

cv_results = {}
for name, algo in algorithms.items():
    print(f"\nEvaluando algoritmo: {name}")
    results = cross_validate(algo, data, measures=['RMSE', 'MAE'], cv=5, verbose=True)
    cv_results[name] = results

print("\nResumen de resultados (media de RMSE):")
for name, results in cv_results.items():
    print(f"{name}: RMSE = {np.mean(results['test_rmse']):.4f}")

# -----------------------------------------
# Grid Search para SVD
# -----------------------------------------

print("\n=== Paso 3: Grid Search en SVD ===")

param_grid = {
    'n_factors': [50, 100],
    'n_epochs': [10, 20],
    'lr_all': [0.005],
    'reg_all': [0.02, 0.1]
}

gs = GridSearchCV(SVD, param_grid, measures=['rmse', 'mae'], cv=3, n_jobs=-1, joblib_verbose=2)
gs.fit(data)

print("\nMejores parámetros encontrados (SVD):")
print(gs.best_params['rmse'])
print(f"Mejor RMSE obtenido: {gs.best_score['rmse']:.4f}")

# -----------------------------------------
# Entrenar SVD final con mejores parámetros
# -----------------------------------------

print("\n=== Paso 4: Entrenando modelo final SVD optimizado ===")

trainset, testset = train_test_split(data, test_size=0.2, random_state=42)

best_svd = SVD(
    n_factors=gs.best_params['rmse']['n_factors'],
    n_epochs=gs.best_params['rmse']['n_epochs'],
    lr_all=gs.best_params['rmse']['lr_all'],
    reg_all=gs.best_params['rmse']['reg_all'],
    random_state=42
)

best_svd.fit(trainset)

predictions = best_svd.test(testset)
print("\nEvaluación final en testset (SVD optimizado):")
accuracy.rmse(predictions)

# -----------------------------------------
# Guardar modelo optimizado
# -----------------------------------------

print("\n=== Paso 5: Guardando modelo optimizado ===")

with open('modelo_recomendacion_optimizado.pkl', 'wb') as f:
    pickle.dump(best_svd, f)

print("Modelo guardado como 'modelo_recomendacion_optimizado.pkl'")

# -----------------------------------------
# Demo para un usuario
# -----------------------------------------

print("\n=== Paso 6: Demo para un usuario ===")

user_id = 1
all_movie_ids = ratings['movieId'].unique()
movies_watched = ratings[ratings['userId'] == user_id]['movieId'].tolist()
movies_not_watched = [mid for mid in all_movie_ids if mid not in movies_watched]

# Películas reseñadas
watched_movies_df = ratings[ratings['userId'] == user_id][['movieId', 'rating']]
watched_movies_df = watched_movies_df.merge(
    movies[['id', 'title']], left_on='movieId', right_on='id'
).sort_values(by='rating', ascending=False)

print(f"\nPelículas que el usuario {user_id} ya ha reseñado:")
print(watched_movies_df[['title', 'rating']])

# Recomendaciones
predictions = [best_svd.predict(user_id, mid) for mid in movies_not_watched]
predictions.sort(key=lambda x: x.est, reverse=True)

top_n = 10
top_n_movies_ids = [int(pred.iid) for pred in predictions[:top_n]]

print(f"\nTop {top_n} películas recomendadas para el usuario {user_id}:")
for pred in predictions[:top_n]:
    movie_row = movies[movies['id'] == int(pred.iid)]
    if len(movie_row) > 0:
        movie_title = movie_row['title'].values[0]
    else:
        movie_title = f"(movieId={pred.iid}) — no metadata"
    print(f"{movie_title} → Predicted rating: {pred.est:.2f}")

# -----------------------------------------
# Demo con 5 usuarios diferentes
# -----------------------------------------

print("\n=== Paso 7: Demo con 5 usuarios diferentes ===")

user_ids = ratings['userId'].unique()
selected_user_ids = np.random.choice(user_ids, size=5, replace=False)

num_recommendations_per_user = {
    selected_user_ids[0]: 5,
    selected_user_ids[1]: 8,
    selected_user_ids[2]: 10,
    selected_user_ids[3]: 7,
    selected_user_ids[4]: 12
}

for user_id in selected_user_ids:
    print("\n" + "="*60)
    print(f"Usuario: {user_id}")

    watched_movies_df = ratings[ratings['userId'] == user_id][['movieId', 'rating']]
    watched_movies_df = watched_movies_df.merge(
        movies[['id', 'title']], left_on='movieId', right_on='id'
    ).sort_values(by='rating', ascending=False)

    print(f"\nPelículas que el usuario {user_id} ya ha reseñado:")
    print(watched_movies_df[['title', 'rating']])

    movies_watched = watched_movies_df['movieId'].tolist()
    movies_not_watched = [mid for mid in all_movie_ids if mid not in movies_watched]

    predictions = [best_svd.predict(user_id, mid) for mid in movies_not_watched]
    predictions.sort(key=lambda x: x.est, reverse=True)

    top_n = num_recommendations_per_user[user_id]

    print(f"\nTop {top_n} películas recomendadas para el usuario {user_id}:")
    for pred in predictions[:top_n]:
        movie_row = movies[movies['id'] == int(pred.iid)]
        if len(movie_row) > 0:
            movie_title = movie_row['title'].values[0]
        else:
            movie_title = f"(movieId={pred.iid}) — no metadata"
        print(f"{movie_title} → Predicted rating: {pred.est:.2f}")


# -----------------------------------------
# Demo con diversificación
# -----------------------------------------

print("\n=== Paso 8: Demo con diversificación de recomendaciones ===")

for user_id in selected_user_ids:
    print("\n" + "="*60)
    print(f"Usuario: {user_id} (con diversificación)")

    watched_movies_df = ratings[ratings['userId'] == user_id][['movieId', 'rating']]
    watched_movies_df = watched_movies_df.merge(
        movies[['id', 'title']], left_on='movieId', right_on='id'
    ).sort_values(by='rating', ascending=False)

    print(f"\nPelículas que el usuario {user_id} ya ha reseñado:")
    print(watched_movies_df[['title', 'rating']])

    movies_watched = watched_movies_df['movieId'].tolist()
    movies_not_watched = [mid for mid in all_movie_ids if mid not in movies_watched]

    predictions = [best_svd.predict(user_id, mid) for mid in movies_not_watched]

    penalized_predictions = []
    for pred in predictions:
        movie_id = int(pred.iid)
        vote_count = movies[movies['id'] == movie_id]['vote_count'].values
        vote_count = vote_count[0] if len(vote_count) > 0 else 0
        penalty = math.log(1 + vote_count)
        final_score = pred.est / penalty if penalty > 0 else pred.est
        penalized_predictions.append((pred, final_score))

    penalized_predictions.sort(key=lambda x: x[1], reverse=True)

    top_n = num_recommendations_per_user[user_id]

    print(f"\nTop {top_n} películas recomendadas para el usuario {user_id} (diversificadas):")
    for pred, score in penalized_predictions[:top_n]:
        movie_row = movies[movies['id'] == int(pred.iid)]
        if len(movie_row) > 0:
            movie_title = movie_row['title'].values[0]
        else:
            movie_title = f"(movieId={pred.iid}) — no metadata"
        print(f"{movie_title} → Predicted rating: {pred.est:.2f}, Final score: {score:.4f}")


# -----------------------------------------
# Preparación: crear genre_vector + director_id
# -----------------------------------------

print("\n=== Preparación para sistema híbrido ===")

def extract_genre_names(genres_list):
    import ast
    genres_list = ast.literal_eval(genres_list) if isinstance(genres_list, str) else genres_list
    return [genre['name'] for genre in genres_list if 'name' in genre]

movies['genre_names'] = movies['genres'].apply(extract_genre_names)

# MultiLabelBinarizer para géneros
mlb = MultiLabelBinarizer()
genre_matrix = mlb.fit_transform(movies['genre_names'])

# LabelEncoder para director
le_director = LabelEncoder()
movies['director_encoded'] = le_director.fit_transform(movies['director'])

# Construir content_vector: [géneros one-hot + director_encoded (normalizado)]
content_vectors = []
for idx, row in enumerate(movies.itertuples(index=False)):
    genre_vector = genre_matrix[idx]
    director_vector = np.array([row.director_encoded / movies['director_encoded'].max()])
    content_vector = np.concatenate([genre_vector, director_vector])
    content_vectors.append(content_vector)

movies['content_vector'] = content_vectors

# -----------------------------------------
# Demo de sistema híbrido
# -----------------------------------------

print("\n=== Paso 9: Demo sistema híbrido ===")

for user_id in selected_user_ids:
    print("\n" + "="*60)
    print(f"Usuario: {user_id} (sistema híbrido)")

    # Películas reseñadas
    watched_movies_df = ratings[ratings['userId'] == user_id][['movieId', 'rating']]
    watched_movies_df = watched_movies_df.merge(
        movies[['id', 'title', 'content_vector']], left_on='movieId', right_on='id'
    ).sort_values(by='rating', ascending=False)

    print(f"\nPelículas que el usuario {user_id} ya ha reseñado:")
    print(watched_movies_df[['title', 'rating']])

    # Perfil de contenido del usuario
    liked_movies_vectors = watched_movies_df[watched_movies_df['rating'] >= 4]['content_vector'].tolist()

    if len(liked_movies_vectors) == 0:
        liked_movies_vectors = watched_movies_df['content_vector'].tolist()

    user_content_profile = np.mean(np.vstack(liked_movies_vectors), axis=0).reshape(1, -1)

    # Películas no vistas
    movies_watched = watched_movies_df['movieId'].tolist()
    movies_not_watched_df = movies[~movies['id'].isin(movies_watched)]

    # Similaridad de contenido
    content_vectors_not_watched = np.vstack(movies_not_watched_df['content_vector'].tolist())
    content_similarities = cosine_similarity(user_content_profile, content_vectors_not_watched)[0]

    # Predicciones de SVD
    predictions = [best_svd.predict(user_id, mid) for mid in movies_not_watched_df['id'].tolist()]

    # Híbrido: alpha * predicted_rating + (1-alpha) * content_sim * 5
    alpha = 0.7
    hybrid_predictions = []

    for pred, content_sim in zip(predictions, content_similarities):
        final_score = alpha * pred.est + (1 - alpha) * content_sim * 5
        hybrid_predictions.append((pred, content_sim, final_score))

    hybrid_predictions.sort(key=lambda x: x[2], reverse=True)

    top_n = num_recommendations_per_user[user_id]

    print(f"\nTop {top_n} películas recomendadas para el usuario {user_id} (sistema híbrido):")
    for pred, content_sim, final_score in hybrid_predictions[:top_n]:
        movie_row = movies[movies['id'] == int(pred.iid)]
        if len(movie_row) > 0:
            movie_title = movie_row['title'].values[0]
        else:
            movie_title = f"(movieId={pred.iid}) — no metadata"
        print(f"{movie_title} → Predicted rating: {pred.est:.2f}, Content sim: {content_sim:.3f}, Final score: {final_score:.3f}")

    print("="*60)


=== Paso 1: Preparando datos ===

=== Paso 2: Cross-validation automática ===

Evaluando algoritmo: SVD
Evaluating RMSE, MAE of algorithm SVD on 5 split(s).

                  Fold 1  Fold 2  Fold 3  Fold 4  Fold 5  Mean    Std     
RMSE (testset)    0.9016  0.8974  0.8931  0.9035  0.9111  0.9014  0.0061  
MAE (testset)     0.6922  0.6922  0.6898  0.6968  0.6998  0.6942  0.0036  
Fit time          1.31    1.56    1.50    1.42    1.52    1.46    0.09    
Test time         0.05    0.05    0.06    0.05    0.05    0.05    0.00    

Evaluando algoritmo: KNNBasic
Computing the cosine similarity matrix...
Done computing similarity matrix.
Computing the cosine similarity matrix...
Done computing similarity matrix.
Computing the cosine similarity matrix...
Done computing similarity matrix.
Computing the cosine similarity matrix...
Done computing similarity matrix.
Computing the cosine similarity matrix...
Done computing similarity matrix.
Evaluating RMSE, MAE of algorithm KNNBasic on 5 split(s)

[Parallel(n_jobs=-1)]: Using backend LokyBackend with 12 concurrent workers.
[Parallel(n_jobs=-1)]: Done  14 out of  24 | elapsed:    2.7s remaining:    1.9s
[Parallel(n_jobs=-1)]: Done  24 out of  24 | elapsed:    3.9s finished



Mejores parámetros encontrados (SVD):
{'n_factors': 50, 'n_epochs': 20, 'lr_all': 0.005, 'reg_all': 0.1}
Mejor RMSE obtenido: 0.8962

=== Paso 4: Entrenando modelo final SVD optimizado ===

Evaluación final en testset (SVD optimizado):
RMSE: 0.8965

=== Paso 5: Guardando modelo optimizado ===
Modelo guardado como 'modelo_recomendacion_optimizado.pkl'

=== Paso 6: Demo para un usuario ===

Películas que el usuario 1 ya ha reseñado:
                            title  rating
2                    American Pie     4.0
0                       Rocky III     2.5
5            Confidentially Yours     2.5
3                        My Tutor     2.0
4  Jay and Silent Bob Strike Back     2.0
1                           Greed     1.0

Top 10 películas recomendadas para el usuario 1:
The Million Dollar Hotel → Predicted rating: 3.94
Sleepless in Seattle → Predicted rating: 3.91
Dead Man → Predicted rating: 3.91
The Good Thief → Predicted rating: 3.86
Galaxy Quest → Predicted rating: 3.85
Space Jam → 

## Sistema de recomendación Pytorch

In [None]:
# Preparar mappings para embeddings
user_id_mapping = {uid: idx for idx, uid in enumerate(ratings['userId'].unique())}
movie_id_mapping = {mid: idx for idx, mid in enumerate(ratings['movieId'].unique())}
director_id_mapping = {d: idx for idx, d in enumerate(movies['director'].unique())}



def extract_genre_names(genres_list):
    import ast
    genres_list = ast.literal_eval(genres_list) if isinstance(genres_list, str) else genres_list
    return [genre['name'] for genre in genres_list if 'name' in genre]

movies['genre_names'] = movies['genres'].apply(extract_genre_names)

mlb = MultiLabelBinarizer()
genre_matrix = mlb.fit_transform(movies['genre_names'])
# Mapping movieId → índice en genre_matrix
movieid_to_index = {mid: idx for idx, mid in enumerate(movies['id'].values)}

# Construir tu Dataset personalizado
class MovieRatingsDataset(Dataset):
    def __init__(self, ratings, movies, genre_matrix, movieid_to_index):
        self.ratings = ratings
        self.movies = movies.set_index('id')
        self.genre_matrix = genre_matrix
        self.movieid_to_index = movieid_to_index
        
    def __len__(self):
        return len(self.ratings)
    
    def __getitem__(self, idx):
        row = self.ratings.iloc[idx]
        
        # Mapping userId → user_idx
        user_idx = user_id_mapping[row['userId']]
        # Mapping movieId → movie_idx
        movie_idx = movie_id_mapping[row['movieId']]
        
        # Obtener fila de movie (para director)
        movie_row = self.movies.loc[row['movieId']]

        # Forzar que sea UNA fila (Series), no DataFrame
        if isinstance(movie_row, pd.DataFrame):
            movie_row = movie_row.iloc[0]

        # Ahora sí seguro
        director_idx = director_id_mapping[movie_row['director']]

        
        # Obtener genre_vector
        genre_vector = self.genre_matrix[ self.movieid_to_index[row['movieId']] ]
        
        # Obtener director_idx
        director_idx = director_id_mapping[movie_row['director']]
        
        # Rating target
        rating = row['rating']
        
        return {
            'user_idx': torch.tensor(user_idx, dtype=torch.long),
            'movie_idx': torch.tensor(movie_idx, dtype=torch.long),
            'genre_vector': torch.tensor(genre_vector, dtype=torch.float32),
            'director_idx': torch.tensor(director_idx, dtype=torch.long),
            'rating': torch.tensor(rating, dtype=torch.float32)
        }

# Definir el MLP
class MovieRecommenderMLP(nn.Module):
    def __init__(self, n_users, n_movies, n_directors, n_genres, embedding_dim=32):
        super(MovieRecommenderMLP, self).__init__()
        
        self.user_embedding = nn.Embedding(n_users, embedding_dim)
        self.movie_embedding = nn.Embedding(n_movies, embedding_dim)
        self.director_embedding = nn.Embedding(n_directors, embedding_dim)
        
        # Tamaño total del input vector
        input_dim = embedding_dim * 3 + n_genres
        
        self.mlp = nn.Sequential(
            nn.Linear(input_dim, 128),
            nn.ReLU(),
            nn.Linear(128, 64),
            nn.ReLU(),
            nn.Linear(64, 1)  # Salida = rating esperado
        )
        
    def forward(self, user_idx, movie_idx, genre_vector, director_idx):
        user_emb = self.user_embedding(user_idx)
        movie_emb = self.movie_embedding(movie_idx)
        director_emb = self.director_embedding(director_idx)
        
        x = torch.cat([user_emb, movie_emb, genre_vector, director_emb], dim=-1)
        out = self.mlp(x)
        return out.squeeze()

# Entrenamiento
# Dataset y DataLoader
dataset = MovieRatingsDataset(ratings, movies, genre_matrix, movieid_to_index)
dataloader = DataLoader(dataset, batch_size=64, shuffle=True)

# Modelo
torch_model = MovieRecommenderMLP(
    n_users=len(user_id_mapping),
    n_movies=len(movie_id_mapping),
    n_directors=len(director_id_mapping),
    n_genres=genre_matrix.shape[1],
    embedding_dim=32,
)

# Optimizer y Loss
optimizer = optim.Adam(torch_model.parameters(), lr=1e-3)
criterion = nn.MSELoss()

# Configurar dispositivo (GPU si disponible)
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(f"Usando dispositivo: {device}")

# Mover modelo a device
torch_model = torch_model.to(device)

# Training loop con .to(device) en los batches
n_epochs = 50

for epoch in range(n_epochs):
    torch_model.train()
    epoch_loss = 0.0
    
    for batch in dataloader:
        optimizer.zero_grad()
        
        # Mover el batch a device
        user_idx = batch['user_idx'].to(device)
        movie_idx = batch['movie_idx'].to(device)
        genre_vector = batch['genre_vector'].to(device)
        director_idx = batch['director_idx'].to(device)
        rating = batch['rating'].to(device)
        
        # Forward pass
        preds = torch_model(
            user_idx,
            movie_idx,
            genre_vector,
            director_idx
        )
        
        # Calcular loss
        loss = criterion(preds, rating)
        
        # Backward pass y optimizer
        loss.backward()
        optimizer.step()
        
        # Acumular loss para logging
        epoch_loss += loss.item()
    
    # Logging por epoch
    print(f"Epoch {epoch+1}/{n_epochs}, Loss: {epoch_loss/len(dataloader):.4f}")

Usando dispositivo: cuda
Epoch 1/50, Loss: 1.2685
Epoch 2/50, Loss: 0.8648
Epoch 3/50, Loss: 0.7951
Epoch 4/50, Loss: 0.7511
Epoch 5/50, Loss: 0.7189
Epoch 6/50, Loss: 0.6904
Epoch 7/50, Loss: 0.6643
Epoch 8/50, Loss: 0.6386
Epoch 9/50, Loss: 0.6105
Epoch 10/50, Loss: 0.5815
Epoch 11/50, Loss: 0.5514
Epoch 12/50, Loss: 0.5185
Epoch 13/50, Loss: 0.4891
Epoch 14/50, Loss: 0.4585
Epoch 15/50, Loss: 0.4312
Epoch 16/50, Loss: 0.4033
Epoch 17/50, Loss: 0.3773
Epoch 18/50, Loss: 0.3573
Epoch 19/50, Loss: 0.3361
Epoch 20/50, Loss: 0.3141
Epoch 21/50, Loss: 0.2988
Epoch 22/50, Loss: 0.2801
Epoch 23/50, Loss: 0.2663
Epoch 24/50, Loss: 0.2525
Epoch 25/50, Loss: 0.2385
Epoch 26/50, Loss: 0.2274
Epoch 27/50, Loss: 0.2137
Epoch 28/50, Loss: 0.2041
Epoch 29/50, Loss: 0.1968
Epoch 30/50, Loss: 0.1872
Epoch 31/50, Loss: 0.1780
Epoch 32/50, Loss: 0.1704
Epoch 33/50, Loss: 0.1633
Epoch 34/50, Loss: 0.1578
Epoch 35/50, Loss: 0.1504
Epoch 36/50, Loss: 0.1441
Epoch 37/50, Loss: 0.1378
Epoch 38/50, Loss: 0.1

In [None]:
# Demo con 5 usuarios aleatorios con el MLP

print("\n=== Demo con 5 usuarios aleatorios con MLP ===")

torch_model.eval()  # poner en modo evaluación

user_ids = ratings['userId'].unique()
selected_user_ids = np.random.choice(user_ids, size=5, replace=False)

num_recommendations_per_user = {
    selected_user_ids[0]: 5,
    selected_user_ids[1]: 8,
    selected_user_ids[2]: 10,
    selected_user_ids[3]: 7,
    selected_user_ids[4]: 12
}

for user_id in selected_user_ids:
    print("\n" + "="*60)
    print(f"Usuario: {user_id} (MLP)")

    # Películas que ya ha visto
    watched_movies_df = ratings[ratings['userId'] == user_id][['movieId', 'rating']]
    watched_movies_df = watched_movies_df.merge(
        movies[['id', 'title']], left_on='movieId', right_on='id'
    ).sort_values(by='rating', ascending=False)

    print(f"\nPelículas que el usuario {user_id} ya ha reseñado:")
    print(watched_movies_df[['title', 'rating']])

    # Películas no vistas
    movies_watched = watched_movies_df['movieId'].tolist()
    movies_not_watched_df = movies[~movies['id'].isin(movies_watched)]
    # Filtrar para que solo queden pelis que están en movie_id_mapping (es decir, pelis conocidas por el modelo)
    movies_not_watched_df = movies_not_watched_df[movies_not_watched_df['id'].isin(movie_id_mapping.keys())]

    # Generamos input para el MLP
    # Generamos input para el MLP
    user_idx = torch.tensor([user_id_mapping[user_id]] * len(movies_not_watched_df), dtype=torch.long).to(device)
    movie_idx = torch.tensor([movie_id_mapping[mid] for mid in movies_not_watched_df['id']], dtype=torch.long).to(device)
    genre_vector = torch.tensor(np.vstack([genre_matrix[ movieid_to_index[mid] ] for mid in movies_not_watched_df['id']]), dtype=torch.float32).to(device)
    director_idx = torch.tensor([director_id_mapping[dir_name] for dir_name in movies_not_watched_df['director']], dtype=torch.long).to(device)


    # Forward pass → predicted ratings
    with torch.no_grad():
        predicted_ratings = torch_model(user_idx, movie_idx, genre_vector, director_idx).cpu().numpy()

    # Ordenar predicciones
    top_n = num_recommendations_per_user[user_id]
    top_indices = np.argsort(-predicted_ratings)[:top_n]

    print(f"\nTop {top_n} películas recomendadas para el usuario {user_id} (MLP):")
    for idx in top_indices:
        movie_id = movies_not_watched_df.iloc[idx]['id']
        movie_row = movies[movies['id'] == movie_id]
        if len(movie_row) > 0:
            movie_title = movie_row['title'].values[0]
        else:
            movie_title = f"(movieId={movie_id}) — no metadata"
        print(f"{movie_title} → Predicted rating (MLP): {predicted_ratings[idx]:.2f}")

    print("="*60)


=== Demo con 5 usuarios aleatorios con MLP ===

Usuario: 131 (MLP)

Películas que el usuario 131 ya ha reseñado:
                    title  rating
0   Beverly Hills Cop III     5.0
1         Rome, Open City     5.0
2                 Solaris     5.0
4                    Hulk     5.0
6         Say Anything...     5.0
7             Local Color     5.0
10     Young and Innocent     5.0
3    Bridge to Terabithia     4.0
5      Swept from the Sea     4.0
8        10 Items or Less     4.0
13      That Man from Rio     4.0
9           The Red Elvis     3.0
11              The Prize     3.0
12           Going Places     2.0

Top 5 películas recomendadas para el usuario 131 (MLP):
Design of Death → Predicted rating (MLP): 6.47
Dasepo Naughty Girls → Predicted rating (MLP): 6.43
You, Me and Dupree → Predicted rating (MLP): 6.24
End of the World → Predicted rating (MLP): 5.96
B. Monkey → Predicted rating (MLP): 5.89

Usuario: 533 (MLP)

Películas que el usuario 533 ya ha reseñado:
               

## Sistema de recomendación Lightfm

In [None]:
# ===============================
# Preparar encoding de IDs
# ===============================

user_enc = LabelEncoder()
item_enc = LabelEncoder()

ratings['user_id_enc'] = user_enc.fit_transform(ratings['userId'])
ratings['movie_id_enc'] = item_enc.fit_transform(ratings['movieId'])

user_id_map = dict(zip(user_enc.classes_, user_enc.transform(user_enc.classes_)))
movie_id_map = dict(zip(item_enc.classes_, item_enc.transform(item_enc.classes_)))
movie_id_inv_map = {v: k for k, v in movie_id_map.items()}

n_users = ratings['user_id_enc'].nunique()
n_items = ratings['movie_id_enc'].nunique()

print(f'Nº de usuarios: {n_users}, Nº de películas: {n_items}')

# ===============================
# Preparar matriz de interacciones
# ===============================

ratings['interaction'] = (ratings['rating'] >= 4).astype(float)

interactions = coo_matrix(
    (ratings['interaction'],
     (ratings['user_id_enc'], ratings['movie_id_enc'])),
    shape=(n_users, n_items)
)

# ===============================
# Preparar features de películas
# ===============================

movies['genres_str'] = movies['genres'].astype(str)
movies['keywords_str'] = movies['keywords'].astype(str)
movies['director_str'] = movies['director'].astype(str)

movies['item_features'] = (
    movies['genres_str'] + ' ' +
    movies['keywords_str'] + ' ' +
    movies['director_str']
)

vectorizer = TfidfVectorizer(max_features=1000)
item_features_matrix = vectorizer.fit_transform(movies['item_features'].fillna(''))

print(f'Matriz de item features: {item_features_matrix.shape}')

# ===============================
# Entrenar LightFM
# ===============================

lightfm_model = LightFM(loss='warp', learning_rate=0.05, no_components=64)
lightfm_model.fit(interactions, item_features=item_features_matrix, epochs=50, num_threads=4)

# ===============================
# Evaluar el modelo (métricas globales)
# ===============================

precision = precision_at_k(lightfm_model, interactions, item_features=item_features_matrix, k=10).mean()
recall = recall_at_k(lightfm_model, interactions, item_features=item_features_matrix, k=10).mean()
auc = auc_score(lightfm_model, interactions, item_features=item_features_matrix).mean()

print(f'Precision@10: {precision:.4f}')
print(f'Recall@10: {recall:.4f}')
print(f'AUC: {auc:.4f}')

# ===============================
# Mostrar historial + recomendaciones por usuario
# ===============================

def show_user_history(user_id_enc):
    user_ratings = ratings[(ratings['user_id_enc'] == user_id_enc) & (ratings['interaction'] == 1)]
    
    print(f"\nPelículas VISTAS por el usuario {user_id_enc}:\n")
    for _, row in user_ratings.iterrows():
        movie_id_original = movie_id_inv_map[row['movie_id_enc']]
        movie_info = movies[movies['id'] == movie_id_original]
        if not movie_info.empty:
            title = movie_info['title'].values[0]
            genres = movie_info['genres_str'].values[0]
            director = movie_info['director_str'].values[0]
            print(f"{title} | Géneros: {genres} | Director: {director}")

def show_user_recommendations(user_id_enc, n_recommendations=10):
    scores = lightfm_model.predict(np.repeat(user_id_enc, n_items), np.arange(n_items), item_features=item_features_matrix)
    top_items = np.argsort(-scores)

    print(f"\nPelículas RECOMENDADAS para el usuario {user_id_enc}:\n")
    for idx in top_items[:n_recommendations]:
        movie_id_original = movie_id_inv_map[idx]
        movie_info = movies[movies['id'] == movie_id_original]
        if not movie_info.empty:
            title = movie_info['title'].values[0]
            genres = movie_info['genres_str'].values[0]
            director = movie_info['director_str'].values[0]
            print(f"{title} | Géneros: {genres} | Director: {director}")

# ===============================
# Ejemplo con 5 usuarios aleatorios
# ===============================

random_users = random.sample(list(ratings['user_id_enc'].unique()), 5)

for user_id_enc in random_users:
    print('='*60)
    print(f"USUARIO {user_id_enc}")
    show_user_history(user_id_enc)
    show_user_recommendations(user_id_enc, n_recommendations=5)
    print('='*60)

Nº de usuarios: 671, Nº de películas: 2830
Matriz de item features: (46625, 1000)
Precision@10: 0.3368
Recall@10: 0.1123
AUC: 0.8503
USUARIO 177

Películas VISTAS por el usuario 177:

Scarface | Géneros: [{'id': 28, 'name': 'Action'}, {'id': 80, 'name': 'Crime'}, {'id': 18, 'name': 'Drama'}, {'id': 53, 'name': 'Thriller'}] | Director: Brian De Palma
The 39 Steps | Géneros: [{'id': 28, 'name': 'Action'}, {'id': 53, 'name': 'Thriller'}, {'id': 9648, 'name': 'Mystery'}] | Director: Alfred Hitchcock
Terminator 3: Rise of the Machines | Géneros: [{'id': 28, 'name': 'Action'}, {'id': 53, 'name': 'Thriller'}, {'id': 878, 'name': 'Science Fiction'}] | Director: Jonathan Mostow
The Million Dollar Hotel | Géneros: [{'id': 18, 'name': 'Drama'}, {'id': 53, 'name': 'Thriller'}] | Director: Wim Wenders
Batman Returns | Géneros: [{'id': 28, 'name': 'Action'}, {'id': 14, 'name': 'Fantasy'}] | Director: Tim Burton
Solaris | Géneros: [{'id': 18, 'name': 'Drama'}, {'id': 878, 'name': 'Science Fiction'}, 

## Evuluación de lo sistemas

In [None]:

# Comparación de modelos → evaluación común

test_user_ids = selected_user_ids if 'selected_user_ids' in globals() else random_users
print(f"Comparación en usuarios: {test_user_ids}")

# --- Funciones get_top_n para cada modelo ---

def get_top_n_surprise(model, user_id, all_movie_ids, n=10):
    watched_movies = ratings[ratings['userId'] == user_id]['movieId'].tolist()
    movies_not_watched = [mid for mid in all_movie_ids if mid not in watched_movies]
    
    predictions = [model.predict(user_id, mid) for mid in movies_not_watched]
    predictions.sort(key=lambda x: x.est, reverse=True)
    
    top_n = [int(pred.iid) for pred in predictions[:n]]
    return top_n

def get_top_n_lightfm(model, user_id_enc, n=10):
    scores = model.predict(np.repeat(user_id_enc, n_items), np.arange(n_items), item_features=item_features_matrix)
    top_items = np.argsort(-scores)[:n]
    # Convertir de encoded_id a movieId original
    top_n = [movie_id_inv_map[idx] for idx in top_items]
    return top_n

def get_top_n_pytorch(model, user_id, all_movie_ids, n=10):
    model.eval()
    user_idx = torch.tensor([user_id_mapping[user_id]], dtype=torch.long).to(device)
    
    # Preparamos todo en batch
    movie_idxs = torch.tensor([movie_id_mapping[mid] for mid in all_movie_ids], dtype=torch.long).to(device)
    genre_vectors_np = np.stack([genre_matrix[movieid_to_index[mid]] for mid in all_movie_ids])
    genre_vectors = torch.tensor(genre_vectors_np, dtype=torch.float32).to(device)
    director_idxs = torch.tensor(
        [director_id_mapping[movies[movies['id'] == mid]['director'].values[0]] for mid in all_movie_ids],
        dtype=torch.long
    ).to(device)
    
    user_idxs = user_idx.repeat(len(all_movie_ids))
    
    with torch.no_grad():
        preds = model(user_idxs, movie_idxs, genre_vectors, director_idxs).cpu().numpy()
    
    top_indices = np.argsort(-preds)[:n]
    top_n = [all_movie_ids[i] for i in top_indices]
    return top_n

# --- Métricas de evaluación ---

def precision_at_k(recommended, ground_truth, k=10):
    recommended_k = recommended[:k]
    hits = len(set(recommended_k) & set(ground_truth))
    return hits / k

def recall_at_k(recommended, ground_truth, k=10):
    recommended_k = recommended[:k]
    hits = len(set(recommended_k) & set(ground_truth))
    return hits / len(ground_truth) if ground_truth else 0

# --- Loop de evaluación ---

results = {'surprise': [], 'lightfm': [], 'pytorch': []}

# All movie ids para Surprise y LightFM
all_movie_ids = movies['id'].tolist()

# All movie ids para PyTorch → solo las que el modelo conoce
all_movie_ids_pytorch = list(movie_id_mapping.keys())

for user_id in tqdm(test_user_ids):
    # --- Ground truth ---
    # Definimos como ground truth las películas que el usuario valoró >= 4
    gt_movies = ratings[(ratings['userId'] == user_id) & (ratings['rating'] >= 4)]['movieId'].tolist()
    if len(gt_movies) == 0:
        continue  # Skip si el usuario no tiene películas con rating >= 4
    
    # --- Surprise ---
    top_surprise = get_top_n_surprise(best_svd, user_id, all_movie_ids)
    
    # --- LightFM ---
    user_id_enc = user_id_map[user_id]
    top_lightfm = get_top_n_lightfm(lightfm_model, user_id_enc)
    
    # --- PyTorch ---
    top_pytorch = get_top_n_pytorch(torch_model, user_id, all_movie_ids_pytorch)
    
    # --- Guardar métricas ---
    for model_name, top_n in [('surprise', top_surprise), ('lightfm', top_lightfm), ('pytorch', top_pytorch)]:
        prec = precision_at_k(top_n, gt_movies)
        rec = recall_at_k(top_n, gt_movies)
        results[model_name].append((prec, rec))

# --- Mostrar resultados ---

print("\n==== Resultados promedio ====")
for model_name, metrics in results.items():
    precisions = [x[0] for x in metrics]
    recalls = [x[1] for x in metrics]
    print(f"Modelo: {model_name}")
    print(f"  Precision@10: {np.mean(precisions):.4f}")
    print(f"  Recall@10:    {np.mean(recalls):.4f}")
    print("")


from sklearn.metrics import mean_squared_error
from math import sqrt

# --- RMSE para Surprise ---
print("Calculando RMSE para Surprise (best_svd)...")

# Usamos todo el set de ratings
y_true_surprise = []
y_pred_surprise = []

for row in ratings.itertuples():
    user_id = row.userId
    movie_id = row.movieId
    true_rating = row.rating
    
    pred = best_svd.predict(user_id, movie_id).est
    
    y_true_surprise.append(true_rating)
    y_pred_surprise.append(pred)

rmse_surprise = sqrt(mean_squared_error(y_true_surprise, y_pred_surprise))
print(f"RMSE (Surprise): {rmse_surprise:.4f}")

# --- RMSE para PyTorch ---
print("\nCalculando RMSE para PyTorch...")

y_true_pytorch = []
y_pred_pytorch = []

torch_model.eval()

batch_size_eval = 512  # tamaño de batch para acelerar

all_rows = ratings.reset_index(drop=True)

for start_idx in tqdm(range(0, len(all_rows), batch_size_eval)):
    end_idx = min(start_idx + batch_size_eval, len(all_rows))
    batch = all_rows.iloc[start_idx:end_idx]
    
    user_idxs = torch.tensor([user_id_mapping[uid] for uid in batch['userId']], dtype=torch.long).to(device)
    movie_idxs = torch.tensor([movie_id_mapping[mid] for mid in batch['movieId']], dtype=torch.long).to(device)
    genre_vectors_np = np.stack([genre_matrix[movieid_to_index[mid]] for mid in batch['movieId']])
    genre_vectors = torch.tensor(genre_vectors_np, dtype=torch.float32).to(device)
    director_idxs = torch.tensor(
        [director_id_mapping[movies[movies['id'] == mid]['director'].values[0]] for mid in batch['movieId']],
        dtype=torch.long
    ).to(device)
    
    with torch.no_grad():
        preds = torch_model(user_idxs, movie_idxs, genre_vectors, director_idxs).cpu().numpy()
    
    y_true_pytorch.extend(batch['rating'].tolist())
    y_pred_pytorch.extend(preds.tolist())

rmse_pytorch = sqrt(mean_squared_error(y_true_pytorch, y_pred_pytorch))
print(f"RMSE (PyTorch): {rmse_pytorch:.4f}")


Comparación en usuarios: [131 533 431  43 257]


100%|██████████| 5/5 [00:10<00:00,  2.20s/it]



==== Resultados promedio ====
Modelo: surprise
  Precision@10: 0.0000
  Recall@10:    0.0000

Modelo: lightfm
  Precision@10: 0.2600
  Recall@10:    0.1121

Modelo: pytorch
  Precision@10: 0.0000
  Recall@10:    0.0000

Calculando RMSE para Surprise (best_svd)...
RMSE (Surprise): 0.8291

Calculando RMSE para PyTorch...


100%|██████████| 88/88 [00:25<00:00,  3.43it/s]

RMSE (PyTorch): 0.2883



