In [1]:
import pandas as pd
import numpy as np
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.metrics import precision_score, recall_score

In [2]:
ratings = pd.read_csv('ratings.csv')
ratings.head()

Unnamed: 0,user_id,movie_id,rating,timestamp
0,52,133,4,2025-01-01 00:00:00
1,93,146,3,2025-01-01 06:00:00
2,15,107,2,2025-01-01 12:00:00
3,72,139,4,2025-01-01 18:00:00
4,61,148,4,2025-01-02 00:00:00


In [3]:
user_movie_matrix = ratings.pivot_table(index='user_id', columns='movie_id', values='rating')

user_movie_matrix_filled = user_movie_matrix.fillna(0)

print("Matriz usuario-película creada con dimensiones:", user_movie_matrix_filled.shape)
user_movie_matrix_filled.head()

Matriz usuario-película creada con dimensiones: (100, 60)


movie_id,100,101,102,103,104,105,106,107,108,109,...,150,151,152,153,154,155,156,157,158,159
user_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1,0.0,0.0,0.0,0.0,1.5,4.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,5.0,0.0
2,1.0,0.0,0.0,5.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,3.0,0.0,3.0,0.0,0.0,3.0,0.0,4.0,0.0
3,0.0,0.0,0.0,0.0,0.0,4.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,0.0,0.0,0.0,4.0,0.0,0.0,0.0,0.0,0.0,3.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,3.0
5,0.0,0.0,3.0,0.0,3.0,1.0,0.0,0.0,3.0,0.0,...,0.0,0.0,0.0,0.0,0.0,5.0,0.0,0.0,0.0,0.0


In [4]:
user_similarity = cosine_similarity(user_movie_matrix_filled)

user_similarity_df = pd.DataFrame(user_similarity, 
                                  index=user_movie_matrix_filled.index, 
                                  columns=user_movie_matrix_filled.index)


In [5]:

ratings_matrix = user_movie_matrix_filled.values

predicted_ratings = user_similarity.dot(ratings_matrix) / np.array([np.abs(user_similarity).sum(axis=1)]).T

predicted_ratings_df = pd.DataFrame(predicted_ratings, 
                                    index=user_movie_matrix_filled.index, 
                                    columns=user_movie_matrix_filled.columns)


In [6]:
user_id = 52

user_real_ratings = user_movie_matrix.loc[user_id]
user_pred_ratings = predicted_ratings_df.loc[user_id]

unwatched = user_real_ratings[user_real_ratings.isna()]

top_recommendations = (
    user_pred_ratings.loc[unwatched.index]
    .sort_values(ascending=False)
    .head(5)
)

print(f"\nRecomendaciones para el usuario {user_id}:\n")
for movie_id, pred_rating in top_recommendations.items():
    print(f"    Película {movie_id} (rating predicho = {pred_rating:.2f})")



Recomendaciones para el usuario 52:

    Película 125 (rating predicho = 0.74)
    Película 122 (rating predicho = 0.69)
    Película 153 (rating predicho = 0.68)
    Película 150 (rating predicho = 0.68)
    Película 158 (rating predicho = 0.59)


In [21]:
def precision_recall_at_k(actual, predicted, k=5, threshold=3.5):
    top_k_items = np.argsort(predicted)[-k:]
    relevant = set(np.where(actual >= threshold)[0])
    recommended = set(top_k_items)
    hits = relevant & recommended

    precision = len(hits) / len(recommended) if recommended else 0
    recall = len(hits) / len(relevant) if relevant else 0
    return precision, recall


precisions = []
recalls = []

for user_id in user_movie_matrix.index:
    actual = user_movie_matrix_filled.loc[user_id].values
    predicted = predicted_ratings_df.loc[user_id].values

    precision, recall = precision_recall_at_k(actual, predicted, k=5)
    precisions.append(precision)
    recalls.append(recall)

precision = np.mean(precisions)
recall = np.mean(recalls)

print(f"Precision@5 del modelo: {precision:.3f}")
print(f"Recall@5 del modelo: {recall:.3f}")


Precision@5 del modelo: 0.584
Recall@5 del modelo: 0.834


- Si se usara factorización matricial (como ALS o SVD), los resultados probablemente mejorarían porque estos métodos descomponen la matriz usuario–película en factores que capturan patrones ocultos de preferencia. Esto permite predecir mejor los ratings incluso cuando hay pocos datos, reduciendo el impacto de la dispersión y mejorando la precisión y el recall. Además, los modelos de factorización generalizan mejor que los basados en similitud directa, aunque requieren ajustar parámetros como el número de factores y la regularización, y no resuelven del todo el problema visto en clasde de usuarios o ítems nuevos (cold-start).

- Una alternativa de uso para la información obtenida y las recomendaciones sería integrarlas en un sistema de personalización dentro de una plataforma (por ejemplo, un servicio de streaming o una tienda online) para mejorar la experiencia del usuario mostrando contenido relevante según sus gustos. Además, los datos pueden servir para analizar patrones de comportamiento, segmentar usuarios por tipo de preferencia y diseñar estrategias de marketing más efectivas, como promociones o sugerencias específicas para cada grupo.