In [1]:
import pandas as pd
import numpy as np
from scipy.sparse import csr_matrix
from sklearn.metrics.pairwise import cosine_similarity
import os
import pickle

# Paths
PROCESSED_DIR = "../data/processed"
ratings_file = os.path.join(PROCESSED_DIR, "ratings_processed.csv")
movies_file = os.path.join(PROCESSED_DIR, "movies_processed.csv")
sparse_file = os.path.join(PROCESSED_DIR, "user_item_matrix.pkl")


In [4]:
# Load preprocessed data
ratings = pd.read_csv(ratings_file)
movies = pd.read_csv(movies_file)

# Load sparse user-item matrix
with open(sparse_file, "rb") as f:
    user_item_matrix = pickle.load(f)

print("Sparse matrix shape:", user_item_matrix.shape)


Sparse matrix shape: (943, 1682)


ML-100K is small, so dense is fine. For large datasets, use sparse similarity or approximate nearest neighbors.

In [5]:
# Compute cosine similarity between users
# Convert sparse matrix to dense temporarily for similarity (small dataset)
user_sim_matrix = cosine_similarity(user_item_matrix)

print("User similarity matrix shape:", user_sim_matrix.shape)


User similarity matrix shape: (943, 943)


In [6]:
def predict_ratings_user_cf(user_idx, top_n=10, k=5):
    """
    Predict ratings for a user based on top-K similar users.
    """
    # Get similarity scores for this user
    sim_scores = user_sim_matrix[user_idx]
    
    # Find top-K similar users
    top_k_users = np.argsort(sim_scores)[-k-1:-1][::-1]  # exclude self
    
    # Ratings of top-K users
    top_ratings = user_item_matrix[top_k_users].toarray()
    
    # Similarity weights
    top_sim = sim_scores[top_k_users].reshape(-1,1)
    
    # Weighted average
    pred_ratings = (top_ratings * top_sim).sum(axis=0) / (top_sim.sum() + 1e-8)
    
    # Mask already rated movies
    user_rated = user_item_matrix[user_idx].toarray().flatten() > 0
    pred_ratings[user_rated] = 0
    
    # Get top-N recommendations
    top_movie_idx = np.argsort(pred_ratings)[-top_n:][::-1]
    recommendations = movies[movies["movie_idx"].isin(top_movie_idx)]
    
    return recommendations[["movie_idx", "title"]].head(top_n)


In [7]:
# Example: Recommend top 10 movies for user_idx 5
recommendations = predict_ratings_user_cf(user_idx=5, top_n=10, k=5)
recommendations


Unnamed: 0,movie_idx,title
63,239,"Shawshank Redemption, The (1994)"
116,140,"Rock, The (1996)"
190,254,Amadeus (1984)
215,166,When Harry Met Sally... (1989)
233,29,Jaws (1975)
514,58,"Boot, Das (1981)"
519,230,"Great Escape, The (1963)"
565,476,Clear and Present Danger (1994)
658,488,Arsenic and Old Lace (1944)
662,550,Being There (1979)
