In [1]:
import pandas as pd
from scipy.sparse import csr_matrix
import os

# Path to preprocessed data
PROCESSED_DIR = "../data/processed"
ratings_file = os.path.join(PROCESSED_DIR, "ratings_processed.csv")


In [3]:
ratings = pd.read_csv(ratings_file)

# Check columns
print(ratings.columns)
# Should have: ['userId', 'movieId', 'rating', 'user_idx', 'movie_idx']
ratings.head()


Index(['userId', 'movieId', 'rating', 'timestamp', 'user_idx', 'movie_idx',
       'datetime'],
      dtype='object')


Unnamed: 0,userId,movieId,rating,timestamp,user_idx,movie_idx,datetime
0,196,242,3.0,881250949,0,0,1997-12-04 15:55:49
1,186,302,3.0,891717742,1,1,1998-04-04 19:22:22
2,22,377,1.0,878887116,2,2,1997-11-07 07:18:36
3,244,51,2.0,880606923,3,3,1997-11-27 05:02:03
4,166,346,1.0,886397596,4,4,1998-02-02 05:33:16


In [4]:
# Number of users and movies
n_users = ratings["user_idx"].nunique()
n_movies = ratings["movie_idx"].nunique()

# Build CSR sparse matrix
user_item_matrix = csr_matrix(
    (ratings["rating"], (ratings["user_idx"], ratings["movie_idx"])),
    shape=(n_users, n_movies)
)

print("Sparse matrix shape:", user_item_matrix.shape)


Sparse matrix shape: (943, 1682)


In [5]:
# Sparsity = percentage of empty entries
sparsity = 100 * (1 - user_item_matrix.nnz / (n_users * n_movies))
print(f"Sparsity of the matrix: {sparsity:.2f}%")


Sparsity of the matrix: 93.70%


In [6]:
import pickle

sparse_file = os.path.join(PROCESSED_DIR, "user_item_matrix.pkl")
with open(sparse_file, "wb") as f:
    pickle.dump(user_item_matrix, f)

print(f"Sparse matrix saved at {sparse_file}")


Sparse matrix saved at ../data/processed/user_item_matrix.pkl
