In [None]:
# Exercise 4

import numpy as np
from scipy.sparse import csr_matrix
from collections import defaultdict
import shelve
from data_util import load_movielens_tf  # Import from provided GitHub repo

def create_data_structures():
    rated_by = defaultdict(list)  # Dictionary for rated_by[]
    user_ratings_dict = defaultdict(list)  # Temporary dict to hold ratings for user_col[]

    # Load MovieLens data incrementally using provided function
    for user_id, movie_id, rating in load_movielens_tf():
        rated_by[movie_id].append(user_id)
        user_ratings_dict[user_id].append((movie_id, rating))
    
    # Convert user ratings to sparse vectors
    user_col = {}
    num_movies = max(max(movie_ids) for movie_ids, _ in user_ratings_dict.values()) + 1
    
    for user_id, ratings in user_ratings_dict.items():
        movie_ids, ratings = zip(*ratings)
        sparse_vector = csr_matrix((ratings, (np.zeros(len(movie_ids)), movie_ids)), shape=(1, num_movies))
        user_col[user_id] = sparse_vector

    # Optionally save data structures to disk with shelve to manage memory
    with shelve.open("movielens_data.shelve") as db:
        db['rated_by'] = rated_by
        db['user_col'] = user_col
    
    return rated_by, user_col

rated_by, user_col = create_data_structures()
print("Data structures created and stored.")