In [1]:
pip install memory-profiler

Note: you may need to restart the kernel to use updated packages.


In [2]:
from scipy.sparse import csr_matrix
from collections import defaultdict
import shelve
from data_util import load_movielens_tf  # Ensure this is accessible

def create_data_structures():
    rated_by = defaultdict(list)
    user_ratings_dict = defaultdict(list)

    # Load data incrementally
    for user_id, movie_id, rating in load_movielens_tf():
        rated_by[movie_id].append(user_id)
        user_ratings_dict[user_id].append((movie_id, rating))
    
    # Convert user ratings to sparse vectors
    user_col = {}
    num_movies = max(max(movie_ids) for movie_ids, _ in user_ratings_dict.values()) + 1
    
    for user_id, ratings in user_ratings_dict.items():
        movie_ids, ratings = zip(*ratings)
        sparse_vector = csr_matrix((ratings, (np.zeros(len(movie_ids)), movie_ids)), shape=(1, num_movies))
        user_col[user_id] = sparse_vector

    # Optionally save data to disk
    with shelve.open("movielens_data.shelve") as db:
        db['rated_by'] = rated_by
        db['user_col'] = user_col
    
    return rated_by, user_col

# Run this function to generate `user_col` and `rated_by`
rated_by, user_col = create_data_structures()



ModuleNotFoundError: No module named 'data_util'

In [5]:
import numpy as np

from memory_profiler import memory_usage

# Assuming user_col and rated_by are loaded from disk or created as in Exercise 4

def estimate_rating(user_id, item_id, user_col, rated_by, similarity_func):
    """
    Estimate the rating of a user for a given item using collaborative filtering.
    
    Parameters:
        user_id (int): ID of the user.
        item_id (int): ID of the item.
        user_col (dict): Sparse user ratings, keyed by user ID.
        rated_by (dict): Dictionary mapping each item to the users who rated it.
        similarity_func (function): Function to compute centered cosine similarity.
    
    Returns:
        float: Estimated rating for the user on the item.
    """
    if item_id not in rated_by:
        return np.nan  # Item has no ratings, so estimation is not possible
    
    # Find users who rated the item
    relevant_users = rated_by[item_id]
    
    # Compute similarities between target user and relevant users
    similarities = []
    weighted_ratings = []
    
    for other_user in relevant_users:
        if other_user != user_id:
            sim = similarity_func(user_col[user_id], user_col[other_user])
            rating = user_col[other_user][0, item_id]
            similarities.append(sim)
            weighted_ratings.append(sim * rating)

    # Calculate estimated rating as weighted average
    sum_similarities = np.sum(np.abs(similarities))
    estimated_rating = np.sum(weighted_ratings) / sum_similarities if sum_similarities != 0 else 0
    
    return estimated_rating

# Provided list of user-item pairs for testing
test_pairs = [
    (828, 11), (2400, 4725), (3765, 1270), (4299, 4020), (5526, 2432),
    (6063, 4525), (7045, 4100), (8160, 6300), (9682, 1212), (10277, 7355)
]

# Running the function on provided pairs and reporting results
def run_tests(user_col, rated_by):
    results = []
    for user_id, item_id in test_pairs:
        rating = estimate_rating(user_id, item_id, user_col, rated_by, centered_cosine_sim)
        results.append((user_id, item_id, rating))
        print(f"Estimated rating for user {user_id} on item {item_id}: {rating:.2f}")
    return results

# Calculate memory usage for the first six pairs
def calculate_memory_usage(user_col, rated_by):
    for idx, (user_id, item_id) in enumerate(test_pairs[:6]):
        mem_usage = memory_usage((estimate_rating, (user_id, item_id, user_col, rated_by, centered_cosine_sim)))
        print(f"Memory usage for pair {idx+1} (user {user_id}, item {item_id}): {max(mem_usage):.2f} MB")

# Run the tests and calculate memory usage
run_tests(user_col, rated_by)
calculate_memory_usage(user_col, rated_by)

NameError: name 'user_col' is not defined