# MMD 2024, Collaborative Filtering on Google Colab
This notebook sets up the enviroment and runs CF experiments on Google Colab.





In [1]:
# Clone the repository to local runtime

private = False
if private:
    # Private repository, requires authentication
    from google.colab import userdata
    pat = userdata.get('github_pat')
    project = '24WS-mmd-code-priv'
else:
    pat = ''
    project = 'Mining-Massive-Datasets'

In [2]:
!git clone https://{pat}@github.com/aip-hd-tea/{project}.git

Cloning into 'Mining-Massive-Datasets'...
Password for 'https://github.com': 

In [1]:
# Import the repository code

import data_util as cfd

# After edits of cf_algorithms_to_complete.py:
# 1. Rename the file rec_sys.cf_algorithms_to_complete.py to rec_sys.cf_algorithms.py
# 2. Restart the runtime (Runtime -> Restart the session); possibly not needed
# 3. Swap the comments in the next two lines, so that cf_algorithms is imported as cfa
import cf_algorithms_to_complete as cfa
#import rec_sys.cf_algorithms as cfa
# 4. Re-run all cells
# 5. If your changes are correct, you will see a long
#    printout of recommendations for MovieLens dataset (last cell)

In [2]:
# Load or set the configuration
#from rec_sys.cf_config import config

import dataclasses
@dataclasses.dataclass
class config:
    max_rows: int = int(1e5)
    dowload_url: str = "https://files.grouplens.org/datasets/movielens/ml-25m.zip"
    download_dir: str = "/Users/jan/Documents/code/Mining-Massive-Datasets/"
    unzipped_dir: str = download_dir + "ml-25m/"
    file_path: str = download_dir + "ml-25m/ratings.csv"


In [3]:
# Load the MovieLens and Lecture datasets
um_movielens = cfd.get_um_by_name(config, "movielens")
um_lecture = cfd.get_um_by_name(config, "lecture_1")

# Rate all items for the lecture toy dataset
all_ratings = cfa.rate_all_items(um_lecture, 4, 2)
print ("all_ratings lecture toy dataset:", all_ratings)

# Rate all items the MovieLens data
all_ratings_movielens = cfa.rate_all_items(um_movielens, 0, 2)
print("all_ratings_movielens:", all_ratings_movielens)

Dir '/Users/jan/Documents/code/Mining-Massive-Datasets/ml-25m/' already exists, skipping download

### Start reading data from '/Users/jan/Documents/code/Mining-Massive-Datasets/ml-25m/ratings.csv'
Loaded data from '/Users/jan/Documents/code/Mining-Massive-Datasets/ml-25m/ratings.csv', df shape: (100000, 3), size in MB: 1.1444091796875 
Pivoting the data
Utility matrix, df shape: (9786, 757), size in MB: 29.142929077148438 
Final utility matrix (numpy array as np.float32), df shape: (9786, 757), size in MB: 28.25928497314453 

>>> CF computation for UM w/ shape: (6, 6), user_index: 4, neighborhood_size: 2

item_idx: 0, neighbors: [5 2], rating: -0.4010036592543246
item_idx: 1, neighbors: [2 3], rating: 4.27917451131852
all_ratings lecture toy dataset: [np.float64(-0.4010036592543246), np.float64(4.27917451131852), np.float64(2.0), np.float64(5.0), np.float64(4.0), np.float64(3.0)]

>>> CF computation for UM w/ shape: (9786, 757), user_index: 0, neighborhood_size: 2

item_idx: 70, neigh

  um_normalized = utility_matrix / norms
  rating_of_item = np.dot(sim_vals, rating_neighbors) / np.sum(


item_idx: 2371, neighbors: [30], rating: 2.5
item_idx: 2372, neighbors: [30], rating: 2.5
item_idx: 2373, neighbors: [101  30], rating: 1.3445560932159424
item_idx: 2374, neighbors: [30], rating: 2.5
item_idx: 2375, neighbors: [101  30], rating: 1.1080639362335205
item_idx: 2376, neighbors: [408 547], rating: 2.6943817138671875
item_idx: 2377, neighbors: [547 631], rating: 3.5613479614257812
item_idx: 2378, neighbors: [30], rating: 3.500000238418579
item_idx: 2379, neighbors: [30], rating: 3.0
item_idx: 2380, neighbors: [ 30 607], rating: 3.3807270526885986
item_idx: 2381, neighbors: [30], rating: 3.0
item_idx: 2382, neighbors: [ 37 678], rating: 2.4837138652801514
item_idx: 2383, neighbors: [ 30 317], rating: 2.5
item_idx: 2384, neighbors: [ 56 317], rating: 3.6716763973236084
item_idx: 2385, neighbors: [30], rating: 2.0
item_idx: 2386, neighbors: [476 317], rating: 2.866605520248413
item_idx: 2387, neighbors: [206  30], rating: 1.1856337785720825
item_idx: 2388, neighbors: [713 317],

In [None]:
rated_by, user_col = load_and_prepare_movielens_data()
print("rated_by:", list(rated_by.items())[:5])
print("user_col:", list(user_col.items())[:5])

In [None]:
pairs = [(828, 11), (2400, 4725), (3765, 1270)]
utility_matrix = csr_matrix(...)  # Load sparse matrix
print("Estimated Ratings:", estimate_ratings_for_pairs(pairs, utility_matrix, neighborhood_size=5))