# Import preprocessed data

In [1]:
from tqdm import tqdm
import numpy as np
import mmh3
from src.structures import User, Movie
from src.data_methods import read_movies,read_viewers
import kagglehub
import os

In [2]:
# Download latest version
#please ikke offentliggør min api key. 
os.environ['KAGGLE_USERNAME'] = "marcusgaleajacobsen"
os.environ['KAGGLE_KEY'] = "32a3003f52c97053841ea46c492128dc"
datapath = kagglehub.dataset_download("netflix-inc/netflix-prize-data")

In [4]:
n_lines = 10000 #number of reviews to read
datafiles = ["combined_data_1.txt"]#, "combined_data_2.txt", "combined_data_3.txt", "combined_data_4.txt"]
with_tqdm = False #set to True to see progress bar (reduce speed)

movies = read_movies(datapath)
users = read_viewers(datapath, movies, datafiles = datafiles, with_tqdm= with_tqdm, n_lines=n_lines) #read only 100000 

RuntimeError: dictionary changed size during iteration

# Compute Signature matrix

In [None]:

def compute_minhashes(
                    object: Movie | User, #movie or user to compute minhashes for
                    n_hash = 100 #number of hashes
                    ):
    
    bag = object.bag_ratings() #bag of ratings
    hashes = np.array([[mmh3.hash(id, seed) for id in bag] for seed in range(n_hash)]) #hashes for each seed
    minhashes = np.min(hashes, axis = 1) #minhashes for each seed
    return minhashes

def compute_signatures(
                    objets: dict[str, Movie | User], #objects to compute signatures for
                    n_hash = 100, #number of hashes
                    with_tqdm = True #whether to show progress bar
                    ):
    iterator = tqdm(objets.items()) if with_tqdm else objets.items() #iterator
    signatures = {id: compute_minhashes(obj, n_hash) for id, obj in iterator}
    return signatures

def bucket_hash(signatures, n_buckets = 100):
    bucket = 0
    for signature in signatures:
        bucket = bucket ^ hash(signature) #xor of all signatures
    return bucket % n_buckets #modulo n_buckets

cartesian_product_exclude_same = lambda A,B : set((a,b) for a in A for b in B if (a != b and b > a))

In [None]:
n_hash = 100    
user = users["1488844"]
minhashes = compute_minhashes(user, n_hash)
minhashes

array([-1810453357, -1570063170,  -184002522,  -126235597,  -172315920,
        -346150140, -2131240906,  -793765815, -1571785578,  -977649233,
        -535678046, -1457417976, -1562019656,   101162235, -1560298715,
        -956861631,   168904549,   790285535, -2070529337,   625771924,
        -249096934,   166839407, -1447133418, -1508133614,  -721831573,
       -1056278211, -2114645946,  -381543989,     2653644,  -243094001,
       -1200734897, -1428275679,  1687665648,  -677418915, -1473849242,
        -248996816,  -893062816,  -576892624,  1653519337, -1799033939,
       -1549378093, -1625405734, -1868994069, -1367157416,  1017076005,
       -1384113491,  -310537956, -1527923987, -1384278664,   427621935,
         369715913, -1778312296,   436922359,  -868451072,  -383727986,
        -532758649, -2127377514,  -701845473,   559389694, -1995129985,
         517183813,  1120606309,  -552741755,  -943591880, -1871239471,
       -1987538483,  -818748851, -1542113122, -1668779179,   290

In [None]:

def create_bucket_matrix(signatures: dict[str, np.ndarray], n_buckets = 100, bands = 10):
    r = len(signatures) // bands
    buckets = np.zeros((len(signatures), bands), dtype = int)
    i = 0
    for _, signature in signatures.items(): #for each object
        for j in range(bands): #for each band
            buckets[i,j] = bucket_hash(signature[j*r:(j+1)*r], n_buckets=n_buckets)
        i += 1
    return buckets

In [None]:
n_hashes = 1000
b = 20
r = n_hashes // b
n_buckets = 10000

SIG = compute_signatures(users, n_hashes, with_tqdm = True)
buckets = create_bucket_matrix(SIG, n_buckets, bands = b)

  1%|▏         | 136/9619 [00:00<00:19, 475.26it/s]

100%|██████████| 9619/9619 [00:17<00:00, 565.80it/s]


In [None]:
def find_candidates(buckets, users, max_size = len(users)):
    #buckets is a matrix of size (n_users, bands)
    ids = np.array(list(users.keys()))
    candidates = set()
    for band in buckets.T:
        #find collisions
        unique = np.unique(band)
        for bucket_value in tqdm(unique):
            idx = np.where(bucket_value == band)[0]
            if len(idx) < max_size:
                new_candidates = cartesian_product_exclude_same(ids[idx], ids[idx])
                candidates = candidates.union(new_candidates)
    return candidates


In [None]:
#find_candidates(buckets, users)

100%|██████████| 55/55 [00:59<00:00,  1.07s/it] 
100%|██████████| 55/55 [02:55<00:00,  3.20s/it]
 51%|█████     | 28/55 [01:38<01:34,  3.50s/it]


KeyboardInterrupt: 

In [None]:
band = buckets.T[0]
unique = np.unique(band)
unique

array([ 213,  918, 1331, 1336, 1445, 1513, 1591, 1729, 1766, 2029, 2221,
       2290, 2395, 2491, 2532, 2622, 2626, 3197, 4340, 4381, 4620, 4960,
       5216, 5418, 6044, 6248, 6328, 6361, 6425, 6474, 6806, 6823, 6910,
       6986, 6996, 7183, 7479, 7513, 7544, 7627, 7711, 8133, 8265, 8290,
       8358, 8688, 8750, 8957, 9132, 9499, 9570, 9677, 9801, 9861, 9971])