# Import preprocessed data

In [31]:
from tqdm import tqdm
import numpy as np
import mmh3
from src.structures import User, Movie
from src.data_methods import read_movies,read_viewers
import kagglehub
import os

In [32]:
# Download latest version
#please ikke offentliggør min api key. 
os.environ['KAGGLE_USERNAME'] = "marcusgaleajacobsen"
os.environ['KAGGLE_KEY'] = "32a3003f52c97053841ea46c492128dc"
datapath = kagglehub.dataset_download("netflix-inc/netflix-prize-data")

In [65]:
n_lines = np.inf #number of reviews to read
datafiles = ["combined_data_1.txt"]#, "combined_data_2.txt", "combined_data_3.txt", "combined_data_4.txt"]
with_tqdm = True #set to True to see progress bar (reduce speed)
reviews_pr_user = 10 #number of reviews per user to read

movies = read_movies(datapath)
users = read_viewers(datapath, movies, datafiles = datafiles, with_tqdm= with_tqdm, n_lines=n_lines,)# reviews_pr_user=reviews_pr_user) #read only 100000 

100%|██████████| 24058263/24058263 [02:29<00:00, 160948.52it/s]


In [70]:
#trim users with less than 10 movies
not_enough_movies = [id for id,user in users.items() if user.n_watched < 100]
for user_id in not_enough_movies:
    del users[user_id]

In [71]:
len(users)

71241

In [69]:
users

defaultdict(src.structures.User,
            {'1488844': User(1488844) has rated 529 movies,
             '822109': User(822109) has rated 36 movies,
             '885013': User(885013) has rated 87 movies,
             '30878': User(30878) has rated 309 movies,
             '823519': User(823519) has rated 157 movies,
             '893988': User(893988) has rated 142 movies,
             '124105': User(124105) has rated 15 movies,
             '1248029': User(1248029) has rated 338 movies,
             '1842128': User(1842128) has rated 38 movies,
             '2238063': User(2238063) has rated 10 movies,
             '1503895': User(1503895) has rated 26 movies,
             '2207774': User(2207774) has rated 149 movies,
             '2590061': User(2590061) has rated 173 movies,
             '2442': User(2442) has rated 118 movies,
             '543865': User(543865) has rated 199 movies,
             '1209119': User(1209119) has rated 86 movies,
             '804919': User(804919) 

# Compute Signature matrix

In [36]:

def compute_minhashes(
                    object: Movie | User, #movie or user to compute minhashes for
                    n_hash = 100 #number of hashes
                    ):
    
    bag = object.bag_ratings() #bag of ratings
    hashes = np.array([[mmh3.hash(id, seed) for id in bag] for seed in range(n_hash)]) #hashes for each seed
    minhashes = np.min(hashes, axis = 1) #minhashes for each seed
    return minhashes

def compute_signatures(
                    objets: dict[str, Movie | User], #objects to compute signatures for
                    n_hash = 100, #number of hashes
                    with_tqdm = True #whether to show progress bar
                    ):
    iterator = tqdm(objets.items()) if with_tqdm else objets.items() #iterator
    signatures = {id: compute_minhashes(obj, n_hash) for id, obj in iterator}
    return signatures

def bucket_hash(signatures, n_buckets = 100):
    bucket = 0
    for signature in signatures:
        bucket = bucket ^ signature #xor of all signatures
    return bucket % n_buckets #modulo n_buckets

cartesian_product_exclude_greater = lambda A,B : set((a,b) for a in A for b in B if (b > a))

In [37]:
from collections import defaultdict

def create_buckets(signatures: dict[str, np.ndarray], n_buckets = 100, bands = 10):
    r = len(signatures[list(signatures.keys())[0]]) // bands #number of rows in each band
    buckets = [defaultdict(list) for _ in range(bands)]
    for id, signature in signatures.items(): #for each object
        for band in range(bands): #for each band
            buckets[band][bucket_hash(signature[band*r:(band+1)*r], n_buckets)].append(id) #add to bucket
    return buckets

In [38]:
n_hashes = 20
n_buckets = len(users)**2

SIG = compute_signatures(users, n_hashes, with_tqdm = True)

100%|██████████| 9619/9619 [00:00<00:00, 33062.82it/s]


In [39]:
b = 5
n_buckets = 2**16
buckets = create_buckets(SIG, bands = b, n_buckets= n_buckets)

In [47]:
def get_candidates(buckets):
    #create a dictionary of the users' signatures
    candidates = set()
    for bucket in tqdm(buckets):
        for _, ids in tqdm(bucket.items()): 
            new_candidates = cartesian_product_exclude_greater(ids, ids)
            candidates.update(new_candidates)
    return candidates

In [None]:
candidates = get_candidates(buckets)

100%|██████████| 35/35 [00:05<00:00,  5.99it/s]
100%|██████████| 39/39 [00:07<00:00,  5.41it/s]
100%|██████████| 41/41 [00:08<00:00,  5.11it/s]
100%|██████████| 46/46 [00:07<00:00,  5.90it/s]
100%|██████████| 30/30 [00:07<00:00,  3.80it/s]
100%|██████████| 5/5 [00:36<00:00,  7.36s/it]


In [54]:
candidates

{('349743', '926783'),
 ('1346990', '519221'),
 ('2188965', '900245'),
 ('1389413', '2508922'),
 ('123384', '174868'),
 ('1916951', '22663'),
 ('1911545', '2400083'),
 ('2535468', '2604492'),
 ('1799139', '2215820'),
 ('157631', '1826063'),
 ('283935', '618288'),
 ('1222552', '2490359'),
 ('115485', '2435631'),
 ('2023397', '2153864'),
 ('148951', '1747893'),
 ('2009440', '2381457'),
 ('1538468', '2218535'),
 ('1322061', '494279'),
 ('1988873', '2216429'),
 ('1577913', '829060'),
 ('1035499', '1150238'),
 ('1506080', '186848'),
 ('1988095', '513210'),
 ('1436857', '285703'),
 ('1668623', '410537'),
 ('1002367', '1432984'),
 ('2373537', '277762'),
 ('1291962', '625428'),
 ('1911746', '750136'),
 ('2268573', '443193'),
 ('1911161', '409948'),
 ('1127447', '280585'),
 ('2600762', '464029'),
 ('1136807', '540794'),
 ('1063873', '1242697'),
 ('335931', '837693'),
 ('1547173', '684876'),
 ('1713599', '198309'),
 ('1313592', '1785248'),
 ('169906', '2426587'),
 ('1794722', '2375712'),
 ('7816

In [55]:
#binomial coefficient
from math import comb
n_candidates = len(candidates)
n_users = len(users)
n_pairs = comb(n_users, 2)
n_pairs

46257771

In [56]:
r = n_hashes // b
t = (1/b)**(1/r)
def trim_candidates(users: dict[str, User | Movie], candidates: set, threshold = 0.5):
    n_removed = 0
    for id1,id2 in tqdm(candidates):
        sim = users[id1].similarity(users[id2], method = "jaccard")
        if sim < threshold:
            n_removed += 1
        else:
            users[id1].neighbors[id2] = sim
            users[id2].neighbors[id1] = sim
    return n_removed

In [57]:
n_removed = trim_candidates(users, candidates, threshold = t)
print(f"Removed {n_removed} from {len(candidates)} candidates")

100%|██████████| 13967112/13967112 [02:04<00:00, 112619.78it/s]

Removed 4886906 from 13967112 candidates





In [59]:
users

defaultdict(src.structures.User,
            {'1488844': User(1488844) has rated 2 movies,
             '822109': User(822109) has rated 1 movies,
             '885013': User(885013) has rated 2 movies,
             '30878': User(30878) has rated 2 movies,
             '823519': User(823519) has rated 2 movies,
             '893988': User(893988) has rated 1 movies,
             '124105': User(124105) has rated 1 movies,
             '1248029': User(1248029) has rated 1 movies,
             '1842128': User(1842128) has rated 1 movies,
             '2238063': User(2238063) has rated 1 movies,
             '1503895': User(1503895) has rated 1 movies,
             '2207774': User(2207774) has rated 1 movies,
             '2590061': User(2590061) has rated 1 movies,
             '2442': User(2442) has rated 1 movies,
             '543865': User(543865) has rated 1 movies,
             '1209119': User(1209119) has rated 1 movies,
             '804919': User(804919) has rated 1 movies,
     

In [None]:
for i,user in enumerate(users.values()):
    #only the first 10 users
    if i < 10:
        print(f"User {user.id} has neighbors {user.neighbors}")

User 1488844 has neighborsdefaultdict(<class 'float'>, {'595778': 0.9230769230769231, '530789': 0.8235294117647058, '769643': 0.7692307692307693, '1116080': 0.8235294117647058, '1227322': 0.875, '401047': 0.7142857142857143, '68959': 0.9333333333333333, '320540': 0.9230769230769231, '93986': 0.8, '823519': 0.7272727272727273})
User 822109 has neighborsdefaultdict(<class 'float'>, {'1779903': 0.8888888888888888, '1524343': 1.0, '1878798': 0.8888888888888888, '269524': 0.75, '2354740': 1.0, '1824586': 0.8888888888888888, '2529547': 1.0, '121456': 0.8888888888888888, '2380806': 1.0, '2268101': 0.8888888888888888, '1273630': 0.8888888888888888, '1537427': 0.8888888888888888, '317050': 1.0, '255383': 1.0, '659505': 0.8888888888888888, '2207774': 1.0, '124105': 0.8888888888888888, '358776': 0.8888888888888888, '2054145': 0.75, '1603525': 0.75, '2413320': 0.8888888888888888, '2609436': 0.8888888888888888, '642036': 0.75, '1647618': 0.8888888888888888, '352635': 1.0, '493945': 1.0, '2380848': 