# Import preprocessed data

In [1]:
from tqdm import tqdm
import numpy as np
import mmh3
from src.structures import User, Movie

In [2]:
import import_ipynb
from loading_and_preprocessing import movies, users

importing Jupyter notebook from loading_and_preprocessing.ipynb
User(1488844)
ratings on the format movie_id : rating  defaultdict(<class 'float'>, {'1': 3, '8': 4})
Dinosaur Planet, 2003
ratings on the format user_id : rating:  {'1488844': 3, '822109': 5, '885013': 4, '30878': 4, '823519': 3, '893988': 3, '124105': 4, '1248029': 3, '1842128': 4, '2238063': 3, '1503895': 4, '2207774': 5, '2590061': 3, '2442': 3, '543865': 4, '1209119': 4, '804919': 4, '1086807': 3, '1711859': 4, '372233': 5, '1080361': 3, '1245640': 3, '558634': 4, '2165002': 4, '1181550': 3, '1227322': 4, '427928': 4, '814701': 5, '808731': 4, '662870': 5, '337541': 5, '786312': 3, '1133214': 4, '1537427': 4, '1209954': 5, '2381599': 3, '525356': 2, '1910569': 4, '2263586': 4, '2421815': 2, '1009622': 1, '1481961': 2, '401047': 4, '2179073': 3, '1434636': 3, '93986': 5, '1308744': 5, '2647871': 4, '1905581': 5, '2508819': 3, '1578279': 1, '1159695': 4, '2588432': 3, '2423091': 3, '470232': 4, '2148699': 2, '1342007': 

# Compute Signature matrix

In [3]:

def compute_minhashes(
                    object: Movie | User, #movie or user to compute minhashes for
                    n_hash = 100 #number of hashes
                    ):
    
    bag = object.bag_ratings() #bag of ratings
    hashes = np.array([[mmh3.hash(id, seed) for id in bag] for seed in range(n_hash)]) #hashes for each seed
    minhashes = np.min(hashes, axis = 1) #minhashes for each seed
    return minhashes

def compute_signatures(
                    objets: dict[str, Movie | User], #objects to compute signatures for
                    n_hash = 100, #number of hashes
                    with_tqdm = True #whether to show progress bar
                    ):
    iterator = tqdm(objets.items()) if with_tqdm else objets.items() #iterator
    signatures = {id: compute_minhashes(obj, n_hash) for id, obj in iterator}
    return signatures

def bucket_hash(signatures, n_buckets = 100):
    bucket = 0
    for signature in signatures:
        bucket = bucket ^ hash(signature) #xor of all signatures
    return bucket % n_buckets

In [4]:
n_hash = 100    
user = users["1488844"]
minhashes = compute_minhashes(user, n_hash)
minhashes

array([-1810453357, -1570063170,  -184002522,  -126235597,  -172315920,
        -346150140, -2131240906,  -793765815, -1571785578,  -977649233,
        -535678046, -1457417976, -1562019656,   101162235, -1560298715,
        -956861631,   168904549,   790285535, -2070529337,   625771924,
        -249096934,   166839407, -1447133418, -1508133614,  -721831573,
       -1056278211, -2114645946,  -381543989,     2653644,  -243094001,
       -1200734897, -1428275679,  1687665648,  -677418915, -1473849242,
        -248996816,  -893062816,  -576892624,  1653519337, -1799033939,
       -1549378093, -1625405734, -1868994069, -1367157416,  1017076005,
       -1384113491,  -310537956, -1527923987, -1384278664,   427621935,
         369715913, -1778312296,   436922359,  -868451072,  -383727986,
        -532758649, -2127377514,  -701845473,   559389694, -1995129985,
         517183813,  1120606309,  -552741755,  -943591880, -1871239471,
       -1987538483,  -818748851, -1542113122, -1668779179,   290

# Exercise 3.4.4 : 
Suppose we wish to implement LSH by MapReduce. Specifically, assume chunks of the signature matrix consist of columns, and elements
are key-value pairs where the key is the column number and the value is the
signature itself (i.e., a vector of values).

    a) Show how to produce the buckets for all the bands as output of a single
    MapReduce process. Hint: Remember that a Map function can produce
    several key-value pairs from a single element.

    b) Show how another MapReduce process can convert the output of (a) to
    a list of pairs that need to be compared. Specifically, for each column i,
    there should be a list of those columns j > i with which i needs to be
    compared.

In [12]:

def create_bucket_matrix(signatures: dict[str, np.ndarray], n_buckets = 100, bands = 10):
    """Creates a matrix of size (n_objects, bands) where each entry corresponds to a bucket number for a band of the minhash signatures.

    Args:
        signatures (dict[str, np.ndarray]): dictionary of on the form {object_id: [hash1, hash2, ...]} where the object is a user or movie.
        n_buckets (int, optional): number of buckets. Defaults to 100.
        bands (int, optional): number of bands. Defaults to 10.

    Returns:
        buckets: matrix of size (n_objects, bands) where each entry corresponds to a bucket number for a band
    """
    
    r = len(signatures) // bands
    buckets = np.zeros((len(signatures), bands), dtype = int)
    i = 0
    for _, signature in signatures.items(): #for each object
        for j in range(bands): #for each band
            buckets[i,j] = bucket_hash(signature[j*r:(j+1)*r], n_buckets=n_buckets)
        i += 1
    return buckets

In [6]:
n_hashes = 1000
b = 20
r = n_hashes // b
n_buckets = 10000

SIG = compute_signatures(users, n_hashes, with_tqdm = True)
buckets = create_bucket_matrix(SIG, n_buckets, bands = b)

  0%|          | 0/9619 [00:00<?, ?it/s]

100%|██████████| 9619/9619 [00:07<00:00, 1300.84it/s]


In [7]:
hash(2)

2

In [11]:
np.unique(buckets)

array([ 0,  2,  5,  6,  7,  8,  9, 10, 12, 14, 15, 16, 17, 18, 20, 21, 22,
       24, 29, 30, 32, 35, 36, 37, 38, 39, 40, 41, 42, 44, 45, 46, 48, 50,
       51, 53, 54, 56, 57, 60, 62, 63, 65, 67, 70, 71, 74, 75, 76, 77, 78,
       79, 80, 81, 83, 84, 86, 87, 89, 91, 95, 96, 99])

In [9]:
buckets
#find indexes of users with same bucket

candidates = find_candidates(buckets, 0)
candidates

NameError: name 'find_candidates' is not defined

In [37]:
signatures = SIG["1488844"]
len(signatures)

100

In [29]:
users

defaultdict(src.structures.User,
            {'1488844': User(1488844) has rated 2 movies,
             '822109': User(822109) has rated 1 movies,
             '885013': User(885013) has rated 2 movies,
             '30878': User(30878) has rated 2 movies,
             '823519': User(823519) has rated 2 movies,
             '893988': User(893988) has rated 1 movies,
             '124105': User(124105) has rated 1 movies,
             '1248029': User(1248029) has rated 1 movies,
             '1842128': User(1842128) has rated 1 movies,
             '2238063': User(2238063) has rated 1 movies,
             '1503895': User(1503895) has rated 1 movies,
             '2207774': User(2207774) has rated 1 movies,
             '2590061': User(2590061) has rated 1 movies,
             '2442': User(2442) has rated 1 movies,
             '543865': User(543865) has rated 1 movies,
             '1209119': User(1209119) has rated 1 movies,
             '804919': User(804919) has rated 1 movies,
     

In [8]:
n_hashes = 100
def locality_sensitive_hashing(SIG, n_hashes = 100, bands = 10, n_buckets = 10000):
    r = n_hashes // bands
    similar = defaultdict(set)
    for i in tqdm(range(0, n_hashes, r)):
        buckets = defaultdict(list)
        for user_id, sig in SIG.items():
            bucket = bucket_hash(sig[i:i+r], n_buckets=n_buckets)

            if buckets[bucket]:
                for prev_user_id in buckets[bucket]:
                    similar[user_id].add(prev_user_id)
                    similar[prev_user_id].add(user_id)
            buckets[bucket].append(user_id)
        
    return similar

In [9]:
# similar = locality_sensitive_hashing(SIG, n_hashes = n_hashes, bands = 20, n_buckets = 10000000)

In [20]:
help(map)

Help on class map in module builtins:

class map(object)
 |  map(func, *iterables) --> map object
 |  
 |  Make an iterator that computes the function using arguments from
 |  each of the iterables.  Stops when the shortest iterable is exhausted.
 |  
 |  Methods defined here:
 |  
 |  __getattribute__(self, name, /)
 |      Return getattr(self, name).
 |  
 |  __iter__(self, /)
 |      Implement iter(self).
 |  
 |  __next__(self, /)
 |      Implement next(self).
 |  
 |  __reduce__(...)
 |      Return state information for pickling.
 |  
 |  ----------------------------------------------------------------------
 |  Static methods defined here:
 |  
 |  __new__(*args, **kwargs)
 |      Create and return a new object.  See help(type) for accurate signature.



In [10]:
# Sim_matrix = np.zeros((len(users), len(users)))
# keys = list(users.keys())
# for i in tqdm(range(len(users))):
#     for j in range(i, len(users)):
#         if i > j:
#             Sim_matrix[i][j] = users[keys[i]].similarity(users[keys[j]])

100%|██████████| 9619/9619 [00:04<00:00, 2240.17it/s]
