In [35]:
import numpy as np
import scipy.sparse as sparse
import random
import hashlib

# Load in Data

In [36]:
data = np.load('Data/user_movie_rating.npy')[:,0:2]

In [37]:
col = data[:, 0]
row = data[:, 1]

n_col = col.max() + 1
n_row = row.max() + 1 

M = sparse.coo_matrix((np.ones(len(data)), (row, col)), shape=(n_row, n_col)).tocsr()
small_M  = M[:,0:1000]

In [38]:
perm = 100 
perm_seeds = []
row_len,_ = small_M.get_shape()
seed = 1000
for i in range(perm):
    perm_seeds.append(int(hashlib.sha256(str(seed+i).encode()).hexdigest(),16)%4294967295)


In [39]:
from tqdm import tqdm


def minhashing(M):
    signature = []
    index_perm = np.arange(M.shape[0])#reshuffeling the same index movies_uay
    sig_len = M.shape[1]-1
    for seed in tqdm(perm_seeds):
        random.seed(seed)
        np.random.shuffle(index_perm)
        perm_M = M[index_perm[:1000],1:] #Just take the top 1000 because it is very likely for each user to be in the first thousand movies at least once
        non_zero = perm_M.nonzero()
        sig = non_zero[0][np.unique(non_zero[1], return_index=True)[1]]
        if len(sig)==sig_len:
            signature.append(sig)        
    return signature
        

        


In [40]:
a = minhashing(M)

100%|██████████| 100/100 [01:12<00:00,  1.39it/s]


In [41]:
u = np.array(a).T.tolist()

In [42]:
len(u)

103703

In [43]:
len(u[0])

100

In [44]:
def LSH(M,b):
    #Create Bands and Hashes
    n_users = len(M)
    sign_len = len(M[0])
    n_rows = int(sign_len/b)
    
    buckets = [ ]
    candidates = []
    for band in tqdm(range(b)):
        bucket = {}
        candidate_in_band = []
        for i in range(n_users):
        
            hash_num = (int((hashlib.sha256(f"{u[i][n_rows*band:(n_rows*(band + 1) - 1)]}".encode())).hexdigest(), 16))
            if  hash_num not in bucket:
                bucket[hash_num] = []
            bucket[hash_num].append(i)
            if len(bucket[hash_num])==2:
                candidate_in_band.append(hash_num)
        buckets.append(bucket)
        candidates.append(candidate_in_band)
    return buckets, candidates     
# implement hash table to check for identity in linear time

        


In [45]:
resultado, candidates = LSH(u,20)

100%|██████████| 20/20 [00:15<00:00,  1.33it/s]


In [46]:
len(candidates[4])

9480

In [47]:
MT = M.T

In [48]:
def Jaccard_similarity(pair,M):
    u1 = int(pair[0])
    u2 = int(pair[1])
    movies_u1 = M[u1]
    movies_u2 = M[u2]
    intersection = movies_u1.dot(movies_u2.T).sum()
    union = (movies_u1 + movies_u2).count_nonzero()
    similarity = intersection/union
    return float(similarity)

In [49]:
def get_pairs(candidates, buckets,M):
    pairs = {}
    for i in tqdm(range(len(candidates))):
        for j in range(len(candidates[i])):
            if len(buckets[i][candidates[i][j]]) <= 2:
                for z in range(len(buckets[i][candidates[i][j]]) - 1):
                    for p in range(z + 1, len(buckets[i][candidates[i][j]])):
                        u1 = buckets[i][candidates[i][j]][z]
                        u2 = buckets[i][candidates[i][j]][p]
                        if f"{u1},{u2}" not in pairs and f"{u2},{u1}" not in pairs:
                        #    similarity = Jaccard_similarity(u1,u2,MT)
                        #    if similarity > 0.5:
                            pairs[f"{u1},{u2}"] = 1
                        elif f"{u1},{u2}" in pairs:
                            pairs[f"{u1},{u2}"] = pairs[f"{u1},{u2}"] + 1
    return pairs   

In [50]:
pairs = get_pairs(candidates, resultado, MT)
len(pairs)

100%|██████████| 20/20 [00:00<00:00, 32.46it/s]


84400

In [30]:
#Not optimal yet
#for pair in tqdm(over_1_band):
#    over_1_band[pair] = Jaccard_similarity(pair.split(","),MT)

100%|██████████| 136/136 [00:32<00:00,  4.15it/s]


In [None]:
print(pairs['15039,87449'])