# AiDM Assignment 2 Group 81
### Meng Yao (yao@strw.leidenuniv.nl) and Michael Keim (keim@strw.leidenuniv.nl)
##  Locality Sensitive Hashing for Netflix Data

In [49]:
import numpy as np
import time
import os
start = time.time()
np.random.seed(seed=17)

Later on in our code we will need to be able to calculate similarities. Here is where we define functions to do so

In [55]:
def generate_list(data,n):
    '''
    The function is to generate a list that contains sets for users. Each set is the watched movies for one user.
    data :
    n : persent of dataset will be considered
    '''
    data_n_persent = data[0:int(len(data_origin)*n),:] # Drag out n persent from the entire dataset.
    user_movie_list = [[] for _ in range(len(set(data_n_persent[:,0])))] 
    # Prepare a list of sets.
    # Every set represent a user  
    for i in range(len(data_n_persent)):
        user_movie_list[data_n_persent[i,0]].append(data_n_persent[i,1])
    for j in range(len(user_movie_list)):
        user_movie_list[j] = set(user_movie_list[j]) # convert every user list to a set
    return user_movie_list  # will return the whole watched movies set for every user in only one list.
def simi(pair,set_list):
    a = set_list[pair[0]]
    b = set_list[pair[1]]
    intersection = a & b # Use logical and to achieve the intersection
    n_intersection = len(intersection)
    similarity = n_intersection / (len(a) + len(b) - n_intersection) # Calculate sim
    return similarity

In [56]:
set_list_total = generate_list(data_origin,1)

###  Load Data
First we read in the data of users (column 1) and the movies they rated (column 2).

In [50]:
data_origin = np.load('../user_movie.npy')

### Shingling
In the interest of memory conservation (only stoing non-zero elements) and efficient rearrangement, we store the data as ones in indicies according to their user ID (column) and rated movies (rows, 1 if rated 0 if not). This is so that it will be easy to tell which users rated the same movies using LSH.

In [51]:
from scipy.sparse import csc_matrix
users_movies = csc_matrix((np.ones(len(data_origin)), (data_origin[:,1], data_origin[:,0])),\
                          shape=(np.max(data_origin[:,1])+1, np.max(data_origin[:,0])+1))

In the next step we will need to find the first nonzero entry for every column in this sparse matrix (re-arranged). Let's see what this is for the first user in the original arrangement.

In [52]:
first_nonzero = np.array((users_movies!=0).argmax(axis=0))[0]
# Now test for the first user
for i in range(first_nonzero[0]+1):
    print(int(users_movies[i,0]), end=' ')

0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 1 

### Signature Matrix
Now we create a random permutations of movie indicies and rearrange our matrix to match. The first non-zero element in the new arrangement will be an element in the user's signature. We will start using 50 permutations, so that our signatures for each user are of length 50.

In [None]:
tic = time.time()
signature_length = 50
signature_matrix = np.zeros((signature_length, np.max(data_origin[:,0])+1))
permutation_indicies = np.arange(np.max(data_origin[:,1])+1)
# We create a copy of the sparse matrix which we will re-arrange
users_movies_copy = np.copy(users_movies)
for i in range(signature_length):
    # We shuffle the permutations, thereby creating new hash functions
    np.random.shuffle(permutation_indicies)
    # We re-arrange the copy
    users_movies_copy = users_movies[permutation_indicies,:]
    # We find the first non-zero entry
    signature_matrix[i,:] = (users_movies_copy!=0).argmax(axis=0)
print('It took', int(time.time() - tic), 'seconds to make the signature matrix.')

Now let's test to make sure we made our signture matrix correctly. For the last signature in the matrix for the first user, is the index actually the first non-zero entry in the permutation_indicies arrangement?


In [54]:
for j in permutation_indicies[1+np.arange(int(signature_matrix[signature_length-1,0]))]:
    print(int(users_movies[j,0]), end=' ')

0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 1 

Yep, that is indeed the first non-zero entry in this ordering!

###  Partition Into Bands
Next we cut the signature matrix into 5 bands (to start),

In [57]:
def hash_to_bucket(bands,sig_M):
    rows = int(len(sig_M) / bands)
    possible_sig_max = 17749
    weight = np.arange(1,rows+1,1)
    bucket_index_max = (1+rows)*rows/2 * possible_sig_max
    candidate_list = [[[] for _ in range(int(bucket_index_max))] for _ in range(bands)]
    candidates = []
    for i in range(bands):
        for j in range(len(sig_M[0])):
            hash_wsum = sum(np.array(sig_M[int(i * rows):int((i+1)*rows),j]) * weight)
            candidate_list[i][int(hash_wsum)].append(j)
        candidates.append([x for x in candidate_list[i] if x])
    return candidates

def bucket_to_simi(candidates, Threshold ,start_time):
    pairs = []
    similarities = []
    similar_users = []
    for band in candidates:
        for bucket in band:
            for i in range(len(bucket)):
                for j in np.arange(start=i+1, stop = len(bucket), step=1):
                    pair = [bucket[i],bucket[int(j)]]
                    if pair not in pairs:
                        similarity = simi(pair,set_list_total)
                        pairs.append(pair)
                        if similarity > Threshold :
                            similar_users.append(pair)
                            similarities.append(similarity)
                    if time.time() - start_time > 300:
                        break
                else:
                    continue
                break
            else:
                continue
            break
        else:
            continue
        break
    return similar_users , similarities, pairs

In [59]:
       
candidates = hash_to_bucket(bands = 25, sig_M = signature_matrix)
similar_users , similarities , pairs = bucket_to_simi(candidates,0.5, time.time())

        
if os.path.exists('results.txt'):
    os.remove('results.txt')
with open('results.txt','w') as f:
    for item in similar_users:
        f.write('%s,%s\n' % (item[0],item[1]))
f.close()

IndentationError: unindent does not match any outer indentation level (<tokenize>, line 10)

In [None]:
len(pairs)

In [None]:
"""
# Meng's testing

#candidates = set().union(candidate_list[0],candidate_list[1])
#candidate_list[0] | candidate_list[1]
#candidates[0] == candidates[1]
'''
candidates_unique = candidates[0]
for i in range(bands-1):
    candidates_unique += [e for e in candidates[i+1] if e not in candidates_unique]
candidates_unique
''''


"""

In [None]:
'''
#Michael's testing

row = np.array([0, 2, 2, 0, 1, 2])
col = np.array([0, 0, 1, 2, 2, 2])
data = np.array([1, 2, 3, 4, 5, 6])
A = csc_matrix((data, (row, col)), shape=(len(row), len(col))).toarray()
print(A)
x = np.arange(len(row))
np.random.shuffle(x)
print(x)
B = A[x,:]
print(B)
A[0,:] = (B!=0).argmax(axis=0)
print(A)
'''