In [37]:
import os
import numpy as np
import scipy.sparse as sps
from sys import getsizeof
import time

In [2]:
def load_data_into_matrix(filename, n_users, n_movies):
    data = np.load(filename)
    data = sps.coo_matrix((data[:, 2], (data[:, 0] - 1, data[:, 1] - 1)), 
                          shape=(n_users, n_movies),
                          dtype=np.int8)
    return data


def load_discrete_data_into_matrix(filename, n_users, n_movies, n_ratings):
    data = np.load(filename)
    data = sps.coo_matrix((np.ones((n_ratings)), (data[:, 0] - 1, data[:, 1] - 1)), 
                          shape=(n_users, n_movies),
                          dtype=np.int8)
    return data


def load_random_data_into_matrix(n_u, n_m):
    matrix = np.random.randint(0, 5, (n_u, n_m))
    return matrix


def cosine_sim(u, v):
    return 1 - np.arccos(np.dot(u, v) / np.sqrt(np.sum(np.square(u)) * np.sum(np.square(v)))) / np.pi


def make_random_projections(n_projections):
    return np.random.choice([-1, 1], size=(n_projections, 17770))

In [3]:
np.random.seed(42)

discrete = False

username = os.getcwd().split('/')[2]
print('username:', username)

if username == 'mvgroeningen':
    n_users = 100
    n_movies = 50
    rating_matrix = load_random_data_into_matrix(n_users, n_movies) 
else:
    filename = 'user_movie_rating.npy'
    n_users = 103703
    n_movies = 17770
    n_ratings = 65225506
    if discrete:
        rating_matrix = load_discrete_data_into_matrix(filename, n_users, n_movies, n_ratings)
    else:
        rating_matrix = load_data_into_matrix(filename, n_users, n_movies)

username: home


In [45]:
n_projections = 10
v = make_random_projections(n_projections)
signatures = np.zeros((n_users, n_projections))
binary_array = np.array([2**i for i in np.arange(n_projections)[::-1]])

In [46]:
t0 = time.time()
for i in np.arange(n_users)[:20]:
    row = rating_matrix.getrow(i)
    indices = row.indices
    signatures[i, :] = (np.dot(v[:, indices], row.toarray()[0][indices]) >=0).astype(int)

signatures = np.dot(signatures, binary_array)
u, c = np.unique(signatures, return_counts=True)
dup = u[c > 1]

print(u)
print(c)
print(dup)

print(time.time() - t0)

[  0. 136. 293. 388. 392. 400. 412. 416. 428. 431. 437. 523. 677. 685.
 692. 896. 901. 911. 920. 932. 940.]
[103683      1      1      1      1      1      1      1      1      1
      1      1      1      1      1      1      1      1      1      1
      1]
[0.]
8.057084798812866
