In [1]:
import os
import numpy as np
import scipy.sparse as sps
from sys import getsizeof
import time

In [2]:
def load_data_into_matrix(filename, n_users, n_movies):
    data = np.load(filename)
    ratings = data[:, 2]
    users = data[:, 0] - 1
    movies = data[:, 1] - 1
    data = sps.coo_matrix((ratings, (users, movies)), shape=(n_users, n_movies), dtype=np.int8)
    return data


def load_discrete_data_into_matrix(filename, n_users, n_movies, n_ratings):
    data = np.load(filename)
    ratings = np.ones((n_ratings))
    users = data[:, 0] - 1
    movies = data[:, 1] - 1
    data = sps.coo_matrix((ratings, (users, movies)), shape=(n_users, n_movies), dtype=np.int8)
    return data


def load_random_data_into_matrix(n_users, n_movies, n_ratings):
    ratings = np.random.randint(0, 5, n_ratings)
    users = np.random.randint(1, n_users, n_ratings) - 1
    movies = np.random.randint(1, n_movies, n_ratings) - 1
    data = sps.coo_matrix((ratings, (users, movies)), shape=(n_users, n_movies), dtype=np.int8)
    return data


def cosine_sim(u, v):
    return 1 - np.arccos(np.dot(u, v) / np.sqrt(np.sum(np.square(u)) * np.sum(np.square(v)))) / np.pi


def make_random_projections(n_projections, n_users):
    return np.random.choice([-1, 1], size=(n_projections, n_users))


def unique_pairs_from_array(a):
    pairs = np.empty((0, 2))
    n = len(a)
    for idx, val in enumerate(a):
        pairs_for_single_value = np.stack((np.tile(val, n - idx), a[idx:]), axis=1)
        pairs = np.append(pairs, pairs_for_single_value, axis=0)
    return pairs

In [3]:
np.random.seed(42)

discrete = False

username = os.getcwd().split('/')[2]
print('username:', username)

if username == 'mvgroeningen':
    n_users = 1000
    n_movies = 500
    n_ratings = 10000
    rating_matrix = load_random_data_into_matrix(n_users, n_movies, n_ratings) 
else:
    filename = 'user_movie_rating.npy'
    n_users = 103703
    n_movies = 17770
    n_ratings = 65225506
    if discrete:
        rating_matrix = load_discrete_data_into_matrix(filename, n_users, n_movies, n_ratings)
    else:
        rating_matrix = load_data_into_matrix(filename, n_users, n_movies)

username: mvgroeningen


In [4]:
t0 = time.time()

n_projections = 12
projections_per_band = 3

v = make_random_projections(n_projections, n_users)
signatures = np.zeros((n_projections, n_users))
n_bands = n_projections // projections_per_band
binary_array = np.array([2**i for i in np.arange(projections_per_band)[::-1]])
pairs = np.empty((0, 2))

for i in np.arange(n_users):
    row = rating_matrix.getrow(i)
    indices = row.indices
    signatures[:, i] = (np.dot(v[:, indices], row.toarray()[0][indices]) >=0).astype(int)

for b in np.arange(n_bands):
    start_of_band = b * projections_per_band
    end_of_band = start_of_band + projections_per_band
    band = signatures[start_of_band:end_of_band, :]
    band = np.dot(binary_array, band).T
    unique_values, _ = np.unique(band, axis=0, return_counts=True)
    for u in unique_values:
        indices_of_unique_value = np.where(band == u)[0]
        pairs_of_unique_value = unique_pairs_from_array(indices_of_unique_value)
        pairs = np.append(pairs, pairs_of_unique_value, axis=0)

print(time.time() - t0)
print(pairs.shape)

4.737015247344971
(263266, 2)
