In [1]:
import os
import numpy as np
import scipy.sparse as sps
from sys import getsizeof
import time
import itertools

In [2]:
def load_data_into_matrix(filename, n_users, n_movies):
    data = np.load(filename)
    ratings = data[:, 2]
    users = data[:, 0] - 1
    movies = data[:, 1] - 1
    data = sps.coo_matrix((ratings, (users, movies)), shape=(n_users, n_movies), dtype=np.int8)
    return data


def load_discrete_data_into_matrix(filename, n_users, n_movies, n_ratings):
    data = np.load(filename)
    ratings = np.ones((n_ratings))
    users = data[:, 0] - 1
    movies = data[:, 1] - 1
    data = sps.coo_matrix((ratings, (users, movies)), shape=(n_users, n_movies), dtype=np.int8)
    return data


def load_random_data_into_matrix(n_users, n_movies, n_ratings):
    ratings = np.random.randint(0, 5, n_ratings)
    users = np.random.randint(1, n_users, n_ratings) - 1
    movies = np.random.randint(1, n_movies, n_ratings) - 1
    data = sps.coo_matrix((ratings, (users, movies)), shape=(n_users, n_movies), dtype=np.int8)
    return data


def cosine_sim(u, v):
    return 1 - np.arccos(np.dot(u, v) / np.sqrt(np.sum(np.square(u)) * np.sum(np.square(v)))) / np.pi


def cosine_sim_sparse(u, v):
    return 1 - np.arccos((u * v.T).toarray()[0, 0] / 
                         (np.sqrt(u.power(2).sum()) * np.sqrt(v.power(2).sum()))) / np.pi


def make_random_projections(n_projections, n_users):
    flat_v = np.random.choice([-1, 1], size=(n_projections * n_users))
    mesh = np.array(np.meshgrid(np.arange(n_projections), np.arange(n_users))).T.reshape(-1,2)
    projection_indices, user_indices = mesh[:, 0], mesh[:, 1]
    return sps.coo_matrix((flat_v, (projection_indices, user_indices)), 
                          shape=(n_projections, n_users), dtype=np.int8)

def unique_pairs_from_array(arr):
    pairs = []
    n = len(arr)
    for idx, val in enumerate(arr[:-1]):
        pairs_for_single_value = np.stack((np.tile(val, n - idx - 1), arr[idx + 1:]), axis=1)
        pairs.append(pairs_for_single_value)
    return np.concatenate(pairs, axis=0)

def itertools_unique_pairs_from_array(arr):
    return np.array(list(itertools.combinations(arr,2)))

In [3]:
np.random.seed(42)

random = False
discrete = False

if random:
    n_users = 1000
    n_movies = 10000
    n_ratings = 1000000
    rating_matrix = load_random_data_into_matrix(n_users, n_movies, n_ratings) 
else:
    filename = 'user_movie_rating.npy'
    n_users = 103703
    n_movies = 17770
    n_ratings = 65225506
    if discrete:
        rating_matrix = load_discrete_data_into_matrix(filename, n_users, n_movies, n_ratings)
    else:
        rating_matrix = load_data_into_matrix(filename, n_users, n_movies)

In [4]:
n_bands = 6
projections_per_band = 30
n_projections = n_bands * projections_per_band

In [5]:
# Calculate signatures

t0 = time.time()

v = make_random_projections(n_projections, n_movies)
signatures = ((rating_matrix * v.T).toarray().T >= 0).astype(int)

print('Signatures shape:', signatures.shape)

signature_time = time.time()
print('Calculate signatures time:', signature_time - t0)

Signatures shape: (180, 103703)
Calculate signatures time: 28.338826417922974


In [6]:
# Find pairs

t0 = time.time()

binary_array = np.array([2**i for i in np.arange(projections_per_band)[::-1]])
pairs = []

for b in np.arange(n_bands):
    start_of_band = b * projections_per_band
    end_of_band = min(start_of_band + projections_per_band, n_projections)
    band = signatures[start_of_band:end_of_band, :]
    band = np.dot(binary_array, band).T
    unique_values = np.unique(band, axis=0)
    for u in unique_values:
        indices_of_unique_value = np.where(band == u)[0]
        if indices_of_unique_value.shape[0] > 1:
            pairs_of_unique_value = unique_pairs_from_array(indices_of_unique_value)
            pairs.append(pairs_of_unique_value)

pairs = np.unique(np.concatenate(pairs, axis=0), axis=0)

print('Find pairs time:', time.time() - t0)
print('Number of pairs:', pairs.shape[0])

Find pairs time: 27.776695013046265
Number of pairs: 106


In [13]:
# Calculate cosine similarities

t0 = time.time()

sim = []
for u1, u2 in pairs[:10]:
    nz1 = np.nonzero(rating_matrix.getrow(u1).toarray()[0])[0]
    nz2 = np.nonzero(rating_matrix.getrow(u2).toarray()[0])[0]
    _, c = np.unique(np.append(nz1, nz2), return_counts=True)
    n_same_movies = np.sum((c > 1).astype(int))
    print(n_same_movies, nz1.shape[0], n_same_movies, nz2.shape[0])
    sim.append(cosine_sim_sparse(rating_matrix.getrow(u1), rating_matrix.getrow(u2)))

similar_pairs = pairs[:10][np.array(sim) > 0.73]
print('Calculate similarity time:', time.time() - t0)

17770
178 399 178 572
17770
97 353 97 341
17770
58 560 58 336
17770
147 981 147 451
17770
288 823 288 649
17770
157 330 157 534
17770
106 337 106 370
17770
123 353 123 324
17770
60 317 60 324
17770
121 372 121 479
Calculate similarity time: 15.514370679855347


In [8]:
## Print result

print('Pair similarities:', sim)
print('Number of pairs with cos_sim > 0.73:', similar_pairs.shape[0])

Pair similarities: [0.49436827113731396, 0.4955488409627161, 0.4951506559156964, 0.5000432733334395, 0.5018078256017717, 0.49707702212119564, 0.4958590983123359, 0.5008311640889037, 0.49752883305432305, 0.5062154194337252]
Number of pairs with cos_sim > 0.73: 0


In [12]:
a = np.array(17000*[0] + 200*[5] + 200*[0] + 100*[5])
b = np.array(17000*[0] + 200*[0] + 200*[5] + 100*[1])

nz1 = np.nonzero(a)[0]
nz2 = np.nonzero(b)[0]
_, c = np.unique(np.append(nz1, nz2), return_counts=True)
n_same_movies = np.sum((c > 1).astype(int))
print(n_same_movies, nz1.shape[0], n_same_movies, nz2.shape[0])

print(cosine_sim(a, b))

100 300 100 300
0.5257619443311856
