In [1]:
import numpy as np
from sklearn.metrics import pairwise_distances, pairwise_distances_chunked
from scipy.spatial.distance import cosine
import pandas as pd
from scipy import sparse
from sklearn.metrics.pairwise import cosine_similarity
import sklearn
import pyarrow as pa
import pyarrow.parquet as pq
from tqdm import tqdm

In [None]:
df = pq.read_table('all_ratings.parquet').to_pandas()

In [None]:
df.head()

### Add indices to users and movies

In [None]:
num_movies = len(df.movie_id.unique())
num_users = len(df.user_id.unique())

In [None]:
num_movies, num_users

In [None]:
movie_to_idx = {m:idx for idx,m in enumerate(df.movie_id.unique())}
idx_to_movie = {idx:m for idx,m in enumerate(df.movie_id.unique())}

user_to_idx = {m:idx for idx,m in enumerate(df.user_id.unique())}
idx_to_user = {idx:m for idx,m in enumerate(df.user_id.unique())}

In [None]:
df['movie_idx']=df.movie_id.map(movie_to_idx)
df['user_idx']=df.user_id.map(user_to_idx)

In [None]:
df.shape

In [None]:
df.head()

In [None]:
table = pa.Table.from_pandas(df)
pq.write_table(table, 'all_ratings_with_indices.parquet')

## Loading ratings with indices and converting to sparse format

In [62]:
df = pq.read_table('all_ratings_with_indices.parquet',columns=['rating','user_idx','movie_idx']).to_pandas()

In [63]:
df.shape

(100480507, 3)

In [64]:
df.head()

Unnamed: 0,rating,user_idx,movie_idx
0,3,0,0
1,5,1,0
2,4,2,0
3,4,3,0
4,3,4,0


In [65]:
num_users = df.user_idx.max()+1
num_movies = df.movie_idx.max()+1

In [66]:
num_movies, num_users

(17770, 480189)

In [67]:
All = sparse.csr_matrix((df.rating,(df.user_idx,df.movie_idx)),shape=(num_users,num_movies))
All

<480189x17770 sparse matrix of type '<class 'numpy.longlong'>'
	with 100480507 stored elements in Compressed Sparse Row format>

### Selecting only 30,000 users out of 480,189

In [74]:
A = All[:30000,:]

In [75]:
# user_sim = pairwise_distances(A, metric="cosine",n_jobs=-1)

### I ran out of memory with 256GB RAM! So I will now use **pairwise_distances_chunked** which is meant for limited memory

In [76]:
user_sim = pairwise_distances_chunked(A, metric="cosine",n_jobs=-1)

In [77]:
def get_neighbors(user_sum,k=5):
    neighbors = []
    for chunk in tqdm(user_sim):
        idx = np.argpartition(chunk, range(k),axis=1)[:,1:k]
        neighbors.append(idx)
    return np.concatenate(neighbors)

In [78]:
recc = get_neighbors(user_sim)

7it [00:55,  7.94s/it]


### The following table shows the user_ids of 4 closest users to all 30,000 users we selected above

In [80]:
recc

array([[27297, 23786, 10971, 25156],
       [25160, 14999, 28160, 23102],
       [ 9020,  8985,  3808, 18610],
       ...,
       [10692, 29407, 11071, 11932],
       [27297, 20543, 22322, 21797],
       [ 2012,  1048,  2186,  6695]])

In [81]:
recc.shape

(30000, 4)