# Collaborative filtering notebook

Collaborative filtering is an umbrella of methods. These filtering methods only use ratings (implicit or
explicit) as the source for creating recommendations. 

Neighborhood-based filtering is a branch of collaboratie filtering.
User-user and item-item systems are to possible implementations.

In [1]:
import pandas as pd
import numpy as np
import time
import matplotlib.pyplot as plt
%matplotlib inline
from scipy.sparse import csr_matrix, save_npz, load_npz
from sklearn import neighbors
from sklearn.cluster import KMeans
import dask.dataframe as dd

## Calculating the suggestins for the "cold-starters"

In [2]:
# importing movies dataframe
movies = pd.read_csv("data/movies.csv")

# extracting release year
movies['release_year'] = movies.title.str.extract("\((\d{4})\)", expand=True)
movies.release_year = pd.to_datetime(movies.release_year, format='%Y')

# as there are some NaN years, we remove those
movies.dropna(inplace=True) 
movies.release_year = movies.release_year.dt.year 
movies.title = movies.title.str[:-7]
print(movies.info())
movies.head()


<class 'pandas.core.frame.DataFrame'>
Int64Index: 27256 entries, 0 to 27277
Data columns (total 4 columns):
 #   Column        Non-Null Count  Dtype 
---  ------        --------------  ----- 
 0   movieId       27256 non-null  int64 
 1   title         27256 non-null  object
 2   genres        27256 non-null  object
 3   release_year  27256 non-null  int64 
dtypes: int64(2), object(2)
memory usage: 1.0+ MB
None


Unnamed: 0,movieId,title,genres,release_year
0,1,Toy Story,Adventure|Animation|Children|Comedy|Fantasy,1995
1,2,Jumanji,Adventure|Children|Fantasy,1995
2,3,Grumpier Old Men,Comedy|Romance,1995
3,4,Waiting to Exhale,Comedy|Drama|Romance,1995
4,5,Father of the Bride Part II,Comedy,1995


In [3]:
# selecting only "recently" released movies (in 2014 and 2015)
start_date = 2013
end_date = 2016
mask = (movies['release_year'] > start_date) & (movies['release_year'] <= end_date)
latest_movies = movies.loc[mask]
latest_movies.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 860 entries, 22304 to 27277
Data columns (total 4 columns):
 #   Column        Non-Null Count  Dtype 
---  ------        --------------  ----- 
 0   movieId       860 non-null    int64 
 1   title         860 non-null    object
 2   genres        860 non-null    object
 3   release_year  860 non-null    int64 
dtypes: int64(2), object(2)
memory usage: 33.6+ KB


In [4]:
# import the ratings matrix
ratings = pd.read_csv("data/ratings.csv")

In [5]:
ratings.columns

Index(['userId', 'movieId', 'rating', 'timestamp'], dtype='object')

In [6]:
# using Dask library, convert the timestamps to year, month and date
dask_ratings = dd.from_pandas(ratings, npartitions=40)
dask_ratings['rating_year']=dd.to_datetime(ratings.timestamp,unit='s').dt.year
dask_ratings['rating_month']=dd.to_datetime(ratings.timestamp,unit='s').dt.month
dask_ratings['rating_date'] = dd.to_datetime(ratings.timestamp, unit='s')

In [7]:
# compute back the pandas ratings dataframe
ratings = dask_ratings.compute()

In [8]:
# merge latest movies and latest ratings
latest_ratings = pd.merge(ratings, latest_movies, on="movieId")
latest_ratings.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 42349 entries, 0 to 42348
Data columns (total 10 columns):
 #   Column        Non-Null Count  Dtype         
---  ------        --------------  -----         
 0   userId        42349 non-null  int64         
 1   movieId       42349 non-null  int64         
 2   rating        42349 non-null  float64       
 3   timestamp     42349 non-null  int64         
 4   rating_year   42349 non-null  int64         
 5   rating_month  42349 non-null  int64         
 6   rating_date   42349 non-null  datetime64[ns]
 7   title         42349 non-null  object        
 8   genres        42349 non-null  object        
 9   release_year  42349 non-null  int64         
dtypes: datetime64[ns](1), float64(1), int64(6), object(2)
memory usage: 3.6+ MB


In [9]:
#distribution of movies between 2014 and 2015
tempdf = latest_ratings.groupby(['release_year']).agg({'release_year': 'count', 'rating' : ['mean']})
tempdf.head(100)

Unnamed: 0_level_0,release_year,rating
Unnamed: 0_level_1,count,mean
release_year,Unnamed: 1_level_2,Unnamed: 2_level_2
2014,41353,3.524484
2015,996,2.920181


In [11]:
tmpdf = pd.DataFrame(latest_ratings.groupby(['movieId']).agg({'movieId' : 'count', 'rating': 'mean', 'title': pd.Series.mode,  'release_year': pd.Series.mode}).rename(columns= {'movieId': 'rating_amount', 'rating':'avg_rating'}))
tmpdf.head()

Unnamed: 0_level_0,rating_amount,avg_rating,title,release_year
movieId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
107209,3,2.666667,The Outsider,2014
107516,2,3.25,Punk's Dead: SLC Punk! 2,2014
107769,33,2.530303,Paranormal Activity: The Marked Ones,2014
107916,17,3.176471,Yves Saint Laurent,2014
107962,4,2.75,Freezer,2014


In [10]:
# select only the movies that have been rated more than 10 times and with the average rating higher then 3.5
mask = (tmpdf['rating_amount'] > 10) & (tmpdf['avg_rating'] > 3.0)
tmpdf2 = tmpdf.loc[mask]
tmpdf2.sort_values(by="avg_rating", ascending=False).head()

NameError: name 'tmpdf' is not defined

In [None]:
tmpdf2.groupby("release_year").count().head()

In [None]:
#Best movies of 2015 so far:
tmpdf2[tmpdf2.release_year == 2015].sort_values(by="avg_rating", ascending=False).head(10)

In [15]:
#Best movies of 2014 so far:
tmpdf2[tmpdf2.release_year == 2014].sort_values(by="avg_rating", ascending=False).head(10)

Unnamed: 0_level_0,rating_amount,avg_rating,title,release_year
movieId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
113315,11,4.5,Zero Motivation (Efes beyahasei enosh),2014
112552,602,4.074751,Whiplash,2014
109487,1739,4.023864,Interstellar,2014
118880,16,4.0,A Girl Walks Home Alone at Night,2014
109374,2230,3.992152,"Grand Budapest Hotel, The",2014
116797,759,3.980237,The Imitation Game,2014
118896,18,3.972222,Mommy,2014
111529,40,3.9625,"Normal Heart, The",2014
113240,26,3.942308,Winter Sleep (Kis Uykusu),2014
112556,1479,3.936444,Gone Girl,2014


### Testing cold starter function

In [None]:
def cold_starters(df, amount = 10):
    def Min_Max(obj):
        nor_obj =  (obj - obj.min()) / (obj.max() - obj.min())
        return nor_obj
    df['score'] = Min_Max(df.avg_rating) + Min_Max(df.rating_amount) + Min_Max(df.release_year)
    return df.sort_values(by='score', ascending = False).head(10)
    #return Min_Max(df.avg_rating)

In [None]:
cold_starters(tmpdf2, 20)

I checked the formula for cold starters. It works quite well for year 2014, but i will not recommend any movie from 2015 given the low avg ratings and low rating amount. If we want to recommend "new releases", we show apply this formula for the films of 2015 only.

Maybe instead of "trending", the cold starters can be recommended the most popular "recent" movies (2014), mixed with 1 or 2 best  movies of 2015. Or just say that our recommendation system is for the beginning of 2015, and ignore the 2015 releases.

## Implementing the pipeline

* making a pivot matrix
* removing the users bias by normalizing by substracting the mean
* choosing an active user: add myself to the database
* calculating similarity between all active yser and all the rest
* order users by similarity
* select a neighborhood
* predict the rating
* evoluate results (accuracy)

### Geting the utility matrix

In [2]:
# load utility matrix 
util_mat = pd.read_csv('matrices/pivot.csv', index_col='userId')

# load sparce utility matrix 
csr_util_mat = load_npz("matrices/sparse_ratings.npz")

In [34]:
# load utility matrix with NAs
util_mat_na = pd.read_csv('matrices/pivot_na.csv', index_col='userId')

In [54]:
util_mat_na

Unnamed: 0_level_0,1,2,3,4,5,6,7,8,9,10,...,116797,116823,116977,117176,117590,118696,118900,118997,119141,119145
userId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
116,3.0,2.0,2.0,,,1.5,,1.0,1.5,2.0,...,,,,,,,,,,
156,5.0,5.0,2.0,3.0,3.0,4.0,4.0,,3.0,4.0,...,,,,,,,,,,
208,4.0,,,,,,,,,,...,,,,,,,,,,
298,4.0,3.0,3.0,,3.0,5.0,,,,4.0,...,,,,,,,,,,
359,5.0,,,,,5.0,,,,4.0,...,,,,,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
138134,4.5,2.0,4.0,,2.0,,,,,3.0,...,,,,,,,,,,
138162,4.0,3.0,,,4.0,4.0,3.0,,,,...,,,,,,,,,,
138208,3.0,2.0,2.0,2.0,2.0,3.0,3.0,1.0,,2.0,...,,,,,,,,,,
138254,4.0,3.5,2.0,,,4.5,3.0,,,4.0,...,,,,,,,,,,


### Ading myself to the database

In [39]:
sveta_ratings = pd.read_csv('matrices/sveta_ratings.csv')
sveta_ratings.head()

Unnamed: 0,movie_id,imdb_id,tmdb_id,rating,average_rating,title
0,50,114814,629,5.0,4.26568,The Usual Suspects (1995)
1,111,75314,103,5.0,4.08056,Taxi Driver (1976)
2,293,110413,101,4.5,4.09683,Léon: The Professional (1994)
3,296,110912,680,5.0,4.19163,Pulp Fiction (1994)
4,307,108394,108,4.0,3.96434,Three Colors: Blue (1993)


{44555, 2571, 1036, 55820, 527, 2575, 69134, 2076, 541, 26150, 1584, 4144, 50, 1089, 88129, 3147, 7759, 593, 3676, 608, 111, 1136, 4720, 3703, 32892, 4226, 63113, 7327, 6818, 60069, 1704, 1193, 1196, 1199, 1200, 1206, 108727, 1208, 1211, 1214, 3262, 58559, 7361, 1221, 7371, 1232, 1240, 6874, 6365, 5349, 6377, 1258, 4848, 1270, 8961, 48385, 2313, 778, 49932, 4878, 7438, 2324, 1305, 79132, 293, 72998, 296, 5418, 99114, 307, 2360, 73017, 318, 109374, 5952, 57669, 3910, 95058, 3415, 344, 858, 68954, 356, 3949, 4973, 105844, 6016, 4993, 53125, 59784, 6539, 2959, 2455, 923, 924, 112552, 4011, 3503, 109487, 108981, 1464, 39869, 8644, 26578, 7123, 2019, 3569, 5618, 7153, 46578, 2557}


In [69]:
def add_user(ml_user_ratings, pivot_mat):
    intersection = set(pivot_mat.columns.astype(int)) & set(ml_user_ratings.movie_id)
    df = pd.DataFrame(columns=pivot_mat.columns, index=[0])
    df.head()
    pivot_mat = pd.concat([df, pivot_mat])
    for i in intersection:
        entry = ml_user_ratings.loc[ml_user_ratings.movie_id == i]
        movie_rating = entry.rating.values[0]
        pivot_mat.loc[[0], [str(i)]] = movie_rating
    return pivot_mat

In [73]:
util_mat_na_new = add_user(sveta_ratings, util_mat_na)

### Normalization of the rating matrix

In [82]:
# normalize the item ratings based on the users’ average rating
def csr_mean_normalization(csr_mat):
    """
    Normalizes the sparce utility matrix based on user's mean rating
    """
    csr_mean = csr_mat.sum(axis=1).flatten()/csr_mat.getnnz(axis=1)
    mean_user_vector = np.asarray(csr_mean)
    # TODO: figure out how to do minus means on this kind of structure
    return mean_user_vector[0]

def mean_normalization(pivot):
    mat = pivot.copy()
    averages = mat[mat != 0].mean(axis = 1).values
    for i in range(len(mat)):
        # Subtract by mean of each user, not the entire user
        mat.iloc[i,:] = (mat.iloc[i,:] - averages[i]) 
    return averages, mat

In [84]:
means, pivot_norm_na = mean_normalization(util_mat_na_new)
print(means)
pivot_norm_na.head()

[4.15315315 2.14447464 3.64135742 ... 2.66099773 3.55260978 4.33077994]


Unnamed: 0,1,2,3,4,5,6,7,8,9,10,...,116797,116823,116977,117176,117590,118696,118900,118997,119141,119145
0,,,,,,,,,,,...,,,,,,,,,,
116,0.855525,-0.144475,-0.144475,,,-0.644475,,-1.144475,-0.644475,-0.144475,...,,,,,,,,,,
156,1.358643,1.358643,-1.641357,-0.641357,-0.641357,0.358643,0.358643,,-0.641357,0.358643,...,,,,,,,,,,
208,0.325794,,,,,,,,,,...,,,,,,,,,,
298,1.123423,0.123423,0.123423,,0.123423,2.123423,,,,1.123423,...,,,,,,,,,,


In [85]:
# fillna(0) should come after scaling
pivot_norm = pivot_norm_na.fillna(0)
pivot_norm.head()

Unnamed: 0,1,2,3,4,5,6,7,8,9,10,...,116797,116823,116977,117176,117590,118696,118900,118997,119141,119145
0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
116,0.855525,-0.144475,-0.144475,0.0,0.0,-0.644475,0.0,-1.144475,-0.644475,-0.144475,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
156,1.358643,1.358643,-1.641357,-0.641357,-0.641357,0.358643,0.358643,0.0,-0.641357,0.358643,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
208,0.325794,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
298,1.123423,0.123423,0.123423,0.0,0.123423,2.123423,0.0,0.0,0.0,1.123423,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


# Modelling
Below we execute the following steps:
* Randomly choosing a target user fo whom recommendations will be generated
* Pre-clustering of users using KMeans model
* Creating a new matrix corresponding with users in the target user cluster
* Defining the Mean User Vector for the working data matrix
* Training the Nearest Neighbors model on the Mean User Vector
* Processing the results of the NN modeling: collecting, aggregating, filtering results and presenting recommendations

## Pre-clustering with KMeans

In [None]:
# Pre-clustering users with K-means
# 20 clusters
kmeans = KMeans(n_clusters=20, algorithm='lloyd', n_init='auto').fit(csr_util_mat) # Demands lots of resources
labels = kmeans.labels_ 
unique, counts = np.unique(labels, return_counts=True)

print("The number of users per class:\n")
for u, c in zip(unique, counts):
    print(u, c)

In [None]:
# Let's randomly choose an existing user
user_index = np.random.choice(csr_util_mat.shape[0])    
print('The target user ', user_index, ' belongs to the cluster number', labels[user_index])

# Creating the csr matrix only for usrs of the cluster of the target user
# get indices of the users in the selected cluster
cluster_user_indices = []
for i,l in enumerate(labels):
    if l == labels[user_index]:
        cluster_user_indices.append(i)
        
# create a new csr_util_mat only with the users from the cluster
mask = np.zeros(csr_util_mat.shape[0], dtype=bool)
mask[cluster_user_indices] = True

#csr_util_mat_cluster = csr_util_mat[mask]
csr_util_mat_cluster = csr_util_mat 

# When shrinking the matrix, the index of the target user changes to the new_user_index
new_user_index = np.nonzero(np.array(cluster_user_indices) == user_index)[0][0]
print('The new index of the target user is ', new_user_index)

# Calculating the mean users' ratings
#def calculate_mean_user_vector(csr_util_mat):
#    csr_mean = csr_util_mat.sum(axis=1)/csr_util_mat.getnnz(axis=1)
#    mean_user_vector = np.asarray(csr_mean)
#    flattened_reshaped = mean_user_vector.mean(axis=1).flatten().reshape(-1, 1)
#    return mean_user_vector.reshape(-1), flattened_reshaped

#mean_user_vector, model_mean_user_vector = calculate_mean_user_vector(csr_util_mat_cluster)

## Nearest Neighbors model

In [None]:
# Create NN object and fit it with the mean user vector

number_of_closest_users = 150
nn = neighbors.NearestNeighbors(metric='cosine', algorithm='brute', n_neighbors=number_of_closest_users)
nn.fit(csr_util_mat_cluster) 
number_of_most_closest_users = 20 # for visualization purpose only
nn_sub = neighbors.NearestNeighbors(metric='cosine', algorithm='brute', n_neighbors=number_of_most_closest_users)
nn_sub.fit(csr_util_mat_cluster)

# Find the nearest neighbors for the target user (e.g., User1)

mask = np.zeros(csr_util_mat_cluster.shape[0], dtype=bool)
mask[new_user_index] = True
target_user_row = csr_util_mat_cluster[mask]

distances, indices = nn.kneighbors(target_user_row)
distances_sub, indices_sub = nn_sub.kneighbors(target_user_row)

In [None]:
# Exercise: visualize the results as graph
import networkx as nx

G = nx.Graph()
G.add_node(new_user_index)

closest_users = indices.reshape(-1)
closest_users_sub = indices.reshape(-1)[1:number_of_most_closest_users +1]
closest_users_remaining = indices.reshape(-1)[number_of_most_closest_users+1 :]

In [None]:
print(len(closest_users), closest_users)
print(len(closest_users_sub), closest_users_sub)

In [None]:
G.add_nodes_from(indices.reshape(-1)[1:])

weights = 1/(distances.reshape(-1)[1:])

G.add_weighted_edges_from(list((new_user_index, n, w) for n,w in zip(closest_users[1:],weights)))

pos = nx.fruchterman_reingold_layout(G)

plt.figure(figsize = (10, 10))
ax = plt.axes()
ax.set_facecolor("white")
plt.title("Visualization of the results of the NN algo")

nx.draw_networkx_nodes(G, pos, node_size = 100, nodelist = closest_users_remaining, node_color = "#ff5e33")
nx.draw_networkx_nodes(G, pos, node_size = 100, nodelist = closest_users_sub, node_color = "#33ff83", label= str(number_of_most_closest_users) + " closest users")
nx.draw_networkx_nodes(G, pos, node_size = 200, nodelist = [new_user_index], node_color = "#ffffff")
nxlabels = nx.get_edge_attributes(G,'weight')
nx.draw_networkx_edges(G, pos, alpha = 0.3, edge_color = "#48dbc8")
#nx.draw_networkx_edge_labels(G,pos,edge_labels=nxlabels)
#nx.draw_networkx_labels(G,pos,font_size=10, labels={user_index: user_index})
nx.draw_networkx_labels(G,pos,font_size=10)
plt.grid(visible=False)
plt.legend()
plt.show()

## Postprocessing

In [None]:
# Collecting ratings from the closers users
def keep_rows_csr(mat, indices):
    """
   Keep the rows denoted by ``indices`` form the CSR sparse matrix ``mat``.
    """
    #if not isinstance(mat, scipy.sparse.csr_matrix):
     #   raise ValueError("works only for CSR format -- use .tocsr() first")
    indices = indices.flatten()
    mask = np.zeros(mat.shape[0], dtype=bool)
    mask[indices] = True
    return mat[mask]
    

# a csr matrix with the closest users only
csr_util_mat_closests_users = keep_rows_csr(csr_util_mat_cluster, indices)

# Aggregating the ratings from the closest users
# calculate the averaged rating of the movies given by the neiboghrs

def closest_users_average_ratings(mat):
    mat_array = mat.toarray()
    mat_array[mat_array == 0] = np.nan
    av_ratings = np.nanmean(mat_array, axis=0)
    df = pd.DataFrame(data ={'movieId': , 'rating': av_ratings })
    
    return df


rating_aggregation = closest_users_average_ratings(csr_util_mat_closests_users)

# rank them by sorting
movies_indices_sorted_desc = np.argsort(rating_aggregation)[::-1]
best_20_movies = movies_indices_sorted_desc[:20]

# Filtering:
# get the user movies row
mask = np.zeros(csr_util_mat_cluster.shape[0], dtype=bool)
mask[new_user_index] = True
target_user_row = csr_util_mat_cluster[mask].toarray().flatten()

# get indices of the unwatched movies
unwatched_indices = np.nonzero(target_user_row == 0)

# get indices of the watched movies
watched_indices = np.nonzero(target_user_row != 0)

# keep only unwatches movie indices
filter_arr = []
for element in movies_indices_sorted_desc:
  if element in unwatched_indices[0]:
    filter_arr.append(True)
  else:
    filter_arr.append(False)

movies_to_watch_unfiltered = movies_indices_sorted_desc
movies_to_watch = movies_indices_sorted_desc[filter_arr]

# Provide personalised recommendations
watched_indices = np.nonzero(target_user_row != 0)

# get intersection between watched_indices and recommender indices
intersection_indices = np.intersect1d(watched_indices[0],best_20_movies[0])

print('User watched ', len(watched_indices[0]), ' movies')
print('System recommends ', len(best_20_movies), ' movies')
print(len(intersection_indices), ' movies are in common')

def get_movie_titles_by_indices(indices):
    titles = []
    for i in indices:
        titles.append(pivot.columns[i])
    return titles

print("User " + str(user_index) + " watched and rated the following movies:\n")

for index, movie in enumerate(get_movie_titles_by_indices(watched_indices)[:20]):
    print(str(index+1), ":", str(movie))
    
print("\n")
print("Recommendation for User " + str(user_index) + ":\n")
for index, movie in enumerate(get_movie_titles_by_indices(best_20_movies)):
    print(str(index+1), ":", str(movie))

In [None]:
var = 5

In [None]:
def text_function(x):
    return x + var

In [None]:
print(text_function(6))

## Surprise

In [None]:
import os
from surprise import BaselineOnly, Dataset, Reader, SVD, KNNBasic
from surprise.model_selection import cross_validate

In [None]:
final_ratings = pd.read_csv('matrices/final_ratings.csv')
final_ratings.head()

In [None]:
final_ratings.info()

In [None]:
# path to dataset file
# A reader is still needed but only the rating_scale param is requiered.
reader = Reader(rating_scale=(0.5, 5))

# The columns must correspond to user id, item id and ratings (in that order).
data = Dataset.load_from_df(final_ratings[["userId", "movieId", "rating"]], reader)

In [None]:
algo = SVD()
cross_validate(algo, data, measures=["RMSE", "MAE"], cv=5, verbose=True)

In [None]:
algo = KNNBasic()
cross_validate(algo, data, measures=["RMSE", "MAE"], cv=5, verbose=True)

## Comparing Surprise with sklearn

### Neighborhood methods study

In [None]:
# Let's randomly choose an existing user
user_index = np.random.choice(csr_util_mat.shape[0])

In [None]:
# Create NN object and fit it with the mean user vector

number_of_closest_users = 50
nn = neighbors.NearestNeighbors(metric='cosine', algorithm='brute', n_neighbors=number_of_closest_users)
nn.fit(csr_util_mat) 

# Find the nearest neighbors for the target user

mask = np.zeros(csr_util_mat_cluster.shape[0], dtype=bool)
mask[user_index] = True
target_user_row = csr_util_mat_cluster[mask]

distances, indices = nn.kneighbors(target_user_row)

In [None]:
csr_util_mat
