In [48]:
import pandas as pd
from sklearn.preprocessing import LabelEncoder
from scipy.sparse import csr_matrix
import numpy as np

In [2]:
ratings = pd.read_csv(
    "data/ratings.csv",
    sep=',',
    names=["userid", "itemid", "rating", "timestamp"],
    skiprows=1
)

In [4]:
ratings.head()

Unnamed: 0,userid,itemid,rating,timestamp
0,1,296,5.0,1147880044
1,1,306,3.5,1147868817
2,1,307,5.0,1147868828
3,1,665,5.0,1147878820
4,1,899,3.5,1147868510


In [10]:
ratings.shape

(25000095, 4)

In [5]:
ratings["userid"].nunique()

162541

In [7]:
ratings["itemid"].nunique()

59047

In [11]:
ratings["itemid"].min()

1

In [12]:
ratings["itemid"].max()

209171

In [13]:
ratings["userid"].min(), ratings["userid"].max()

(1, 162541)

In [18]:
item_encoder = LabelEncoder()
items = item_encoder.fit_transform(ratings["itemid"])

In [19]:
user_encoder = LabelEncoder()
users = user_encoder.fit_transform(ratings["userid"])

In [21]:
rating_matrix = csr_matrix((ratings["rating"], (users, items)))

In [22]:
rating_matrix

<162541x59047 sparse matrix of type '<class 'numpy.float64'>'
	with 25000095 stored elements in Compressed Sparse Row format>

In [23]:
import implicit

In [24]:
model = implicit.als.AlternatingLeastSquares()

In [25]:
model.fit(rating_matrix)

  0%|          | 0/15 [00:00<?, ?it/s]

In [26]:
model.similar_users(2)

(array([     2, 126881,   3746,   9807, 124560,  51327, 124567,   8504,
         72310,  37959], dtype=int32),
 array([1.        , 0.7489628 , 0.74526894, 0.7296327 , 0.72887444,
        0.69245577, 0.691985  , 0.6898754 , 0.67966044, 0.6785391 ],
       dtype=float32))

In [27]:
model.similar_items(2)

(array([  2,   4,   6, 627, 133, 786, 704, 693,  78, 772], dtype=int32),
 array([1.0000001 , 0.8865746 , 0.8227142 , 0.79436594, 0.7769356 ,
        0.7735333 , 0.76941115, 0.755799  , 0.74906534, 0.7463218 ],
       dtype=float32))

In [53]:
ratings_u_user = rating_matrix[2].toarray().ravel()
mean_u_rating = ratings_u_user[ratings_u_user != 0].mean()
std_u_rating = ratings_u_user[ratings_u_user != 0].std()

In [54]:
mean_u_rating, std_u_rating

(3.6974085365853657, 0.5993970477928102)

In [55]:
similar_users, similiraty = model.similar_users(2, N=11)

In [56]:
similar_users = similar_users[1:]
similiraty = similiraty[1:]

In [57]:
total_rating = 0
for user, similiraty in zip(similar_users, similiraty):
    ratings_v_user = rating_matrix[user].toarray().ravel()
    mean_v_rating = ratings_v_user[ratings_v_user != 0].mean()
    std_v_rating = ratings_v_user[ratings_v_user != 0].std()
    total_rating += similiraty * (ratings_v_user - mean_v_rating) / std_v_rating

In [58]:
np.argsort(total_rating.ravel())[:5]

array([29523, 39318, 39319, 39320, 39321])

In [63]:
ratings_u_user[39321]

0.0