Удалим часть пользователей и оценок из выборки и для них спрогнозируем эти оценки, а затем проверим, на сколько они близки, рассчитав для двух векторов: прогнозного и фактического, RMSE.

In [1]:
import pandas as pd
import scipy
import numpy as np
from scipy import sparse
from scipy.sparse.linalg import svds
from sklearn.model_selection import cross_val_score
from scipy.linalg import sqrtm
from sklearn.metrics import mean_squared_error
import random

In [2]:
path_ratings = "/kaggle/input/ratings.csv"
ratings = pd.read_csv(path_ratings)
ratings.head(5)

Unnamed: 0,userId,movieId,rating,timestamp
0,1,1,4.0,964982703
1,1,3,4.0,964981247
2,1,6,4.0,964982224
3,1,47,5.0,964983815
4,1,50,5.0,964982931


In [3]:
ratings.dtypes

userId         int64
movieId        int64
rating       float64
timestamp      int64
dtype: object

In [4]:
ratings['userId'] = ratings['userId'].astype('str')
ratings['movieId'] = ratings['movieId'].astype('str')

In [5]:
users = ratings['userId'].unique()

In [6]:
test = pd.DataFrame(columns=ratings.columns)
train = pd.DataFrame(columns=ratings.columns)
test_ratio = 0.2
for user in users:
    user_history = ratings[ratings['userId'] == user]
    n = len(user_history)
    test_size = int(test_ratio * n)
    user_history = user_history.sort_values('timestamp').reset_index()
    user_history.drop('index', axis=1, inplace=True)
    dummy_test = user_history.loc[n-test_size:]
    dummy_train = user_history.loc[:n-1-test_size]
    test = pd.concat([test, dummy_test])
    train = pd.concat([train, dummy_train])

In [7]:
test.shape

(19940, 4)

In [8]:
train.shape

(80896, 4)

In [9]:
ratings.shape

(100836, 4)

In [10]:
train_matrix_df = train.pivot('userId', 'movieId', 'rating').fillna(0)
train_matrix = scipy.sparse.csr_matrix(train_matrix_df).toarray()
random.shuffle(train_matrix)
train_matrix

array([[4., 0., 0., ..., 0., 0., 0.],
       [4., 0., 0., ..., 0., 0., 0.],
       [4., 0., 0., ..., 0., 0., 0.],
       ...,
       [0., 0., 0., ..., 0., 0., 0.],
       [5., 0., 0., ..., 0., 0., 0.],
       [2., 0., 0., ..., 0., 0., 0.]])

In [11]:
user_list = train.iloc[:, 0].tolist()
item_list = train.iloc[:, 1].tolist()
value_list = train.iloc[:, 2].tolist()
users = list(set(train.iloc[:, 0]))
items = list(set(train.iloc[:, 1]))
users_index = {users[i]: i for i in range(len(users))}
pd_dict = {item: [np.nan for i in range(len(users))] for item in items}

for i in range(len(train)):
    item = item_list[i]
    user = user_list[i]
    value = value_list[i]
    pd_dict[item][users_index[user]] = value

X = pd.DataFrame(pd_dict)
X.index = users

itemcols = list(X.columns)
items_index = {itemcols[i]: i for i in range(len(itemcols))}

In [12]:
util_mat = np.array(X)
mask = np.isnan(util_mat)
masked_arr = np.ma.masked_array(util_mat, mask)
item_means = np.mean(masked_arr, axis=0)
util_mat = masked_arr.filled(item_means)
x = np.tile(item_means, (util_mat.shape[0],1))
util_mat = util_mat - x

In [14]:
U, s, V = np.linalg.svd(util_mat, full_matrices=False)
k = 10
s = np.diag(s)
s = s[0:k,0:k]
U = U[:,0:k]
V = V[0:k,:]
s_root = sqrtm(s)
Usk = np.dot(U,s_root)
skV = np.dot(s_root,V)
UsV = np.dot(Usk, skV)
UsV = UsV + x

In [16]:
svdout = UsV

In [17]:
pred = []
for _,row in test.iterrows():
    user = row['userId']
    item = row['movieId']
    
    u_index = users_index[user]
    if item in items_index:
        i_index = items_index[item]
        pred_rating = svdout[u_index, i_index]
    else:
        pred_rating = np.mean(svdout[u_index, :])
    pred.append(pred_rating)

In [22]:
np.sqrt(mean_squared_error(pred, test['rating']))

1.0055113644296951