In [4]:
import rs_datasets
import pandas as pd
import numpy as np
from scipy.linalg import sqrtm
from scipy.sparse import linalg as sparse_linalg
from scipy.sparse import csr_matrix
from sklearn.metrics import mean_squared_error

In [5]:
# выгружаем датасет
data = rs_datasets.MovieLens('100k').ratings.copy()

4.94MB [00:14, 349kB/s]                             


In [6]:
data.head()

Unnamed: 0,user_id,item_id,rating,timestamp
0,196,242,3,881250949
1,186,302,3,891717742
2,22,377,1,878887116
3,244,51,2,880606923
4,166,346,1,886397596


In [7]:
data['user_id'] = data['user_id'].astype('str')
data['item_id'] = data['item_id'].astype('str')

In [8]:
users = data['user_id'].unique()  # список юзеров
movies = data['item_id'].unique()  # список фильмов

In [9]:
test  = pd.DataFrame(columns=data.columns)
train = pd.DataFrame(columns=data.columns)

In [11]:
test_ratio = 0.2  # 20% по времени обрезается
for u in users:
    temp = data[data['user_id'] == u]
    n = len(temp)
    test_size = int(test_ratio*n)
    temp = temp.sort_values('timestamp').reset_index()
    temp.drop('index', axis=1, inplace=True)
    dummy_test = temp.loc[n-1-test_size:]
    dummy_train = temp.loc[:n-2-test_size]    
    test = pd.concat([test, dummy_test])
    train = pd.concat([train, dummy_train])

In [14]:
def create_utility_matrix(data, formatizer={'user': 0, 'item': 1, 'value': 2}):
    '''
        функция создания матрицы
    '''
    itemField = formatizer['item'] 
    userField = formatizer['user']
    valueField = formatizer['value']
    userList = data.iloc[:, userField].tolist()
    itemList = data.iloc[:, itemField].tolist()
    valueList = data.iloc[:, valueField].tolist()
    users = list(set(data.iloc[:, userField]))
    items = list(set(data.iloc[:, itemField]))
    users_index = {users[i]: i for i in range(len(users))}
    pd_dict = {item: [np.nan for i in range(len(users))] for item in items}
    for i in range(0, len(data)):
        item = itemList[i]
        user = userList[i]
        value = valueList[i]
        pd_dict[item][users_index[user]] = value
    X = pd.DataFrame(pd_dict)
    X.index = users
    itemcols = list(X.columns)
    items_index = {itemcols[i]: i for i in range(len(itemcols))}
    return X, users_index, items_index

In [59]:
no_of_features = [8, 10, 12, 14, 17]
utilMat, users_index, items_index = create_utility_matrix(train)

In [17]:
def svd_simple(train, k):
    '''
    svd без понтов
    '''
    utilMat = csr_matrix(train.fillna(0).values)
    U, s, V = sparse_linalg.svds(utilMat, k)
    s = np.diag(s)
    s_root = sqrtm(s)
    Usk = np.dot(U, s_root)
    skV = np.dot(s_root, V)
    UsV = np.dot(Usk, skV)
    return UsV

In [18]:
# тестирование
for f in no_of_features:
    svdout = svd_simple(utilMat, k=f)
    pred = []
    for _, row in test.iterrows():
        user = row['user_id']
        item = row['item_id']
        u_index = users_index[user]
        if item in items_index:
            i_index = items_index[item]
            pred_rating = svdout[u_index, i_index]
        else:
            pred_rating = np.mean(svdout[u_index, :])
        pred.append(pred_rating)
    print(mean_squared_error(test['rating'], pred)**0.5)

2.8107698931135627
2.8095067363779505
2.8101565972243154
2.8175902121890806
2.830510650366513


In [35]:
def svd_fillna(train, k):
    '''
        svd с заполнением пропусков
    '''
    utilMat = train.copy()
    # маска из нанов
    mask = np.isnan(utilMat)
    masked_arr = np.ma.masked_array(utilMat, mask)
    item_means = np.mean(masked_arr, axis=0)
    # замена средним для нанов
    utilMat = masked_arr.filled(item_means)
    x = np.tile(item_means, (utilMat.shape[0], 1))
    # вычитание
    utilMat = utilMat - x
    U, s, V = np.linalg.svd(utilMat, full_matrices=False)
    s = np.diag(s)
    s = s[0:k, 0:k]
    U = U[:, 0:k]
    V = V[0:k, :]
    s_root = sqrtm(s)
    Usk = np.dot(U, s_root)
    skV = np.dot(s_root, V)
    UsV = np.dot(Usk, skV)
    # восстановление
    UsV = UsV + x
    return UsV

In [37]:
for f in no_of_features:
    svdout = svd_fillna(utilMat, k=f)
    pred = []
    for _, row in test.iterrows():
        user = row['user_id']
        item = row['item_id']
        u_index = users_index[user]
        if item in items_index:
            i_index = items_index[item]
            pred_rating = svdout[u_index, i_index]
        else:
            pred_rating = np.mean(svdout[u_index, :])
        pred.append(pred_rating)
    print(mean_squared_error(test['rating'], pred)**0.5)

1.036695184072289
1.0376126313011744
1.036940958610357
1.037315110588204
1.0386615165258344


In [60]:
mask = np.isnan(utilMat)
masked_arr = np.ma.masked_array(utilMat, mask)
item_means = np.mean(masked_arr, axis=0)

In [61]:
# замена средним для нанов
utilMat = masked_arr.filled(item_means)
x = np.tile(item_means, (utilMat.shape[0], 1))

In [62]:
total_mean = np.mean(utilMat)
total_mean

3.112882049259111

In [63]:
utilMat = utilMat - x
total_mean = np.mean(utilMat)
col_means = np.mean(utilMat, axis=0) - total_mean
row_means = np.mean(utilMat, axis=1) - total_mean
utilMat = utilMat - row_means[:, np.newaxis] - col_means - total_mean

In [76]:
col_means

masked_array(data=[-5.847980092387419e-17, 3.8674332984566793e-19,
                   -2.9097831483626442e-18, ..., 1.7995403919349447e-18,
                   -5.7353772725411976e-18, -8.418902418409097e-20],
             mask=[False, False, False, ..., False, False, False],
       fill_value=1e+20)

In [40]:
def svd_fillna_shift(train, k):
    '''
        svd со сдвигом
    '''
    utilMat = train.copy()
    # маска из нанов
    mask = np.isnan(utilMat)
    masked_arr = np.ma.masked_array(utilMat, mask)
    item_means = np.mean(masked_arr, axis=0)
    # замена средним для нанов
    utilMat = masked_arr.filled(item_means)
    x = np.tile(item_means, (utilMat.shape[0], 1))
    # вычитание
    utilMat = utilMat - x
    total_mean = np.mean(utilMat)
    col_means = np.mean(utilMat, axis=0) - total_mean
    row_means = np.mean(utilMat, axis=1) - total_mean
    utilMat = utilMat - row_means[:, np.newaxis] - col_means - total_mean
    U, s, V = np.linalg.svd(utilMat, full_matrices=False)
    s = np.diag(s)
    s = s[0:k, 0:k]
    U = U[:, 0:k]
    V = V[0:k, :]
    s_root = sqrtm(s)
    Usk = np.dot(U, s_root)
    skV = np.dot(s_root, V)
    UsV = np.dot(Usk, skV)
    # восстановление
    UsV = UsV + row_means[:, np.newaxis] + col_means + total_mean
    UsV = UsV + x
    return UsV

In [41]:
for f in no_of_features: 
    svdout = svd_fillna_shift(utilMat, k=f)
    pred = []
    for _, row in test.iterrows():
        user = row['user_id']
        item = row['item_id']
        u_index = users_index[user]
        if item in items_index:
            i_index = items_index[item]
            pred_rating = svdout[u_index, i_index]
        else:
            pred_rating = np.mean(svdout[u_index, :])
        pred.append(pred_rating)
    print(mean_squared_error(test['rating'], pred)**0.5)

1.036062527500169
1.0374378272460585
1.0367533284622752
1.037352936452513
1.0385584659382243


In [42]:
def svd_fillna_shift_norm(train, k):
    '''
        svd с нормировкой + заполнениями и сдвигом
    '''
    utilMat = train.copy()
    # маска из нанов
    mask = np.isnan(utilMat)
    masked_arr = np.ma.masked_array(utilMat, mask)
    item_means = np.mean(masked_arr, axis=0)
    # замена средним для нанов
    utilMat = masked_arr.filled(item_means)
    x = np.tile(item_means, (utilMat.shape[0], 1))
    # вычитание
    utilMat = utilMat - x
    total_mean = np.mean(utilMat)
    col_means = np.mean(utilMat, axis=0) - total_mean
    row_means = np.mean(utilMat, axis=1) - total_mean
    utilMat = utilMat - row_means[:, np.newaxis] - col_means - total_mean
    row_norms = np.sqrt(np.sum(utilMat*utilMat, axis=1))[:, np.newaxis]
    col_norms = np.sqrt(np.sum(utilMat*utilMat, axis=0))[np.newaxis, :]
    utilMat = utilMat / np.sqrt(row_norms) / np.sqrt(col_norms)
    U, s, V = np.linalg.svd(utilMat, full_matrices=False)
    s = np.diag(s)
    s = s[0:k, 0:k]
    U = U[:, 0:k]
    V = V[0:k, :]
    s_root = sqrtm(s)
    Usk = np.dot(U, s_root)
    skV = np.dot(s_root, V)
    UsV = np.dot(Usk, skV)
    # восстановление
    UsV = UsV * np.sqrt(row_norms) * np.sqrt(col_norms)
    UsV = UsV + row_means[:, np.newaxis] + col_means + total_mean
    UsV = UsV + x
    return UsV

In [43]:
for f in no_of_features:
    svdout = svd_fillna_shift_norm(utilMat, k=f)
    pred = []
    for _, row in test.iterrows():
        user = row['user_id']
        item = row['item_id']
        u_index = users_index[user]
        if item in items_index:
            i_index = items_index[item]
            pred_rating = svdout[u_index, i_index]
        else:
            pred_rating = np.mean(svdout[u_index, :])
        pred.append(pred_rating)
    print(mean_squared_error(test['rating'], pred)**0.5)

1.035402964598958
1.0353785630013288
1.0347903758778334
1.0357938453084006
1.035679638469686
