# user-item матрица

In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
%matplotlib inline

In [2]:
r_df = pd.read_csv("/Users/tural/Datasets/ml-20m/ratings.csv")
r_df.head()

Unnamed: 0,userId,movieId,rating,timestamp
0,1,2,3.5,1112486027
1,1,29,3.5,1112484676
2,1,32,3.5,1112484819
3,1,47,3.5,1112484727
4,1,50,3.5,1112484580


In [3]:
from scipy.sparse import csr_matrix

def load_data(df):
    rows = []
    cols = []
    data = []
    
    uid_to_row = {}
    iid_to_col = {}
    
    for t in df.itertuples():
        row_id = uid_to_row.setdefault(t.userId, len(uid_to_row))
        col_id = iid_to_col.setdefault(t.movieId, len(iid_to_col))
        rating = t.rating
        
        rows.append(row_id)
        cols.append(col_id)
        data.append(rating)
        
    ui_m = csr_matrix((data, (rows, cols)))
    return ui_m, uid_to_row, iid_to_col

In [4]:
ui_m, uid_to_row, iid_to_col = load_data(r_df)
del r_df

In [5]:
print("Density", ui_m.nnz / (ui_m.shape[0] * ui_m.shape[1]))
print("Max rating", ui_m.data.max())
print("Min rating", ui_m.data.min())
print("Shape", ui_m.shape)

Density 0.0053998478135544505
Max rating 5.0
Min rating 0.5
Shape (138493, 26744)


## Простейшая матрица схожести

In [6]:
from sklearn.metrics.pairwise import cosine_similarity
ii_sim_m = cosine_similarity(ui_m.T.tocsr(), dense_output=False)

In [7]:
print("Density", ii_sim_m.nnz / (ii_sim_m.shape[0] * ii_sim_m.shape[1]))
print("Max sim", ii_sim_m.data.max())
print("Min sim", ii_sim_m.data.min())
print("Shape", ii_sim_m.shape)

Density 0.4010638638301901
Max sim 1.0
Min sim 1.64155673741e-05
Shape (26744, 26744)


In [8]:
# вспомогательные функции, которые могут пригодиться при построении Item-based CF
def nullify_main_diagonal(m):
    positions = range(m.shape[0])
    eye = csr_matrix((np.ones(len(positions)), (positions, positions)), m.shape)
    return m - m.multiply(eye)


def get_topk(matrix, top, axis=1):
    """Converts source matrix to Top-K matrix
    where each row or column contains only top K values

    :param matrix: source matrix
    :param top: number of top items to be stored
    :param axis: 0 - top by column, 1 - top by row
    :return:
    """
    rows = []
    cols = []
    data = []

    if axis == 0:
        matrix = matrix.T.tocsr()

    for row_id, row in enumerate(matrix):
        if top is not None and row.nnz > top:
            top_args = np.argsort(row.data)[-top:]

            rows += [row_id] * top
            cols += row.indices[top_args].tolist()
            data += row.data[top_args].tolist()
        elif row.nnz > 0:
            rows += [row_id] * row.nnz
            cols += row.indices.tolist()
            data += row.data.tolist()

    topk_m = csr_matrix((data, (rows, cols)), (matrix.shape[0], matrix.shape[1]))

    if axis == 0:
        topk_m = topk_m.T.tocsr()

    return topk_m

In [9]:
ii_sim_m = nullify_main_diagonal(ii_sim_m)
ii_sim_m = get_topk(ii_sim_m, top=30)

In [10]:
print("Density", ii_sim_m.nnz / (ii_sim_m.shape[0] * ii_sim_m.shape[1]))
print("Max sim", ii_sim_m.data.max())
print("Min sim", ii_sim_m.data.min())
print("Shape", ii_sim_m.shape)

Density 0.0011216197041442515
Max sim 1.0
Min sim 0.000939213518196
Shape (26744, 26744)


## Item-based rating prediction

In [12]:
from sklearn.preprocessing import binarize

user_id = 99072
item_id = 297

up = ii_sim_m[item_id].dot(ui_m[user_id].T)
down = np.abs(ii_sim_m[item_id]).dot(binarize(ui_m[user_id]).T)

print(user_id, item_id, ":", ui_m[user_id, item_id])
print("Prediction:", up.sum() / down.sum())

99072 297 : 0.0
Prediction: 3.18107435751
