In [2]:
from abc import ABC, abstractmethod
import pandas as pd
import numpy as np
import scipy.sparse.csr
from sklearn.preprocessing import LabelEncoder
import matplotlib.pyplot as plt
import scipy.sparse as sp
from sklearn.metrics import ndcg_score
from tqdm.notebook import tqdm
import json
%config InlineBackend.figure_format='retina'

### Список датасетов
 - Gowalla
 - Yelp2018
 - Amazon-book

In [7]:
class Dataset(ABC):
    def __init__(self, ratings_filename, metadata_filename=None):
        self.ratings_filename = ratings_filename
        self.metadata_filename = metadata_filename
        self.df = None

    @staticmethod
    def _parse_json(filename, nrows):
        # pd.read_json('Musical_Instruments_5.json', nrows=10, chunksize=None, lines=True)
        data = []
        with open(filename) as json_file:
            while len(data) < nrows:
                data.append(json.loads(next(json_file)))
        return data

    @abstractmethod
    def load_dataset(self, nrows):
        pass

    # @abstractmethod
    def cold_start_type_split(self):
        pass

    # @abstractmethod
    def train_test_split(self):
        pass

    # @abstractmethod
    def build_sparse_interaction_matrix(self) -> sp.csr_matrix:
        pass


class AmazonDataset(Dataset):
    def load_rating_dataset(self, nrows):
        df = pd.read_csv(self.ratings_filename, nrows=nrows, header=None, dtype={3: int})
        df[3] = pd.to_datetime(df[3], unit='s')
        df.columns = ["user", "item", "rating", "timestamp"]
        self.df = df
        return df

    def load_dataset(self, nrows):
        return self._parse_json(self.ratings_filename, nrows)

    def load_metadata(self, nrows):
        return self._parse_json(self.metadata_filename, nrows)

In [8]:
amazon = AmazonDataset('Musical_Instruments_5.json', 'meta_Appliances.json')
df = amazon.load_dataset(3)

In [None]:
amazon.load_metadata(3)[2]

## Amazon Movies

In [None]:
amazon = AmazonDataset('ratings_Movies_and_TV.csv')
df = amazon.load_rating_dataset(500_000)

### Матрица интеракций

In [None]:
# отфильтруем по позитивным оценкам
df_positive = df[df['rating'] >= 4]
df_positive.shape

In [None]:
encoder = LabelEncoder()

df_positive.loc[:, 'user'] = encoder.fit_transform(df_positive['user'])
df_positive.loc[:, 'item'] = encoder.fit_transform(df_positive['item'])

In [None]:
df_positive.nunique()

In [None]:
# csr_matrix((data, (row_ind, col_ind)), [shape=(M, N)])
sparse_interactions = sp.csr_matrix((df_positive['rating'], (df_positive['user'], df_positive['item'])))

In [None]:
sparse_interactions

In [None]:
sparse_interactions = sp.csr_matrix((df_positive['rating'], (df_positive['user'], df_positive['item'])))

### Собираем граф

In [None]:
adj_mat = sp.csr_matrix((4, 4)).tolil()
adj_mat[2:, 2:] = [[1, 1], [1, 1]]
adj_mat.todense()

In [None]:
del adj_mat

### GF-CF

In [6]:
from sparsesvd import sparsesvd

In [71]:
class GF_CF(object):
    def __init__(self, train_matrix):
        self.train_matrix = train_matrix

    def fit(self, dim=16):
        R = self.train_matrix
        rowsum = np.array(R.sum(axis=1))
        d_inv = np.power(rowsum, -0.5).flatten()
        d_inv[np.isinf(d_inv)] = 0.
        d_mat = sp.diags(d_inv)
        R_norm = d_mat @ R

        colsum = np.array(R.sum(axis=0))
        d_inv = np.power(colsum, -0.5).flatten()
        d_inv[np.isinf(d_inv)] = 0.
        d_mat = sp.diags(d_inv)
        self.d_mat_i = d_mat
        self.d_mat_i_inv = sp.diags(1 / d_inv)
        R_norm = R_norm @ d_mat
        self.R_norm = R_norm.tocsc()
        ut, s, self.vt = sparsesvd(self.R_norm, dim)
        print('Train finished')

    def predict(self, new_ratings: sp.coo_matrix):
        R_norm = self.R_norm
        # new_ratings = new_ratings.todense()
        U_2 = new_ratings @ R_norm.T @ R_norm
        U_1 = new_ratings @  self.d_mat_i @ self.vt.T @ self.vt @ self.d_mat_i_inv
        predict = U_2 + U_1
        predict[np.isnan(predict)] = 0

        # if ds_name == 'amazon-book':
        #     ret = U_2
        # else:
        #     U_1 = batch_test @  self.d_mat_i @ self.vt.T @ self.vt @ self.d_mat_i_inv
        #     ret = U_2 + 0.3 * U_1
        # return ret
        return predict

    def recommend_top_k(self, interactions: sp.coo_matrix, k=20):
        """
        Предполагается нумерация айтемов с 0, иначе не будет работать argsort
        """
        ranks = self.predict(interactions)
        return np.asarray(np.argsort(-ranks, axis=1)[:, :k])

In [None]:
gf_cf = GF_CF(sparse_interactions[:10_000])
gf_cf.fit(1)

In [None]:
sparse_interactions.shape

In [None]:
gf_cf.R_norm.shape

In [None]:
%%time
preds = gf_cf.predict(sparse_interactions[10_000:20_000])
# preds = np.nanargmax(preds, axis=1)
preds[np.isnan(preds)] = 0

In [None]:
%%time
ndcg_score(np.array(preds),
           np.array(sparse_interactions[10_000:20_000].todense()))

In [None]:
result = []
for i in tqdm(range(sparse_interactions.shape[0])):
    if preds[i] in sparse_interactions[i].nonzero()[1]:
        result.append(1)
    else:
        result.append(0)
result = np.array(result)
result.mean()

In [None]:
np.where(result == 0)

In [None]:
def normalizing(R):
    rowsum = np.array(R.sum(axis=1))
    d_inv = np.power(rowsum, -0.5).flatten()
    d_inv[np.isinf(d_inv)] = 0.
    d_mat = sp.diags(d_inv)
    norm_R = d_mat @ R

    colsum = np.array(R.sum(axis=0))
    d_inv = np.power(colsum, -0.5).flatten()
    d_inv[np.isinf(d_inv)] = 0.
    d_mat = sp.diags(d_inv)
    d_mat_i = d_mat.copy()
    d_mat_i_inv = sp.diags(1 / d_inv)
    norm_R = norm_R @ d_mat
    return norm_R

In [None]:
normalizing(np.array([[1, 0, 0], [0, 1, 1], [0, 0, 1]]))

### LGCN-IDE

In [None]:
class LGCN_IDE(object):
    def __init__(self, adj_mat):
        self.adj_mat = adj_mat

    def train(self):
        adj_mat = self.adj_mat[:threshold]
        rowsum = np.array(adj_mat.sum(axis=1))
        d_inv = np.power(rowsum, -0.5).flatten()
        d_inv[np.isinf(d_inv)] = 0.
        d_mat = sp.diags(d_inv)
        d_mat_i = d_mat
        norm_adj = d_mat.dot(adj_mat)

        colsum = np.array(adj_mat.sum(axis=0))
        d_inv = np.power(colsum, -0.5).flatten()
        d_inv[np.isinf(d_inv)] = 0.
        d_mat = sp.diags(d_inv)
        d_mat_u = d_mat
        d_mat_u_inv = sp.diags(1/d_inv)
        norm_adj = norm_adj.dot(d_mat)
        self.norm_adj = norm_adj.tocsr()

    def getUsersRating(self, batch_users, ds_name):
        norm_adj = self.norm_adj
        batch_test = np.array(norm_adj[batch_users,:].todense())
        U_1 = batch_test @ norm_adj.T @ norm_adj
        if(ds_name == 'gowalla'):
            U_2 = U_1 @ norm_adj.T @ norm_adj
            return U_2
        else:
            return U_1

### Метрики

In [152]:
def recall_at_k(pred_items: np.array, true_items: np.array):
    assert pred_items.shape[0] == true_items.shape[0]
    recall_list = []
    for i_pred, i_true in zip(pred_items, true_items):
        try:
            hits = len(np.intersect1d(i_pred, i_true))
            recall = hits / len(i_true)
            recall_list.append(recall)
        except ZeroDivisionError:
            print(len(recall_list), i_true)
    return np.mean(recall_list)

def precision_at_k(pred_items: np.array, true_items: np.array):
    assert pred_items.shape[0] == true_items.shape[0]
    precision_list = []
    for i_pred, i_true in zip(pred_items, true_items):
        try:
            hits = len(np.intersect1d(i_pred, i_true))
            prec = hits / len(i_pred)
            precision_list.append(prec)
        except ZeroDivisionError:
            print(len(precision_list), i_true)
    return np.mean(precision_list)

In [None]:
y_true = [
    [1, 1, 0, 0, 0],
    [1, 1, 1, 0, 0]
]

y_pred = [
    [2, 3, 0, 0, 0],
    [100, 200, 300, 0, 0]
]

# y_true : ndarray of shape (n_samples, n_labels)
# True targets of multilabel classification, or true scores of entities to be ranked.
#
# y_score: ndarray of shape (n_samples, n_labels)
# Target scores, can either be probability estimates, confidence values, or non-thresholded measure of decisions (as returned by “decision_function” on some classifiers).

ndcg_score(y_true, y_pred)

In [None]:
preds = gf_cf.predict(sparse_interactions)

In [None]:
%%time
ndcg_score(np.array(preds), np.array(sparse_interactions.todense()))

## MovieLens 100k

In [4]:
class MovieLens(Dataset):
    def load_rating_dataset(self, nrows=None):
        df = pd.read_table(self.ratings_filename,  header=None, nrows=nrows)
        df.columns = ['user_id', 'item_id', 'rating', 'timestamp']
        df['timestamp'] = pd.to_datetime(df['timestamp'], unit='s')
        df['user_id'] -= 1
        df['item_id'] -= 1
        self.df = df
        return df

    def load_dataset(self, nrows):
        pass

    def load_metadata(self, nrows):
        pass

In [23]:
mov = MovieLens('ml-100k/u.data')
df = mov.load_rating_dataset()
df = df[df['rating'] >= 4]
df['rating'] = 1

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df['rating'] = 1


In [24]:
sparse_interactions = sp.csr_matrix((df['rating'], (df['user_id'], df['item_id'])))

In [25]:
sparse_interactions

<943x1674 sparse matrix of type '<class 'numpy.int64'>'
	with 55375 stored elements in Compressed Sparse Row format>

In [156]:
SPLIT_THRESHOLD = 650
K = 10

gf_cf = GF_CF(sparse_interactions[:SPLIT_THRESHOLD])
gf_cf.fit(32)
ranks = gf_cf.predict(sparse_interactions[SPLIT_THRESHOLD:])
recs = gf_cf.recommend_top_k(sparse_interactions[SPLIT_THRESHOLD:, :], K)

# valuation
y_true = np.array([sparse_interactions[i].nonzero()[1] for i in range(SPLIT_THRESHOLD, sparse_interactions.shape[0])], dtype='object')
print(f'NDGC score = {ndcg_score(np.array(ranks), np.array(sparse_interactions[SPLIT_THRESHOLD:].todense())).round(5)}')
print(f'Recall = {recall_at_k(recs, y_true).round(5)}')
print(f'Precision = {precision_at_k(recs, y_true).round(5)}')

  d_inv = np.power(colsum, -0.5).flatten()
  self.d_mat_i_inv = sp.diags(1 / d_inv)


Train finished
NDGC score = 0.77876
34 []
Recall = 0.21262
Precision = 0.62867


In [157]:
sparse_interactions[684]

<1x1674 sparse matrix of type '<class 'numpy.int64'>'
	with 0 stored elements in Compressed Sparse Row format>

In [127]:
df[df['user_id'] == 684]

Unnamed: 0,user_id,item_id,rating,timestamp
