In [173]:
from abc import ABC, abstractmethod
import pandas as pd
import numpy as np
from sklearn.preprocessing import LabelEncoder
import matplotlib.pyplot as plt
import scipy.sparse as sp
from tqdm.notebook import tqdm
import json
%config InlineBackend.figure_format='retina'

### Список датасетов
 - Gowalla
 - Yelp2018
 - Amazon-book

In [2]:
class Dataset(ABC):
    def __init__(self, ratings_filename, metadata_filename=None):
        self.ratings_filename = ratings_filename
        self.metadata_filename = metadata_filename
        self.df = None

    @staticmethod
    def _parse_json(filename, nrows):
        # pd.read_json('Musical_Instruments_5.json', nrows=10, chunksize=None, lines=True)
        data = []
        with open(filename) as json_file:
            while len(data) < nrows:
                data.append(json.loads(next(json_file)))
        return data

    @abstractmethod
    def load_dataset(self, nrows):
        pass

    # @abstractmethod
    def cold_start_type_split(self):
        pass

    # @abstractmethod
    def train_test_split(self):
        pass

    # @abstractmethod
    def build_sparse_interaction_matrix(self) -> sp.csr_matrix:
        pass


class AmazonDataset(Dataset):
    def load_rating_dataset(self, nrows):
        df = pd.read_csv('ratings_Movies_and_TV.csv', nrows=nrows, header=None, dtype={3:int})
        df[3] = pd.to_datetime(df[3], unit='s')
        df.columns = ["user", "item", "rating", "timestamp"]
        self.df = df
        return df

    def load_dataset(self, nrows):
        return self._parse_json(self.ratings_filename, nrows)

    def load_metadata(self, nrows):
        return self._parse_json(self.metadata_filename, nrows)

In [3]:
amazon = AmazonDataset('Musical_Instruments_5.json', 'meta_Appliances.json')
df = amazon.load_dataset(3)

In [4]:
amazon.load_metadata(3)[2]

{'category': ['Appliances', 'Parts &amp; Accessories'],
 'tech1': '',
 'description': [],
 'fit': '',
 'title': 'The Cigar - Moments of Pleasure',
 'also_buy': [],
 'tech2': '',
 'brand': 'The Cigar Book',
 'feature': [],
 'rank': ['>#1,861,816 in Home &amp; Kitchen (See Top 100 in Home &amp; Kitchen)',
  '>#79,974 in Tools &amp; Home Improvement &gt; Appliances &gt; Large Appliance Accessories'],
 'also_view': ['B01HCAVSLK', '1632206579'],
 'details': {},
 'main_cat': 'Amazon Home',
 'similar_item': '',
 'date': '',
 'price': '$150.26',
 'asin': '8792559360',
 'imageURL': ['https://images-na.ssl-images-amazon.com/images/I/41zzM-k9bdL._SS40_.jpg',
  'https://images-na.ssl-images-amazon.com/images/I/51IEoSRcvyL._SS40_.jpg',
  'https://images-na.ssl-images-amazon.com/images/I/41WubC4aJbL._SS40_.jpg',
  'https://images-na.ssl-images-amazon.com/images/I/41fOInFBIjL._SS40_.jpg',
  'https://images-na.ssl-images-amazon.com/images/I/41NF4UI2KGL._SS40_.jpg',
  'https://images-na.ssl-images-amaz

## Amazon Movies

In [230]:
amazon = AmazonDataset('ratings_Movies_and_TV.csv')
df = amazon.load_rating_dataset(100_000)

### Матрица интеракций

In [98]:
# отфильтруем по позитивным оценкам
df_positive = df[df['rating'] >= 4]
df_positive.shape

(82797, 4)

In [99]:
encoder = LabelEncoder()

df_positive.loc[:, 'user'] = encoder.fit_transform(df_positive['user'])
df_positive.loc[:, 'item'] = encoder.fit_transform(df_positive['item'])

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  self._setitem_single_column(ilocs[0], value, pi)


In [100]:
df_positive.nunique()

user         66108
item          1825
rating           2
timestamp     5776
dtype: int64

In [101]:
# csr_matrix((data, (row_ind, col_ind)), [shape=(M, N)])
sparse_interactions = sp.csr_matrix((df_positive['rating'], (df_positive['user'], df_positive['item'])))

In [102]:
sparse_interactions

<66108x1825 sparse matrix of type '<class 'numpy.float64'>'
	with 82797 stored elements in Compressed Sparse Row format>

### Собираем граф

In [12]:
adj_mat = sp.csr_matrix((4, 4)).tolil()
adj_mat[2:, 2:] = [[1, 1], [1, 1]]
adj_mat.todense()

matrix([[0., 0., 0., 0.],
        [0., 0., 0., 0.],
        [0., 0., 1., 1.],
        [0., 0., 1., 1.]])

### GF-CF

In [13]:
from sparsesvd import sparsesvd

In [221]:
class GF_CF(object):
    def __init__(self, adj_mat):
        self.adj_mat = adj_mat

    def train(self, train_threshold, dim=16):
        adj_mat = self.adj_mat[:train_threshold]
        rowsum = np.array(adj_mat.sum(axis=1))
        d_inv = np.power(rowsum, -0.5).flatten()
        d_inv[np.isinf(d_inv)] = 0.
        d_mat = sp.diags(d_inv)
        norm_adj = d_mat.dot(adj_mat)

        colsum = np.array(adj_mat.sum(axis=0))
        d_inv = np.power(colsum, -0.5).flatten()
        d_inv[np.isinf(d_inv)] = 0.
        d_mat = sp.diags(d_inv)
        self.d_mat_i = d_mat
        self.d_mat_i_inv = sp.diags(1 / d_inv)
        norm_adj = norm_adj.dot(d_mat)
        self.norm_adj = norm_adj.tocsc()
        ut, s, self.vt = sparsesvd(self.norm_adj, dim)
        print('Train finished')

    def getUsersRating(self, batch_users, ds_name=None):
        norm_adj = self.norm_adj
        adj_mat = self.adj_mat
        batch_test = np.array(adj_mat[batch_users,:].todense())
        U_2 = batch_test @ norm_adj.T @ norm_adj
        U_1 = batch_test @  self.d_mat_i @ self.vt.T @ self.vt @ self.d_mat_i_inv
        #
        # if ds_name == 'amazon-book':
        #     ret = U_2
        # else:
        #     U_1 = batch_test @  self.d_mat_i @ self.vt.T @ self.vt @ self.d_mat_i_inv
        #     ret = U_2 + 0.3 * U_1
        # return ret
        return U_2 + U_1

gf_cf = GF_CF(sparse_interactions)
gf_cf.train(1_000, 3)

Train finished


  d_inv = np.power(colsum, -0.5).flatten()
  self.d_mat_i_inv = sp.diags(1 / d_inv)


In [222]:
gf_cf.norm_adj.shape

(1000, 1825)

In [223]:
sparse_interactions.shape

(66108, 1825)

In [224]:
%%time
preds = gf_cf.getUsersRating(np.arange(sparse_interactions.shape[0]))
preds = np.nanargmax(preds, axis=1)

CPU times: user 4.54 s, sys: 4.46 s, total: 9 s
Wall time: 7.06 s


In [225]:
result = []
for i in tqdm(range(sparse_interactions.shape[0])):
    if preds[i] in sparse_interactions[i].nonzero()[1]:
        result.append(1)
    else:
        result.append(0)
result = np.array(result)
result.mean()

  0%|          | 0/66108 [00:00<?, ?it/s]

0.8410328553276457

In [229]:
np.where(result == 0)[0]

array([ 1012,  1016,  1019, ..., 66092, 66094, 66101])

In [220]:
i = 66101
pred = gf_cf.getUsersRating([i])
display(pd.Series(pred[0]).sort_values(ascending=False).head())
sparse_interactions[i].nonzero()

14      0.0
1418    0.0
1457    0.0
1456    0.0
1455    0.0
dtype: float64

(array([0], dtype=int32), array([1461], dtype=int32))