In [1]:
import numpy as np
import pandas as pd
from scipy.sparse import diags
from scipy.sparse import csr_matrix
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.preprocessing import Normalizer

In [147]:
# code copied from recpack
def rescale_id_space(ids, id_mapping=None):
    """
    Map the given ids to indices,
    if id_mapping is not None, use that as start, and add new values
    """
    counter = 0

    if id_mapping is not None and len(id_mapping) > 0:
        counter = max(id_mapping.values()) + 1
    else:
        id_mapping = {}
    for val in ids:
        if val not in id_mapping:
            id_mapping[val] = counter
            counter += 1

    return id_mapping

# code copied from recpack
def get_top_K_ranks(X: csr_matrix, K = None) -> csr_matrix:
    """Returns a matrix of ranks assigned to the largest K values in X.

    Selects K largest values for every row in X and assigns a rank to each.

    :param X: Matrix from which we will select K values in every row.
    :type X: csr_matrix
    :param K: Amount of values to select.
    :type K: int, optional
    :return: Matrix with K values per row.
    :rtype: csr_matrix
    """
    U, I, V = [], [], []
    for row_ix, (le, ri) in enumerate(zip(X.indptr[:-1], X.indptr[1:])):
        K_row_pick = min(K, ri - le) if K is not None else ri - le

        if K_row_pick != 0:

            top_k_row = X.indices[le + np.argpartition(X.data[le:ri], list(range(-K_row_pick, 0)))[-K_row_pick:]]

            for rank, col_ix in enumerate(reversed(top_k_row)):
                U.append(row_ix)
                I.append(col_ix)
                V.append(rank + 1)

    X_top_K = csr_matrix((V, (U, I)), shape=X.shape)

    return X_top_K

# code copied from recpack
def invert(x):
    """Invert an array.

    :param x: [description]
    :type x: [type]
    :return: [description]
    :rtype: [type]
    """
    if isinstance(x, np.ndarray):
        ret = np.zeros(x.shape)
    elif isinstance(x, csr_matrix):
        ret = csr_matrix(x.shape)
    else:
        raise TypeError("Unsupported type for argument x.")
    ret[x.nonzero()] = 1 / x[x.nonzero()]
    return ret


In [50]:
BASE_PATH = '../../../data/'
# DATA_PATH = BASE_PATH + 'sample_0.05/'
DATA_PATH = BASE_PATH + 'parquet/'

transactions = pd.read_parquet(DATA_PATH + 'transactions_train.parquet')
customers = pd.read_parquet(DATA_PATH + 'customers.parquet')
articles = pd.read_parquet(DATA_PATH + 'articles.parquet')

In [54]:
# code copied from recpack
df = transactions.copy()
a = df.article_id.value_counts()
a = set(a[a > 100].index)
df = df[df.article_id.isin(a)]

item_ids = list(df.article_id.unique())
user_ids = list(df.customer_id.unique())

_user_id_mapping = rescale_id_space(user_ids)
_item_id_mapping = rescale_id_space(item_ids)

df.loc[:, 'uid'] = df.customer_id.map(lambda x: _user_id_mapping.get(x))
df.loc[:, 'iid'] = df.article_id.map(lambda x: _item_id_mapping.get(x))
values = np.ones(df.shape[0])
indices = df[['uid', 'iid']].values
indices = indices[:, 0], indices[:, 1]
shape = (len(user_ids), len(item_ids))
matrix = csr_matrix((values, indices), shape=shape, dtype=np.int32)

In [None]:
# code copied from recpack
item_cosine_similarities = cosine_similarity(matrix.T, dense_output=False)
item_cosine_similarities.setdiag(0)

In [148]:
# code copied from recpack
X_binary = matrix.astype(bool).astype(matrix.dtype)
co_mat = X_binary.T @ matrix
A = invert(diags(X_binary.sum(axis=0).A[0]).tocsr())
item_cond_prob_similarities = A @ co_mat
item_cond_prob_similarities.setdiag(0)

  self._set_arrayXarray(i, j, x)


In [153]:
# code copied from recpack
transformer = Normalizer(norm="l1", copy=False)
item_similarities = transformer.transform(item_cond_prob_similarities)

top_K_ranks = get_top_K_ranks(item_similarities, 50)
top_K_ranks[top_K_ranks > 0] = 1  # ranks to binary

item_similarities2 = top_K_ranks.multiply(item_similarities)  # elementwise multiplication

In [154]:
sims = []
for i1, i2 in zip(*item_similarities2.nonzero()):
    sims.append((i1, i2, item_similarities2[i1, i2]))

a = pd.DataFrame(sims, columns=['article_id', 'similar_article_id', 'score'])

_item_id_mapping_rev = {i1: i2 for (i2, i1) in _item_id_mapping.items()}
a['article_id'] = a.article_id.map(lambda x: _item_id_mapping_rev.get(x))
a['similar_article_id'] = a.similar_article_id.map(lambda x: _item_id_mapping_rev.get(x))

a.sort_values(['article_id', 'score'], ascending=[True, False], inplace=True)
a.reset_index(inplace=True, drop=True)

In [156]:
a.to_parquet('sim_condprob.parquet')