In [1]:
import numpy as np
import pandas as pd

from scipy.sparse.linalg import svds
from typing import Optional, Union
from scipy.sparse import csr_matrix

from polara.tools.display import print_frames

from tqdm.auto import tqdm
import gc

# from dataprep import transform_indices, leave_last_out, verify_time_split, reindex_data, generate_interactions_matrix
# from evaluation import topn_recommendations, model_evaluate, downvote_seen_items

## Source code

#### From dataprep file

In [2]:
def leave_last_out(data, userid='userid', timeid='timestamp'):
    data_sorted = data.sort_values(by=timeid)
    holdout = data_sorted.drop_duplicates(
        subset=[userid], keep='last'
    ) # split the last item from each user's history
    remaining = data.drop(holdout.index) # store the remaining data - will be our training
    return remaining, holdout


def to_numeric_id(data, field):
    idx_data = data[field].astype("category")
    idx = idx_data.cat.codes
    idx_map = idx_data.cat.categories.rename(field)
    return idx, idx_map


def transform_indices(data: pd.DataFrame, users: str, items:str, inplace: bool=False):
    data_index = {}
    data_codes = {}
    for entity, field in zip(['users', 'items'], [users, items]):
        new_index, data_index[entity] = to_numeric_id(data, field)
        if inplace:
            data.loc[:, field] = new_index
        else:
            data_codes[field] = new_index

    if data_codes:
        data = data.assign(**data_codes) # makes a copy of data
    return data, data_index


def reindex_data(
        data: pd.DataFrame,
        data_index: dict,
        entities: Optional[Union[str, list[str]]] = None,
        filter_invalid: bool = True,
        inplace: bool = False
    ):
    if entities is None:
        entities = data_index.keys()
    if isinstance(entities, str): # handle single entity provided as a string
        entities = [entities]
    data_codes = {}
    for entity in entities:
        entity_index = data_index[entity]
        field = entity_index.name # extract the field name
        new_index = entity_index.get_indexer(data[field])
        if inplace:
            data.loc[:, field] = new_index # assign new values inplace
        else:
            data_codes[field] = new_index # store new values
    if data_codes:
        data = data.assign(**data_codes) # assign new values by making a copy
    if filter_invalid: # discard unrecognized entity index
        valid_values = [f'{data_index[entity].name}>=0' for entity in entities]
        data = data.query(' and '.join(valid_values))
    return data


def generate_interactions_matrix(data, data_description, rebase_users=False):
    n_users = data_description['n_users']
    n_items = data_description['n_items']
    # get indices of observed data
    user_idx = data[data_description['users']].values
    if rebase_users: # handle non-contiguous index of test users
        # This ensures that all user ids are contiguous and start from 0,
        # which helps ensure data consistency at the scoring stage.
        user_idx, user_index = pd.factorize(user_idx, sort=True)
        n_users = len(user_index)
    item_idx = data[data_description['items']].values
    feedback = data[data_description['feedback']].values
    # construct rating matrix
    return csr_matrix((feedback, (user_idx, item_idx)), shape=(n_users, n_items))


def verify_time_split(before, after, target_field='userid', timeid='timestamp'):
    before_ts = before.groupby(target_field)[timeid].max()
    after_ts = after.groupby(target_field)[timeid].min()
    assert (
        before_ts
        .reindex(after_ts.index)
        .combine(after_ts, lambda x, y: True if x!=x else x <= y)
    ).all()

#### From evaluation file

In [3]:
def downvote_seen_items(scores, data, data_description):
    assert isinstance(scores, np.ndarray), 'Scores must be a dense numpy array!'
    itemid = data_description['items']
    userid = data_description['users']
    # get indices of observed data, corresponding to scores array
    # we need to provide correct mapping of rows in scores array into
    # the corresponding user index (which is assumed to be sorted)
    row_idx, test_users = pd.factorize(data[userid], sort=True)
    assert len(test_users) == scores.shape[0]
    col_idx = data[itemid].values
    # downvote scores at the corresponding positions
    scores[row_idx, col_idx] = scores.min() - 1


def topn_recommendations(scores, topn=10):
    recommendations = np.apply_along_axis(topidx, 1, scores, topn)
    return recommendations


def topidx(a, topn):
    parted = np.argpartition(a, -topn)[-topn:]
    return parted[np.argsort(-a[parted])]


def model_evaluate(recommended_items, holdout, holdout_description, topn=10):
    itemid = holdout_description['items']
    holdout_items = holdout[itemid].values
    assert recommended_items.shape[0] == len(holdout_items)
    hits_mask = recommended_items[:, :topn] == holdout_items.reshape(-1, 1)
    # HR calculation
    hr = np.mean(hits_mask.any(axis=1))
    # MRR calculation
    n_test_users = recommended_items.shape[0]
    hit_rank = np.where(hits_mask)[1] + 1.0
    mrr = np.sum(1 / hit_rank) / n_test_users
    # coverage calculation
    n_items = holdout_description['n_items']
    cov = np.unique(recommended_items).size / n_items
    return {
        f'hr@{topn}' : hr,
        f'mrr@{topn}' : mrr,
        f'cov@{topn}' : cov,
    }

## Data

In [None]:
df_clicks_train = pd.read_parquet("./data/otto_exploded_dataset/clicks/train")
df_carts_train = pd.read_parquet("./data/otto_exploded_dataset/carts/train")
df_orders_train = pd.read_parquet("./data/otto_exploded_dataset/orders/train")

In [5]:
df_clicks_train = pd.read_parquet("./data/otto_exploded_dataset/clicks/train")
df_carts_train = pd.read_parquet("./data/otto_exploded_dataset/carts/train")
df_orders_train = pd.read_parquet("./data/otto_exploded_dataset/orders/train")

df_clicks_train['type'] = 1

print_frames([
    df_clicks_train.head(),
    df_carts_train.head(),
    df_orders_train.head()
])

Unnamed: 0_level_0,session,aid,ts,type
Unnamed: 0_level_1,session,aid,ts,type
Unnamed: 0_level_2,session,aid,ts,type
54,0,1521766,1659729979807.0,1
55,0,1725503,1659774028031.0,1
56,0,528847,1659774232119.0,1
57,0,1816325,1659774337835.0,1
58,0,984597,1659774357892.0,1
290,1,854637,1659990941327.0,carts
293,1,215311,1659990964841.0,carts
296,1,711125,1659991053886.0,carts
299,1,105393,1659991168139.0,carts
390,3,984459,1659818148834.0,carts

Unnamed: 0,session,aid,ts,type
54,0,1521766,1659729979807,1
55,0,1725503,1659774028031,1
56,0,528847,1659774232119,1
57,0,1816325,1659774337835,1
58,0,984597,1659774357892,1

Unnamed: 0,session,aid,ts,type
290,1,854637,1659990941327,carts
293,1,215311,1659990964841,carts
296,1,711125,1659991053886,carts
299,1,105393,1659991168139,carts
390,3,984459,1659818148834,carts

Unnamed: 0,session,aid,ts,type
539,3,1018433,1659999789346,orders
540,3,54857,1659999789346,orders
909,11,1145803,1659902394985,orders
3478,35,1162085,1659788011065,orders
4651,40,223422,1660302982234,orders


In [5]:
training_, holdout_ = leave_last_out(df_clicks_train, userid='session', timeid='ts')
verify_time_split(training_, holdout_, target_field='session', timeid='ts')

In [6]:
training, data_index = transform_indices(training_, 'session', 'aid')
holdout = (
    reindex_data(holdout_, data_index, filter_invalid=True)
    .sort_values('session')
)

In [7]:
len(data_index['users']), len(data_index['items'])
data_index['users'].name
data_index['items'].name

'aid'

In [8]:
data_description = dict(
    users = data_index['users'].name,
    items = data_index['items'].name,
    feedback = 'type',
    n_users = len(data_index['users']),
    n_items = len(data_index['items']),
)

## PureSVD

In [9]:
def build_svd_model(config, data, data_description):
    source_matrix = generate_interactions_matrix(data, data_description, rebase_users=False)
    _, s, vt = svds(
        source_matrix.astype('f8'),
        k=config['rank'],
        return_singular_vectors='vh'
    )
    sidx = np.argsort(-s)
    singular_values = s[sidx]
    item_factors = np.ascontiguousarray(vt[sidx, :].T)
    return item_factors, singular_values

def svd_model_scoring(params, data, data_description):
    item_factors, sigma = params
    test_matrix = generate_interactions_matrix(data, data_description, rebase_users=True)
    scores = test_matrix.dot(item_factors) @ item_factors.T
    return scores

In [10]:
svd_config = {'rank': 200}

V, sigma = svd_params = build_svd_model(svd_config, training, data_description)

In [11]:
test_matrix = generate_interactions_matrix(training, data_description, rebase_users=True)
test_matrix

<304164x680757 sparse matrix of type '<class 'numpy.int64'>'
	with 3220087 stored elements in Compressed Sparse Row format>

In [12]:
from scipy import sparse

V_sparse = sparse.csr_matrix(V)
VT_sparse = sparse.csr_matrix(V.T)

In [14]:
V_sparse

<680757x200 sparse matrix of type '<class 'numpy.float64'>'
	with 136151400 stored elements in Compressed Sparse Row format>

In [18]:
VT_sparse

<200x680757 sparse matrix of type '<class 'numpy.float64'>'
	with 136151400 stored elements in Compressed Sparse Row format>

In [15]:
test_matrix.shape

(304164, 680757)

In [16]:
a = test_matrix * V_sparse

In [17]:
a

<304164x200 sparse matrix of type '<class 'numpy.float64'>'
	with 60832800 stored elements in Compressed Sparse Row format>

In [None]:
# It can't compute this multiplication in reasonable time
b = a * VT_sparse
b

The last multiplication requires $O(60832800*136151400*680757)=O(10^21)$ elementary operations that will take approximately 10^13 seconds or 316 887 years on this laptop.