In [1]:
import pandas as pd
import numpy as np



## Data completeness

In [109]:
df_clicks_t = pd.read_parquet("data/otto_exploded_dataset/clicks/test")
df_carts_t = pd.read_parquet("data/otto_exploded_dataset/carts/test")
df_orders_t = pd.read_parquet("data/otto_exploded_dataset/orders/test")

In [59]:
df_clicks_t.head()

Unnamed: 0,session,aid,ts,type
79,0,1707783,1660397387901,clicks
80,0,1624436,1660427603773,clicks
81,0,1157411,1660427638022,clicks
82,0,358305,1660427747431,clicks
83,0,1202970,1660487679331,clicks


In [21]:
def set_ranks(df: pd.DataFrame):
    df_c = df.copy()
    df_c["rank"] = df_c.groupby("session")["ts"].rank(method="first", ascending=True)
    return df_c[["session", "aid", "rank"]]

In [22]:
df_clicks_r = set_ranks(df_clicks_t)
df_carts_r = set_ranks(df_carts_t)
df_orders_r = set_ranks(df_orders_t)

In [52]:
def rank_stat(df_r: pd.DataFrame, ks: list):
    df_mr = df_r.groupby("session").agg("max")["rank"]
    for k in ks:
        cnt_me = len(df_mr[df_mr >= k])
        print(f"Users with more than {k} test items: {cnt_me} ({round(cnt_me / len(df_mr) * 100, 2)}%)")

In [53]:
rank_stat(df_clicks_r, [3, 5, 10, 20])

Users with more than 3 test items: 187014 (65.87%)
Users with more than 5 test items: 144337 (50.84%)
Users with more than 10 test items: 91221 (32.13%)
Users with more than 20 test items: 46619 (16.42%)


In [54]:
rank_stat(df_carts_r, [3, 5, 10, 20])

Users with more than 3 test items: 29741 (38.89%)
Users with more than 5 test items: 14984 (19.59%)
Users with more than 10 test items: 4225 (5.52%)
Users with more than 20 test items: 622 (0.81%)


In [55]:
rank_stat(df_orders_r, [3, 5, 10, 20])

Users with more than 3 test items: 7770 (25.77%)
Users with more than 5 test items: 3150 (10.45%)
Users with more than 10 test items: 576 (1.91%)
Users with more than 20 test items: 55 (0.18%)


# Metrics

In [226]:
K_RECS = 200
K_TEST = 5

In [227]:
df_clicks_r = pd.read_parquet("data/clicks/")

In [219]:
df_carts_r = pd.read_parquet("data/carts/")

In [220]:
df_orders_r = pd.read_parquet("data/orders/")

In [228]:
def prepare_data(df_test: pd.DataFrame, df_recs: pd.DataFrame, k_test, k_recs):
    valid_session = np.intersect1d(
        df_test["session"].unique(),
        df_recs["session"].unique()
    )

    df_test_l = set_ranks(df_test)
    df_test_l = df_test_l[df_test_l["session"].isin(valid_session)]
    df_test_l = df_test_l[df_test_l["rank"] <= k_test]
    df_test_l = df_test_l.groupby("session")["aid"].apply(list).reset_index().sort_values("session")

    df_test_l["aid"] = (
        df_test_l["aid"]
        .apply(lambda x: np.pad(
            x[:k_test],
            (0, max(k_test - len(x), 0)),
            mode="constant",
            constant_values=-1
        ))
    )
    test = np.stack(df_test_l["aid"].values, axis=0)

    df_recs_l = df_recs[df_recs["session"].isin(valid_session)]
    df_recs_l["aid"] = df_recs_l["aid"].apply(lambda x: np.array(x[:k_recs]))
    df_recs_l = df_recs_l.sort_values("session")
    recs = np.stack(df_recs_l["aid"].values, axis=0)

    return test, recs

In [230]:
test_clicks, recs_clicks = prepare_data(df_clicks_t, df_clicks_r, K_TEST, K_RECS)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_recs_l["aid"] = df_recs_l["aid"].apply(lambda x: np.array(x[:k_recs]))


In [None]:
import numpy as np


class Metrics:
    def __init__(
            self,
            k: int,
            recommendations: np.ndarray,
            holdout: np.ndarray
            ):
        self.k = k
        self.recommendations = recommendations[:, :k]
        self.holdout = holdout

    def report(self):
        print(f"HR@{self.k} = {round(self.hit_rate(), 4)}")
        print(f"MRR@{self.k} = {round(self.mean_reciprocal_rank(), 4)}")
        print(f"Recall@{self.k} = {round(self.recall(), 4)}")

    def hit_rate(self):
        return (
            self._get_hit_mask()
            .any(axis=1)
            .mean()
        )

    def mean_reciprocal_rank(self):
        hits_mask = self._get_hit_mask()

        idx = np.argwhere(hits_mask.argmax(axis=1)).squeeze(axis=1)

        return np.sum(
            1 / (hits_mask[idx].argmax(axis=1) + 1)
        ) / hits_mask.shape[0]

    def recall(self):
        hits_mask = self._get_hit_mask()
        return (
            hits_mask.sum(axis=1) / self.recommendations.shape[1]
        ).sum() / self.recommendations.shape[0]

    def _get_hit_mask(self) -> np.ndarray:
        return (
            self.holdout[..., None] ==
            np.expand_dims(self.recommendations, axis=1)
        ).any(axis=1)

In [231]:
#from rec_base import Metrics
metrics_clicks = Metrics(K_RECS, recs_clicks, test_clicks)
metrics_clicks.report()

HR@200 = 0.0403
MRR@200 = 0.004
Recall@200 = 0.0084
Recall_Otto@200 = 0.0084


In [222]:
test_carts, recs_carts = prepare_data(df_carts_t, df_carts_r, K_TEST, K_RECS)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_recs_l["aid"] = df_recs_l["aid"].apply(lambda x: np.array(x[:k_recs]))


In [223]:
#from rec_base import Metrics
metrics_carts = Metrics(K_RECS, test_carts, recs_carts)
metrics_carts.report()

HR@200 = 0.0592
MRR@200 = 0.0086
Recall@200 = 0.0004
Recall_Otto@200 = 0.0004


In [224]:
test_orders, recs_orders = prepare_data(df_orders_t, df_orders_r, K_TEST, K_RECS)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_recs_l["aid"] = df_recs_l["aid"].apply(lambda x: np.array(x[:k_recs]))


In [225]:
#from rec_base import Metrics
metrics_orders = Metrics(K_TEST, test_orders, recs_orders)
metrics_orders.report()

HR@5 = 0.0577
MRR@5 = 0.0059
Recall@5 = 0.0003
Recall_Otto@5 = 0.0132
