In [20]:
import pandas as pd
import numpy as np

## Data completeness

In [21]:
df_clicks_t = pd.read_parquet("data/otto_exploded_dataset/clicks/test")
df_carts_t = pd.read_parquet("data/otto_exploded_dataset/carts/test")
df_orders_t = pd.read_parquet("data/otto_exploded_dataset/orders/test")

In [22]:
def set_ranks(df: pd.DataFrame):
    df_c = df.copy()
    df_c["rank"] = df_c.groupby("session")["ts"].rank(method="first", ascending=True)
    return df_c[["session", "aid", "rank"]]

In [23]:
df_clicks_tr = set_ranks(df_clicks_t)
df_carts_tr = set_ranks(df_carts_t)
df_orders_tr = set_ranks(df_orders_t)

In [24]:
def rank_stat(df_r: pd.DataFrame, ks: list):
    df_mr = df_r.groupby("session").agg("max")["rank"]
    for k in ks:
        cnt_me = len(df_mr[df_mr >= k])
        print(f"Users with more than {k} test items: {cnt_me} ({round(cnt_me / len(df_mr) * 100, 2)}%)")

In [25]:
rank_stat(df_clicks_tr, [3, 5, 10, 20])

Users with more than 3 test items: 187014 (65.87%)
Users with more than 5 test items: 144337 (50.84%)
Users with more than 10 test items: 91221 (32.13%)
Users with more than 20 test items: 46619 (16.42%)


In [26]:
rank_stat(df_carts_tr, [3, 5, 10, 20])

Users with more than 3 test items: 29741 (38.89%)
Users with more than 5 test items: 14984 (19.59%)
Users with more than 10 test items: 4225 (5.52%)
Users with more than 20 test items: 622 (0.81%)


In [27]:
rank_stat(df_orders_tr, [3, 5, 10, 20])

Users with more than 3 test items: 7770 (25.77%)
Users with more than 5 test items: 3150 (10.45%)
Users with more than 10 test items: 576 (1.91%)
Users with more than 20 test items: 55 (0.18%)


# Metrics

In [28]:
K_RECS = 200
K_TEST = 5

In [29]:
df_clicks_r = pd.read_parquet("data/clicks/")

In [30]:
df_carts_r = pd.read_parquet("data/carts/")

In [31]:
df_orders_r = pd.read_parquet("data/orders/")

In [32]:
def prepare_data(df_test: pd.DataFrame, df_recs: pd.DataFrame, k_test, k_recs):
    valid_session = np.intersect1d(
        df_test["session"].unique(),
        df_recs["session"].unique()
    )

    df_test_l = set_ranks(df_test)
    df_test_l = df_test_l[df_test_l["session"].isin(valid_session)]
    df_test_l = df_test_l[df_test_l["rank"] <= k_test]
    df_test_l = df_test_l.groupby("session")["aid"].apply(list).reset_index().sort_values("session")

    df_test_l["aid"] = (
        df_test_l["aid"]
        .apply(lambda x: np.pad(
            x[:k_test],
            (0, max(k_test - len(x), 0)),
            mode="constant",
            constant_values=-1
        ))
    )
    test = np.stack(df_test_l["aid"].values, axis=0)

    df_recs_l = df_recs[df_recs["session"].isin(valid_session)]
    df_recs_l["aid"] = df_recs_l["aid"].apply(lambda x: np.array(x[:k_recs]))
    df_recs_l = df_recs_l.sort_values("session")
    recs = np.stack(df_recs_l["aid"].values, axis=0)

    return test, recs

In [33]:
test_clicks, recs_clicks = prepare_data(df_clicks_t, df_clicks_r, K_TEST, K_RECS)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_recs_l["aid"] = df_recs_l["aid"].apply(lambda x: np.array(x[:k_recs]))


In [34]:
from rec_base import Metrics
metrics_clicks = Metrics(K_RECS, recs_clicks, test_clicks)
metrics_clicks.report()

HR@200 = 0.0403
MRR@200 = 0.004
Recall@200 = 0.0002


In [35]:
test_carts, recs_carts = prepare_data(df_carts_t, df_carts_r, K_TEST, K_RECS)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_recs_l["aid"] = df_recs_l["aid"].apply(lambda x: np.array(x[:k_recs]))


In [36]:
from rec_base import Metrics
metrics_carts = Metrics(K_RECS, test_carts, recs_carts)
metrics_carts.report()

HR@200 = 0.0592
MRR@200 = 0.0086
Recall@200 = 0.0152


In [37]:
test_orders, recs_orders = prepare_data(df_orders_t, df_orders_r, K_TEST, K_RECS)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_recs_l["aid"] = df_recs_l["aid"].apply(lambda x: np.array(x[:k_recs]))


In [38]:
from rec_base import Metrics
metrics_orders = Metrics(K_RECS, test_orders, recs_orders)
metrics_orders.report()

HR@200 = 0.0577
MRR@200 = 0.0059
Recall@200 = 0.0132


# Merge candidates

In [None]:
def merge_cand(s1, s2, out_size):
    assert out_size <= len(s1)
    s12 = np.intersect1d(s1, s2)
    s1n2 = np.setdiff1d(s1, s2)

    return np.concatenate((s12[:out_size], s1n2[:(max(0, out_size - len(s12)))]))

In [None]:
def prepare_data_merged(
        df_test: pd.DataFrame,
        df_recs_source: pd.DataFrame,
        df_recs_lo: pd.DataFrame,
        k_test,
        k_recs
        ):
    valid_session = np.intersect1d(
        df_test["session"].unique(),
        df_recs_source["session"].unique()
    )

    df_test_l = set_ranks(df_test)
    df_test_l = df_test_l[df_test_l["session"].isin(valid_session)]
    df_test_l = df_test_l[df_test_l["rank"] <= k_test]
    df_test_l = df_test_l.groupby("session")["aid"].apply(list).reset_index().sort_values("session")

    df_test_l["aid"] = (
        df_test_l["aid"]
        .apply(lambda x: np.pad(
            x[:k_test],
            (0, max(k_test - len(x), 0)),
            mode="constant",
            constant_values=-1
        ))
    )
    test = np.stack(df_test_l["aid"].values, axis=0)

    df_recs_source_l = df_recs_source[df_recs_source["session"].isin(valid_session)]
    df_recs_lo_l = df_recs_lo.rename(columns={"aid": "aid_lo"})
    df_recs_final = df_recs_source_l.merge(df_recs_lo_l, how="left", on="session")
    df_recs_final["maid"] = df_recs_final.apply(
        lambda x: merge_cand(x["aid"], x["aid_lo"], k_recs), axis=1
    )

    df_recs_final = df_recs_final.sort_values("session")
    recs = np.stack(df_recs_final["maid"].values, axis=0)

    return test, recs

In [None]:
test_orders_m, recs_orders_m = prepare_data_merged(df_orders_t, df_orders_r, df_carts_r, K_TEST, K_RECS)

In [None]:
from rec_base import Metrics
metrics_orders_m = Metrics(K_TEST, test_orders, recs_orders_m)
metrics_orders_m.report()

HR@5 = 0.0513
MRR@5 = 0.0056
Recall@5 = 0.0117


# Randoms

In [None]:
from rec_base import Metrics

def random_metrics(tp, test, recs_shape):
    items = pd.read_parquet(f"data/otto_exploded_dataset/{tp}/train").aid.unique()
    recs_rand = np.random.choice(items, size=recs_shape)

    metrics_orders_rand = Metrics(K_RECS, test, recs_rand)
    metrics_orders_rand.report()

In [None]:
random_metrics("clicks", test_clicks, recs_clicks.shape)

HR@200 = 0.0008
MRR@200 = 0.0002
Recall@200 = 0.0002


In [None]:
random_metrics("carts", test_carts, recs_carts.shape)

HR@200 = 0.0022
MRR@200 = 0.0004
Recall@200 = 0.0004


In [None]:
random_metrics("orders", test_orders, recs_orders.shape)

HR@200 = 0.0024
MRR@200 = 0.0005
Recall@200 = 0.0005
