In [36]:
import pandas as pd
import numpy as np
from rec_base import Metrics
import gc

## Data completeness

In [18]:
df_clicks_t = pd.read_parquet("data/otto_exploded_dataset/clicks/test")
df_carts_t = pd.read_parquet("data/otto_exploded_dataset/carts/test")
df_orders_t = pd.read_parquet("data/otto_exploded_dataset/orders/test")

In [19]:
df_clicks_t.head()

Unnamed: 0,session,aid,ts,type
79,0,1707783,1660397387901,clicks
80,0,1624436,1660427603773,clicks
81,0,1157411,1660427638022,clicks
82,0,358305,1660427747431,clicks
83,0,1202970,1660487679331,clicks


In [20]:
def set_ranks(df: pd.DataFrame):
    df_c = df.copy()
    df_c["rank"] = df_c.groupby("session")["ts"].rank(method="first", ascending=True)
    return df_c[["session", "aid", "rank"]]

In [21]:
df_clicks_r = set_ranks(df_clicks_t)
df_carts_r = set_ranks(df_carts_t)
df_orders_r = set_ranks(df_orders_t)

In [22]:
def rank_stat(df_r: pd.DataFrame, ks: list):
    df_mr = df_r.groupby("session").agg("max")["rank"]
    for k in ks:
        cnt_me = len(df_mr[df_mr >= k])
        print(f"Users with more than {k} test items: {cnt_me} ({round(cnt_me / len(df_mr) * 100, 2)}%)")

In [23]:
rank_stat(df_clicks_r, [3, 5, 10, 20])

Users with more than 3 test items: 187014 (65.87%)
Users with more than 5 test items: 144337 (50.84%)
Users with more than 10 test items: 91221 (32.13%)
Users with more than 20 test items: 46619 (16.42%)


In [24]:
rank_stat(df_carts_r, [3, 5, 10, 20])

Users with more than 3 test items: 29741 (38.89%)
Users with more than 5 test items: 14984 (19.59%)
Users with more than 10 test items: 4225 (5.52%)
Users with more than 20 test items: 622 (0.81%)


In [25]:
rank_stat(df_orders_r, [3, 5, 10, 20])

Users with more than 3 test items: 7770 (25.77%)
Users with more than 5 test items: 3150 (10.45%)
Users with more than 10 test items: 576 (1.91%)
Users with more than 20 test items: 55 (0.18%)


# Metrics

In [27]:
K_RECS = 200
K_TEST = 5

In [26]:
def prepare_data(df_test: pd.DataFrame, df_recs: pd.DataFrame, k_test, k_recs):
    valid_session = np.intersect1d(
        df_test["session"].unique(),
        df_recs["session"].unique()
    )

    df_test_l = set_ranks(df_test)
    df_test_l = df_test_l[df_test_l["session"].isin(valid_session)]
    df_test_l = df_test_l[df_test_l["rank"] <= k_test]
    df_test_l = df_test_l.groupby("session")["aid"].apply(list).reset_index().sort_values("session")

    df_test_l["aid"] = (
        df_test_l["aid"]
        .apply(lambda x: np.pad(
            x[:k_test],
            (0, max(k_test - len(x), 0)),
            mode="constant",
            constant_values=-1
        ))
    )
    test = np.stack(df_test_l["aid"].values, axis=0)

    df_recs_l = df_recs[df_recs["session"].isin(valid_session)]
    df_recs_l["aid"] = df_recs_l["aid"].apply(lambda x: np.array(x[:k_recs]))
    df_recs_l = df_recs_l.sort_values("session")
    recs = np.stack(df_recs_l["aid"].values, axis=0)

    return test, recs

## ALS 1

In [28]:
df_clicks_r1 = pd.read_parquet("recs_clicks_als_1/")
df_carts_r1 = pd.read_parquet("recs_carts_als_1/")
df_orders_r1 = pd.read_parquet("recs_orders_als_1/")

In [30]:
test_clicks1, recs_clicks1 = prepare_data(df_clicks_t, df_clicks_r1, K_TEST, K_RECS)
metrics_clicks1 = Metrics(K_RECS, recs_clicks1, test_clicks1)
metrics_clicks1.report()

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_recs_l["aid"] = df_recs_l["aid"].apply(lambda x: np.array(x[:k_recs]))


HR@200 = 0.2355
MRR@200 = 0.0202
Recall@200 = 0.0018


In [31]:
test_carts1, recs_carts1 = prepare_data(df_carts_t, df_carts_r1, K_TEST, K_RECS)
metrics_carts1 = Metrics(K_RECS, test_carts1, recs_carts1)
metrics_carts1.report()

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_recs_l["aid"] = df_recs_l["aid"].apply(lambda x: np.array(x[:k_recs]))


HR@200 = 0.1049
MRR@200 = 0.0141
Recall@200 = 0.0309


In [32]:
test_orders1, recs_orders1 = prepare_data(df_orders_t, df_orders_r1, K_TEST, K_RECS)
metrics_orders1 = Metrics(K_TEST, test_orders1, recs_orders1)
metrics_orders1.report()

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_recs_l["aid"] = df_recs_l["aid"].apply(lambda x: np.array(x[:k_recs]))


HR@5 = 0.0616
MRR@5 = 0.0098
Recall@5 = 0.0155


In [37]:
del df_clicks_r1, df_carts_r1, df_orders_r1, test_orders1, recs_orders1, metrics_orders1
gc.collect()

2088

## ALS 2

In [38]:
df_clicks_r2 = pd.read_parquet("recs_clicks_als_2/")
df_carts_r2 = pd.read_parquet("recs_carts_als_2/")
df_orders_r2 = pd.read_parquet("recs_orders_als_2/")

In [39]:
test_clicks2, recs_clicks2 = prepare_data(df_clicks_t, df_clicks_r2, K_TEST, K_RECS)
metrics_clicks2 = Metrics(K_RECS, recs_clicks2, test_clicks2)
metrics_clicks2.report()

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_recs_l["aid"] = df_recs_l["aid"].apply(lambda x: np.array(x[:k_recs]))


HR@200 = 0.302
MRR@200 = 0.0251
Recall@200 = 0.0024


In [40]:
test_carts2, recs_carts2 = prepare_data(df_carts_t, df_carts_r2, K_TEST, K_RECS)
metrics_carts2 = Metrics(K_RECS, test_carts2, recs_carts2)
metrics_carts2.report()

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_recs_l["aid"] = df_recs_l["aid"].apply(lambda x: np.array(x[:k_recs]))


HR@200 = 0.1537
MRR@200 = 0.0203
Recall@200 = 0.0469


In [41]:
test_orders2, recs_orders2 = prepare_data(df_orders_t, df_orders_r2, K_TEST, K_RECS)
metrics_orders2 = Metrics(K_TEST, test_orders2, recs_orders2)
metrics_orders2.report()

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_recs_l["aid"] = df_recs_l["aid"].apply(lambda x: np.array(x[:k_recs]))


HR@5 = 0.1017
MRR@5 = 0.014
Recall@5 = 0.026


In [42]:
del df_clicks_r2, df_carts_r2, df_orders_r2, test_orders2, recs_orders2, metrics_orders2
gc.collect()

0

## ALS 3

In [43]:
df_clicks_r3 = pd.read_parquet("recs_clicks_als_3/")
df_carts_r3 = pd.read_parquet("recs_carts_als_3/")
df_orders_r3 = pd.read_parquet("recs_orders_als_3/")

In [44]:
test_clicks3, recs_clicks3 = prepare_data(df_clicks_t, df_clicks_r3, K_TEST, K_RECS)
metrics_clicks3 = Metrics(K_RECS, recs_clicks3, test_clicks3)
metrics_clicks3.report()

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_recs_l["aid"] = df_recs_l["aid"].apply(lambda x: np.array(x[:k_recs]))


HR@200 = 0.2875
MRR@200 = 0.0236
Recall@200 = 0.0023


In [45]:
test_carts3, recs_carts3 = prepare_data(df_carts_t, df_carts_r3, K_TEST, K_RECS)
metrics_carts3 = Metrics(K_RECS, test_carts3, recs_carts3)
metrics_carts3.report()

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_recs_l["aid"] = df_recs_l["aid"].apply(lambda x: np.array(x[:k_recs]))


HR@200 = 0.1501
MRR@200 = 0.0198
Recall@200 = 0.0455


In [46]:
test_orders3, recs_orders3 = prepare_data(df_orders_t, df_orders_r3, K_TEST, K_RECS)
metrics_orders3 = Metrics(K_TEST, test_orders3, recs_orders3)
metrics_orders3.report()

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_recs_l["aid"] = df_recs_l["aid"].apply(lambda x: np.array(x[:k_recs]))


HR@5 = 0.1017
MRR@5 = 0.014
Recall@5 = 0.0261


In [None]:
del df_clicks_r3, df_carts_r3, df_orders_r3, test_orders3, recs_orders3, metrics_orders3
gc.collect()