In [1]:
import os
import sys
import pickle
import glob
import gc
from collections import Counter
import itertools


import pandas as pd
import numpy as np
from tqdm.notebook import tqdm
from collections import Counter
import polars as pl

# eval
```
%%time
# COMPUTE METRIC
score = 0
weights = {'clicks': 0.10, 'carts': 0.30, 'orders': 0.60}
for t in ['clicks','carts','orders']:
    sub = pred_df.loc[pred_df.session_type.str.contains(t)].copy()
    sub['session'] = sub.session_type.apply(lambda x: int(x.split('_')[0]))
    sub.labels = sub.labels.apply(lambda x: [int(i) for i in x.split(' ')[:20]])
    test_labels = pd.read_parquet('../input/otto-validation/test_labels.parquet')
    test_labels = test_labels.loc[test_labels['type']==t]
    test_labels = test_labels.merge(sub, how='left', on=['session'])
    test_labels['hits'] = test_labels.apply(lambda df: len(set(df.ground_truth).intersection(set(df.labels))), axis=1)
    test_labels['gt_count'] = test_labels.ground_truth.str.len().clip(0,20)
    recall = test_labels['hits'].sum() / test_labels['gt_count'].sum()
    score += weights[t]*recall
    print(f'{t} recall =',recall)
    
print('=============')
print('Overall Recall =',score)
print('=============')
```

In [2]:
DATADIR = "/home/search3/lichunyu/dataset/otto-recommender-system"

In [3]:
test_data_path = os.path.join(DATADIR, "test.parquet")
train_data_path = os.path.join(DATADIR, "train.parquet")
type2idx_path = os.path.join(DATADIR, "type2id.pkl")
idx2type_path = os.path.join(DATADIR, "id2type.pkl")

In [4]:
type2idx = pd.read_pickle(type2idx_path)
idx2type = pd.read_pickle(idx2type_path)
print(f"type2idx is: {chr(10)}    {type2idx}")
print(f"idx2type is: {chr(10)}    {idx2type}")

type2idx is: 
    {'clicks': 0, 'carts': 1, 'orders': 2}
idx2type is: 
    ['clicks', 'carts', 'orders']


In [5]:
train_data = pl.read_parquet(train_data_path)
test_data = pl.read_parquet(test_data_path)

train_data

session,aid,ts,type
i32,i32,i32,u8
0,1517085,1659304800,0
0,1563459,1659304904,0
0,1309446,1659367439,0
0,16246,1659367719,0
0,1781822,1659367871,0
0,1152674,1659367885,0
0,1649869,1659369893,1
0,461689,1659369898,1
0,305831,1659370027,2
0,461689,1659370027,2


In [6]:
%%time
type_weight = {0:1, 1:6, 2:3}

df_train = train_data.sort(["session", "ts"], reverse=[False, True])

CPU times: user 34.9 s, sys: 4.13 s, total: 39 s
Wall time: 5.29 s


In [7]:
df_train = df_train.select([
    pl.col("*"),
    pl.col("session").cumcount().over("session").alias("cumcount")
]).filter(pl.col("cumcount")<30)[["session", "aid", "ts", "type"]]

In [None]:
%%time
def func(x):
    result = []
    length = len(x)
    for idx_i in range(length):
        for idx_j in range(idx_i+1, length):
            i = x[idx_i]
            j = x[idx_j]
            aid = i["aid"]
            ts = i["ts"]
            _type = i["type"]
            aid_x = j["aid"]
            ts_x = j["ts"]
            type_x = j["type"]
            if aid == aid_x:
                continue
            if _type - type_x < 24*60*60:
                result.append([aid, aid_x, type_x])
    return result


df_train.groupby("session").agg([
    pl.struct(["aid", "ts", "type"]).apply(func).flatten().alias("result")
]).select([
    pl.col("result").arr.get(0).alias("aid"),
    pl.col("result").arr.get(1).alias("aid_x"),
    pl.col("result").arr.get(2).alias("type_x")
])

In [None]:
def func(x):
    aid = x["aid"]
    ts = x["ts"]
    _type = x["type"]
    aid_x = x["aid_x"]
    ts_x = x["ts_x"]
    type_x = x["type_x"]
    result = []
    for _id, _t, _tp in zip(aid, ts, _type):
        for _id_x, _t_x, _tp_x in zip(aid_x, ts_x, type_x):
            if _id == _id_x:
                continue
            if _t - _t_x < 24*60*60:
                result.append([_id, _id_x, _tp_x])
    return result


df_train.groupby("session").agg([
    pl.col("*")
]).select([
    pl.struct(["aid", "ts", "type", "aid_x", "ts_x", "type_x"]).apply(func).flatten().alias("result")
]).select([
    pl.col("result").arr.get(0).alias("aid"),
    pl.col("result").arr.get(1).alias("aid_x"),
    pl.col("result").arr.get(2).alias("type_x")
])

In [11]:
df_train

session,aid,ts,type,aid_x,ts_x,type_x
i32,i32,i32,u8,i32,i32,u8
0,161938,1661684983,0,161938,1661684983,0
0,1740927,1661684942,0,1740927,1661684942,0
0,1228848,1661684528,0,1228848,1661684528,0
0,938007,1661684355,0,938007,1661684355,0
0,843110,1661684298,0,843110,1661684298,0
0,219925,1661684258,0,219925,1661684258,0
0,341626,1661684136,0,341626,1661684136,0
0,543308,1661682228,0,543308,1661682228,0
0,1048797,1661673496,0,1048797,1661673496,0
0,1048797,1661634830,0,1048797,1661634830,0
