In [1]:
import os
import time
import glob

import pandas as pd
import polars as pl
import numpy as np
import xgboost as xgb
import cudf
from sklearn.model_selection import GroupKFold
import joblib
from matplotlib import pyplot as plt

In [2]:
DATASET_PATH = "/home/search2/lichunyu/otto-recommender-system/data/input"

TYPE_MAP = {'clicks':0, 'carts':1, 'orders':2}

def read_parquet(f):
    df = pd.read_parquet(f)
    df.ts = (df.ts/1000).astype('int32')
    df['type'] = df['type'].map(TYPE_MAP).astype('int8')
    return df

In [3]:
train_files: list = sorted(glob.glob(os.path.join(DATASET_PATH, "otto-validation/train_parquet/*.parquet")))
# train_files

In [7]:
test_files: list = sorted(glob.glob(os.path.join(DATASET_PATH, "otto-chunk-data-inparquet-format/test_parquet/*.parquet")))

# Candidate Strategery
* top30 最近的行为
* top10 clicks
* top10 carts
* top10 orders
* repeat orders | clicks | carts

# top10

In [4]:
df_carts_count = pd.read_pickle(os.path.join(DATASET_PATH, "feature/carts_count.pkl"))
df_carts_count = df_carts_count.sort_values("count", ascending=False).reset_index(drop=True)
df_carts_count
df_carts_count.head(50).to_pickle(os.path.join(DATASET_PATH, "feature/carts_top50.pkl"))

Unnamed: 0,aid,count
0,1,0
1,3,0
2,4,0
3,8,0
4,10,0
...,...,...
1102655,1855594,1
1102656,1855595,0
1102657,1855597,0
1102658,1855600,0


In [11]:
df_clicks_count = pd.read_pickle(os.path.join(DATASET_PATH, "feature/clicks_count.pkl"))
df_clicks_count = df_clicks_count.sort_values("count", ascending=False).reset_index(drop=True)
df_clicks_count

Unnamed: 0,aid,count
0,1460571,867
1,108125,850
2,29735,823
3,1733943,765
4,832192,648
...,...,...
1824580,658065,0
1824581,658064,0
1824582,658063,0
1824583,658062,0


In [12]:
df_clicks_count.head(50).to_pickle(os.path.join(DATASET_PATH, "feature/clicks_top50.pkl"))

In [13]:
df_orders_count = pd.read_pickle(os.path.join(DATASET_PATH, "feature/orders_count.pkl"))
df_orders_count = df_orders_count.sort_values("count", ascending=False).reset_index(drop=True)
df_orders_count

Unnamed: 0,aid,count
0,231487,38
1,166037,30
2,1733943,27
3,1629608,24
4,756588,22
...,...,...
570713,623027,0
570714,623026,0
570715,623021,0
570716,623020,0


In [14]:
df_orders_count.head(50).to_pickle(os.path.join(DATASET_PATH, "feature/orders_top50.pkl"))

# TOP10 BY USERS
* train_data 与 test_data session 无重复（验证过）

In [8]:
df_test = None
for f in test_files:
    df_tmp = read_parquet(f)
    df_tmp = df_tmp.sort_values(['session','ts'],ascending=[True,False])
    df_tmp["n"] = df_tmp.groupby('session').cumcount()
    df_tmp = df_tmp.loc[df_tmp["n"]<30].drop("n", axis=1)
    if df_test is None:
        df_test = df_tmp
    else:
        df_test = pd.concat([df_test, df_tmp]).reset_index(drop=True)
        
df_test

Unnamed: 0,session,aid,ts,type
0,12899779,59625,1661724000,0
1,12899780,1142000,1661724155,0
2,12899780,736515,1661724136,0
3,12899780,973453,1661724109,0
4,12899780,582732,1661724058,0
...,...,...,...,...
6370316,14571577,1141710,1662328774,0
6370317,14571578,519105,1662328775,0
6370318,14571579,739876,1662328775,0
6370319,14571580,202353,1662328781,0


In [9]:
df_test_session = df_test[["session"]].drop_duplicates(subset="session")
df_test_session["inner_key"] = 1
df_test_session

Unnamed: 0,session,inner_key
0,12899779,1
1,12899780,1
6,12899781,1
17,12899782,1
47,12899783,1
...,...,...
6370316,14571577,1
6370317,14571578,1
6370318,14571579,1
6370319,14571580,1


In [10]:
df_orders_top10 = pd.read_pickle(os.path.join(DATASET_PATH, "feature/orders_top50.pkl"))[["aid"]].head(10)
df_orders_top10["type"] = TYPE_MAP["orders"]
df_orders_top10["inner_key"] = 1
df_orders_top10 = df_orders_top10.merge(df_test_session, on="inner_key", how="inner")
df_orders_top10["ts"] = None
df_orders_top10 = df_orders_top10[["session", "aid", "ts", "type"]]
df_orders_top10

Unnamed: 0,session,aid,ts,type
0,12899779,231487,,2
1,12899780,231487,,2
2,12899781,231487,,2
3,12899782,231487,,2
4,12899783,231487,,2
...,...,...,...,...
16718025,14571577,1083665,,2
16718026,14571578,1083665,,2
16718027,14571579,1083665,,2
16718028,14571580,1083665,,2


In [11]:
df_carts_top10 = pd.read_pickle(os.path.join(DATASET_PATH, "feature/carts_top50.pkl"))[["aid"]].head(10)
df_carts_top10["type"] = TYPE_MAP["carts"]
df_carts_top10["inner_key"] = 1
df_carts_top10 = df_carts_top10.merge(df_test_session, on="inner_key", how="inner")
df_carts_top10["ts"] = None
df_carts_top10 = df_carts_top10[["session", "aid", "ts", "type"]]
df_carts_top10

Unnamed: 0,session,aid,ts,type
0,12899779,166037,,1
1,12899780,166037,,1
2,12899781,166037,,1
3,12899782,166037,,1
4,12899783,166037,,1
...,...,...,...,...
16718025,14571577,1629608,,1
16718026,14571578,1629608,,1
16718027,14571579,1629608,,1
16718028,14571580,1629608,,1


In [12]:
df_clicks_top10 = pd.read_pickle(os.path.join(DATASET_PATH, "feature/carts_top50.pkl"))[["aid"]].head(10)
df_clicks_top10["type"] = TYPE_MAP["clicks"]
df_clicks_top10["inner_key"] = 1
df_clicks_top10 = df_clicks_top10.merge(df_test_session, on="inner_key", how="inner")
df_clicks_top10["ts"] = None
df_clicks_top10 = df_clicks_top10[["session", "aid", "ts", "type"]]
df_clicks_top10

Unnamed: 0,session,aid,ts,type
0,12899779,166037,,0
1,12899780,166037,,0
2,12899781,166037,,0
3,12899782,166037,,0
4,12899783,166037,,0
...,...,...,...,...
16718025,14571577,1629608,,0
16718026,14571578,1629608,,0
16718027,14571579,1629608,,0
16718028,14571580,1629608,,0


In [13]:
df = pd.concat([df_test, df_orders_top10, df_carts_top10, df_clicks_top10]).sort_values(['session','ts'],ascending=[True,False]).reset_index(drop=True)
df

Unnamed: 0,session,aid,ts,type
0,12899779,59625,1661724000,0
1,12899779,231487,,2
2,12899779,166037,,2
3,12899779,1733943,,2
4,12899779,1629608,,2
...,...,...,...,...
56524406,14571581,832192,,0
56524407,14571581,1022566,,0
56524408,14571581,33343,,0
56524409,14571581,1083665,,0


In [14]:
df.to_pickle(os.path.join(DATASET_PATH, "feature/candidate_test.pkl"))

In [1]:
56524411/1600000

35.327756875

# dump candidate

In [None]:
pd.read_pickle(os.path.join(DATASET_PATH, "feature/candidate_comatrix_exploded_details.pkl"))