In [59]:
import os
import time
import glob
import gc
import warnings
from multiprocessing import Pool

from tqdm import tqdm
import pandas as pd
import polars as pl
import numpy as np
import xgboost as xgb
import cudf
from sklearn.model_selection import GroupKFold
import joblib
from openfe import openfe, transform

warnings.filterwarnings("ignore")

# PARAMS

In [68]:
TRAIN_SESSION_SAMPLE_SIZE = 5e6
# TRAIN_SESSION_SAMPLE_SIZE = 5e5
DOWNSAMPLE_RATE = 20
VALID_PICKLE_PATH = "/home/search2/lichunyu/otto-recommender-system/data/input/otto-validation/test_pickle"

valid_pickle = sorted(glob.glob(os.path.join(VALID_PICKLE_PATH, "*_candidates.pkl")))
# valid_pickle

In [31]:
DATASET_PATH = "/home/search2/lichunyu/otto-recommender-system/data/input"

TYPE_MAP = {'clicks':0, 'carts':1, 'orders':2}

def read_parquet(f):
    df = pd.read_parquet(f)
    df.ts = (df.ts/1000).astype('int32')
    df['type'] = df['type'].map(TYPE_MAP).astype('int8')
    return df

In [32]:
test_files: list = sorted(glob.glob(os.path.join(DATASET_PATH, "otto-validation/test_parquet/*.parquet")))
# test_files

# Carts

用户最近20个行为商品 + 用户top商品20 + 全部top商品10

In [33]:
train_files: list = sorted(glob.glob(os.path.join(DATASET_PATH, "otto-validation/train_parquet/*.parquet")))
# train_files

In [69]:
%%time
def read_parquet(f):
    df = pd.read_parquet(f)
    df.ts = (df.ts/1000).astype('int32')
    df['type'] = df['type'].map(TYPE_MAP).astype('int8')
    return df

df_train = None
for i in train_files:
    df = read_parquet(i)
#     df = cudf.DataFrame(read_parquet(i))
    df = df.sort_values(['session','ts'],ascending=[True,False])
#     df = df.to_pandas()
    if df_train is None:
        df_train = df
    else:
        df_train = pd.concat([df_train, df])
#     break
        
df_train

CPU times: user 55.6 s, sys: 17.2 s, total: 1min 12s
Wall time: 1min 7s


Unnamed: 0,session,aid,ts,type
146,0,1110548,1661103727,0
145,0,724999,1661103701,0
144,0,30373,1661103687,0
143,0,102416,1661019639,0
142,0,504365,1661017998,0
...,...,...,...,...
656387,11098523,175715,1661119197,0
656388,11098524,1088524,1661119198,0
656389,11098525,182927,1661119199,0
656390,11098526,510055,1661119199,0


In [12]:
aid_candidates = df_train["aid"].unique()
aid_candidates

array([1110548,  724999,   30373, ..., 1009870, 1842655,  610759],
      dtype=int32)

## Sample

In [70]:
%%time
df_train = df_train[df_train["session"].isin(pd.DataFrame(np.random.choice(df_train.session.unique(), int(TRAIN_SESSION_SAMPLE_SIZE))).rename({0: "session"}, axis=1)["session"])]
train_ts_maximal = df_train["ts"].max()
train_ts_minimal = df_train["ts"].min()
df_train_label = df_train[(train_ts_maximal-df_train["ts"]) < 24*60*60].reset_index(drop=True)
df_train = df_train[(train_ts_maximal-df_train["ts"]) >= 24*60*60].reset_index(drop=True)
df_train_label["label"] = 1

CPU times: user 3.03 s, sys: 561 ms, total: 3.59 s
Wall time: 3.59 s


## Downsample

1:20

In [71]:
df_train = df_train.merge(df_train_label[["session", "aid", "label"]], on=["session", "aid"], how="outer").fillna({"label": 0, "type": 3, "ts": train_ts_minimal})
df_train["label"] = df_train["label"].astype("int8")
df_train["type"].value_counts()

0.0    51077061
1.0     4421522
3.0     2951806
2.0     1340670
Name: type, dtype: int64

In [72]:
def downsample(df, candidates=None, n=-1):
    if n == -1:
        n = DOWNSAMPLE_RATE
    if candidates is None:
        candidates = []
    df_negative = df[df["label"]==0]
    df_postive = df[df["label"]==1]
    r = len(df_negative)/len(df_postive)
    print(f"current negative size: {len(df_negative)}, postive size: {len(df_postive)}, rate: {r}")
    if r > 20:
        gloden_negative_size = n * len(df_postive)
        df_negative = df_negative.sample(gloden_negative_size)
        df = pd.concat([df_postive, df_negative])
    df["_noise"] = np.random.randn(len(df))
    df = df.sort_values(["session", "_noise"])
    df = df.drop("_noise", axis=1).reset_index(drop=True)
    return df

In [73]:
df_train = downsample(df_train)

current negative size: 55447314, postive size: 4343745, rate: 12.764863959555637


# Feature Engineering

In [74]:
%%time

# FEATURE_COL = [
#     "ts_day",
#     "carts_count",
#     "clicks_count",
#     "orders_count",
#     "session_clicks_count",
# ]

def ts_day(df):
    ts_minimal = df["ts"].min()
    df["ts_day"] = (df["ts"]-ts_minimal) // (24*60*60)
    return df

def carts_count(df):
    df_tmp = pd.read_pickle(os.path.join(DATASET_PATH, "feature/carts_count.pkl"))
    df = df.merge(df_tmp, on="aid", how="left").fillna({"count": 0}).rename({"count": "carts_count"}, axis=1)
    df["carts_count"] = df["carts_count"].astype("int32")
    return df

def clicks_count(df):
    df_tmp = pd.read_pickle(os.path.join(DATASET_PATH, "feature/clicks_count.pkl"))
    df = df.merge(df_tmp, on="aid", how="left").fillna({"count": 0}).rename({"count": "clicks_count"}, axis=1)
    df["clicks_count"] = df["clicks_count"].astype("int32")
    return df

def orders_count(df):
    df_tmp = pd.read_pickle(os.path.join(DATASET_PATH, "feature/orders_count.pkl"))
    df = df.merge(df_tmp, on="aid", how="left").fillna({"count": 0}).rename({"count": "orders_count"}, axis=1)
    df["orders_count"] = df["orders_count"].astype("int32")
    return df

def session_clicks_count(df):
    df_tmp = df[df["type"]==TYPE_MAP["clicks"]].groupby("session").agg({"aid": "count"}).reset_index().rename({"aid": "count"}, axis=1)
    df = df.merge(df_tmp, on="session", how="left").fillna({"count": 0}).rename({"count": "session_clicks_count"}, axis=1)
    df["session_clicks_count"] = df["session_clicks_count"].astype("int32")
    return df

def session_orders_count(df):
    df_tmp = df[df["type"]==TYPE_MAP["orders"]].groupby("session").agg({"aid": "count"}).reset_index().rename({"aid": "count"}, axis=1)
    df = df.merge(df_tmp, on="session", how="left").fillna({"count": 0}).rename({"count": "session_orders_count"}, axis=1)
    df["session_orders_count"] = df["session_orders_count"].astype("int32")
    return df

def session_carts_count(df):
    df_tmp = df[df["type"]==TYPE_MAP["carts"]].groupby("session").agg({"aid": "count"}).reset_index().rename({"aid": "count"}, axis=1)
    df = df.merge(df_tmp, on="session", how="left").fillna({"count": 0}).rename({"count": "session_carts_count"}, axis=1)
    df["session_carts_count"] = df["session_carts_count"].astype("int32")
    return df

def session_unknow_count(df):
    df_tmp = df[df["type"]==3].groupby("session").agg({"aid": "count"}).reset_index().rename({"aid": "count"}, axis=1)
    df = df.merge(df_tmp, on="session", how="left").fillna({"count": 0}).rename({"count": "session_unknow_count"}, axis=1)
    df["session_unknow_count"] = df["session_unknow_count"].astype("int32")
    return df

# ---- public -----
def add_session_length(df):
    # If not using cuDF, remove .to_pandas()
    df['session_length'] = df.groupby('session')['ts'].transform('count')
    return df

def add_action_num_reverse_chrono(df):
    df['action_num_reverse_chrono'] = df.session_length - df.groupby('session').cumcount() - 1
    return df

def add_log_recency_score(df):
    linear_interpolation = 0.1 + ((1-0.1) / (df['session_length']-1)) * (df['session_length']-df['action_num_reverse_chrono']-1)
    df['log_recency_score'] = (2 ** linear_interpolation - 1).fillna(1.0)
    return df

def add_type_weighted_log_recency_score(df):
    type_weights = {0:1, 1:6, 2:3}
    df['type_weighted_log_recency_score'] = df['log_recency_score'] / df['type'].map(type_weights)
    return df


pipeline = [
    ts_day,
    carts_count,
    clicks_count,
    orders_count,
    session_clicks_count,
    session_orders_count,
    session_carts_count,
    session_unknow_count,
    add_session_length,
    add_action_num_reverse_chrono,
    add_log_recency_score,
    add_type_weighted_log_recency_score
]

for p in pipeline:
    df_train = p(df_train)

FEATURE_COL = list(set(df_train.columns.tolist()) - set(["session", "aid", "ts", "label"]))
with open("FEATURE_COL.txt", "w") as f:
    f.write("\n".join(FEATURE_COL))

df_train

CPU times: user 50 s, sys: 17.6 s, total: 1min 7s
Wall time: 1min 7s


Unnamed: 0,session,aid,ts,type,label,ts_day,carts_count,clicks_count,orders_count,session_clicks_count,session_orders_count,session_carts_count,session_unknow_count,session_length,action_num_reverse_chrono,log_recency_score,type_weighted_log_recency_score
0,0,789245,1.659710e+09,1.0,0,4.0,1,14,0,139,2,3,2,146,145,0.071773,0.011962
1,0,1190046,1.659385e+09,0.0,0,0.0,2,12,0,139,2,3,2,146,144,0.076394,0.076394
2,0,1650637,1.659458e+09,0.0,0,1.0,1,16,0,139,2,3,2,146,143,0.081035,0.081035
3,0,964169,1.660561e+09,0.0,0,14.0,0,23,0,139,2,3,2,146,142,0.085696,0.085696
4,0,1309446,1.659367e+09,0.0,0,0.0,4,41,0,139,2,3,2,146,141,0.090377,0.090377
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
59791054,11098518,1039909,1.659305e+09,3.0,1,0.0,1,4,0,0,0,0,1,1,0,1.000000,
59791055,11098519,1515198,1.659305e+09,3.0,1,0.0,0,8,0,0,0,0,1,1,0,1.000000,
59791056,11098521,570499,1.659305e+09,3.0,1,0.0,0,5,0,0,0,0,1,1,0,1.000000,
59791057,11098522,1524949,1.659305e+09,3.0,1,0.0,0,0,0,0,0,0,1,1,0,1.000000,


In [81]:
FEATURE_COL = list(set(df_train.columns.tolist()) - set(["session", "aid", "ts", "label", "ts_day"]))
with open("FEATURE_COL.txt", "w") as f:
    f.write("\n".join(FEATURE_COL))


# Model

In [82]:
ranker = xgb.XGBRanker(
    tree_method='gpu_hist',
    booster='gbtree',
    objective='rank:pairwise',
    random_state=42, 
    learning_rate=0.1,
    colsample_bytree=0.9,  # 0.9
    eta=0.05, 
    max_depth=6, 
    n_estimators=80, 
    subsample=0.9,
    n_jobs=10
)

feature_cols = FEATURE_COL
label_col = 'label'

ranker.fit(
    X=df_train[feature_cols],
    y=df_train[label_col],
    group=df_train.groupby("session").count()["label"]
)

joblib.dump(ranker, f"carts_xgbranker_1.m")

['carts_xgbranker_1.m']

# Validation

In [None]:
%%time

# ranker = joblib.load("carts_xgbranker_1.m")

def validate(df_valid, scope=None):
    if scope is None:
#         scope = ["clicks", "carts", "orders"]
            scope = ["carts"]
    label = pd.read_parquet(os.path.join(DATASET_PATH, "otto-validation/test_labels.parquet"))
    df_valid = df_valid.merge(label, on=["session", "type"])
    df_valid["hits"] = df_valid.apply(lambda df: len(set(df.ground_truth).intersection(set(df.aid))), axis=1)
    df_valid['gt_count'] = df_valid.ground_truth.str.len().clip(0,20)
    recall = 0
    weights = {'clicks': 0.10, 'carts': 0.30, 'orders': 0.60}
    for s in scope:
        df = df_valid[df_valid["type"]==s]
        recall_s = df["hits"].sum() / df['gt_count'].sum()
        print(f"{s} Recall: {recall_s}")
        recall += recall_s*weights[s]
    print("")
    print(f"CV Recall: {recall}")
    return 


df_valid = None

batch = 1

for i in tqdm(range(0, len(valid_pickle), batch)):
    df = pd.concat([pd.read_pickle(_) for _ in valid_pickle[i:i+batch]]).reset_index(drop=True)
#     df = pd.read_pickle(f)
    for p in pipeline:
        df = p(df)
    df["score"] = ranker.predict(df[FEATURE_COL])
    df = df.sort_values(by=['session', 'score'], ascending=False)[['session', 'aid']].reset_index(drop=True)
    df = df.groupby('session').head(20).groupby('session').agg(list).reset_index(drop=False)
    df["type"] = "carts"
    if df_valid is None:
        df_valid = df
    else:
        df_valid = pd.concat([df_valid, df])

validate(df_valid, ["carts"])

 11%|███████████▊                                                                                                    | 2/19 [00:44<06:21, 22.46s/it]

# Inference

In [5]:
test_data_path = os.path.join(DATASET_PATH, "otto-chunk-data-inparquet-format/test_parquet/*.parquet")
test_data = sorted(glob.glob(test_data_path))
ranker = joblib.load("carts_xgbranker_1.m")

In [62]:
%%time

test_data_path = sorted(glob.glob("/home/search2/lichunyu/otto-recommender-system/data/input/feature/test_chunk_data/*_candidates.pkl"))

# df_test = pd.read_pickle(os.path.join(DATASET_PATH, "feature/candidate_comatrix_exploded_details.pkl"))
df_test = None
for i in tqdm(test_data_path):
    df = pd.read_pickle(i)
    for p in pipeline:
        df = p(df)
    df["score"] = ranker.predict(df[FEATURE_COL])
    df = df.sort_values(by=['session', 'score'], ascending=False)[['session', 'aid']].reset_index(drop=True)
    df = df.groupby('session').head(20).groupby('session').agg(list).reset_index(drop=False)
    if df_test is None:
        df_test = df
    else:
        df_test = pd.concat([df_test, df])
        
        
df_test = df_test.reset_index(drop=True)

# df_test["score"] = ranker.predict(df_test[FEATURE_COL])
# df_test = df_test.sort_values(by=['session', 'score'], ascending=False)[['session', 'aid']].reset_index(drop=True)
# df_test = df_test.groupby('session').head(20).groupby('session').agg(list).reset_index(drop=False)
df_test

100%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████| 17/17 [07:41<00:00, 27.15s/it]

CPU times: user 13min 25s, sys: 1min 2s, total: 14min 27s
Wall time: 7min 41s





Unnamed: 0,session,aid
0,12899779,"[59625, 1460571, 94230, 737445, 686765, 164098..."
1,12899780,"[1142000, 736515, 1142000, 1360606, 1032776, 1..."
2,12899781,"[199008, 1460571, 754412, 1853288, 1043508, 51..."
3,12899782,"[834354, 834354, 740494, 889671, 1007613, 1669..."
4,12899783,"[1142000, 1586171, 736515, 1502122, 77422, 607..."
...,...,...
1671798,13899774,"[393555, 481124, 755668, 354395, 108125, 40581..."
1671799,13899775,"[182882, 1146306, 943641, 1217686, 1070309, 30..."
1671800,13899776,"[688174, 1236234, 1236234, 799127, 947126, 619..."
1671801,13899777,"[1382226, 1594025, 615408, 230911, 1855330, 67..."


In [63]:
df_test["session_type"] = df_test["session"].apply(lambda x: str(x)+"_carts")
df_test = df_test.rename({"aid": "labels"}, axis=1)[["session_type", "labels"]]
df_test

Unnamed: 0,session_type,labels
0,12899779_carts,"[59625, 1460571, 94230, 737445, 686765, 164098..."
1,12899780_carts,"[1142000, 736515, 1142000, 1360606, 1032776, 1..."
2,12899781_carts,"[199008, 1460571, 754412, 1853288, 1043508, 51..."
3,12899782_carts,"[834354, 834354, 740494, 889671, 1007613, 1669..."
4,12899783_carts,"[1142000, 1586171, 736515, 1502122, 77422, 607..."
...,...,...
1671798,13899774_carts,"[393555, 481124, 755668, 354395, 108125, 40581..."
1671799,13899775_carts,"[182882, 1146306, 943641, 1217686, 1070309, 30..."
1671800,13899776_carts,"[688174, 1236234, 1236234, 799127, 947126, 619..."
1671801,13899777_carts,"[1382226, 1594025, 615408, 230911, 1855330, 67..."


In [64]:
df_carts_top50 = pd.read_pickle(os.path.join(DATASET_PATH, "feature/carts_top50.pkl"))
carts_top50 = df_carts_top50["aid"].tolist()

def func(x):
    if len(x) < 20:
        n = 20 - len(x)
        x = x + carts_top50[:n]
    return x

df_test["labels"] = df_test["labels"].apply(func)
df_test["labels"] = df_test["labels"].apply(lambda x: " ".join([str(_) for _ in x]))
df_test

Unnamed: 0,session_type,labels
0,12899779_carts,59625 1460571 94230 737445 686765 164098 43819...
1,12899780_carts,1142000 736515 1142000 1360606 1032776 1263108...
2,12899781_carts,199008 1460571 754412 1853288 1043508 518425 5...
3,12899782_carts,834354 834354 740494 889671 1007613 1669402 59...
4,12899783_carts,1142000 1586171 736515 1502122 77422 607638 17...
...,...,...
1671798,13899774_carts,393555 481124 755668 354395 108125 405819 5603...
1671799,13899775_carts,182882 1146306 943641 1217686 1070309 30822 11...
1671800,13899776_carts,688174 1236234 1236234 799127 947126 619885 11...
1671801,13899777_carts,1382226 1594025 615408 230911 1855330 671881 1...


In [65]:
df_submission = pd.read_csv("../data/output/submission.csv")
df_submission = df_submission[~df_submission.session_type.str.contains("_carts$")]
df_submission = pd.concat([df_test, df_submission])
df_submission.to_csv("../data/output/submission_optim_carts.csv", index=False)

In [66]:
df_submission

Unnamed: 0,session_type,labels
0,12899779_carts,59625 1460571 94230 737445 686765 164098 43819...
1,12899780_carts,1142000 736515 1142000 1360606 1032776 1263108...
2,12899781_carts,199008 1460571 754412 1853288 1043508 518425 5...
3,12899782_carts,834354 834354 740494 889671 1007613 1669402 59...
4,12899783_carts,1142000 1586171 736515 1502122 77422 607638 17...
...,...,...
3343601,14571577_orders,1141710 1276792 1666114 631085 1004292 367734 ...
3343602,14571578_orders,519105 977826 1811714 822641 1671592 815460 15...
3343603,14571579_orders,739876 1209992 1550479 1750859 785544 51363 21...
3343604,14571580_orders,202353 433425 1314576 925638 1231403 871658 88...


# submission csv must have 5015409 rows

In [107]:
df_submission.shape

(5015409, 2)

In [18]:
5015409/3

1671803.0

# Unused Function

In [None]:
def bool_count(df):
    df_tmp = df[df["label"]==0].groupby("session").agg({"aid": "count"}).reset_index().rename({"aid": "negative_count"}, axis=1)
    df = df.merge(df_tmp, on="session", how="left").fillna(0)
    df["negative_count"] = df["negative_count"].astype("int32")
    df_tmp = df[df["label"]==1].groupby("session").agg({"aid": "count"}).reset_index().rename({"aid": "postive_count"}, axis=1)
    df = df.merge(df_tmp, on="session", how="left").fillna(0)
    df["postive_count"] = df["postive_count"].astype("int32")
    return df