In [1]:
import os
import time
import glob
import gc
import warnings
from multiprocessing import Pool

from tqdm import tqdm
import pandas as pd
import polars as pl
import numpy as np
import xgboost as xgb
# import cudf
from sklearn.model_selection import GroupKFold
import joblib
# from openfe import openfe, transform

warnings.filterwarnings("ignore")

# Param

In [2]:
DATA_PATH = "/home/search3/lichunyu/otto-recommender-system/data"
TYPE_MAP = {'clicks':0, 'carts':1, 'orders':2}
DOWNSAMPLE_RATE = 20

def read_parquet(f):
    df = pd.read_parquet(f)
    df.ts = (df.ts/1000).astype('int32')
    df['type'] = df['type'].map(TYPE_MAP).astype('int8')
    return df

## Model Train

In [5]:
%%time

# valid_a_candidates_path = sorted(glob.glob(
#     os.path.join(DATA_PATH, "input/otto-candidates/valid_a_candidates/valid_a_candidates_buys_top50.parquet ")
# ))

# df_train_candidates = pd.concat([pd.read_parquet(i) for i in valid_a_candidates_path])

valid_a_candidates_path = os.path.join(DATA_PATH, "input/otto-candidates/valid_a_candidates/valid_a_candidates_buys_top50.parquet")
df_train_candidates = pd.read_parquet(valid_a_candidates_path)


df_train_label = pd.read_parquet(
    os.path.join(DATA_PATH, "input/otto-validation/test_labels.parquet")
)
df_train_label = df_train_label[df_train_label["type"]=="orders"][["session", "ground_truth"]]
df_train_label = df_train_label.explode("ground_truth").reset_index(drop=True)
df_train_label = df_train_label.rename({"ground_truth": "aid"}, axis=1)
df_train_label["label"] = 1

df_train_candidates = df_train_candidates.merge(df_train_label, on=["session", "aid"], how="left").fillna({"label": 0})
df_train_candidates["label"] = df_train_candidates["label"].astype("int")
df_train_candidates.shape

CPU times: user 44.9 s, sys: 7.33 s, total: 52.2 s
Wall time: 51 s


(89862970, 3)

In [6]:
def downsample(df, n=-1):
    if n == -1:
        n = DOWNSAMPLE_RATE
    df_negative = df[df["label"]==0]
    df_postive = df[df["label"]==1]
    r = len(df_negative)//len(df_postive)
    print(f"current negative size: {len(df_negative)}, postive size: {len(df_postive)}, rate: {r}")
    if r > n:
        gloden_negative_size = n * len(df_postive)
        df_negative = df_negative.sample(gloden_negative_size)
        df = pd.concat([df_postive, df_negative])
    df["_noise"] = np.random.randn(len(df))
    df = df.sort_values(["session", "_noise"])
    df = df.drop("_noise", axis=1).reset_index(drop=True)
    return df


def train_valid_split(df):
    print(f"origin all data size: {len(df)}")
    valid_session = np.random.choice(df_train_candidates.session.unique(), int(len(df_train_candidates.session.unique())*0.2))
    df_train = df[~df["session"].isin(valid_session)]
    df_valid = df[df["session"].isin(valid_session)]
    df_train = downsample(df_train)
    return df_train, df_valid


df_train_candidates, df_valid_candidates = train_valid_split(df_train_candidates)
df_train_candidates["label"] = df_train_candidates["label"].astype("int")
df_valid_candidates["label"] = df_valid_candidates["label"].astype("int")
print(f"df_train_candidates size: {len(df_train_candidates)}, df_valid_candidates size: {len(df_valid_candidates)}")

# storage
# df_train_candidates.to_pickle("df_train_candidates.pkl")
# df_valid_candidates.to_pickle("df_valid_candidates.pkl")

origin all data size: 89862970
current negative size: 73396016, postive size: 175530, rate: 418
df_train_candidates size: 3686130, df_valid_candidates size: 16291424


In [6]:
# %%time

# for p in pipeline:
#     df_train_candidates = p(df_train_candidates)
#     df_valid_candidates = p(df_valid_candidates)

CPU times: user 7.41 s, sys: 1.16 s, total: 8.56 s
Wall time: 8.56 s


In [7]:
%%time

train_item_feature_path = sorted(glob.glob(
    os.path.join(DATA_PATH, "input/feature/train/item_feature_*.parquet")
))

for p in tqdm(train_item_feature_path):
    df_train_candidates = df_train_candidates.merge(pd.read_parquet(p), on="aid", how="left")
    df_valid_candidates = df_valid_candidates.merge(pd.read_parquet(p), on="aid", how="left")

print(f"df_train_candidates size: {len(df_train_candidates)}, df_valid_candidates size: {len(df_valid_candidates)}")
df_train_candidates.head(5)

100%|████████████████████████████████████████████████████████████████████████████████████████████████████| 1/1 [00:09<00:00,  9.96s/it]

df_train_candidates size: 3686130, df_valid_candidates size: 16291424
CPU times: user 9.64 s, sys: 548 ms, total: 10.2 s
Wall time: 9.96 s





Unnamed: 0,session,aid,label,item_user_count,item_item_count,item_buy_ratio
0,11098528,542780,0,10772,16655,0.07139
1,11098528,205357,0,5105,8528,0.08478
2,11098528,822934,0,14015,23154,0.192451
3,11098528,487136,0,18408,29704,0.155232
4,11098528,11830,1,19211,33776,0.17397


In [8]:
%%time

train_user_feature_path = sorted(glob.glob(
    os.path.join(DATA_PATH, "input/feature/train/user_feature_*.parquet")
))

for p in tqdm(train_user_feature_path):
    df_train_candidates = df_train_candidates.merge(pd.read_parquet(p), on="session", how="left")
    df_valid_candidates = df_valid_candidates.merge(pd.read_parquet(p), on="session", how="left")

print(f"df_train_candidates size: {len(df_train_candidates)}, df_valid_candidates size: {len(df_valid_candidates)}")
df_train_candidates.head(5)

100%|████████████████████████████████████████████████████████████████████████████████████████████████████| 1/1 [00:03<00:00,  3.86s/it]

df_train_candidates size: 3686130, df_valid_candidates size: 16291424
CPU times: user 3.57 s, sys: 497 ms, total: 4.07 s
Wall time: 3.87 s





Unnamed: 0,session,aid,label,item_user_count,item_item_count,item_buy_ratio,user_user_count,user_item_count,user_buy_ratio
0,11098528,542780,0,10772,16655,0.07139,1,1,0.0
1,11098528,205357,0,5105,8528,0.08478,1,1,0.0
2,11098528,822934,0,14015,23154,0.192451,1,1,0.0
3,11098528,487136,0,18408,29704,0.155232,1,1,0.0
4,11098528,11830,1,19211,33776,0.17397,1,1,0.0


## Storage

In [9]:
df_train_candidates.to_pickle("df_train_candidates.pkl")
df_valid_candidates.to_pickle("df_valid_candidates.pkl")

## Feature Name

In [10]:
FEATURE_COL = list(set(df_train_candidates.columns.tolist()) - set(["session", "aid", "ts", "label"]))
with open("FEATURE_COL.txt", "w") as f:
    f.write("\n".join(FEATURE_COL))
    
FEATURE_COL

['user_user_count',
 'item_item_count',
 'item_buy_ratio',
 'item_user_count',
 'user_buy_ratio',
 'user_item_count']

## Training

In [11]:
%%time
n_estimators_candidates = list(range(10,51,10))

for n_estimators in tqdm(n_estimators_candidates):
    ranker = xgb.XGBRanker(
#         tree_method='gpu_hist',
        tree_method="hist",
        booster='gbtree',
        objective='rank:pairwise',
        random_state=42, 
        learning_rate=0.1,
        colsample_bytree=0.9,  # 0.9
        eta=0.05, 
        max_depth=6, 
        n_estimators=n_estimators,
        subsample=0.8,
        n_jobs=11
    )

    feature_cols = FEATURE_COL
    label_col = 'label'

    ranker.fit(
        X=df_train_candidates[feature_cols],
        y=df_train_candidates[label_col],
        group=df_train_candidates.groupby("session").count()["label"]
    )

    joblib.dump(ranker, f"models/orders_xgbranker_{str(n_estimators)}.m")

100%|████████████████████████████████████████████████████████████████████████████████████████████████████| 5/5 [00:26<00:00,  5.39s/it]

CPU times: user 4min 46s, sys: 3.26 s, total: 4min 50s
Wall time: 26.9 s





# Valid

In [12]:
%%time

# ranker = joblib.load("carts_xgbranker_1.m")

# FEATURE_COL = list(ranker.feature_names_in_)
with open("FEATURE_COL.txt", "r") as f:
    FEATURE_COL = f.read().splitlines()
    
print(FEATURE_COL)

for n_estimators in tqdm(n_estimators_candidates[::-1]):
    df_valid_candidates = pd.read_pickle("df_valid_candidates.pkl")
    ranker = joblib.load(f"models/orders_xgbranker_{n_estimators}.m")
    df_valid_candidates["score"] = ranker.predict(df_valid_candidates[FEATURE_COL])
    df_valid_candidates = df_valid_candidates.sort_values(by=['session', 'score'], ascending=False)[['session', 'aid', "label", "score"]].reset_index(drop=True)
    recall = pl.DataFrame(df_valid_candidates).groupby("session").agg([
        (pl.col("label").head(20).sum()/pl.col("label").sum()).alias("gt_ratio")
    ]).select(pl.col("gt_ratio").drop_nans()).mean()["gt_ratio"][0]
    print(f"n_estimators={n_estimators} get Recall@20: {recall}")

['user_user_count', 'item_item_count', 'item_buy_ratio', 'item_user_count', 'user_buy_ratio', 'user_item_count']


 20%|████████████████████                                                                                | 1/5 [00:11<00:45, 11.36s/it]

n_estimators=50 get Recall@20: 0.5787934275097294


 40%|████████████████████████████████████████                                                            | 2/5 [00:22<00:33, 11.03s/it]

n_estimators=40 get Recall@20: 0.5797784249789684


 60%|████████████████████████████████████████████████████████████                                        | 3/5 [00:32<00:21, 10.90s/it]

n_estimators=30 get Recall@20: 0.579733063855297


 80%|████████████████████████████████████████████████████████████████████████████████                    | 4/5 [00:43<00:10, 10.76s/it]

n_estimators=20 get Recall@20: 0.5803773727008771


100%|████████████████████████████████████████████████████████████████████████████████████████████████████| 5/5 [00:52<00:00, 10.59s/it]

n_estimators=10 get Recall@20: 0.5922325112274903
CPU times: user 2min 11s, sys: 7.93 s, total: 2min 19s
Wall time: 53 s





In [54]:
(0.8097062407307981 - 0.8072849174254828)*0.6

0.0014527939831891822

# Inference

In [26]:
kaggle_test_candidates_path = sorted(glob.glob(
    os.path.join(DATA_PATH, "input/otto-test-chunk-candidates-top100/test_chunk_data/test_chunk_data/*_candidates.pkl")
))

df_kaggle_test = pd.concat([pd.read_pickle(i) for i in kaggle_test_candidates_path])
df_kaggle_test

Unnamed: 0,session,aid,ts,type
0,12899779,59625,1661724000,0
1,12899779,469285,1661724000,3
2,12899779,1493965,1661724000,3
3,12899779,438191,1661724000,3
4,12899779,731692,1661724000,3
...,...,...,...,...
101041929,13899778,880834,1661724000,3
101041930,13899778,894123,1661724000,3
101041931,13899778,907634,1661724000,3
101041932,13899778,937690,1661724000,3


In [28]:
%%time

# for p in tqdm(pipeline):
#     df_kaggle_test = p(df_kaggle_test)


test_user_feature_path = sorted(glob.glob(
    os.path.join(DATA_PATH, "input/feature/test/user_feature_*.parquet")
))
for p in tqdm(train_user_feature_path):
    df_kaggle_test = df_kaggle_test.merge(pd.read_parquet(p), on="session", how="left")


test_item_feature_path = sorted(glob.glob(
    os.path.join(DATA_PATH, "input/feature/test/item_feature_*.parquet")
))
for p in tqdm(train_item_feature_path):
    df_kaggle_test = df_kaggle_test.merge(pd.read_parquet(p), on="aid", how="left")

100%|████████████████████████████████████████████████████████████████████████████████████████████████████| 1/1 [00:27<00:00, 27.03s/it]
100%|████████████████████████████████████████████████████████████████████████████████████████████████████| 1/1 [01:29<00:00, 89.14s/it]

CPU times: user 1min 22s, sys: 29.3 s, total: 1min 51s
Wall time: 1min 56s





In [29]:
df_kaggle_test

Unnamed: 0,session,aid,ts,type,user_user_count,user_item_count,user_buy_ratio,item_user_count,item_item_count,item_buy_ratio
0,12899779,59625,1661724000,0,,,,9.0,10.0,0.000000
1,12899779,469285,1661724000,3,,,,26.0,42.0,0.071429
2,12899779,1493965,1661724000,3,,,,108.0,139.0,0.064748
3,12899779,438191,1661724000,3,,,,1869.0,2847.0,0.028802
4,12899779,731692,1661724000,3,,,,45.0,89.0,0.191011
...,...,...,...,...,...,...,...,...,...,...
168843432,13899778,880834,1661724000,3,,,,25.0,35.0,0.171429
168843433,13899778,894123,1661724000,3,,,,266.0,348.0,0.020115
168843434,13899778,907634,1661724000,3,,,,630.0,1023.0,0.037146
168843435,13899778,937690,1661724000,3,,,,176.0,268.0,0.014925


In [25]:
%%time
best_n_estimators = "20"
ranker = joblib.load(f"models/orders_xgbranker_{best_n_estimators}.m")
df_kaggle_test["score"] = ranker.predict(df_kaggle_test[FEATURE_COL])
df_kaggle_test = df_kaggle_test.sort_values(by=['session', 'score'], ascending=False)[['session', 'aid']].reset_index(drop=True)
df_kaggle_test = df_kaggle_test.groupby('session').head(20).groupby('session').agg(list).reset_index(drop=False)
df_kaggle_test

KeyError: "None of [Index(['user_user_count', 'item_item_count', 'item_buy_ratio',\n       'item_user_count', 'user_buy_ratio', 'user_item_count'],\n      dtype='object')] are in the [columns]"

In [17]:
%%time

df_kaggle_test["session_type"] = df_kaggle_test["session"].apply(lambda x: str(x)+"_orders")
# df_test["session_type"] = df_test["session"].apply(lambda x: str(x)+"_carts")
df_kaggle_test = df_kaggle_test.rename({"aid": "labels"}, axis=1)[["session_type", "labels"]]
df_kaggle_test["labels"] = df_kaggle_test["labels"].apply(lambda x: " ".join([str(_) for _ in x]))
df_submission = pd.read_csv("../data/output/submission_578.csv")
# df_submission = df_submission[~df_submission.session_type.str.contains("_carts$")]
df_submission = df_submission[~df_submission.session_type.str.contains("_orders$")]
df_submission = pd.concat([df_kaggle_test, df_submission])
df_submission.to_csv("../data/output/submission_optim_orders.csv", index=False)
df_submission.shape

CPU times: user 30.8 s, sys: 1.24 s, total: 32.1 s
Wall time: 35.6 s


(5015409, 2)

In [30]:
df_submission = pd.read_csv("../data/output/submission_578.csv")
df_submission

Unnamed: 0,session_type,labels
0,13913534_clicks,278972 1089813 1638602 667905 1076015 967342 1...
1,14289599_clicks,928810 814150 620490 90121 435196 1390232 1305...
2,12997553_carts,1837490 1066416 57315 127864 1582125 959548 10...
3,14090373_carts,1400630 63531 629603 337471 1497245 199409 168...
4,12919826_orders,138748 907262 223388 1655327 525721 1052093 87...
...,...,...
5015404,13686109_orders,823175 1254185 1424310 147427 635613 1440236 1...
5015405,13894288_carts,442211 176650 1155739 1639021 480831 1354998 1...
5015406,13871060_carts,528019 795130 439795 341050 1270528 1658239 10...
5015407,12965861_carts,1851407 623811 1667230 990186 1178396 1695413 ...
