In [None]:
import os
import time
import glob
import gc
import warnings
from multiprocessing import Pool

from tqdm import tqdm
import pandas as pd
import polars as pl
import numpy as np
import xgboost as xgb
# import cudf
from sklearn.model_selection import GroupKFold
import joblib
# from openfe import openfe, transform

warnings.filterwarnings("ignore")

# Param

In [None]:
DATA_PATH = "/home/search3/lichunyu/otto-recommender-system/data"
TYPE_MAP = {'clicks':0, 'carts':1, 'orders':2}
TOPN = 60
DOWNSAMPLE_RATE = 20
VALID_DATA_RATIO = 0.2

def read_parquet(f):
    df = pd.read_parquet(f)
    df.ts = (df.ts/1000).astype('int32')
    df['type'] = df['type'].map(TYPE_MAP).astype('int8')
    return df

# Fill NA

In [None]:
def fillna_default(df):
    df = df.fillna({
        "user_first_clicks_time": 8.0,
        "user_last_clicks_time": 8.0,
        "user_first_orders_time": 8.0,
        "user_last_orders_time": 8.0,
        "user_first_carts_time": 8.0,
        "user_last_carts_time": 8.0,
        "interaction_type_core": 0,
        "item_last_clicks_ts": 30,
        "item_first_clicks_ts": 30,
        "item_last_carts_ts": 30,
        "item_first_carts_ts": 30,
        "item_last_orders_ts": 30,
        "item_first_orders_ts": 30,
        "interaction_clicks_carts_ratio": 0,
        "interaction_carts_clicks_ratio": 0,
        "interaction_clicks_orders_ratio": 0,
        "interaction_orders_clicks_ratio": 0,
        "interaction_orders_carts_ratio": 0,
        "interaction_carts_orders_ratio": 0,
    }, axis=0)
    return df

## Model Train

In [None]:
%%time

# valid_a_candidates_path = sorted(glob.glob(
#     os.path.join(DATA_PATH, "input/otto-candidates/valid_a_candidates/valid_a_candidates_buys_top50.parquet ")
# ))

# df_train_candidates = pd.concat([pd.read_parquet(i) for i in valid_a_candidates_path])

# valid_a_candidates_path = os.path.join(DATA_PATH, "input/otto-candidates/valid_a_candidates/valid_a_orders_candidates_top50.parquet")
valid_a_candidates_path = os.path.join(DATA_PATH, f"input/otto-candidates/valid_a_candidates/valid_a_orders_candidates_top{TOPN}.parquet")
df_train_candidates = pd.read_parquet(valid_a_candidates_path)


df_train_label = pd.read_parquet(
    os.path.join(DATA_PATH, "input/otto-validation/test_labels.parquet")
)
# df_train_label = df_train_label[(df_train_label["type"]=="orders")|(df_train_label["type"]=="carts")][["session", "ground_truth"]] # TODO test it
df_train_label = df_train_label[(df_train_label["type"]=="orders")][["session", "ground_truth"]]
df_train_label = df_train_label.explode("ground_truth").reset_index(drop=True)
df_train_label = df_train_label.rename({"ground_truth": "aid"}, axis=1)
df_train_label["label"] = 1

# df_train_candidates = df_train_candidates.merge(df_train_label, on=["session", "aid"], how="outer").fillna({"label": 0})

df_train_candidates = df_train_candidates.merge(df_train_label, on=["session", "aid"], how="left").fillna({"label": 0})
df_train_candidates["label"] = df_train_candidates["label"].astype("int")
df_train_candidates["recall_order"] = df_train_candidates.groupby('session').cumcount()
df_train_candidates.shape

## Downsample

In [None]:
%%time

def downsample(df, n=-1):
    if n == -1:
        n = DOWNSAMPLE_RATE
    df_negative = df[df["label"]==0]
    df_postive = df[df["label"]==1]
    r = len(df_negative)//len(df_postive)
    print(f"current negative size: {len(df_negative)}, postive size: {len(df_postive)}, rate: {r}")
    if r > n:
        gloden_negative_size = n * len(df_postive)
        df_negative = df_negative.sample(gloden_negative_size)
        df = pd.concat([df_postive, df_negative])
    df["_noise"] = np.random.randn(len(df))
    df = df.sort_values(["session", "_noise"])
    df = df.drop("_noise", axis=1).reset_index(drop=True)
    return df


def train_valid_split(df):
    print(f"origin all data size: {len(df)}")
    valid_session = np.random.choice(df_train_candidates.session.unique(), int(len(df_train_candidates.session.unique())*VALID_DATA_RATIO))
    df_train = df[~df["session"].isin(valid_session)]
    df_valid = df[df["session"].isin(valid_session)]
    df_train = downsample(df_train)
    return df_train, df_valid


df_train_candidates, df_valid_candidates = train_valid_split(df_train_candidates)
df_train_candidates["label"] = df_train_candidates["label"].astype("int")
df_valid_candidates["label"] = df_valid_candidates["label"].astype("int")
print(f"df_train_candidates size: {len(df_train_candidates)}, df_valid_candidates size: {len(df_valid_candidates)}")

## Merge Feature

In [None]:
%%time

train_item_feature_path = sorted(glob.glob(
    os.path.join(DATA_PATH, "input/feature/train/item_feature_*.parquet")
))
for p in tqdm(train_item_feature_path):
    df_train_candidates = df_train_candidates.merge(pd.read_parquet(p), on="aid", how="left")
    df_valid_candidates = df_valid_candidates.merge(pd.read_parquet(p), on="aid", how="left")


train_user_feature_path = sorted(glob.glob(
    os.path.join(DATA_PATH, "input/feature/train/user_feature_*.parquet")
))
for p in tqdm(train_user_feature_path):
    df_train_candidates = df_train_candidates.merge(pd.read_parquet(p), on="session", how="left")
    df_valid_candidates = df_valid_candidates.merge(pd.read_parquet(p), on="session", how="left")


train_interaction_feature_path = sorted(glob.glob(
    os.path.join(DATA_PATH, "input/feature/train/interaction_feature_*.parquet")
))
for p in tqdm(train_interaction_feature_path):
    df_train_candidates = df_train_candidates.merge(pd.read_parquet(p), on=["session", "aid"], how="left")
    df_valid_candidates = df_valid_candidates.merge(pd.read_parquet(p), on=["session", "aid"], how="left")


df_train_candidates = fillna_default(df_train_candidates)
df_valid_candidates = fillna_default(df_valid_candidates)

print(f"df_train_candidates size: {len(df_train_candidates)}, df_valid_candidates size: {len(df_valid_candidates)}")
df_train_candidates.head(5)

## Storage

In [None]:
df_train_candidates.to_pickle("df_train_candidates.pkl")
df_valid_candidates.to_pickle("df_valid_candidates.pkl")
df_train_label[
    df_train_label["session"].isin(df_valid_candidates["session"].unique())
].reset_index(drop=True).rename({"aid": "ground_truth"}, axis=1).to_parquet("df_valid_label.parquet")

## Feature Name

In [None]:
FEATURE_COL = list(set(df_train_candidates.columns.tolist()) - set(["session", "aid", "ts", "label", "user_buy_ratio"]))
with open("FEATURE_COL.txt", "w") as f:
    f.write("\n".join(FEATURE_COL))

print(f"Count of Feature is: {len(FEATURE_COL)}")
print("")
FEATURE_COL

## Training

In [None]:
%%time
n_estimators_candidates = list(range(100,401,50))

for n_estimators in tqdm(n_estimators_candidates):
    ranker = xgb.XGBRanker(
#         tree_method='gpu_hist',
        tree_method="hist",
        booster='gbtree',
        objective='rank:pairwise',
        random_state=42, 
        learning_rate=0.1,
        colsample_bytree=0.9,  # 0.9
        eta=0.05, 
        max_depth=6, 
        n_estimators=n_estimators,
        subsample=0.8,
        n_jobs=15
    )

    feature_cols = FEATURE_COL
    label_col = 'label'

    ranker.fit(
        X=df_train_candidates[feature_cols],
        y=df_train_candidates[label_col],
        group=df_train_candidates.groupby("session").count()["label"]
    )

    joblib.dump(ranker, f"models/orders_xgbranker_{str(n_estimators)}.m")

In [None]:
n_estimators_candidates = list(range(50,501,50))

# Valid

In [None]:
%%time

# ranker = joblib.load("carts_xgbranker_1.m")

# FEATURE_COL = list(ranker.feature_names_in_)
with open("FEATURE_COL.txt", "r") as f:
    FEATURE_COL = f.read().splitlines()
    
print(FEATURE_COL)

for n_estimators in tqdm(n_estimators_candidates[::-1]):
    df_valid_candidates = pd.read_pickle("df_valid_candidates.pkl")
    df_valid_label = pd.read_parquet("df_valid_label.parquet")
    df_valid_label = df_valid_label[["session", "ground_truth"]].groupby("session").agg(list).reset_index()
    ranker = joblib.load(f"models/orders_xgbranker_{n_estimators}.m")
    df_valid_candidates["score"] = ranker.predict(df_valid_candidates[FEATURE_COL])
    df_valid_candidates = df_valid_candidates.sort_values(by=['session', 'score'], ascending=False)[['session', 'aid']].reset_index(drop=True)
    df_valid_candidates = df_valid_candidates.groupby('session').head(20).groupby('session').agg(list).reset_index(drop=False)
    df_valid_candidates = df_valid_candidates.merge(df_valid_label, on="session", how="left")
    df_valid_candidates["ground_truth"] = df_valid_candidates["ground_truth"].apply(lambda x: x if isinstance(x, list) else [])
    df_valid_candidates["hits"] = df_valid_candidates.apply(lambda df: len(set(df.ground_truth).intersection(set(df.aid))), axis=1)
    df_valid_candidates['gt_count'] = df_valid_candidates.ground_truth.str.len().clip(0,20)
    recall = df_valid_candidates["hits"].sum() / df_valid_candidates['gt_count'].sum()

    print(f"n_estimators={n_estimators} get Recall@20: {recall}")

# Chunking kaggle test dataset to save memory(don't execute it)

In [None]:
# df_kaggle_test = pd.read_parquet(os.path.join(DATA_PATH, "input/otto-candidates/test_candidates/test_orders_candidates_top50.parquet"))
df_kaggle_test = pd.read_parquet(os.path.join(DATA_PATH, f"input/otto-candidates/test_candidates/test_orders_candidates_top{TOPN}.parquet"))

inference_session_list = df_kaggle_test["session"].unique().tolist()
INFERENCE_CHUNK_SIZE = len(inference_session_list)//INFERENCE_CHUNK_NUM + 2
n = 1
for i in range(0, len(inference_session_list), INFERENCE_CHUNK_SIZE):
    df_kaggle_test[df_kaggle_test["session"].isin(inference_session_list[i:i+INFERENCE_CHUNK_SIZE])].to_parquet(
        os.path.join(DATA_PATH, f"input/otto-candidates/test_candidates/test_orders_candidates_top{TOPN}_chunk_{n}.parquet")
    )
    n += 1
df_kaggle_test.head(10)

# Inference

In [None]:
# INFERENCE_CHUNK_NUM = 10

In [None]:
%%time
INFERENCE_CHUNK_NUM = 10
df_kaggle_test = None
for n in range(1, INFERENCE_CHUNK_NUM+1):
    print(f"chunk {n} start ...")
    df_kaggle_test_chunk = pd.read_parquet(
        os.path.join(DATA_PATH, f"input/otto-candidates/test_candidates/test_orders_candidates_top{TOPN}_chunk_{n}.parquet")
    )
    df_kaggle_test_chunk["recall_order"] = df_kaggle_test_chunk.groupby('session').cumcount()
    test_user_feature_path = sorted(glob.glob(
        os.path.join(DATA_PATH, "input/feature/test/user_feature_*.parquet")
    ))
    for p in tqdm(test_user_feature_path):
        df_kaggle_test_chunk = df_kaggle_test_chunk.merge(pd.read_parquet(p), on="session", how="left")


    test_item_feature_path = sorted(glob.glob(
        os.path.join(DATA_PATH, "input/feature/test/item_feature_*.parquet")
    ))
    for p in tqdm(test_item_feature_path):
        df_kaggle_test_chunk = df_kaggle_test_chunk.merge(pd.read_parquet(p), on="aid", how="left")

    test_interaction_feature_path = sorted(glob.glob(
        os.path.join(DATA_PATH, "input/feature/test/interaction_feature_*.parquet")
    ))
    for p in tqdm(test_interaction_feature_path):
        df_kaggle_test_chunk = df_kaggle_test_chunk.merge(pd.read_parquet(p), on=["session", "aid"], how="left")


    df_kaggle_test_chunk = fillna_default(df_kaggle_test_chunk)
    df_kaggle_test_chunk.to_parquet(f"df_kaggle_test_chunk_{n}.parquet")
    
# df_kaggle_test.to_parquet("df_kaggle_test.parquet")
# inference_session_list = df_kaggle_test["session"].unique().tolist()
# INFERENCE_CHUNK_NUM = 10
# INFERENCE_CHUNK_SIZE = len(inference_session_list)//INFERENCE_CHUNK_NUM + 2
# n = 1
# for i in range(0, len(inference_session_list), INFERENCE_CHUNK_SIZE):
#     df_kaggle_test[df_kaggle_test["session"].isin(inference_session_list[i:INFERENCE_CHUNK_SIZE])].to_parquet(f"df_kaggle_test_chunk_{n}.parquet")
#     n += 1
# df_kaggle_test.head(10)

# Please restart kernel to save memory

In [None]:
%%time

with open("FEATURE_COL.txt", "r") as f:
    FEATURE_COL = f.read().splitlines()
print(FEATURE_COL)

best_n_estimators = "200"
ranker = joblib.load(f"models/orders_xgbranker_{best_n_estimators}.m")
df_kaggle_test = None
for n in tqdm(range(1, INFERENCE_CHUNK_NUM+1)):
    df = pd.read_parquet(f"df_kaggle_test_chunk_{n}.parquet")
    df["score"] = ranker.predict(df[FEATURE_COL])
    df = df.sort_values(by=['session', 'score'], ascending=False)[['session', 'aid']].reset_index(drop=True)
    df = df.groupby('session').head(20).groupby('session').agg(list).reset_index(drop=False)
    if df_kaggle_test is None:
        df_kaggle_test = df
    else:
        df_kaggle_test = pd.concat([df_kaggle_test, df])

# df_kaggle_test["score"] = ranker.predict(df_kaggle_test[FEATURE_COL])
# df_kaggle_test = df_kaggle_test.sort_values(by=['session', 'score'], ascending=False)[['session', 'aid']].reset_index(drop=True)
# df_kaggle_test = df_kaggle_test.groupby('session').head(20).groupby('session').agg(list).reset_index(drop=False)
df_kaggle_test

In [None]:
%%time

df_kaggle_test["session_type"] = df_kaggle_test["session"].apply(lambda x: str(x)+"_orders")
# df_test["session_type"] = df_test["session"].apply(lambda x: str(x)+"_carts")
df_kaggle_test = df_kaggle_test.rename({"aid": "labels"}, axis=1)[["session_type", "labels"]]
df_kaggle_test["labels"] = df_kaggle_test["labels"].apply(lambda x: " ".join([str(_) for _ in x]))
df_submission = pd.read_csv("../data/output/submission_578.csv")
# df_submission = df_submission[~df_submission.session_type.str.contains("_carts$")]
df_submission = df_submission[~df_submission.session_type.str.contains("_orders$")]
df_submission = pd.concat([df_kaggle_test, df_submission])
df_submission.to_csv("../data/output/submission_optim_orders.csv", index=False)
df_submission.shape