In [1]:
import os
import time
import glob
import gc
import warnings
from multiprocessing import Pool

from tqdm import tqdm
import pandas as pd
import polars as pl
import numpy as np
import xgboost as xgb
# import cudf
from sklearn.model_selection import GroupKFold
import joblib
# from openfe import openfe, transform

warnings.filterwarnings("ignore")

# Param

In [2]:
DATA_PATH = "/home/search3/lichunyu/otto-recommender-system/data"
TYPE_MAP = {'clicks':0, 'carts':1, 'orders':2}
TOPN = 50
DOWNSAMPLE_RATE = 20
VALID_DATA_RATIO = 0.01
MODEL_TYPE = "clicks"

def read_parquet(f):
    df = pd.read_parquet(f)
    df.ts = (df.ts/1000).astype('int32')
    df['type'] = df['type'].map(TYPE_MAP).astype('int8')
    return df

# Fill NA

In [3]:
def fillna_default(df):
    df = df.fillna({
        "user_first_clicks_time": 8.0,
        "user_last_clicks_time": 8.0,
        "user_first_orders_time": 8.0,
        "user_last_orders_time": 8.0,
        "user_first_carts_time": 8.0,
        "user_last_carts_time": 8.0,
        "interaction_type_core": 0,
        "item_last_clicks_ts": 30,
        "item_first_clicks_ts": 30,
        "item_last_carts_ts": 30,
        "item_first_carts_ts": 30,
        "item_last_orders_ts": 30,
        "item_first_orders_ts": 30,
        "interaction_clicks_carts_ratio": 0,
        "interaction_carts_clicks_ratio": 0,
        "interaction_clicks_orders_ratio": 0,
        "interaction_orders_clicks_ratio": 0,
        "interaction_orders_carts_ratio": 0,
        "interaction_carts_orders_ratio": 0,
        "interaction_timing_decay_score": 0,
        "interaction_orders_count": 0,
        "interaction_clicks_count": 0,
        "interaction_carts_count": 0,
        "interaction_behavior_count": 0,
        "interaction_behavor_period": 0
    }, axis=0)
    return df

## Model Train

In [4]:
%%time

# valid_a_candidates_path = sorted(glob.glob(
#     os.path.join(DATA_PATH, "input/otto-candidates/valid_a_candidates/valid_a_candidates_buys_top50.parquet ")
# ))

# df_train_candidates = pd.concat([pd.read_parquet(i) for i in valid_a_candidates_path])

# valid_a_candidates_path = os.path.join(DATA_PATH, "input/otto-candidates/valid_a_candidates/valid_a_orders_candidates_top50.parquet")
valid_a_candidates_path = os.path.join(DATA_PATH, f"input/otto-candidates/valid_a_candidates/valid_a_{MODEL_TYPE}_candidates_top{TOPN}.parquet")
df_train_candidates = pd.read_parquet(valid_a_candidates_path)


df_train_label = pd.read_parquet(
    os.path.join(DATA_PATH, "input/otto-validation/test_labels.parquet")
)
# df_train_label = df_train_label[(df_train_label["type"]=="orders")|(df_train_label["type"]=="carts")][["session", "ground_truth"]] # TODO test it
df_train_label = df_train_label[(df_train_label["type"]==MODEL_TYPE)][["session", "ground_truth"]]
df_train_label = df_train_label.explode("ground_truth").reset_index(drop=True)
df_train_label = df_train_label.rename({"ground_truth": "aid"}, axis=1)
df_train_label["label"] = 1

# df_train_candidates = df_train_candidates.merge(df_train_label, on=["session", "aid"], how="outer").fillna({"label": 0})

df_train_candidates = df_train_candidates.merge(df_train_label, on=["session", "aid"], how="left").fillna({"label": 0})
df_train_candidates["label"] = df_train_candidates["label"].astype("int")
df_train_candidates["recall_order"] = df_train_candidates.groupby('session').cumcount()
df_train_candidates.shape

CPU times: user 50.4 s, sys: 8.64 s, total: 59 s
Wall time: 58.4 s


(90062550, 4)

## Downsample

In [5]:
%%time

def downsample(df, n=-1):
    if n == -1:
        n = DOWNSAMPLE_RATE
    df_negative = df[df["label"]==0]
    df_postive = df[df["label"]==1]
    r = len(df_negative)//len(df_postive)
    print(f"current negative size: {len(df_negative)}, postive size: {len(df_postive)}, rate: {r}")
    if r > n:
        gloden_negative_size = n * len(df_postive)
        df_negative = df_negative.sample(gloden_negative_size)
        df = pd.concat([df_postive, df_negative])
    df["_noise"] = np.random.randn(len(df))
    df = df.sort_values(["session", "_noise"])
    df = df.drop("_noise", axis=1).reset_index(drop=True)
    return df


def train_valid_split(df):
    print(f"origin all data size: {len(df)}")
    valid_session = np.random.choice(df_train_candidates.session.unique(), int(len(df_train_candidates.session.unique())*VALID_DATA_RATIO))
    df_train = df[~df["session"].isin(valid_session)]
    df_valid = df[df["session"].isin(valid_session)]
    df_train = downsample(df_train)
    return df_train, df_valid


df_train_candidates, df_valid_candidates = train_valid_split(df_train_candidates)
df_train_candidates["label"] = df_train_candidates["label"].astype("int")
df_valid_candidates["label"] = df_valid_candidates["label"].astype("int")
print(f"df_train_candidates size: {len(df_train_candidates)}, df_valid_candidates size: {len(df_valid_candidates)}")

origin all data size: 90062550
current negative size: 88216587, postive size: 950963, rate: 92
df_train_candidates size: 19970223, df_valid_candidates size: 895000
CPU times: user 54.3 s, sys: 2.79 s, total: 57.1 s
Wall time: 57.1 s


## Merge Feature

In [6]:
%%time

train_item_feature_path = sorted(glob.glob(
    os.path.join(DATA_PATH, "input/feature/train/item_feature_*.parquet")
))
for p in tqdm(train_item_feature_path):
    df_train_candidates = df_train_candidates.merge(pd.read_parquet(p), on="aid", how="left")
    df_valid_candidates = df_valid_candidates.merge(pd.read_parquet(p), on="aid", how="left")


train_user_feature_path = sorted(glob.glob(
    os.path.join(DATA_PATH, "input/feature/train/user_feature_*.parquet")
))
for p in tqdm(train_user_feature_path):
    df_train_candidates = df_train_candidates.merge(pd.read_parquet(p), on="session", how="left")
    df_valid_candidates = df_valid_candidates.merge(pd.read_parquet(p), on="session", how="left")


train_interaction_feature_path = sorted(glob.glob(
    os.path.join(DATA_PATH, "input/feature/train/interaction_feature_*.parquet")
))
for p in tqdm(train_interaction_feature_path):
    df_train_candidates = df_train_candidates.merge(pd.read_parquet(p), on=["session", "aid"], how="left")
    df_valid_candidates = df_valid_candidates.merge(pd.read_parquet(p), on=["session", "aid"], how="left")


df_train_candidates = fillna_default(df_train_candidates)
df_valid_candidates = fillna_default(df_valid_candidates)

print(f"df_train_candidates size: {len(df_train_candidates)}, df_valid_candidates size: {len(df_valid_candidates)}")
df_train_candidates.head(5)

100%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 7/7 [01:25<00:00, 12.23s/it]
100%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 5/5 [00:31<00:00,  6.25s/it]
100%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 5/5 [02:18<00:00, 27.72s/it]


df_train_candidates size: 19970223, df_valid_candidates size: 895000
CPU times: user 4min 23s, sys: 42.9 s, total: 5min 6s
Wall time: 5min 1s


Unnamed: 0,session,aid,label,recall_order,item_user_count,item_item_count,item_buy_ratio,item_clicks_user_count,item_clicks_item_count,item_carts_user_count,...,interaction_clicks_orders_ratio,interaction_orders_clicks_ratio,interaction_orders_carts_ratio,interaction_carts_orders_ratio,interaction_timing_decay_score,interaction_orders_count,interaction_clicks_count,interaction_carts_count,interaction_behavior_count,interaction_behavor_period
0,11098528,801774,0,48,31553,49410,0.182473,31223.0,42502.0,4069.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,11098528,634452,0,32,13361,30211,0.175466,13341.0,25890.0,2469.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,11098528,307904,0,9,5352,8581,0.121897,5322.0,7774.0,488.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,11098528,1633746,0,12,14840,24869,0.183079,14748.0,21380.0,2077.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,11098528,1531805,0,29,13832,30129,0.12556,13809.0,27055.0,1782.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


## Storage

In [7]:
df_train_candidates.to_pickle(f"df_train_{MODEL_TYPE}_candidates.pkl")
df_valid_candidates.to_pickle(f"df_valid_{MODEL_TYPE}_candidates.pkl")
df_train_label[
    df_train_label["session"].isin(df_valid_candidates["session"].unique())
].reset_index(drop=True).rename({"aid": "ground_truth"}, axis=1).to_parquet(f"df_valid_{MODEL_TYPE}_label.parquet")

## Feature Name

In [8]:
FEATURE_COL = list(set(df_train_candidates.columns.tolist()) - set(["session", "aid", "ts", "label", "user_buy_ratio"]))
with open(f"FEATURE_COL_{MODEL_TYPE}.txt", "w") as f:
    f.write("\n".join(FEATURE_COL))

print(f"Count of Feature is: {len(FEATURE_COL)}")
print("")
FEATURE_COL

Count of Feature is: 44



['item_first_orders_ts',
 'item_first_carts_ts',
 'user_last_orders_time',
 'user_carts_ratio',
 'interaction_behavior_count',
 'item_orders_item_count',
 'user_last_carts_time',
 'item_clicks_item_count',
 'user_first_orders_time',
 'item_first_clicks_ts',
 'interaction_type_core',
 'item_item_count',
 'user_clicks_count',
 'interaction_clicks_carts_ratio',
 'user_user_count',
 'interaction_clicks_orders_ratio',
 'user_orders_ratio',
 'user_behavior_count',
 'item_orders_user_count',
 'interaction_clicks_count',
 'item_last_clicks_ts',
 'interaction_orders_count',
 'user_carts_count',
 'user_clicks_ratio',
 'user_orders_count',
 'user_last_clicks_time',
 'interaction_timing_decay_score',
 'recall_order',
 'interaction_orders_clicks_ratio',
 'user_first_carts_time',
 'item_carts_user_count',
 'interaction_carts_orders_ratio',
 'item_carts_item_count',
 'user_item_count',
 'interaction_carts_count',
 'item_last_orders_ts',
 'item_buy_ratio',
 'item_user_count',
 'interaction_orders_cart

## Training

In [9]:
%%time
n_estimators_candidates = list(range(100,401,50))

for n_estimators in tqdm(n_estimators_candidates):
    ranker = xgb.XGBRanker(
#         tree_method='gpu_hist',
        tree_method="hist",
        booster='gbtree',
        objective='rank:pairwise',
        random_state=42, 
        learning_rate=0.1,
        colsample_bytree=0.9,  # 0.9
        eta=0.05, 
        max_depth=6, 
        n_estimators=n_estimators,
        subsample=0.8,
        n_jobs=15
    )

    feature_cols = FEATURE_COL
    label_col = 'label'

    ranker.fit(
        X=df_train_candidates[feature_cols],
        y=df_train_candidates[label_col],
        group=df_train_candidates.groupby("session").count()["label"]
    )

    joblib.dump(ranker, f"models/{MODEL_TYPE}_xgbranker_{str(n_estimators)}.m")

100%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 7/7 [31:41<00:00, 271.63s/it]

CPU times: user 6h 41min 43s, sys: 5min 8s, total: 6h 46min 52s
Wall time: 31min 41s





In [None]:
n_estimators_candidates = list(range(50,501,50))

# Valid

In [10]:
%%time

# ranker = joblib.load("carts_xgbranker_1.m")

# FEATURE_COL = list(ranker.feature_names_in_)
with open(f"FEATURE_COL_{MODEL_TYPE}.txt", "r") as f:
    FEATURE_COL = f.read().splitlines()
    
print(FEATURE_COL)

for n_estimators in tqdm(n_estimators_candidates[::-1]):
    df_valid_candidates = pd.read_pickle(f"df_valid_{MODEL_TYPE}_candidates.pkl")
    df_valid_label = pd.read_parquet(f"df_valid_{MODEL_TYPE}_label.parquet")
    df_valid_label = df_valid_label[["session", "ground_truth"]].groupby("session").agg(list).reset_index()
    ranker = joblib.load(f"models/{MODEL_TYPE}_xgbranker_{n_estimators}.m")
    df_valid_candidates["score"] = ranker.predict(df_valid_candidates[FEATURE_COL])
    df_valid_candidates = df_valid_candidates.sort_values(by=['session', 'score'], ascending=False)[['session', 'aid']].reset_index(drop=True)
    df_valid_candidates = df_valid_candidates.groupby('session').head(20).groupby('session').agg(list).reset_index(drop=False)
    df_valid_candidates = df_valid_candidates.merge(df_valid_label, on="session", how="left")
    df_valid_candidates["ground_truth"] = df_valid_candidates["ground_truth"].apply(lambda x: x if isinstance(x, list) else [])
    df_valid_candidates["hits"] = df_valid_candidates.apply(lambda df: len(set(df.ground_truth).intersection(set(df.aid))), axis=1)
    df_valid_candidates['gt_count'] = df_valid_candidates.ground_truth.str.len().clip(0,20)
    recall = df_valid_candidates["hits"].sum() / df_valid_candidates['gt_count'].sum()

    print(f"n_estimators={n_estimators} get Recall@20: {recall}")

['item_first_orders_ts', 'item_first_carts_ts', 'user_last_orders_time', 'user_carts_ratio', 'interaction_behavior_count', 'item_orders_item_count', 'user_last_carts_time', 'item_clicks_item_count', 'user_first_orders_time', 'item_first_clicks_ts', 'interaction_type_core', 'item_item_count', 'user_clicks_count', 'interaction_clicks_carts_ratio', 'user_user_count', 'interaction_clicks_orders_ratio', 'user_orders_ratio', 'user_behavior_count', 'item_orders_user_count', 'interaction_clicks_count', 'item_last_clicks_ts', 'interaction_orders_count', 'user_carts_count', 'user_clicks_ratio', 'user_orders_count', 'user_last_clicks_time', 'interaction_timing_decay_score', 'recall_order', 'interaction_orders_clicks_ratio', 'user_first_carts_time', 'item_carts_user_count', 'interaction_carts_orders_ratio', 'item_carts_item_count', 'user_item_count', 'interaction_carts_count', 'item_last_orders_ts', 'item_buy_ratio', 'item_user_count', 'interaction_orders_carts_ratio', 'interaction_behavor_period'

 14%|███████████████████▏                                                                                                                  | 1/7 [00:04<00:25,  4.20s/it]

n_estimators=400 get Recall@20: 0.513791919423143


 29%|██████████████████████████████████████▎                                                                                               | 2/7 [00:06<00:15,  3.14s/it]

n_estimators=350 get Recall@20: 0.5136774636602953


 43%|█████████████████████████████████████████████████████████▍                                                                            | 3/7 [00:08<00:11,  2.76s/it]

n_estimators=300 get Recall@20: 0.5140208309488383


 57%|████████████████████████████████████████████████████████████████████████████▌                                                         | 4/7 [00:11<00:07,  2.58s/it]

n_estimators=250 get Recall@20: 0.5138491473045668


 71%|███████████████████████████████████████████████████████████████████████████████████████████████▋                                      | 5/7 [00:13<00:04,  2.43s/it]

n_estimators=200 get Recall@20: 0.5138491473045668


 86%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████▊                   | 6/7 [00:15<00:02,  2.38s/it]

n_estimators=150 get Recall@20: 0.5133340963717523


100%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 7/7 [00:17<00:00,  2.53s/it]

n_estimators=100 get Recall@20: 0.5125901339132425
CPU times: user 1min 35s, sys: 2.57 s, total: 1min 37s
Wall time: 17.7 s





# Chunking kaggle test dataset to save memory(don't execute it)

In [None]:
# df_kaggle_test = pd.read_parquet(os.path.join(DATA_PATH, "input/otto-candidates/test_candidates/test_orders_candidates_top50.parquet"))
INFERENCE_CHUNK_NUM = 10
df_kaggle_test = pd.read_parquet(os.path.join(DATA_PATH, f"input/otto-candidates/test_candidates/test_{MODEL_TYPE}_candidates_top{TOPN}.parquet"))

inference_session_list = df_kaggle_test["session"].unique().tolist()
INFERENCE_CHUNK_SIZE = len(inference_session_list)//INFERENCE_CHUNK_NUM + 2
n = 1
for i in range(0, len(inference_session_list), INFERENCE_CHUNK_SIZE):
    df_kaggle_test[df_kaggle_test["session"].isin(inference_session_list[i:i+INFERENCE_CHUNK_SIZE])].to_parquet(
        os.path.join(DATA_PATH, f"input/otto-candidates/test_candidates/test_{MODEL_TYPE}_candidates_top{TOPN}_chunk_{n}.parquet")
    )
    n += 1
df_kaggle_test.head(10)

# Inference

In [None]:
# INFERENCE_CHUNK_NUM = 10

In [11]:
%%time
INFERENCE_CHUNK_NUM = 10
df_kaggle_test = None
for n in range(1, INFERENCE_CHUNK_NUM+1):
    print(f"chunk {n} start ...")
    df_kaggle_test_chunk = pd.read_parquet(
        os.path.join(DATA_PATH, f"input/otto-candidates/test_candidates/test_{MODEL_TYPE}_candidates_top{TOPN}_chunk_{n}.parquet")
    )
    df_kaggle_test_chunk["recall_order"] = df_kaggle_test_chunk.groupby('session').cumcount()
    test_user_feature_path = sorted(glob.glob(
        os.path.join(DATA_PATH, "input/feature/test/user_feature_*.parquet")
    ))
    for p in tqdm(test_user_feature_path):
        df_kaggle_test_chunk = df_kaggle_test_chunk.merge(pd.read_parquet(p), on="session", how="left")


    test_item_feature_path = sorted(glob.glob(
        os.path.join(DATA_PATH, "input/feature/test/item_feature_*.parquet")
    ))
    for p in tqdm(test_item_feature_path):
        df_kaggle_test_chunk = df_kaggle_test_chunk.merge(pd.read_parquet(p), on="aid", how="left")

    test_interaction_feature_path = sorted(glob.glob(
        os.path.join(DATA_PATH, "input/feature/test/interaction_feature_*.parquet")
    ))
    for p in tqdm(test_interaction_feature_path):
        df_kaggle_test_chunk = df_kaggle_test_chunk.merge(pd.read_parquet(p), on=["session", "aid"], how="left")


    df_kaggle_test_chunk = fillna_default(df_kaggle_test_chunk)
    df_kaggle_test_chunk.to_parquet(f"df_kaggle_test_{MODEL_TYPE}_chunk_{n}.parquet")
    
# df_kaggle_test.to_parquet("df_kaggle_test.parquet")
# inference_session_list = df_kaggle_test["session"].unique().tolist()
# INFERENCE_CHUNK_NUM = 10
# INFERENCE_CHUNK_SIZE = len(inference_session_list)//INFERENCE_CHUNK_NUM + 2
# n = 1
# for i in range(0, len(inference_session_list), INFERENCE_CHUNK_SIZE):
#     df_kaggle_test[df_kaggle_test["session"].isin(inference_session_list[i:INFERENCE_CHUNK_SIZE])].to_parquet(f"df_kaggle_test_chunk_{n}.parquet")
#     n += 1
# df_kaggle_test.head(10)

chunk 1 start ...


100%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 5/5 [00:06<00:00,  1.23s/it]
100%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 7/7 [00:16<00:00,  2.35s/it]
100%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 5/5 [00:27<00:00,  5.41s/it]


chunk 2 start ...


100%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 5/5 [00:05<00:00,  1.18s/it]
100%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 7/7 [00:15<00:00,  2.22s/it]
100%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 5/5 [00:25<00:00,  5.20s/it]


chunk 3 start ...


100%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 5/5 [00:06<00:00,  1.28s/it]
100%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 7/7 [00:17<00:00,  2.45s/it]
100%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 5/5 [00:28<00:00,  5.64s/it]


chunk 4 start ...


100%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 5/5 [00:05<00:00,  1.18s/it]
100%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 7/7 [00:15<00:00,  2.24s/it]
100%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 5/5 [00:26<00:00,  5.22s/it]


chunk 5 start ...


100%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 5/5 [00:06<00:00,  1.26s/it]
100%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 7/7 [00:16<00:00,  2.40s/it]
100%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 5/5 [00:27<00:00,  5.49s/it]


chunk 6 start ...


100%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 5/5 [00:05<00:00,  1.18s/it]
100%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 7/7 [00:15<00:00,  2.22s/it]
100%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 5/5 [00:26<00:00,  5.21s/it]


chunk 7 start ...


100%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 5/5 [00:05<00:00,  1.18s/it]
100%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 7/7 [00:15<00:00,  2.24s/it]
100%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 5/5 [00:26<00:00,  5.23s/it]


chunk 8 start ...


100%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 5/5 [00:06<00:00,  1.30s/it]
100%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 7/7 [00:17<00:00,  2.48s/it]
100%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 5/5 [00:28<00:00,  5.69s/it]


chunk 9 start ...


100%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 5/5 [00:06<00:00,  1.26s/it]
100%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 7/7 [00:16<00:00,  2.41s/it]
100%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 5/5 [00:28<00:00,  5.64s/it]


chunk 10 start ...


100%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 5/5 [00:06<00:00,  1.24s/it]
100%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 7/7 [00:16<00:00,  2.35s/it]
100%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 5/5 [00:27<00:00,  5.44s/it]


CPU times: user 11min, sys: 2min 12s, total: 13min 13s
Wall time: 12min 36s


# For Ensemble

In [12]:
%%time

with open(f"FEATURE_COL_{MODEL_TYPE}.txt", "r") as f:
    FEATURE_COL = f.read().splitlines()
print(FEATURE_COL)


for n_es in tqdm(n_estimators_candidates):
# best_n_estimators = "300"
    ranker = joblib.load(f"models/{MODEL_TYPE}_xgbranker_{n_es}.m")
    df_kaggle_test = None
    for n in tqdm(range(1, INFERENCE_CHUNK_NUM+1)):
        df = pd.read_parquet(f"df_kaggle_test_{MODEL_TYPE}_chunk_{n}.parquet")
        df["score"] = ranker.predict(df[FEATURE_COL])
        df = df.sort_values(by=['session', 'score'], ascending=False)[['session', 'aid']].reset_index(drop=True)
        df = df.groupby('session').head(20).groupby('session').agg(list).reset_index(drop=False)
        if df_kaggle_test is None:
            df_kaggle_test = df
        else:
            df_kaggle_test = pd.concat([df_kaggle_test, df])

    df_kaggle_test["session_type"] = df_kaggle_test["session"].apply(lambda x: str(x)+f"_{MODEL_TYPE}")
    df_kaggle_test = df_kaggle_test.rename({"aid": "labels"}, axis=1)[["session_type", "labels"]]
    df_kaggle_test["labels"] = df_kaggle_test["labels"].apply(lambda x: " ".join([str(_) for _ in x]))
    df_kaggle_test.to_csv(f"../data/output/submission_part_{MODEL_TYPE}_n_es{n_es}.csv", index=False)
    df_kaggle_test.shape


['item_first_orders_ts', 'item_first_carts_ts', 'user_last_orders_time', 'user_carts_ratio', 'interaction_behavior_count', 'item_orders_item_count', 'user_last_carts_time', 'item_clicks_item_count', 'user_first_orders_time', 'item_first_clicks_ts', 'interaction_type_core', 'item_item_count', 'user_clicks_count', 'interaction_clicks_carts_ratio', 'user_user_count', 'interaction_clicks_orders_ratio', 'user_orders_ratio', 'user_behavior_count', 'item_orders_user_count', 'interaction_clicks_count', 'item_last_clicks_ts', 'interaction_orders_count', 'user_carts_count', 'user_clicks_ratio', 'user_orders_count', 'user_last_clicks_time', 'interaction_timing_decay_score', 'recall_order', 'interaction_orders_clicks_ratio', 'user_first_carts_time', 'item_carts_user_count', 'interaction_carts_orders_ratio', 'item_carts_item_count', 'user_item_count', 'interaction_carts_count', 'item_last_orders_ts', 'item_buy_ratio', 'item_user_count', 'interaction_orders_carts_ratio', 'interaction_behavor_period'

  0%|                                                                                                                                              | 0/7 [00:00<?, ?it/s]
  0%|                                                                                                                                             | 0/10 [00:00<?, ?it/s][A
 10%|█████████████▎                                                                                                                       | 1/10 [00:12<01:50, 12.25s/it][A
 20%|██████████████████████████▌                                                                                                          | 2/10 [00:24<01:36, 12.09s/it][A
 30%|███████████████████████████████████████▉                                                                                             | 3/10 [00:36<01:24, 12.14s/it][A
 40%|█████████████████████████████████████████████████████▏                                                                               

100%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 10/10 [02:27<00:00, 14.75s/it][A
 57%|████████████████████████████████████████████████████████████████████████████                                                         | 4/7 [09:41<07:26, 148.96s/it]
  0%|                                                                                                                                             | 0/10 [00:00<?, ?it/s][A
 10%|█████████████▎                                                                                                                       | 1/10 [00:15<02:15, 15.06s/it][A
 20%|██████████████████████████▌                                                                                                          | 2/10 [00:30<02:01, 15.23s/it][A
 30%|███████████████████████████████████████▉                                                                                             

CPU times: user 1h 12min 26s, sys: 5min 20s, total: 1h 17min 47s
Wall time: 18min 5s





# Please restart kernel to save memory

In [None]:
%%time

with open(f"FEATURE_COL_{MODEL_TYPE}.txt", "r") as f:
    FEATURE_COL = f.read().splitlines()
print(FEATURE_COL)

best_n_estimators = "200"
ranker = joblib.load(f"models/{MODEL_TYPE}_xgbranker_{best_n_estimators}.m")
df_kaggle_test = None
for n in tqdm(range(1, INFERENCE_CHUNK_NUM+1)):
    df = pd.read_parquet(f"df_kaggle_test_{MODEL_TYPE}_chunk_{n}.parquet")
    df["score"] = ranker.predict(df[FEATURE_COL])
    df = df.sort_values(by=['session', 'score'], ascending=False)[['session', 'aid']].reset_index(drop=True)
    df = df.groupby('session').head(20).groupby('session').agg(list).reset_index(drop=False)
    if df_kaggle_test is None:
        df_kaggle_test = df
    else:
        df_kaggle_test = pd.concat([df_kaggle_test, df])

# df_kaggle_test["score"] = ranker.predict(df_kaggle_test[FEATURE_COL])
# df_kaggle_test = df_kaggle_test.sort_values(by=['session', 'score'], ascending=False)[['session', 'aid']].reset_index(drop=True)
# df_kaggle_test = df_kaggle_test.groupby('session').head(20).groupby('session').agg(list).reset_index(drop=False)
df_kaggle_test

In [None]:
%%time

df_kaggle_test["session_type"] = df_kaggle_test["session"].apply(lambda x: str(x)+f"_{MODEL_TYPE}")
# df_test["session_type"] = df_test["session"].apply(lambda x: str(x)+"_carts")
df_kaggle_test = df_kaggle_test.rename({"aid": "labels"}, axis=1)[["session_type", "labels"]]
df_kaggle_test["labels"] = df_kaggle_test["labels"].apply(lambda x: " ".join([str(_) for _ in x]))
df_submission = pd.read_csv("../data/output/submission_583.csv")
# df_submission = df_submission[~df_submission.session_type.str.contains("_carts$")]
df_submission = df_submission[~df_submission.session_type.str.contains(f"_{MODEL_TYPE}$")]
df_submission = pd.concat([df_kaggle_test, df_submission])
df_submission.to_csv(f"../data/output/submission_optim_{MODEL_TYPE}.csv", index=False)
df_submission.shape