In [1]:
import os
import time
import glob
import gc
import warnings
from multiprocessing import Pool

from tqdm import tqdm
import pandas as pd
import polars as pl
import numpy as np
import xgboost as xgb
import cudf
from sklearn.model_selection import GroupKFold
import joblib
from openfe import openfe, transform

warnings.filterwarnings("ignore")

# PARAMS

In [2]:
# TRAIN_SESSION_SAMPLE_SIZE = 5e6
CORES_NUM = 8
TRAIN_SESSION_SAMPLE_SIZE = 5e5
DOWNSAMPLE_RATE = 6
VALID_PICKLE_PATH = "/home/search2/lichunyu/otto-recommender-system/data/input/otto-validation/test_pickle"

valid_pickle = sorted(glob.glob(os.path.join(VALID_PICKLE_PATH, "*_candidates.pkl")))
# valid_pickle

In [3]:
DATASET_PATH = "/home/search2/lichunyu/otto-recommender-system/data/input"

TYPE_MAP = {'clicks':0, 'carts':1, 'orders':2}

def read_parquet(f):
    df = pd.read_parquet(f)
    df.ts = (df.ts/1000).astype('int32')
    df['type'] = df['type'].map(TYPE_MAP).astype('int8')
    return df

In [4]:
test_files: list = sorted(glob.glob(os.path.join(DATASET_PATH, "otto-validation/test_parquet/*.parquet")))
# test_files

# Carts

用户最近20个行为商品 + 用户top商品20 + 全部top商品10

In [5]:
train_files: list = sorted(glob.glob(os.path.join(DATASET_PATH, "otto-validation/train_parquet/*.parquet")))
# train_files

In [6]:
%%time
def read_parquet(f):
    df = pd.read_parquet(f)
    df.ts = (df.ts/1000).astype('int32')
    df['type'] = df['type'].map(TYPE_MAP).astype('int8')
    return df

df_train = None
for i in train_files:
    df = read_parquet(i)
#     df = cudf.DataFrame(read_parquet(i))
    df = df.sort_values(['session','ts'],ascending=[True,False])
#     df = df.to_pandas()
    if df_train is None:
        df_train = df
    else:
        df_train = pd.concat([df_train, df])
#     break
        
df_train

CPU times: user 54.8 s, sys: 17.3 s, total: 1min 12s
Wall time: 1min 6s


Unnamed: 0,session,aid,ts,type
146,0,1110548,1661103727,0
145,0,724999,1661103701,0
144,0,30373,1661103687,0
143,0,102416,1661019639,0
142,0,504365,1661017998,0
...,...,...,...,...
656387,11098523,175715,1661119197,0
656388,11098524,1088524,1661119198,0
656389,11098525,182927,1661119199,0
656390,11098526,510055,1661119199,0


In [7]:
aid_candidates = df_train["aid"].unique()
aid_candidates

array([1110548,  724999,   30373, ...,  720469,  582525,  397245],
      dtype=int32)

## Sample

In [8]:
%%time

OFFSET = 0

df_train = df_train[df_train["session"].isin(pd.DataFrame(np.random.choice(df_train.session.unique(), int(TRAIN_SESSION_SAMPLE_SIZE))).rename({0: "session"}, axis=1)["session"])]
train_ts_maximal = df_train["ts"].max()
# train_ts_minimal = df_train["ts"].min()
df_train_label = df_train[((train_ts_maximal-df_train["ts"]) < (OFFSET+14)*24*60*60)&((train_ts_maximal-df_train["ts"]) >= (OFFSET)*24*60*60)].reset_index(drop=True)
df_train_label = df_train_label[(df_train_label["type"]==TYPE_MAP["carts"])|(df_train_label["type"]==TYPE_MAP["orders"])]
df_train_label = df_train_label.sort_values(['session','ts'],ascending=[True,True])
df_train_label['n'] = df_train_label.groupby('session').cumcount()
df_train_label = df_train_label.loc[df_train_label.n<20].drop('n',axis=1)

df_train = df_train[((train_ts_maximal-df_train["ts"]) >= (OFFSET+14)*24*60*60)&((train_ts_maximal-df_train["ts"]) < (OFFSET+21)*24*60*60)].reset_index(drop=True)
df_train_label["label"] = 1
df_train_label = df_train_label[["session", "aid", "label"]].drop_duplicates(subset=["session", "aid"])
train_ts_minimal = df_train["ts"].min()
df_train = df_train.merge(df_train_label[["session", "aid", "label"]], on=["session", "aid"], how="outer").fillna({"label": 0, "type": 3, "ts": train_ts_minimal})
# df_train = df_train.merge(df_train_label[["session", "aid", "label"]], on=["session", "aid"], how="left").fillna({"label": 0, "type": 3, "ts": train_ts_minimal})
df_train["label"] = df_train["label"].astype("int8")
df_train["type"] = df_train["type"].astype("int8")
df_train["ts"] = df_train["ts"].astype("int32")
df_train["label"].value_counts()
# df_train_label

CPU times: user 2.24 s, sys: 380 ms, total: 2.62 s
Wall time: 2.62 s


0    2314083
1     340466
Name: label, dtype: int64

## Downsample

1:20

In [9]:
def downsample(df, candidates=None, n=-1):
    ts_minimal = df["ts"].min()
    session_num = TRAIN_SESSION_SAMPLE_SIZE
    session_list = df["session"].unique()
    if n == -1:
        n = DOWNSAMPLE_RATE
    if candidates is None:
        candidates = []
    df_negative = df[df["label"]==0]
    df_postive = df[df["label"]==1]
    r = len(df_negative)/len(df_postive)
    print(f"current negative size: {len(df_negative)}, postive size: {len(df_postive)}, rate: {r}")
    if r > n:
        gloden_negative_size = n * len(df_postive)
        df_negative = df_negative.sample(gloden_negative_size)
        df = pd.concat([df_postive, df_negative])
    elif r < n:
        gloden_negative_size = n * len(df_postive)
        difference = gloden_negative_size - len(df_negative)
        session_difference = difference // session
        data4df = {"session": [], "aid": []}
        for i in session_list:
            data4df["aid"].append(np.random.choice(candidates, session_difference).tolist())
            data4df["session"].append(i)
        df_addition = pd.DataFrame(data4df)
        df_addition = df_addition.explode("aid")
        duplicated_columns = df_addition[["session", "aid"]].set_index(["session", "aid"]).index.intersection(df_postive[["session", "aid"]].set_index(["session", "aid"]).index)
        df_addition = df_addition.drop(duplicated_columns, axis=0).reset_index()
        df_addition["label"] = 0
        df_addition["ts"] = ts_minimal
        df_addition["type"] = 3
        df = pd.concat([df, df_addition])
        df = df.drop_duplicates(subset=["session", "aid", "ts"])
    df["_noise"] = np.random.randn(len(df))
    df = df.sort_values(["session", "_noise"])
    df = df.drop("_noise", axis=1).reset_index(drop=True)
    return df

In [10]:
df_train = downsample(df_train, aid_candidates)
df_train["label"] = df_train["label"].astype("int8")
df_train["type"] = df_train["type"].astype("int8")
df_train["ts"] = df_train["ts"].astype("int32")
df_train

current negative size: 2314083, postive size: 340466, rate: 6.796810841611204


Unnamed: 0,session,aid,ts,type,label
0,27,1251955,1659362219,0,0
1,27,1466252,1659304814,0,0
2,27,1627638,1659362208,0,0
3,27,644734,1659304800,0,0
4,27,555996,1659304910,0,0
...,...,...,...,...,...
2383257,11097742,881203,1659304800,3,1
2383258,11097742,1207772,1659304800,3,1
2383259,11097834,1714583,1659304800,3,1
2383260,11098122,530377,1659304800,3,1


# Feature Engineering

In [6]:
%%time

def ts_day(df):
    ts_minimal = df["ts"].min()
    df["ts_day"] = (df["ts"]-ts_minimal) // (24*60*60)
    return df

def carts_count(df):
    df_tmp = pd.read_pickle(os.path.join(DATASET_PATH, "feature/carts_count.pkl"))
    df = df.merge(df_tmp, on="aid", how="left").fillna({"count": 0}).rename({"count": "carts_count"}, axis=1)
    df["carts_count"] = df["carts_count"].astype("int32")
    return df

def clicks_count(df):
    df_tmp = pd.read_pickle(os.path.join(DATASET_PATH, "feature/clicks_count.pkl"))
    df = df.merge(df_tmp, on="aid", how="left").fillna({"count": 0}).rename({"count": "clicks_count"}, axis=1)
    df["clicks_count"] = df["clicks_count"].astype("int32")
    return df

def orders_count(df):
    df_tmp = pd.read_pickle(os.path.join(DATASET_PATH, "feature/orders_count.pkl"))
    df = df.merge(df_tmp, on="aid", how="left").fillna({"count": 0}).rename({"count": "orders_count"}, axis=1)
    df["orders_count"] = df["orders_count"].astype("int32")
    return df

def session_clicks_count(df):
    df_tmp = df[df["type"]==TYPE_MAP["clicks"]].groupby("session").agg({"aid": "count"}).reset_index().rename({"aid": "count"}, axis=1)
    df = df.merge(df_tmp, on="session", how="left").fillna({"count": 0}).rename({"count": "session_clicks_count"}, axis=1)
    df["session_clicks_count"] = df["session_clicks_count"].astype("int32")
    return df

def session_orders_count(df):
    df_tmp = df[df["type"]==TYPE_MAP["orders"]].groupby("session").agg({"aid": "count"}).reset_index().rename({"aid": "count"}, axis=1)
    df = df.merge(df_tmp, on="session", how="left").fillna({"count": 0}).rename({"count": "session_orders_count"}, axis=1)
    df["session_orders_count"] = df["session_orders_count"].astype("int32")
    return df

def session_carts_count(df):
    df_tmp = df[df["type"]==TYPE_MAP["carts"]].groupby("session").agg({"aid": "count"}).reset_index().rename({"aid": "count"}, axis=1)
    df = df.merge(df_tmp, on="session", how="left").fillna({"count": 0}).rename({"count": "session_carts_count"}, axis=1)
    df["session_carts_count"] = df["session_carts_count"].astype("int32")
    return df

def session_unknow_count(df):
    df_tmp = df[df["type"]==3].groupby("session").agg({"aid": "count"}).reset_index().rename({"aid": "count"}, axis=1)
    df = df.merge(df_tmp, on="session", how="left").fillna({"count": 0}).rename({"count": "session_unknow_count"}, axis=1)
    df["session_unknow_count"] = df["session_unknow_count"].astype("int32")
    return df

# ---- public -----
def add_session_length(df):
    # If not using cuDF, remove .to_pandas()
    df['session_length'] = df.groupby('session')['ts'].transform('count')
    return df

def add_action_num_reverse_chrono(df):
    df['action_num_reverse_chrono'] = df.session_length - df.groupby('session').cumcount() - 1
    return df

def add_log_recency_score(df):
    linear_interpolation = 0.1 + ((1-0.1) / (df['session_length']-1)) * (df['session_length']-df['action_num_reverse_chrono']-1)
    df['log_recency_score'] = (2 ** linear_interpolation - 1).fillna(1.0)
    return df

def add_type_weighted_log_recency_score(df):
    type_weights = {0:1, 1:6, 2:3, 3:1}
    df['type_weighted_log_recency_score'] = df['log_recency_score'] / df['type'].map(type_weights)
    return df

# --- new ---
def rate(df):
    if isinstance(df, pd.DataFrame):
        df = pl.DataFrame(df)
    df = df.join(df[["session", "aid", "ts", "type"]].groupby("session").agg([
        (((pl.col("type")==TYPE_MAP["carts"])&(pl.col("type")==TYPE_MAP["orders"])).sum()/(pl.col("type")==TYPE_MAP["orders"]).sum()).alias("carts_rate_in_orders"),
        (((pl.col("type")==TYPE_MAP["carts"])&(pl.col("type")==TYPE_MAP["orders"])).sum()/(pl.col("type")==TYPE_MAP["carts"]).sum()).alias("orders_rate_in_carts"),
        (((pl.col("type")==TYPE_MAP["clicks"])&(pl.col("type")==TYPE_MAP["orders"])).sum()/(pl.col("type")==TYPE_MAP["orders"]).sum()).alias("clicks_rate_in_orders"),
        (((pl.col("type")==TYPE_MAP["clicks"])&(pl.col("type")==TYPE_MAP["orders"])).sum()/(pl.col("type")==TYPE_MAP["clicks"]).sum()).alias("orders_rate_in_clicks"),
        (((pl.col("type")==TYPE_MAP["carts"])&(pl.col("type")==TYPE_MAP["clicks"])).sum()/(pl.col("type")==TYPE_MAP["clicks"]).sum()).alias("carts_rate_in_clicks"),
        (((pl.col("type")==TYPE_MAP["carts"])&(pl.col("type")==TYPE_MAP["clicks"])).sum()/(pl.col("type")==TYPE_MAP["carts"]).sum()).alias("clicks_rate_in_carts"),
    ]), on=["session"])
    df = df.with_column(
        pl.col([
            "carts_rate_in_orders",
            "orders_rate_in_carts",
            "clicks_rate_in_orders",
            "orders_rate_in_clicks",
            "carts_rate_in_clicks",
            "clicks_rate_in_carts"
        ]).fill_nan(pl.lit(0)),
    )
    df = df.to_pandas()
    return df



def clicks_rate_in_carts(df):
    def func(x):
        x_clicks = x[x["type"]==TYPE_MAP["clicks"]]
        x_carts = x[x["type"]==TYPE_MAP["carts"]]
        x_hit = x_carts.merge(x_clicks, on="aid", how="inner").drop_duplicates()
        rate = (len(x_hit) / len(x_carts)) if len(x_carts) != 0 else 0
        dual_rate = (len(x_carts) / len(x_hit)) if len(x_hit) != 0 else 0
#         r = pd.Series({"clicks_rate_in_carts": rate})
        r = pd.Series({"clicks_rate_in_carts": rate, "carts_rate_in_clicks": dual_rate})
        return r
    df_tmp = df[["session", "aid", "ts", "type"]].groupby("session")[["aid", "type"]].apply(func)
    df = df.merge(df_tmp, on="session")
    return df

def clicks_rate_in_orders(df):
    def func(x):
        x_clicks = x[x["type"]==TYPE_MAP["clicks"]]
        x_orders = x[x["type"]==TYPE_MAP["orders"]]
        x_hit = x_orders.merge(x_clicks, on="aid", how="inner").drop_duplicates()
        rate = (len(x_hit) / len(x_orders)) if len(x_orders) != 0 else 0
        dual_rate = (len(x_orders) / len(x_hit)) if len(x_hit) != 0 else 0
#         r = pd.Series({"clicks_rate_in_orders": rate})
        r = pd.Series({"clicks_rate_in_orders": rate, "orders_rate_in_clicks": dual_rate})
        return r
    df_tmp = df[["session", "aid", "ts", "type"]].groupby("session")[["aid", "type"]].apply(func)
    df = df.merge(df_tmp, on="session")
    return df

def carts_rate_in_orders(df):
    def func(x):
        x_carts = x[x["type"]==TYPE_MAP["carts"]]
        x_orders = x[x["type"]==TYPE_MAP["orders"]]
        x_hit = x_orders.merge(x_carts, on="aid", how="inner").drop_duplicates()
        rate = (len(x_hit) / len(x_orders)) if len(x_orders) != 0 else 0
        dual_rate = (len(x_orders) / len(x_hit)) if len(x_hit) != 0 else 0
        r = pd.Series({"carts_rate_in_orders": rate, "orders_rate_in_carts": dual_rate})
        return r
    df_tmp = df[["session", "aid", "ts", "type"]].groupby("session")[["aid", "type"]].apply(func)
    df = df.merge(df_tmp, on="session")
    return df




pipeline = [
    ts_day,
    carts_count,
    clicks_count,
    orders_count,
    session_clicks_count,
    session_orders_count,
    session_carts_count,
    session_unknow_count,
    add_session_length,
    add_action_num_reverse_chrono,
    add_log_recency_score,
    add_type_weighted_log_recency_score,
    clicks_rate_in_carts,
    clicks_rate_in_orders,
    carts_rate_in_orders
#     rate
]

# for p in tqdm(pipeline):
#     df_train = p(df_train)

# FEATURE_COL = list(set(df_train.columns.tolist()) - set(["session", "aid", "ts", "label"]))
# with open("FEATURE_COL.txt", "w") as f:
#     f.write("\n".join(FEATURE_COL))

# df_train

CPU times: user 1 µs, sys: 1e+03 ns, total: 2 µs
Wall time: 4.53 µs


In [12]:
%%time
for p in tqdm(pipeline):
    df_train = p(df_train)

FEATURE_COL = list(set(df_train.columns.tolist()) - set(["session", "aid", "ts", "label"]))
with open("FEATURE_COL.txt", "w") as f:
    f.write("\n".join(FEATURE_COL))

df_train

100%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 15/15 [27:04<00:00, 108.33s/it]


CPU times: user 26min 50s, sys: 2.55 s, total: 26min 53s
Wall time: 27min 4s


Unnamed: 0,session,aid,ts,type,label,ts_day,carts_count,clicks_count,orders_count,session_clicks_count,...,session_length,action_num_reverse_chrono,log_recency_score,type_weighted_log_recency_score,clicks_rate_in_carts,carts_rate_in_clicks,clicks_rate_in_orders,orders_rate_in_clicks,carts_rate_in_orders,orders_rate_in_carts
0,27,1251955,1659362219,0,0,0,0,5,0,6,...,6,5,0.071773,0.071773,0.0,0.0,0.0,0.0,0.0,0.0
1,27,1466252,1659304814,0,0,0,0,22,0,6,...,6,4,0.214195,0.214195,0.0,0.0,0.0,0.0,0.0,0.0
2,27,1627638,1659362208,0,0,0,0,7,0,6,...,6,3,0.375542,0.375542,0.0,0.0,0.0,0.0,0.0,0.0
3,27,644734,1659304800,0,0,0,0,14,0,6,...,6,2,0.558329,0.558329,0.0,0.0,0.0,0.0,0.0,0.0
4,27,555996,1659304910,0,0,0,7,36,1,6,...,6,1,0.765406,0.765406,0.0,0.0,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2383257,11097742,881203,1659304800,3,1,0,0,2,0,0,...,2,1,0.071773,0.071773,0.0,0.0,0.0,0.0,0.0,0.0
2383258,11097742,1207772,1659304800,3,1,0,0,1,0,0,...,2,0,1.000000,1.000000,0.0,0.0,0.0,0.0,0.0,0.0
2383259,11097834,1714583,1659304800,3,1,0,0,14,0,0,...,1,0,1.000000,1.000000,0.0,0.0,0.0,0.0,0.0,0.0
2383260,11098122,530377,1659304800,3,1,0,33,326,8,0,...,1,0,1.000000,1.000000,0.0,0.0,0.0,0.0,0.0,0.0


In [14]:
df_train["clicks_rate_in_orders"].value_counts()

0.000000    781875
1.000000     91082
0.500000     24899
0.666667      9947
0.333333      7212
0.750000      4238
0.600000      2878
0.250000      2478
0.800000      2241
0.400000      2019
0.200000       959
0.428571       909
0.833333       832
0.714286       792
0.166667       627
0.285714       556
0.571429       507
0.375000       451
0.625000       410
0.555556       315
0.444444       292
0.857143       184
0.111111       164
0.142857       141
0.700000       139
0.454545       136
0.125000       130
0.541667       119
0.411765       116
0.777778       107
0.181818       100
0.300000        92
0.416667        75
0.888889        68
0.583333        68
0.222222        65
0.538462        62
0.733333        55
0.384615        55
0.363636        54
0.083333        50
0.357143        44
0.545455        42
0.461538        39
0.153846        35
0.272727        32
0.727273        31
0.230769        30
0.100000        24
Name: clicks_rate_in_orders, dtype: int64

# Model

In [7]:
n_estimators_candidates = [50, 100, 150, 200, 250, 300, 350, 400, 450, 500, 550, 600]

In [18]:
%%time
n_estimators_candidates = [50, 100, 150, 200, 250, 300, 350, 400, 450, 500, 550, 600]

for n_estimators in n_estimators_candidates:
    ranker = xgb.XGBRanker(
        tree_method='gpu_hist',
        booster='gbtree',
        objective='rank:pairwise',
        random_state=42, 
        learning_rate=0.1,
        colsample_bytree=0.9,  # 0.9
        eta=0.05, 
        max_depth=6, 
        n_estimators=n_estimators,
        subsample=0.85,
        n_jobs=11
    )

    feature_cols = FEATURE_COL
    label_col = 'label'

    ranker.fit(
        X=df_train[feature_cols],
        y=df_train[label_col],
        group=df_train.groupby("session").count()["label"]
    )

    joblib.dump(ranker, f"carts_xgbranker_{str(n_estimators)}.m")

CPU times: user 2min 3s, sys: 2.24 s, total: 2min 6s
Wall time: 52.3 s


# Validation

In [8]:
%%time

# ranker = joblib.load("carts_xgbranker_1.m")

# FEATURE_COL = list(ranker.feature_names_in_)
with open("FEATURE_COL.txt", "r") as f:
    FEATURE_COL = f.read().splitlines()
    
print(FEATURE_COL)


def parallel_run(func, df_list):
    cores_num = min([CORES_NUM, len(df_list), BATCH])
    pool = Pool(cores_num)
    df_list = pool.map(func, df_list)
    print("--- batch complete ---")
    pool.close()
    pool.join()
    return df_list


def validate(df_valid, scope=None):
    if scope is None:
#         scope = ["clicks", "carts", "orders"]
            scope = ["carts"]
    label = pd.read_parquet(os.path.join(DATASET_PATH, "otto-validation/test_labels.parquet"))
    df_valid = df_valid.merge(label, on=["session", "type"])
    df_valid["hits"] = df_valid.apply(lambda df: len(set(df.ground_truth).intersection(set(df.aid))), axis=1)
    df_valid['gt_count'] = df_valid.ground_truth.str.len().clip(0,20)
    recall = 0
    weights = {'clicks': 0.10, 'carts': 0.30, 'orders': 0.60}
    for s in scope:
        df = df_valid[df_valid["type"]==s]
        recall_s = df["hits"].sum() / df['gt_count'].sum()
        print(f"{s} Recall: {recall_s}")
        recall += recall_s*weights[s]
#     print("")
#     print(f"CV Recall: {recall}")
    return 


def processer(df_item):
    print("---- processer start ----")
    for p in pipeline:
        df_item = p(df_item)
    print("---- processer complete ----")
    return df_item


BATCH = 4

for n_estimators in n_estimators_candidates[::-1]:
    ranker = joblib.load(f"carts_xgbranker_{n_estimators}.m")
    df_valid = None
    for i in tqdm(range(0, len(valid_pickle), BATCH)):
    #     df = pd.concat([pd.read_pickle(_) for _ in valid_pickle[i:i+BATCH]]).reset_index(drop=True)
        df = [pd.read_pickle(_) for _ in valid_pickle[i:i+BATCH]]
        df = parallel_run(processer, df)
    #     df = processer(df)
    #     print(type(df))
        df = pd.concat(df)
        df["score"] = ranker.predict(df[FEATURE_COL])
        df = df.sort_values(by=['session', 'score'], ascending=False)[['session', 'aid']].reset_index(drop=True)
        df = df.groupby('session').head(20).groupby('session').agg(list).reset_index(drop=False)
        df["type"] = "carts"
        if df_valid is None:
            df_valid = df
        else:
            df_valid = pd.concat([df_valid, df])
        print(f"n_estimators={n_estimators} get {validate(df_valid, ['carts'])}")
        break

#     validate(df_valid, ["carts"])

['carts_rate_in_clicks', 'clicks_rate_in_orders', 'session_clicks_count', 'session_orders_count', 'type', 'clicks_count', 'action_num_reverse_chrono', 'type_weighted_log_recency_score', 'clicks_rate_in_carts', 'carts_count', 'ts_day', 'session_length', 'orders_rate_in_clicks', 'orders_rate_in_carts', 'carts_rate_in_orders', 'orders_count', 'log_recency_score', 'session_carts_count', 'session_unknow_count']


  0%|                                                                                                                         | 0/5 [00:00<?, ?it/s]

---- processer start ----
---- processer start ----
---- processer start ----
---- processer start ----
---- processer complete ----
---- processer complete ----
---- processer complete ----
---- processer complete ----
--- batch complete ---


  0%|                                                                                                                         | 0/5 [09:57<?, ?it/s]


carts Recall: 0.05789738660808275
n_estimators=600 get None


  0%|                                                                                                                         | 0/5 [00:00<?, ?it/s]

---- processer start ----
---- processer start ----
---- processer start ----
---- processer start ----
---- processer complete ----
---- processer complete ----
---- processer complete ----
---- processer complete ----
--- batch complete ---


  0%|                                                                                                                         | 0/5 [16:15<?, ?it/s]


carts Recall: 0.0576914275252872
n_estimators=550 get None


  0%|                                                                                                                         | 0/5 [00:00<?, ?it/s]

---- processer start ----
---- processer start ----
---- processer start ----
---- processer start ----
---- processer complete ----
---- processer complete ----
---- processer complete ----
---- processer complete ----
--- batch complete ---


  0%|                                                                                                                         | 0/5 [13:35<?, ?it/s]


carts Recall: 0.0602163333180771
n_estimators=500 get None


  0%|                                                                                                                         | 0/5 [00:00<?, ?it/s]

---- processer start ----
---- processer start ----
---- processer start ----
---- processer start ----
---- processer complete ----
---- processer complete ----
---- processer complete ----
---- processer complete ----
--- batch complete ---


  0%|                                                                                                                         | 0/5 [12:10<?, ?it/s]


carts Recall: 0.06284040459517598
n_estimators=450 get None


  0%|                                                                                                                         | 0/5 [00:00<?, ?it/s]

---- processer start ----
---- processer start ----
---- processer start ----
---- processer start ----
---- processer complete ----
---- processer complete ----
---- processer complete ----
---- processer complete ----
--- batch complete ---


  0%|                                                                                                                         | 0/5 [09:37<?, ?it/s]

carts Recall: 0.062375089630341586
n_estimators=400 get None



  0%|                                                                                                                         | 0/5 [00:00<?, ?it/s]

---- processer start ----
---- processer start ----
---- processer start ----
---- processer start ----
---- processer complete ----
---- processer complete ----
---- processer complete ----
---- processer complete ----
--- batch complete ---


  0%|                                                                                                                         | 0/5 [09:35<?, ?it/s]


carts Recall: 0.05258059102628648
n_estimators=350 get None


  0%|                                                                                                                         | 0/5 [00:00<?, ?it/s]

---- processer start ----
---- processer start ----
---- processer start ----
---- processer start ----
---- processer complete ----
---- processer complete ----
---- processer complete ----
---- processer complete ----
--- batch complete ---


  0%|                                                                                                                         | 0/5 [09:33<?, ?it/s]


carts Recall: 0.05289334370756862
n_estimators=300 get None


  0%|                                                                                                                         | 0/5 [00:00<?, ?it/s]

---- processer start ----
---- processer start ----
---- processer start ----
---- processer start ----
---- processer complete ----
---- processer complete ----
---- processer complete ----
---- processer complete ----
--- batch complete ---


  0%|                                                                                                                         | 0/5 [09:31<?, ?it/s]

carts Recall: 0.05556318366973317
n_estimators=250 get None



  0%|                                                                                                                         | 0/5 [00:00<?, ?it/s]

---- processer start ----
---- processer start ----
---- processer start ----
---- processer start ----
---- processer complete ----
---- processer complete ----
---- processer complete ----
---- processer complete ----
--- batch complete ---


  0%|                                                                                                                         | 0/5 [09:28<?, ?it/s]

carts Recall: 0.09142294841869193
n_estimators=200 get None



  0%|                                                                                                                         | 0/5 [00:00<?, ?it/s]

---- processer start ----
---- processer start ----
---- processer start ----
---- processer start ----
---- processer complete ----
---- processer complete ----
---- processer complete ----
---- processer complete ----
--- batch complete ---


  0%|                                                                                                                         | 0/5 [09:22<?, ?it/s]


carts Recall: 0.09294094314003692
n_estimators=150 get None


  0%|                                                                                                                         | 0/5 [00:00<?, ?it/s]

---- processer start ----
---- processer start ----
---- processer start ----
---- processer start ----
---- processer complete ----
---- processer complete ----
---- processer complete ----
---- processer complete ----
--- batch complete ---


  0%|                                                                                                                         | 0/5 [09:17<?, ?it/s]


carts Recall: 0.036698857308496195
n_estimators=100 get None


  0%|                                                                                                                         | 0/5 [00:00<?, ?it/s]

---- processer start ----
---- processer start ----
---- processer start ----
---- processer start ----
---- processer complete ----
---- processer complete ----
---- processer complete ----
---- processer complete ----
--- batch complete ---


  0%|                                                                                                                         | 0/5 [09:13<?, ?it/s]

carts Recall: 0.08127755656246663
n_estimators=50 get None
CPU times: user 39min 6s, sys: 2min 26s, total: 41min 33s
Wall time: 2h 7min 41s





In [17]:
FEATURE_COL = list(ranker.feature_names_in_)

# Inference

In [15]:
test_data_path = os.path.join(DATASET_PATH, "otto-chunk-data-inparquet-format/test_parquet/*.parquet")
test_data = sorted(glob.glob(test_data_path))
# ranker = joblib.load("carts_xgbranker_1.m")

In [16]:
%%time

test_data_path = sorted(glob.glob("/home/search2/lichunyu/otto-recommender-system/data/input/feature/test_chunk_data/*_candidates.pkl"))

# df_test = pd.read_pickle(os.path.join(DATASET_PATH, "feature/candidate_comatrix_exploded_details.pkl"))

def parallel_run(func, df_list):
    cores_num = min([CORES_NUM, len(df_list), BATCH])
    pool = Pool(cores_num)
    df_list = pool.map(func, df_list)
    print("--- batch complete ---")
    pool.close()
    pool.join()
    return df_list


def test_processer(df_item):
    for p in pipeline:
        df_item = p(df_item)
    return df_item


BATCH_SIZE = 4

df_test = None
for i in tqdm(range(0, len(test_data_path), BATCH_SIZE)) :
#     df = pd.read_pickle(i)
    df = [pd.read_pickle(_) for _ in test_data_path[i:i+BATCH_SIZE]]
    df = parallel_run(test_processer, df)
    df = pd.concat(df)
    df["score"] = ranker.predict(df[FEATURE_COL])
    df = df.sort_values(by=['session', 'score'], ascending=False)[['session', 'aid']].reset_index(drop=True)
    df = df.groupby('session').head(20).groupby('session').agg(list).reset_index(drop=False)
    if df_test is None:
        df_test = df
    else:
        df_test = pd.concat([df_test, df])
        
        
df_test = df_test.reset_index(drop=True)
df_test

  0%|                                                                                                                                              | 0/5 [00:00<?, ?it/s]

--- batch complete ---


 20%|██████████████████████████▌                                                                                                          | 1/5 [09:11<36:45, 551.41s/it]

--- batch complete ---


 40%|█████████████████████████████████████████████████████▏                                                                               | 2/5 [18:17<27:24, 548.17s/it]

--- batch complete ---


 60%|███████████████████████████████████████████████████████████████████████████████▊                                                     | 3/5 [27:53<18:41, 560.84s/it]

--- batch complete ---


 80%|██████████████████████████████████████████████████████████████████████████████████████████████████████████▍                          | 4/5 [38:04<09:40, 580.69s/it]

--- batch complete ---


100%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 5/5 [52:43<00:00, 632.71s/it]

CPU times: user 8min 41s, sys: 3min 8s, total: 11min 49s
Wall time: 52min 43s





Unnamed: 0,session,aid
0,12899779,"[59625, 469285, 1493965, 438191, 731692, 73744..."
1,12899780,"[1142000, 1142000, 736515, 973453, 582732, 150..."
2,12899781,"[918667, 199008, 199008, 199008, 199008, 19900..."
3,12899782,"[1007613, 595994, 1033148, 834354, 834354, 834..."
4,12899783,"[1817895, 607638, 1754419, 1216820, 1729553, 3..."
...,...,...
1671798,13899774,"[393555, 868822, 1383522, 321393, 575971, 4058..."
1671799,13899775,"[182882, 943641, 146605, 292423, 1217686, 1116..."
1671800,13899776,"[1236234, 1236234, 688174, 578577, 799127, 947..."
1671801,13899777,"[1382226, 1150166, 259263, 1556931, 434028, 53..."


In [17]:
df_test["session_type"] = df_test["session"].apply(lambda x: str(x)+"_carts")
df_test = df_test.rename({"aid": "labels"}, axis=1)[["session_type", "labels"]]
df_test

Unnamed: 0,session_type,labels
0,12899779_carts,"[59625, 469285, 1493965, 438191, 731692, 73744..."
1,12899780_carts,"[1142000, 1142000, 736515, 973453, 582732, 150..."
2,12899781_carts,"[918667, 199008, 199008, 199008, 199008, 19900..."
3,12899782_carts,"[1007613, 595994, 1033148, 834354, 834354, 834..."
4,12899783_carts,"[1817895, 607638, 1754419, 1216820, 1729553, 3..."
...,...,...
1671798,13899774_carts,"[393555, 868822, 1383522, 321393, 575971, 4058..."
1671799,13899775_carts,"[182882, 943641, 146605, 292423, 1217686, 1116..."
1671800,13899776_carts,"[1236234, 1236234, 688174, 578577, 799127, 947..."
1671801,13899777_carts,"[1382226, 1150166, 259263, 1556931, 434028, 53..."


In [18]:
df_carts_top50 = pd.read_pickle(os.path.join(DATASET_PATH, "feature/carts_top50.pkl"))
carts_top50 = df_carts_top50["aid"].tolist()

def func(x):
    if len(x) < 20:
        n = 20 - len(x)
        x = x + carts_top50[:n]
    return x

df_test["labels"] = df_test["labels"].apply(func)
df_test["labels"] = df_test["labels"].apply(lambda x: " ".join([str(_) for _ in x]))
df_test

Unnamed: 0,session_type,labels
0,12899779_carts,59625 469285 1493965 438191 731692 737445 1253...
1,12899780_carts,1142000 1142000 736515 973453 582732 1502122 4...
2,12899781_carts,918667 199008 199008 199008 199008 199008 1990...
3,12899782_carts,1007613 595994 1033148 834354 834354 834354 83...
4,12899783_carts,1817895 607638 1754419 1216820 1729553 300127 ...
...,...,...
1671798,13899774_carts,393555 868822 1383522 321393 575971 405819 126...
1671799,13899775_carts,182882 943641 146605 292423 1217686 1116425 30...
1671800,13899776_carts,1236234 1236234 688174 578577 799127 947126 90...
1671801,13899777_carts,1382226 1150166 259263 1556931 434028 534366 5...


In [19]:
df_submission = pd.read_csv("../data/output/submission_ensemble.csv")
df_submission = df_submission[~df_submission.session_type.str.contains("_carts$")]
df_submission = pd.concat([df_test, df_submission])
df_submission.to_csv("../data/output/submission_optim_carts.csv", index=False)

In [16]:
df_submission

Unnamed: 0,session_type,labels
0,12899779_carts,59625 984047 898986 438191 1667087 737445 1825...
1,12899780_carts,1142000 1502122 973453 1142000 582732 1419849 ...
2,12899781_carts,199008 918667 3542 199008 518425 199008 129797...
3,12899782_carts,889671 595994 834354 1007613 127404 987399 740...
4,12899783_carts,1502122 607638 1754419 1216820 198385 1142000 ...
...,...,...
5015404,14362948_orders,44180 1633529 1661189 290775 1734371 441577 13...
5015405,13525273_clicks,1745620 1068655 223523 1530242 1703143 1314705...
5015406,13665806_clicks,1402107 426140 1636071 185602 995813 1523231 1...
5015407,13179133_clicks,1792659 490652 102965 1677119 1375098 284105 1...


# submission csv must have 5015409 rows

In [21]:
suffix = "_clicks$"

df_submission = pd.read_csv("../data/output/submission.csv")
df_submission_ensemble = pd.read_csv("../data/output/submission_ensemble.csv")
df_submission = df_submission[~df_submission.session_type.str.contains(suffix)]
df_submission_ensemble = df_submission_ensemble[df_submission_ensemble.session_type.str.contains(suffix)]
df_submission = pd.concat([df_submission_ensemble, df_submission])
df_submission.to_csv("../data/output/submission_optim_carts.csv", index=False)
df_submission.shape

(5015409, 2)

In [40]:
(0.578-0.553)/0.3

0.08333333333333304

In [31]:
df_submission_ensemble

Unnamed: 0,session_type,labels
0,13399527_clicks,1632206 349016 703890 84773 537304 1689819 637...
1,13712096_clicks,84550 1012174 340291 229829 1732553 1739076 98...
3,13557868_orders,1204405 1675318 1611581 102345 331708 399992 1...
4,13443775_clicks,755127 956575 723612 385521 969639 1632206 777...
5,14012641_orders,1052172 822418 1363906 925411 997207 903224 90...
...,...,...
5015404,14362948_orders,44180 1633529 1661189 290775 1734371 441577 13...
5015405,13525273_clicks,1745620 1068655 223523 1530242 1703143 1314705...
5015406,13665806_clicks,1402107 426140 1636071 185602 995813 1523231 1...
5015407,13179133_clicks,1792659 490652 102965 1677119 1375098 284105 1...


In [34]:
df_submission_ensemble = df_submission_ensemble.rename({"labels": "ensemble_labels"}, axis=1)
df_submission_ensemble

Unnamed: 0,session_type,ensemble_labels
0,13399527_clicks,1632206 349016 703890 84773 537304 1689819 637...
1,13712096_clicks,84550 1012174 340291 229829 1732553 1739076 98...
2,14163812_carts,357318 114120 1314974 389924 91574 1459262 552...
3,13557868_orders,1204405 1675318 1611581 102345 331708 399992 1...
4,13443775_clicks,755127 956575 723612 385521 969639 1632206 777...
...,...,...
5015404,14362948_orders,44180 1633529 1661189 290775 1734371 441577 13...
5015405,13525273_clicks,1745620 1068655 223523 1530242 1703143 1314705...
5015406,13665806_clicks,1402107 426140 1636071 185602 995813 1523231 1...
5015407,13179133_clicks,1792659 490652 102965 1677119 1375098 284105 1...


In [35]:
df_e2 = df_submission.merge(df_submission_ensemble, on="session_type")
df_e2

Unnamed: 0,session_type,labels,ensemble_labels
0,12899779_clicks,59625 1253524 737445 438191 731692 1790770 942...,1667087 1660089 1138236 967363 742709 620510 3...
1,12899780_clicks,1142000 736515 973453 582732 889686 487136 141...,1032776 1383529 1586171 618310 1515511 1263108...
2,12899781_clicks,918667 199008 194067 57315 141736 1460571 7594...,1422724 1836671 1628918 528496 1681537 1767530...
3,12899782_clicks,834354 740494 987399 889671 779477 127404 1711...,413962 229748 562753 1669402 479970 1033148 47...
4,12899783_clicks,1817895 607638 1754419 1216820 1729553 300127 ...,1157882 1420411 230028 294573 1492009 998637 1...
...,...,...,...
5015404,14571577_carts,1141710 1276792 1004292 1666114 367734 86916 8...,493115 1302088 150132 631085 934971 1187209 84...
5015405,14571578_carts,519105 977826 1811714 822641 815460 1671592 29...,5573 1139638 1005419 1580943 1290293 735459 29...
5015406,14571579_carts,739876 1209992 1750859 1550479 51363 785544 77...,1286038 832213 702275 707204 857928 210534 770...
5015407,14571580_carts,202353 1314576 433425 1231403 925638 888228 87...,682237 1627186 985380 891417 356096 1682397 88...


In [36]:
df_e2["is_same"] = df_e2["labels"]==df_e2["ensemble_labels"]
df_e2

Unnamed: 0,session_type,labels,ensemble_labels,is_same
0,12899779_clicks,59625 1253524 737445 438191 731692 1790770 942...,1667087 1660089 1138236 967363 742709 620510 3...,False
1,12899780_clicks,1142000 736515 973453 582732 889686 487136 141...,1032776 1383529 1586171 618310 1515511 1263108...,False
2,12899781_clicks,918667 199008 194067 57315 141736 1460571 7594...,1422724 1836671 1628918 528496 1681537 1767530...,False
3,12899782_clicks,834354 740494 987399 889671 779477 127404 1711...,413962 229748 562753 1669402 479970 1033148 47...,False
4,12899783_clicks,1817895 607638 1754419 1216820 1729553 300127 ...,1157882 1420411 230028 294573 1492009 998637 1...,False
...,...,...,...,...
5015404,14571577_carts,1141710 1276792 1004292 1666114 367734 86916 8...,493115 1302088 150132 631085 934971 1187209 84...,False
5015405,14571578_carts,519105 977826 1811714 822641 815460 1671592 29...,5573 1139638 1005419 1580943 1290293 735459 29...,False
5015406,14571579_carts,739876 1209992 1750859 1550479 51363 785544 77...,1286038 832213 702275 707204 857928 210534 770...,False
5015407,14571580_carts,202353 1314576 433425 1231403 925638 888228 87...,682237 1627186 985380 891417 356096 1682397 88...,False


# Unused Function

In [None]:
def bool_count(df):
    df_tmp = df[df["label"]==0].groupby("session").agg({"aid": "count"}).reset_index().rename({"aid": "negative_count"}, axis=1)
    df = df.merge(df_tmp, on="session", how="left").fillna(0)
    df["negative_count"] = df["negative_count"].astype("int32")
    df_tmp = df[df["label"]==1].groupby("session").agg({"aid": "count"}).reset_index().rename({"aid": "postive_count"}, axis=1)
    df = df.merge(df_tmp, on="session", how="left").fillna(0)
    df["postive_count"] = df["postive_count"].astype("int32")
    return df