In [1]:
import os
import time
import glob
import gc
import warnings
from multiprocessing import Pool

from tqdm import tqdm
import pandas as pd
import polars as pl
import numpy as np
import xgboost as xgb
# import cudf
from sklearn.model_selection import GroupKFold
import joblib
# from openfe import openfe, transform

warnings.filterwarnings("ignore")

# Param

In [2]:
DATA_PATH = "/home/search3/lichunyu/otto-recommender-system/data"
TYPE_MAP = {'clicks':0, 'carts':1, 'orders':2}
DOWNSAMPLE_RATE = 20

def read_parquet(f):
    df = pd.read_parquet(f)
    df.ts = (df.ts/1000).astype('int32')
    df['type'] = df['type'].map(TYPE_MAP).astype('int8')
    return df

# Feature Engineering

## Time Feature

In [3]:
def ts_day(df):
    ts_minimal = df["ts"].min()
    df["ts_day"] = (df["ts"]-ts_minimal) // (24*60*60)
    return df

def add_session_length(df):
    # If not using cuDF, remove .to_pandas()
    df['session_length'] = df.groupby('session')['ts'].transform('count')
    return df

def add_action_num_reverse_chrono(df):
    df['action_num_reverse_chrono'] = df.session_length - df.groupby('session').cumcount() - 1
    return df

def add_log_recency_score(df):
    linear_interpolation = 0.1 + ((1-0.1) / (df['session_length']-1)) * (df['session_length']-df['action_num_reverse_chrono']-1)
    df['log_recency_score'] = (2 ** linear_interpolation - 1).fillna(1.0)
    return df

def add_type_weighted_log_recency_score(df):
    type_weights = {0:1, 1:6, 2:3, 3:0.1}
    df['type_weighted_log_recency_score'] = df['log_recency_score'] / df['type'].map(type_weights)
    return df



pipeline = [
    ts_day,
    add_session_length,
    add_action_num_reverse_chrono,
    add_log_recency_score,
    add_type_weighted_log_recency_score
]

## Item Feature

In [22]:
def item_feature_1(df, prefix):
    if isinstance(df, pd.DataFrame):
        df = pl.DataFrame(df)
    df = df.groupby("aid").agg([
        pl.col("session").n_unique().alias("item_user_count"),
        pl.col("aid").count().alias("item_item_count"),
        pl.col("type").mean().alias("item_buy_ratio"),
    ]).to_pandas().to_parquet(
        os.path.join(DATA_PATH, f"input/feature/{prefix}/item_feature_1.parquet")
    )
    return df

# User Feature

In [23]:
def user_feature_1(df, prefix):
    if isinstance(df, pd.DataFrame):
        df = pl.DataFrame(df)
    df = df.groupby("session").agg([
        pl.col("session").count().alias("user_user_count"),
        pl.col("aid").n_unique().alias("user_item_count"),
        pl.col("type").mean().alias("user_buy_ratio")
    ]).to_pandas().to_parquet(
        os.path.join(DATA_PATH, f"input/feature/{prefix}/user_feature_1.parquet")
    )
    return df

# Train

## Item Feature

In [3]:
%%time

train_path = sorted(glob.glob(
    os.path.join(DATA_PATH, "input/otto-validation/train_parquet/*.parquet")
))

valid_a_path = sorted(glob.glob(
    os.path.join(DATA_PATH, "input/otto-validation/test_parquet/*.parquet")
))

df_train_item = pd.concat([
    pd.concat([read_parquet(i) for i in train_path]),
    pd.concat([read_parquet(i) for i in valid_a_path])
])

df_train_item = pl.DataFrame(df_train_item)

CPU times: user 25.1 s, sys: 5.33 s, total: 30.5 s
Wall time: 20.4 s


Unnamed: 0,session,aid,ts,type
0,0,1517085,1659304800,0
1,0,1563459,1659304904,0
2,0,1309446,1659367439,0
3,0,16246,1659367719,0
4,0,1781822,1659367871,0
...,...,...,...,...
316324,12899774,33035,1661723968,0
316325,12899775,1743151,1661723970,0
316326,12899776,548599,1661723972,0
316327,12899777,384045,1661723976,0


In [15]:
%%time

df_train_item.groupby("aid").agg([
    pl.col("session").n_unique().alias("item_user_count"),
    pl.col("aid").count().alias("item_item_count"),
    pl.col("type").mean().alias("item_buy_ratio"),
]).to_pandas().to_parquet(
    os.path.join(DATA_PATH, "input/feature/train/item_feature_1.parquet")
)

CPU times: user 8 µs, sys: 0 ns, total: 8 µs
Wall time: 17.2 µs


## User Feature

In [19]:
%%time

valid_a_path = sorted(glob.glob(
    os.path.join(DATA_PATH, "input/otto-validation/test_parquet/*.parquet")
))

df_train_user = pd.concat([read_parquet(i) for i in valid_a_path])
df_train_user = pl.DataFrame(df_train_user)
df_train_user.head(10)

CPU times: user 1.53 s, sys: 117 ms, total: 1.64 s
Wall time: 1.28 s


session,aid,ts,type
i32,i32,i32,i8
11098528,11830,1661119200,0
11098529,1105029,1661119200,0
11098530,264500,1661119200,0
11098530,264500,1661119288,0
11098530,409236,1661119369,0
11098530,409236,1661119441,0
11098530,409236,1661120165,0
11098530,409236,1661120532,1
11098531,452188,1661119200,0
11098531,1239060,1661119227,0


In [18]:
df_train_user.groupby("session").agg([
    pl.col("session").count().alias("user_user_count"),
    pl.col("aid").n_unique().alias("user_item_count"),
    pl.col("type").mean().alias("user_buy_ratio")
]).to_pandas().to_parquet(
    os.path.join(DATA_PATH, "input/feature/train/user_feature_1.parquet")
)

## Model Train

In [4]:
%%time

valid_a_candidates_path = sorted(glob.glob(
    os.path.join(DATA_PATH, "input/otto-test-chunk-candidates-top100/valid_chunk_data/valid_chunk_data/*_candidates.pkl")
))

df_train_candidates = pd.concat([pd.read_pickle(i) for i in valid_a_candidates_path])

df_train_label = pd.read_parquet(
    os.path.join(DATA_PATH, "input/otto-validation/test_labels.parquet")
)
df_train_label = df_train_label[df_train_label["type"]=="orders"][["session", "ground_truth"]]
df_train_label = df_train_label.explode("ground_truth").reset_index(drop=True)
df_train_label = df_train_label.rename({"ground_truth": "aid"}, axis=1)
df_train_label["label"] = 1

df_train_candidates = df_train_candidates.merge(df_train_label, on=["session", "aid"], how="left").fillna({"label": 0})
df_train_candidates["label"] = df_train_candidates["label"].astype("int")
df_train_candidates.shape

CPU times: user 1min 37s, sys: 13.1 s, total: 1min 50s
Wall time: 1min 50s


(182272665, 5)

In [5]:
def downsample(df, n=-1):
    if n == -1:
        n = DOWNSAMPLE_RATE
    df_negative = df[df["label"]==0]
    df_postive = df[df["label"]==1]
    r = len(df_negative)//len(df_postive)
    print(f"current negative size: {len(df_negative)}, postive size: {len(df_postive)}, rate: {r}")
    if r > n:
        gloden_negative_size = n * len(df_postive)
        df_negative = df_negative.sample(gloden_negative_size)
        df = pd.concat([df_postive, df_negative])
    df["_noise"] = np.random.randn(len(df))
    df = df.sort_values(["session", "_noise"])
    df = df.drop("_noise", axis=1).reset_index(drop=True)
    return df


def train_valid_split(df):
    print(f"origin all data size: {len(df)}")
    valid_session = np.random.choice(df_train_candidates.session.unique(), int(len(df_train_candidates.session.unique())*0.2))
    df_train = df[~df["session"].isin(valid_session)]
    df_valid = df[df["session"].isin(valid_session)]
    df_train = downsample(df_train)
    return df_train, df_valid


df_train_candidates, df_valid_candidates = train_valid_split(df_train_candidates)
df_train_candidates["label"] = df_train_candidates["label"].astype("int")
df_valid_candidates["label"] = df_valid_candidates["label"].astype("int")
print(f"df_train_candidates size: {len(df_train_candidates)}, df_valid_candidates size: {len(df_valid_candidates)}")

# storage
# df_train_candidates.to_pickle("df_train_candidates.pkl")
# df_valid_candidates.to_pickle("df_valid_candidates.pkl")

origin all data size: 182272665
current negative size: 148785554, postive size: 419345, rate: 354
df_train_candidates size: 8806245, df_valid_candidates size: 33067766


In [6]:
%%time

for p in pipeline:
    df_train_candidates = p(df_train_candidates)
    df_valid_candidates = p(df_valid_candidates)

CPU times: user 7.41 s, sys: 1.16 s, total: 8.56 s
Wall time: 8.56 s


In [7]:
df_train_candidates

Unnamed: 0,session,aid,ts,type,label,ts_day,session_length,action_num_reverse_chrono,log_recency_score,type_weighted_log_recency_score
0,11098528,1467813,1661119200,3,0,0,10,9,0.071773,0.717735
1,11098528,11830,1661119200,0,1,0,10,8,0.148698,0.148698
2,11098528,1586171,1661119200,3,0,0,10,7,0.231144,2.311444
3,11098528,1764621,1661119200,3,0,0,10,6,0.319508,3.195079
4,11098528,307904,1661119200,3,0,0,10,5,0.414214,4.142136
...,...,...,...,...,...,...,...,...,...,...
8806240,12899778,674409,1661119200,3,0,0,7,4,0.319508,3.195079
8806241,12899778,1845894,1661119200,3,0,0,7,3,0.464086,4.640857
8806242,12899778,631041,1661119200,3,0,0,7,2,0.624505,6.245048
8806243,12899778,863998,1661119200,3,0,0,7,1,0.802501,8.025009


In [8]:
%%time

train_item_feature_path = sorted(glob.glob(
    os.path.join(DATA_PATH, "input/feature/train/item_feature_*.parquet")
))

for p in tqdm(train_item_feature_path):
    df_train_candidates = df_train_candidates.merge(pd.read_parquet(p), on="aid", how="left")
    df_valid_candidates = df_valid_candidates.merge(pd.read_parquet(p), on="aid", how="left")

print(f"df_train_candidates size: {len(df_train_candidates)}, df_valid_candidates size: {len(df_valid_candidates)}")
df_train_candidates.head(5)

100%|████████████████████████████████████████████████████████████████████████████████████████████████████| 1/1 [00:25<00:00, 25.72s/it]

df_train_candidates size: 8806245, df_valid_candidates size: 33067766
CPU times: user 23.8 s, sys: 2.09 s, total: 25.9 s
Wall time: 25.7 s





Unnamed: 0,session,aid,ts,type,label,ts_day,session_length,action_num_reverse_chrono,log_recency_score,type_weighted_log_recency_score,item_user_count,item_item_count,item_buy_ratio
0,11098528,1467813,1661119200,3,0,0,10,9,0.071773,0.717735,2950.0,4355.0,0.174053
1,11098528,11830,1661119200,0,1,0,10,8,0.148698,0.148698,19211.0,33776.0,0.17397
2,11098528,1586171,1661119200,3,0,0,10,7,0.231144,2.311444,33479.0,50753.0,0.131657
3,11098528,1764621,1661119200,3,0,0,10,6,0.319508,3.195079,3895.0,6228.0,0.201349
4,11098528,307904,1661119200,3,0,0,10,5,0.414214,4.142136,5352.0,8581.0,0.121897


In [9]:
%%time

train_user_feature_path = sorted(glob.glob(
    os.path.join(DATA_PATH, "input/feature/train/user_feature_*.parquet")
))

for p in tqdm(train_user_feature_path):
    df_train_candidates = df_train_candidates.merge(pd.read_parquet(p), on="session", how="left")
    df_valid_candidates = df_valid_candidates.merge(pd.read_parquet(p), on="session", how="left")

print(f"df_train_candidates size: {len(df_train_candidates)}, df_valid_candidates size: {len(df_valid_candidates)}")
df_train_candidates.head(5)

100%|████████████████████████████████████████████████████████████████████████████████████████████████████| 1/1 [00:08<00:00,  8.47s/it]

df_train_candidates size: 8806245, df_valid_candidates size: 33067766
CPU times: user 7.44 s, sys: 1.25 s, total: 8.69 s
Wall time: 8.47 s





Unnamed: 0,session,aid,ts,type,label,ts_day,session_length,action_num_reverse_chrono,log_recency_score,type_weighted_log_recency_score,item_user_count,item_item_count,item_buy_ratio,user_user_count,user_item_count,user_buy_ratio
0,11098528,1467813,1661119200,3,0,0,10,9,0.071773,0.717735,2950.0,4355.0,0.174053,1,1,0.0
1,11098528,11830,1661119200,0,1,0,10,8,0.148698,0.148698,19211.0,33776.0,0.17397,1,1,0.0
2,11098528,1586171,1661119200,3,0,0,10,7,0.231144,2.311444,33479.0,50753.0,0.131657,1,1,0.0
3,11098528,1764621,1661119200,3,0,0,10,6,0.319508,3.195079,3895.0,6228.0,0.201349,1,1,0.0
4,11098528,307904,1661119200,3,0,0,10,5,0.414214,4.142136,5352.0,8581.0,0.121897,1,1,0.0


## Storage

In [10]:
df_train_candidates.to_pickle("df_train_candidates.pkl")
df_valid_candidates.to_pickle("df_valid_candidates.pkl")

## Feature Name

In [11]:
FEATURE_COL = list(set(df_train_candidates.columns.tolist()) - set(["session", "aid", "ts", "label"]))
with open("FEATURE_COL.txt", "w") as f:
    f.write("\n".join(FEATURE_COL))
    
FEATURE_COL

['type',
 'user_user_count',
 'item_buy_ratio',
 'action_num_reverse_chrono',
 'user_item_count',
 'session_length',
 'item_item_count',
 'user_buy_ratio',
 'type_weighted_log_recency_score',
 'log_recency_score',
 'ts_day',
 'item_user_count']

## Training

In [12]:
%%time
n_estimators_candidates = list(range(10,91,10))

for n_estimators in tqdm(n_estimators_candidates):
    ranker = xgb.XGBRanker(
#         tree_method='gpu_hist',
        tree_method="hist",
        booster='gbtree',
        objective='rank:pairwise',
        random_state=42, 
        learning_rate=0.1,
        colsample_bytree=0.9,  # 0.9
        eta=0.05, 
        max_depth=6, 
        n_estimators=n_estimators,
        subsample=0.8,
        n_jobs=11
    )

    feature_cols = FEATURE_COL
    label_col = 'label'

    ranker.fit(
        X=df_train_candidates[feature_cols],
        y=df_train_candidates[label_col],
        group=df_train_candidates.groupby("session").count()["label"]
    )

    joblib.dump(ranker, f"orders_xgbranker_{str(n_estimators)}.m")

100%|████████████████████████████████████████████████████████████████████████████████████████████████████| 9/9 [03:31<00:00, 23.54s/it]

CPU times: user 33min 58s, sys: 29.6 s, total: 34min 28s
Wall time: 3min 31s





# Valid

In [13]:
%%time

# ranker = joblib.load("carts_xgbranker_1.m")

# FEATURE_COL = list(ranker.feature_names_in_)
with open("FEATURE_COL.txt", "r") as f:
    FEATURE_COL = f.read().splitlines()
    
print(FEATURE_COL)

for n_estimators in tqdm(n_estimators_candidates[::-1]):
    df_valid_candidates = pd.read_pickle("df_valid_candidates.pkl")
    ranker = joblib.load(f"orders_xgbranker_{n_estimators}.m")
    df_valid_candidates["score"] = ranker.predict(df_valid_candidates[FEATURE_COL])
    df_valid_candidates = df_valid_candidates.sort_values(by=['session', 'score'], ascending=False)[['session', 'aid', "label", "score"]].reset_index(drop=True)
    recall = pl.DataFrame(df_valid_candidates).groupby("session").agg([
        (pl.col("label").head(20).sum()/pl.col("label").sum()).alias("gt_ratio")
    ]).select(pl.col("gt_ratio").drop_nans()).mean()["gt_ratio"][0]
    print(f"n_estimators={n_estimators} get Recall@20: {recall}")

['type', 'user_user_count', 'item_buy_ratio', 'action_num_reverse_chrono', 'user_item_count', 'session_length', 'item_item_count', 'user_buy_ratio', 'type_weighted_log_recency_score', 'log_recency_score', 'ts_day', 'item_user_count']


 11%|███████████                                                                                         | 1/9 [00:30<04:04, 30.55s/it]

n_estimators=90 get Recall@20: 0.8034475026144497


 22%|██████████████████████▏                                                                             | 2/9 [01:00<03:32, 30.31s/it]

n_estimators=80 get Recall@20: 0.8029117092544046


 33%|█████████████████████████████████▎                                                                  | 3/9 [01:30<02:59, 29.86s/it]

n_estimators=70 get Recall@20: 0.8032645775668841


 44%|████████████████████████████████████████████▍                                                       | 4/9 [01:58<02:26, 29.34s/it]

n_estimators=60 get Recall@20: 0.8020121828036556


 56%|███████████████████████████████████████████████████████▌                                            | 5/9 [02:26<01:54, 28.67s/it]

n_estimators=50 get Recall@20: 0.8020352846866952


 67%|██████████████████████████████████████████████████████████████████▋                                 | 6/9 [02:51<01:23, 27.68s/it]

n_estimators=40 get Recall@20: 0.805536668398937


 78%|█████████████████████████████████████████████████████████████████████████████▊                      | 7/9 [03:16<00:53, 26.63s/it]

n_estimators=30 get Recall@20: 0.8058955404946412


 89%|████████████████████████████████████████████████████████████████████████████████████████▉           | 8/9 [03:40<00:25, 25.84s/it]

n_estimators=20 get Recall@20: 0.806478196221058


100%|████████████████████████████████████████████████████████████████████████████████████████████████████| 9/9 [04:02<00:00, 26.92s/it]

n_estimators=10 get Recall@20: 0.806092901946874
CPU times: user 9min 1s, sys: 37.9 s, total: 9min 39s
Wall time: 4min 2s





In [54]:
(0.8097062407307981 - 0.8072849174254828)*0.6

0.0014527939831891822

# Inference

## Feature

In [21]:
%%time

kaggle_train_path = sorted(glob.glob(
    os.path.join(DATA_PATH, "input/otto-chunk-data-inparquet-format/train_parquet/*.parquet")
))

kaggle_test_path = sorted(glob.glob(
    os.path.join(DATA_PATH, "input/otto-chunk-data-inparquet-format/test_parquet/*.parquet")
))

df_test_item = pd.concat([
    pd.concat([read_parquet(i) for i in kaggle_train_path]),
    pd.concat([read_parquet(i) for i in kaggle_test_path])
])


CPU times: user 38 s, sys: 9.34 s, total: 47.4 s
Wall time: 28.8 s


In [24]:
%%time

df_test_item = pl.DataFrame(df_test_item)
item_feature_1(df_test_item, "test")
user_feature_1(df_test_item, "test")

## Mege

In [14]:
kaggle_test_candidates_path = sorted(glob.glob(
    os.path.join(DATA_PATH, "input/otto-test-chunk-candidates-top100/test_chunk_data/test_chunk_data/*_candidates.pkl")
))

df_kaggle_test = pd.concat([pd.read_pickle(i) for i in kaggle_test_candidates_path])
df_kaggle_test

Unnamed: 0,session,aid,ts,type
0,12899779,59625,1661724000,0
1,12899779,469285,1661724000,3
2,12899779,1493965,1661724000,3
3,12899779,438191,1661724000,3
4,12899779,731692,1661724000,3
...,...,...,...,...
101041929,13899778,880834,1661724000,3
101041930,13899778,894123,1661724000,3
101041931,13899778,907634,1661724000,3
101041932,13899778,937690,1661724000,3


In [15]:
%%time

for p in tqdm(pipeline):
    df_kaggle_test = p(df_kaggle_test)


test_user_feature_path = sorted(glob.glob(
    os.path.join(DATA_PATH, "input/feature/test/user_feature_*.parquet")
))
for p in tqdm(train_user_feature_path):
    df_kaggle_test = df_kaggle_test.merge(pd.read_parquet(p), on="session", how="left")


test_item_feature_path = sorted(glob.glob(
    os.path.join(DATA_PATH, "input/feature/test/item_feature_*.parquet")
))
for p in tqdm(train_item_feature_path):
    df_kaggle_test = df_kaggle_test.merge(pd.read_parquet(p), on="aid", how="left")

100%|████████████████████████████████████████████████████████████████████████████████████████████████████| 5/5 [00:31<00:00,  6.34s/it]
100%|████████████████████████████████████████████████████████████████████████████████████████████████████| 1/1 [00:24<00:00, 24.11s/it]
100%|███████████████████████████████████████████████████████████████████████████████████████████████████| 1/1 [01:41<00:00, 101.02s/it]

CPU times: user 2min 14s, sys: 22.1 s, total: 2min 36s
Wall time: 2min 36s





In [16]:
%%time
best_n_estimators = "20"
ranker = joblib.load(f"orders_xgbranker_{best_n_estimators}.m")
df_kaggle_test["score"] = ranker.predict(df_kaggle_test[FEATURE_COL])
df_kaggle_test = df_kaggle_test.sort_values(by=['session', 'score'], ascending=False)[['session', 'aid']].reset_index(drop=True)
df_kaggle_test = df_kaggle_test.groupby('session').head(20).groupby('session').agg(list).reset_index(drop=False)
df_kaggle_test

CPU times: user 4min 14s, sys: 12.5 s, total: 4min 27s
Wall time: 2min 25s


Unnamed: 0,session,aid
0,12899779,"[475447, 448688, 985764, 1340695, 1422133, 696..."
1,12899780,"[973453, 736515, 582732, 1142000, 1142000, 137..."
2,12899781,"[199008, 57315, 141736, 199008, 199008, 199008..."
3,12899782,"[740494, 987399, 834354, 834354, 889671, 12740..."
4,12899783,"[255297, 198385, 255297, 1729553, 1817895, 111..."
...,...,...
1671798,14571577,"[1141710, 166065, 30148, 512809, 287041, 63870..."
1671799,14571578,"[1403962, 131032, 1174989, 40598, 5095, 231888..."
1671800,14571579,"[739876, 304799, 630181, 653990, 1590992, 1557..."
1671801,14571580,"[202353, 391852, 1652005, 441903, 142475, 2488..."


In [17]:
%%time

df_kaggle_test["session_type"] = df_kaggle_test["session"].apply(lambda x: str(x)+"_orders")
# df_test["session_type"] = df_test["session"].apply(lambda x: str(x)+"_carts")
df_kaggle_test = df_kaggle_test.rename({"aid": "labels"}, axis=1)[["session_type", "labels"]]
df_kaggle_test["labels"] = df_kaggle_test["labels"].apply(lambda x: " ".join([str(_) for _ in x]))
df_submission = pd.read_csv("../data/output/submission_578.csv")
# df_submission = df_submission[~df_submission.session_type.str.contains("_carts$")]
df_submission = df_submission[~df_submission.session_type.str.contains("_orders$")]
df_submission = pd.concat([df_kaggle_test, df_submission])
df_submission.to_csv("../data/output/submission_optim_orders.csv", index=False)
df_submission.shape

CPU times: user 31.4 s, sys: 1.54 s, total: 32.9 s
Wall time: 36.4 s


(5015409, 2)

In [41]:
df_submission = pd.read_csv("../data/output/submission_578.csv")
# df_submission = df_submission[~df_submission.session_type.str.contains("_carts$")]
df_submission = df_submission[~df_submission.session_type.str.contains("_orders$")]
df_submission = pd.concat([df_kaggle_test, df_submission])
df_submission.to_csv("../data/output/submission_optim_orders.csv", index=False)
df_submission.shape

(5015409, 2)