In [2]:
import os
import time
import glob

import pandas as pd
import polars as pl
import numpy as np
import xgboost as xgb
import cudf
from sklearn.model_selection import GroupKFold
import joblib

In [1]:
DATASET_PATH = "/home/search2/lichunyu/otto-recommender-system/data/input"

TYPE_MAP = {'clicks':0, 'carts':1, 'orders':2}

In [3]:
test_files: list = sorted(glob.glob(os.path.join(DATASET_PATH, "otto-validation/test_parquet/*.parquet")))
# test_files

# Carts

In [4]:
train_files: list = sorted(glob.glob(os.path.join(DATASET_PATH, "otto-validation/train_parquet/*.parquet")))
# train_files

In [5]:
def read_parquet(f):
    df = pd.read_parquet(f)
    df.ts = (df.ts/1000).astype('int32')
    df['type'] = df['type'].map(TYPE_MAP).astype('int8')
    return df

i = train_files[0]
df_train = cudf.DataFrame(read_parquet(i))
df_train = df_train.sort_values(['session','ts'],ascending=[True,False])
df_train['n'] = df_train.groupby('session').cumcount()
df_train["label"] = ((df_train["n"]<100)&df_train["type"]==TYPE_MAP["carts"]).astype('int8')
df_train = df_train.to_pandas()
df_train

Unnamed: 0,session,aid,ts,type,n,label,session_length
146,0,1110548,1661103727,0,0,0,147
145,0,724999,1661103701,0,1,0,147
144,0,30373,1661103687,0,2,0,147
143,0,102416,1661019639,0,3,0,147
142,0,504365,1661017998,0,4,0,147
...,...,...,...,...,...,...,...
4763698,110984,111814,1659328494,0,15,0,17
4763697,110984,334359,1659328485,0,16,0,17
4763716,110985,314992,1659782929,0,0,0,3
4763715,110985,1644439,1659782912,0,1,0,3


In [110]:
def read_parquet(f):
    df = pd.read_parquet(f)
    df.ts = (df.ts/1000).astype('int32')
    df['type'] = df['type'].map(TYPE_MAP).astype('int8')
    return df

df_train = None
for i in train_files:
# i = train_files[0]
    df = cudf.DataFrame(read_parquet(i))
    df = df.sort_values(['session','ts'],ascending=[True,False])
    df['n'] = df.groupby('session').cumcount()
    df["label"] = ((df["n"]<100)&df["type"]==TYPE_MAP["carts"]).astype('int8')
    df['session_length'] = df.groupby('session')['ts'].transform('count')
    df = df.to_pandas()
    if df_train is None:
        df_train = df
    else:
        df_train = pd.concat([df_train, df])

In [111]:
df_train

Unnamed: 0,session,aid,ts,type,n,label,session_length
146,0,1110548,1661103727,0,0,0,147
145,0,724999,1661103701,0,1,0,147
144,0,30373,1661103687,0,2,0,147
143,0,102416,1661019639,0,3,0,147
142,0,504365,1661017998,0,4,0,147
...,...,...,...,...,...,...,...
656387,11098523,175715,1661119197,0,0,0,1
656388,11098524,1088524,1661119198,0,0,0,1
656389,11098525,182927,1661119199,0,0,0,1
656390,11098526,510055,1661119199,0,0,0,1


# Model

In [None]:
ranker = xgb.XGBRanker(
    tree_method='gpu_hist',
    booster='gbtree',
    objective='rank:pairwise',
    random_state=42, 
    learning_rate=0.1,
    colsample_bytree=0.9, 
    eta=0.05, 
    max_depth=6, 
    n_estimators=110, 
    subsample=0.75 
)

feature_cols = ['aid', 'type']
label_col = 'label'
group_col = "session_length"


ranker.fit(
    X=df_train[feature_cols],
    y=df_train[label_col],
    group=df_train.groupby("session").count()["label"]
)

joblib.dump(ranker, f"carts_xgbranker_1.m")

# Validation

In [89]:
df_valid = None
for f in test_files:
    df = cudf.DataFrame(read_parquet(f))
    if df_valid is None:
        df_valid = df
    else:
        df_valid = cudf.concat([df_valid, df])
    break

df_valid["score"] = ranker.predict(df_valid[feature_cols].to_pandas())
df_valid = df_valid.sort_values(by=['session', 'score'], ascending=False)[['session', 'aid']].reset_index(drop=True)
df_valid = df_valid.to_pandas().groupby('session').head(20).groupby('session').agg(list).reset_index(drop=False)
df_valid["type"] = "carts"
df_valid

Unnamed: 0,session,aid,type
0,11098528,[11830],carts
1,11098529,[1105029],carts
2,11098530,"[409236, 264500, 264500, 409236, 409236, 409236]",carts
3,11098531,"[1365569, 1365569, 1365569, 624163, 1449555, 1...",carts
4,11098532,"[876469, 7651]",carts
...,...,...,...
90058,11188586,"[871551, 1531151, 489812, 871551]",carts
90059,11188587,"[416407, 409907, 1365091, 542343, 161453, 4164...",carts
90060,11188588,"[982423, 1663535, 1347624]",carts
90061,11188589,"[641024, 1627743]",carts


In [78]:
def validate(df_valid, scope=None):
    if scope is None:
        scope = ["clicks", "carts", "orders"]
    label = pd.read_parquet(os.path.join(DATASET_PATH, "otto-validation/test_labels.parquet"))
    df_valid = df_valid.merge(label, on=["session", "type"])
    df_valid["hits"] = df_valid.apply(lambda df: len(set(df.ground_truth).intersection(set(df.aid))), axis=1)
    df_valid['gt_count'] = df_valid.ground_truth.str.len().clip(0,20)
    recall = 0
    weights = {'clicks': 0.10, 'carts': 0.30, 'orders': 0.60}
    for s in scope:
        df = df_valid[df_valid["type"]==s]
        recall_s = df["hits"].sum() / df['gt_count'].sum()
        print(f"{s} Recall: {recall_s}")
        recall += recall_s*weights[s]
    print("")
    print(f"CV Recall: {recall}")

In [83]:
validate(df_valid, ["carts"])

carts Recall: 0.2952269108142145

CV Recall: 0.08856807324426434


# Inference

In [84]:
test_data_path = os.path.join(DATASET_PATH, "otto-chunk-data-inparquet-format/test_parquet/*.parquet")
test_data = sorted(glob.glob(test_data_path))
ranker = joblib.load("carts_xgbranker_1.m")

['/home/search2/lichunyu/kaggle/dataset/otto-recommender-system/otto-chunk-data-inparquet-format/test_parquet/000000000_000100000.parquet',
 '/home/search2/lichunyu/kaggle/dataset/otto-recommender-system/otto-chunk-data-inparquet-format/test_parquet/000100000_000200000.parquet',
 '/home/search2/lichunyu/kaggle/dataset/otto-recommender-system/otto-chunk-data-inparquet-format/test_parquet/000200000_000300000.parquet',
 '/home/search2/lichunyu/kaggle/dataset/otto-recommender-system/otto-chunk-data-inparquet-format/test_parquet/000300000_000400000.parquet',
 '/home/search2/lichunyu/kaggle/dataset/otto-recommender-system/otto-chunk-data-inparquet-format/test_parquet/000400000_000500000.parquet',
 '/home/search2/lichunyu/kaggle/dataset/otto-recommender-system/otto-chunk-data-inparquet-format/test_parquet/000500000_000600000.parquet',
 '/home/search2/lichunyu/kaggle/dataset/otto-recommender-system/otto-chunk-data-inparquet-format/test_parquet/000600000_000700000.parquet',
 '/home/search2/lich

In [92]:
df_test = None
for f in test_data:
    df = cudf.DataFrame(read_parquet(f))
    if df_test is None:
        df_test = df
    else:
        df_test = cudf.concat([df_test, df])

df_test["score"] = ranker.predict(df_test[feature_cols].to_pandas())
df_test = df_test.sort_values(by=['session', 'score'], ascending=False)[['session', 'aid']].reset_index(drop=True)
df_test = df_test.to_pandas().groupby('session').head(20).groupby('session').agg(list).reset_index(drop=False)
df_test

Unnamed: 0,session,aid
0,12899779,[59625]
1,12899780,"[582732, 1142000, 1142000, 973453, 736515]"
2,12899781,"[199008, 57315, 141736, 918667, 199008, 194067..."
3,12899782,"[1711180, 476063, 987399, 562753, 889671, 1494..."
4,12899783,"[1729553, 1216820, 255297, 255297, 300127, 300..."
...,...,...
1671798,14571577,[1141710]
1671799,14571578,[519105]
1671800,14571579,[739876]
1671801,14571580,[202353]


In [101]:
df_test["session_type"] = df_test["session"].apply(lambda x: str(x)+"_carts")
df_test = df_test.rename({"aid": "labels"}, axis=1)[["session_type", "labels"]]
df_test

Unnamed: 0,session_type,labels
0,12899779_carts,[59625]
1,12899780_carts,"[582732, 1142000, 1142000, 973453, 736515]"
2,12899781_carts,"[199008, 57315, 141736, 918667, 199008, 194067..."
3,12899782_carts,"[1711180, 476063, 987399, 562753, 889671, 1494..."
4,12899783_carts,"[1729553, 1216820, 255297, 255297, 300127, 300..."
...,...,...
1671798,14571577_carts,[1141710]
1671799,14571578_carts,[519105]
1671800,14571579_carts,[739876]
1671801,14571580_carts,[202353]


In [4]:
df_submission = pd.read_csv("../data/output/submission.csv")
df_submission = df_submission[~df_submission.session_type.str.contains("_carts$")]
df_submission = pd.concat([df_test, df_submission])
df_submission.to_csv("../data/output/submission_optim_carts.csv", index=False)

Unnamed: 0,session_type,labels
0,12899779_clicks,59625
1,12899780_clicks,1142000 736515 973453 582732
2,12899781_clicks,199008 918667 194067 57315 141736
3,12899782_clicks,834354 595994 740494 889671 987399 779477 8291...
4,12899783_clicks,1817895 607638 1754419 300127 1216820 1729553 ...
...,...,...
5015404,14571577_carts,1141710 1276792 1666114 631085 1004292 367734 ...
5015405,14571578_carts,519105 977826 1811714 822641 1671592 815460 15...
5015406,14571579_carts,739876 1209992 1550479 1750859 785544 51363 21...
5015407,14571580_carts,202353 433425 1314576 925638 1231403 871658 88...


# submission csv must have 5015409 rows

In [107]:
df_submission.shape

(5015409, 2)