In [1]:
import os
import time
import glob
import gc
import warnings
from multiprocessing import Pool

from tqdm import tqdm
import pandas as pd
import polars as pl
import numpy as np
import xgboost as xgb
# import cudf
from sklearn.model_selection import GroupKFold
import joblib
# from openfe import openfe, transform

warnings.filterwarnings("ignore")

# Setup

In [2]:
DATA_PATH = "/home/search3/lichunyu/otto-recommender-system/data"
TYPE_MAP = {'clicks':0, 'carts':1, 'orders':2}
DOWNSAMPLE_RATE = 20

def read_parquet(f):
    df = pd.read_parquet(f)
    df.ts = (df.ts/1000).astype('int32')
    df['type'] = df['type'].map(TYPE_MAP).astype('int8')
    return df

# Item Feature

In [9]:
def item_feature_1(df, prefix):
    if isinstance(df, pd.DataFrame):
        df = pl.DataFrame(df)
    df = df.groupby("aid").agg([
        pl.col("session").n_unique().alias("item_user_count"),
        pl.col("aid").count().alias("item_item_count"),
        pl.col("type").mean().alias("item_buy_ratio"),
    ]).to_pandas().to_parquet(
        os.path.join(DATA_PATH, f"input/feature/{prefix}/item_feature_1.parquet")
    )
    return df

def item_feature_2(df, prefix):
    if isinstance(df, pd.DataFrame):
        df = pl.DataFrame(df)
    df = df.filter(
        pl.col("type")==TYPE_MAP["clicks"]
    ).groupby("aid").agg([
        pl.col("session").n_unique().alias("item_clicks_user_count"),
        pl.col("aid").count().alias("item_clicks_item_count")
    ])
    df.to_pandas().to_parquet(
        os.path.join(DATA_PATH, f"input/feature/{prefix}/item_feature_2.parquet")
    )
    return df

def item_feature_3(df, prefix):
    if isinstance(df, pd.DataFrame):
        df = pl.DataFrame(df)
    df = df.filter(
        pl.col("type")==TYPE_MAP["carts"]
    ).groupby("aid").agg([
        pl.col("session").n_unique().alias("item_carts_user_count"),
        pl.col("aid").count().alias("item_carts_item_count")
    ])
    df.to_pandas().to_parquet(
        os.path.join(DATA_PATH, f"input/feature/{prefix}/item_feature_3.parquet")
    )
    return df

def item_feature_4(df, prefix):
    if isinstance(df, pd.DataFrame):
        df = pl.DataFrame(df)
    df = df.filter(
        pl.col("type")==TYPE_MAP["orders"]
    ).groupby("aid").agg([
        pl.col("session").n_unique().alias("item_orders_user_count"),
        pl.col("aid").count().alias("item_orders_item_count")
    ])
    df.to_pandas().to_parquet(
        os.path.join(DATA_PATH, f"input/feature/{prefix}/item_feature_4.parquet")
    )
    return df

def item_feature_5(df, prefix):
    if isinstance(df, pd.DataFrame):
        df = pl.DataFrame(df)
    ts_maximal = df["ts"].max()
    df = df.filter(
        pl.col("type")==TYPE_MAP["clicks"]
    ).groupby("aid").agg([
        ((ts_maximal-pl.col("ts").max())/(24*60*60)).alias("item_last_clicks_ts"),
        ((ts_maximal-pl.col("ts").min())/(24*60*60)).alias("item_first_clicks_ts")
    ])
    df.to_pandas().to_parquet(
        os.path.join(DATA_PATH, f"input/feature/{prefix}/item_feature_5.parquet")
    )
    return df

def item_feature_6(df, prefix):
    if isinstance(df, pd.DataFrame):
        df = pl.DataFrame(df)
    ts_maximal = df["ts"].max()
    df = df.filter(
        pl.col("type")==TYPE_MAP["carts"]
    ).groupby("aid").agg([
        ((ts_maximal-pl.col("ts").max())/(24*60*60)).alias("item_last_carts_ts"),
        ((ts_maximal-pl.col("ts").min())/(24*60*60)).alias("item_first_carts_ts")
    ])
    df.to_pandas().to_parquet(
        os.path.join(DATA_PATH, f"input/feature/{prefix}/item_feature_6.parquet")
    )
    return df

def item_feature_7(df, prefix):
    if isinstance(df, pd.DataFrame):
        df = pl.DataFrame(df)
    ts_maximal = df["ts"].max()
    df = df.filter(
        pl.col("type")==TYPE_MAP["orders"]
    ).groupby("aid").agg([
        ((ts_maximal-pl.col("ts").max())/(24*60*60)).alias("item_last_orders_ts"),
        ((ts_maximal-pl.col("ts").min())/(24*60*60)).alias("item_first_orders_ts")
    ])
    df.to_pandas().to_parquet(
        os.path.join(DATA_PATH, f"input/feature/{prefix}/item_feature_7.parquet")
    )
    return df

In [10]:
pipeline = [
    item_feature_1,
    item_feature_2,
    item_feature_3,
    item_feature_4,
    item_feature_5,
    item_feature_6,
    item_feature_7
]

# Build

## Train

In [5]:
%%time

train_path = sorted(glob.glob(
    os.path.join(DATA_PATH, "input/otto-validation/train_parquet/*.parquet")
))

valid_a_path = sorted(glob.glob(
    os.path.join(DATA_PATH, "input/otto-validation/test_parquet/*.parquet")
))

df_train_item = pd.concat([
    pd.concat([read_parquet(i) for i in train_path]),
    pd.concat([read_parquet(i) for i in valid_a_path])
])

# df_train_item = pl.DataFrame(df_train_item)
df_train_item.head(10)

CPU times: user 25.8 s, sys: 5.94 s, total: 31.7 s
Wall time: 21.2 s


Unnamed: 0,session,aid,ts,type
0,0,1517085,1659304800,0
1,0,1563459,1659304904,0
2,0,1309446,1659367439,0
3,0,16246,1659367719,0
4,0,1781822,1659367871,0
5,0,1152674,1659367885,0
6,0,1649869,1659369893,1
7,0,461689,1659369898,1
8,0,305831,1659370027,2
9,0,461689,1659370027,2


In [11]:
for p in tqdm(pipeline):
    p(df_train_item, "train")

100%|████████████████████████████████████████████████████████████████████████████████████████████████████| 7/7 [00:09<00:00,  1.31s/it]


## Test

In [12]:
%%time

kaggle_train_path = sorted(glob.glob(
    os.path.join(DATA_PATH, "input/otto-chunk-data-inparquet-format/train_parquet/*.parquet")
))

kaggle_test_path = sorted(glob.glob(
    os.path.join(DATA_PATH, "input/otto-chunk-data-inparquet-format/test_parquet/*.parquet")
))

df_test_item = pd.concat([
    pd.concat([read_parquet(i) for i in kaggle_train_path]),
    pd.concat([read_parquet(i) for i in kaggle_test_path])
])
df_test_item = pl.DataFrame(df_test_item)
print(df_test_item.shape)
df_test_item = df_test_item.filter(
    pl.col("ts") > df_test_item["ts"].min()+7*24*60*60
)
print(df_test_item.shape)

(223644219, 4)
(170483281, 4)
CPU times: user 40.2 s, sys: 12.1 s, total: 52.3 s
Wall time: 42.4 s


In [13]:
%%time

for p in tqdm(pipeline):
    p(df_test_item, "test")

100%|████████████████████████████████████████████████████████████████████████████████████████████████████| 7/7 [00:10<00:00,  1.49s/it]

CPU times: user 1min 53s, sys: 3.23 s, total: 1min 56s
Wall time: 10.4 s





In [None]:
def add_session_length(df):
    # If not using cuDF, remove .to_pandas()
    df['session_length'] = df.groupby('session')['ts'].transform('count')
    return df

def add_action_num_reverse_chrono(df):
    df['action_num_reverse_chrono'] = df.session_length - df.groupby('session').cumcount() - 1
    return df

def add_log_recency_score(df):
    linear_interpolation = 0.1 + ((1-0.1) / (df['session_length']-1)) * (df['session_length']-df['action_num_reverse_chrono']-1)
    df['log_recency_score'] = (2 ** linear_interpolation - 1).fillna(1.0)
    return df

def add_type_weighted_log_recency_score(df):
    type_weights = {0:1, 1:6, 2:3, 3:0.1}
    df['type_weighted_log_recency_score'] = df['log_recency_score'] / df['type'].map(type_weights)
    return df



pipeline = [
    ts_day,
    add_session_length,
    add_action_num_reverse_chrono,
    add_log_recency_score,
    add_type_weighted_log_recency_score
]