In [2]:
import os
import time
import glob
import gc
import warnings
from multiprocessing import Pool

from tqdm import tqdm
import pandas as pd
import polars as pl
import numpy as np
import xgboost as xgb
# import cudf
from sklearn.model_selection import GroupKFold
import joblib
# from openfe import openfe, transform

warnings.filterwarnings("ignore")

# Setup

In [3]:
DATA_PATH = "/home/search3/lichunyu/otto-recommender-system/data"
TYPE_MAP = {'clicks':0, 'carts':1, 'orders':2}
DOWNSAMPLE_RATE = 20

def read_parquet(f):
    df = pd.read_parquet(f)
    df.ts = (df.ts/1000).astype('int32')
    df['type'] = df['type'].map(TYPE_MAP).astype('int8')
    return df

# User Feature

In [4]:
def user_feature_1(df, prefix):
    if isinstance(df, pd.DataFrame):
        df = pl.DataFrame(df)
    df = df.groupby("session").agg([
        pl.col("session").count().alias("user_user_count"),
        pl.col("aid").n_unique().alias("user_item_count"),
        pl.col("type").mean().alias("user_buy_ratio")
    ]).to_pandas().to_parquet(
        os.path.join(DATA_PATH, f"input/feature/{prefix}/user_feature_1.parquet")
    )
    return df

def user_feature_2(df, prefix):
    if isinstance(df, pd.DataFrame):
        df = pl.DataFrame(df)
#     ts_minimal = df["ts"].min()
    ts_maximal = df["ts"].max()
    df = df.filter(
        pl.col("type")==TYPE_MAP["orders"]
    ).groupby("session").agg([
        ((ts_maximal - pl.col("ts").min())/(24*60*60)).alias("user_first_orders_time"),
        ((ts_maximal - pl.col("ts").max())/(24*60*60)).alias("user_last_orders_time")
    ])
    df.to_pandas().to_parquet(
        os.path.join(DATA_PATH, f"input/feature/{prefix}/user_feature_2.parquet")
    )
    return df

def user_feature_3(df, prefix):
    if isinstance(df, pd.DataFrame):
        df = pl.DataFrame(df)
    df = df.groupby("session").agg([
        (pl.col("type")==TYPE_MAP["orders"]).sum().alias("user_orders_count"),
        (pl.col("type")==TYPE_MAP["clicks"]).sum().alias("user_clicks_count"),
        (pl.col("type")==TYPE_MAP["carts"]).sum().alias("user_carts_count"),
        (pl.col("type")<5).sum().alias("user_behavior_count")
    ]).with_columns([
        (pl.col("user_orders_count")/pl.col("user_behavior_count")).alias("user_orders_ratio"),
        (pl.col("user_clicks_count")/pl.col("user_behavior_count")).alias("user_clicks_ratio"),
        (pl.col("user_carts_count")/pl.col("user_behavior_count")).alias("user_carts_ratio"),
    ])
    df.to_pandas().to_parquet(
        os.path.join(DATA_PATH, f"input/feature/{prefix}/user_feature_3.parquet")
    )
    return df

def user_feature_4(df, prefix):
    if isinstance(df, pd.DataFrame):
        df = pl.DataFrame(df)
#     ts_minimal = df["ts"].min()
    ts_maximal = df["ts"].max()
    df = df.filter(
        pl.col("type")==TYPE_MAP["clicks"]
    ).groupby("session").agg([
        ((ts_maximal - pl.col("ts").min())/(24*60*60)).alias("user_first_clicks_time"),
        ((ts_maximal - pl.col("ts").max())/(24*60*60)).alias("user_last_clicks_time")
    ])
    df.to_pandas().to_parquet(
        os.path.join(DATA_PATH, f"input/feature/{prefix}/user_feature_4.parquet")
    )
    return df

def user_feature_5(df, prefix):
    if isinstance(df, pd.DataFrame):
        df = pl.DataFrame(df)
#     ts_minimal = df["ts"].min()
    ts_maximal = df["ts"].max()
    df = df.filter(
        pl.col("type")==TYPE_MAP["carts"]
    ).groupby("session").agg([
        ((ts_maximal - pl.col("ts").min())/(24*60*60)).alias("user_first_carts_time"),
        ((ts_maximal - pl.col("ts").max())/(24*60*60)).alias("user_last_carts_time")
    ])
    df.to_pandas().to_parquet(
        os.path.join(DATA_PATH, f"input/feature/{prefix}/user_feature_5.parquet")
    )
    return df

In [None]:
def clicks_rate_in_carts(df):
    def func(x):
        x_clicks = x[x["type"]==TYPE_MAP["clicks"]]
        x_carts = x[x["type"]==TYPE_MAP["carts"]]
        x_hit = x_carts.merge(x_clicks, on="aid", how="inner").drop_duplicates()
        rate = (len(x_hit) / len(x_carts)) if len(x_carts) != 0 else 0
        dual_rate = (len(x_carts) / len(x_hit)) if len(x_hit) != 0 else 0
#         r = pd.Series({"clicks_rate_in_carts": rate})
        r = pd.Series({"clicks_rate_in_carts": rate, "carts_rate_in_clicks": dual_rate})
        return r
    df_tmp = df[["session", "aid", "ts", "type"]].groupby("session")[["aid", "type"]].apply(func)
    df = df.merge(df_tmp, on="session")
    return df

def clicks_rate_in_orders(df):
    def func(x):
        x_clicks = x[x["type"]==TYPE_MAP["clicks"]]
        x_orders = x[x["type"]==TYPE_MAP["orders"]]
        x_hit = x_orders.merge(x_clicks, on="aid", how="inner").drop_duplicates()
        rate = (len(x_hit) / len(x_orders)) if len(x_orders) != 0 else 0
        dual_rate = (len(x_orders) / len(x_hit)) if len(x_hit) != 0 else 0
#         r = pd.Series({"clicks_rate_in_orders": rate})
        r = pd.Series({"clicks_rate_in_orders": rate, "orders_rate_in_clicks": dual_rate})
        return r
    df_tmp = df[["session", "aid", "ts", "type"]].groupby("session")[["aid", "type"]].apply(func)
    df = df.merge(df_tmp, on="session")
    return df

def carts_rate_in_orders(df):
    def func(x):
        x_carts = x[x["type"]==TYPE_MAP["carts"]]
        x_orders = x[x["type"]==TYPE_MAP["orders"]]
        x_hit = x_orders.merge(x_carts, on="aid", how="inner").drop_duplicates()
        rate = (len(x_hit) / len(x_orders)) if len(x_orders) != 0 else 0
        dual_rate = (len(x_orders) / len(x_hit)) if len(x_hit) != 0 else 0
        r = pd.Series({"carts_rate_in_orders": rate, "orders_rate_in_carts": dual_rate})
        return r
    df_tmp = df[["session", "aid", "ts", "type"]].groupby("session")[["aid", "type"]].apply(func)
    df = df.merge(df_tmp, on="session")
    return df

In [5]:
pipeline = [
    user_feature_2,
    user_feature_4,
    user_feature_5
]

In [9]:
pipeline = [
    user_feature_1
]

# Build

## Train

In [6]:
%%time

valid_a_path = sorted(glob.glob(
    os.path.join(DATA_PATH, "input/otto-validation/test_parquet/*.parquet")
))

df_train_user = pd.concat([read_parquet(i) for i in valid_a_path])
df_train_user = pl.DataFrame(df_train_user)
df_train_user.head(5)

CPU times: user 1.49 s, sys: 229 ms, total: 1.72 s
Wall time: 1.3 s


session,aid,ts,type
i32,i32,i32,i8
11098528,11830,1661119200,0
11098529,1105029,1661119200,0
11098530,264500,1661119200,0
11098530,264500,1661119288,0
11098530,409236,1661119369,0


In [7]:
for p in tqdm(pipeline):
    p(df_train_user, "train")

100%|████████████████████████████████████████████████████████████████████████████████████████████████████| 3/3 [00:00<00:00,  4.71it/s]


## Test

In [8]:
%%time

kaggle_test_path = sorted(glob.glob(
    os.path.join(DATA_PATH, "input/otto-chunk-data-inparquet-format/test_parquet/*.parquet")
))

df_test_user = pd.concat([read_parquet(i) for i in kaggle_test_path])
df_test_user = pl.DataFrame(df_test_user)
df_test_user.shape

CPU times: user 1.78 s, sys: 173 ms, total: 1.95 s
Wall time: 1.22 s


(6928123, 4)

In [9]:
for p in tqdm(pipeline):
    p(df_test_user, "test")

100%|████████████████████████████████████████████████████████████████████████████████████████████████████| 3/3 [00:00<00:00,  5.01it/s]


In [None]:
def ts_day(df):
    ts_minimal = df["ts"].min()
    df["ts_day"] = (df["ts"]-ts_minimal) // (24*60*60)
    return df

def add_session_length(df):
    # If not using cuDF, remove .to_pandas()
    df['session_length'] = df.groupby('session')['ts'].transform('count')
    return df

def add_action_num_reverse_chrono(df):
    df['action_num_reverse_chrono'] = df.session_length - df.groupby('session').cumcount() - 1
    return df

def add_log_recency_score(df):
    linear_interpolation = 0.1 + ((1-0.1) / (df['session_length']-1)) * (df['session_length']-df['action_num_reverse_chrono']-1)
    df['log_recency_score'] = (2 ** linear_interpolation - 1).fillna(1.0)
    return df

def add_type_weighted_log_recency_score(df):
    type_weights = {0:1, 1:6, 2:3, 3:0.1}
    df['type_weighted_log_recency_score'] = df['log_recency_score'] / df['type'].map(type_weights)
    return df



pipeline = [
    ts_day,
    add_session_length,
    add_action_num_reverse_chrono,
    add_log_recency_score,
    add_type_weighted_log_recency_score
]