In [3]:
import os
import time
import glob
import gc
import warnings
from multiprocessing import Pool

from tqdm import tqdm
import pandas as pd
import polars as pl
import numpy as np
import xgboost as xgb
# import cudf
from sklearn.model_selection import GroupKFold
import joblib
# from openfe import openfe, transform

warnings.filterwarnings("ignore")

# Setup

In [4]:
DATA_PATH = "/home/search3/lichunyu/otto-recommender-system/data"
TYPE_MAP = {'clicks':0, 'carts':1, 'orders':2}
DOWNSAMPLE_RATE = 20

def read_parquet(f):
    df = pd.read_parquet(f)
    df.ts = (df.ts/1000).astype('int32')
    df['type'] = df['type'].map(TYPE_MAP).astype('int8')
    return df

# User Feature

In [8]:
def user_feature_1(df, prefix):
    if isinstance(df, pd.DataFrame):
        df = pl.DataFrame(df)
    df = df.groupby("session").agg([
        pl.col("session").count().alias("user_user_count"),
        pl.col("aid").n_unique().alias("user_item_count"),
        pl.col("type").mean().alias("user_buy_ratio")
    ]).to_pandas().to_parquet(
        os.path.join(DATA_PATH, f"input/feature/{prefix}/user_feature_1.parquet")
    )
    return df

In [9]:
pipeline = [
    user_feature_1
]

# Build

## Train

In [None]:
%%time

valid_a_path = sorted(glob.glob(
    os.path.join(DATA_PATH, "input/otto-validation/test_parquet/*.parquet")
))

df_train_user = pd.concat([read_parquet(i) for i in valid_a_path])
df_train_user = pl.DataFrame(df_train_user)
df_train_user.head(5)

In [None]:
for p in tqdm(pipeline):
    p(df_train_user, "train")

## Test

In [5]:
%%time

kaggle_test_path = sorted(glob.glob(
    os.path.join(DATA_PATH, "input/otto-chunk-data-inparquet-format/test_parquet/*.parquet")
))

df_test_user = pd.concat([read_parquet(i) for i in kaggle_test_path])
df_test_user = pl.DataFrame(df_test_user)
df_test_user.shape

CPU times: user 1.89 s, sys: 230 ms, total: 2.13 s
Wall time: 2.04 s


(6928123, 4)

In [13]:
for p in tqdm(pipeline):
    p(df_test_user, "test")

100%|████████████████████████████████████████████████████████████████████████████████████████████████████| 1/1 [00:00<00:00,  2.37it/s]


In [6]:
df_test_user

session,aid,ts,type
i64,i64,i32,i8
12899779,59625,1661724000,0
12899780,1142000,1661724000,0
12899780,582732,1661724058,0
12899780,973453,1661724109,0
12899780,736515,1661724136,0
12899780,1142000,1661724155,0
12899781,141736,1661724000,0
12899781,199008,1661724022,0
12899781,57315,1661724170,0
12899781,194067,1661724246,0


In [None]:
def ts_day(df):
    ts_minimal = df["ts"].min()
    df["ts_day"] = (df["ts"]-ts_minimal) // (24*60*60)
    return df

def add_session_length(df):
    # If not using cuDF, remove .to_pandas()
    df['session_length'] = df.groupby('session')['ts'].transform('count')
    return df

def add_action_num_reverse_chrono(df):
    df['action_num_reverse_chrono'] = df.session_length - df.groupby('session').cumcount() - 1
    return df

def add_log_recency_score(df):
    linear_interpolation = 0.1 + ((1-0.1) / (df['session_length']-1)) * (df['session_length']-df['action_num_reverse_chrono']-1)
    df['log_recency_score'] = (2 ** linear_interpolation - 1).fillna(1.0)
    return df

def add_type_weighted_log_recency_score(df):
    type_weights = {0:1, 1:6, 2:3, 3:0.1}
    df['type_weighted_log_recency_score'] = df['log_recency_score'] / df['type'].map(type_weights)
    return df



pipeline = [
    ts_day,
    add_session_length,
    add_action_num_reverse_chrono,
    add_log_recency_score,
    add_type_weighted_log_recency_score
]