In [4]:
import os
import time
import glob
import gc
import warnings
from multiprocessing import Pool

from tqdm import tqdm
import pandas as pd
import polars as pl
import numpy as np
import xgboost as xgb
# import cudf
from sklearn.model_selection import GroupKFold
import joblib
# from openfe import openfe, transform

warnings.filterwarnings("ignore")

# Setup

In [5]:
DATA_PATH = "/home/search3/lichunyu/otto-recommender-system/data"
TYPE_MAP = {'clicks':0, 'carts':1, 'orders':2}
DOWNSAMPLE_RATE = 20

def read_parquet(f):
    df = pd.read_parquet(f)
    df.ts = (df.ts/1000).astype('int32')
    df['type'] = df['type'].map(TYPE_MAP).astype('int8')
    return df

# Item Feature

In [6]:
def item_feature_1(df, prefix):
    if isinstance(df, pd.DataFrame):
        df = pl.DataFrame(df)
    df = df.groupby("aid").agg([
        pl.col("session").n_unique().alias("item_user_count"),
        pl.col("aid").count().alias("item_item_count"),
        pl.col("type").mean().alias("item_buy_ratio"),
    ]).to_pandas().to_parquet(
        os.path.join(DATA_PATH, f"input/feature/{prefix}/item_feature_1.parquet")
    )
    return df

In [7]:
pipeline = [
    item_feature_1
]

# Build

## Train

In [None]:
%%time

train_path = sorted(glob.glob(
    os.path.join(DATA_PATH, "input/otto-validation/train_parquet/*.parquet")
))

valid_a_path = sorted(glob.glob(
    os.path.join(DATA_PATH, "input/otto-validation/test_parquet/*.parquet")
))

df_train_item = pd.concat([
    pd.concat([read_parquet(i) for i in train_path]),
    pd.concat([read_parquet(i) for i in valid_a_path])
])

df_train_item = pl.DataFrame(df_train_item)

In [None]:
for p in tqdm(pipeline):
    p(df_train_item, "train")

## Test

In [24]:
%%time

kaggle_train_path = sorted(glob.glob(
    os.path.join(DATA_PATH, "input/otto-chunk-data-inparquet-format/train_parquet/*.parquet")
))

kaggle_test_path = sorted(glob.glob(
    os.path.join(DATA_PATH, "input/otto-chunk-data-inparquet-format/test_parquet/*.parquet")
))

df_test_item = pd.concat([
    pd.concat([read_parquet(i) for i in kaggle_train_path]),
    pd.concat([read_parquet(i) for i in kaggle_test_path])
])
df_test_item = pl.DataFrame(df_test_item)
print(df_test_item.shape)
df_test_item = df_test_item.filter(
    pl.col("ts") > df_test_item["ts"].min()+7*24*60*60
)
print(df_test_item.shape)

(223644219, 4)
(170483281, 4)
CPU times: user 40.6 s, sys: 12.4 s, total: 53 s
Wall time: 43.3 s


In [25]:
%%time

for p in tqdm(pipeline):
    p(df_test_item, "test")

100%|████████████████████████████████████████████████████████████████████████████████████████████████████| 1/1 [00:02<00:00,  2.74s/it]

CPU times: user 40.9 s, sys: 609 ms, total: 41.5 s
Wall time: 2.75 s





In [22]:
df_test_item.shape

(170483281, 4)

In [None]:
def ts_day(df):
    ts_minimal = df["ts"].min()
    df["ts_day"] = (df["ts"]-ts_minimal) // (24*60*60)
    return df

def add_session_length(df):
    # If not using cuDF, remove .to_pandas()
    df['session_length'] = df.groupby('session')['ts'].transform('count')
    return df

def add_action_num_reverse_chrono(df):
    df['action_num_reverse_chrono'] = df.session_length - df.groupby('session').cumcount() - 1
    return df

def add_log_recency_score(df):
    linear_interpolation = 0.1 + ((1-0.1) / (df['session_length']-1)) * (df['session_length']-df['action_num_reverse_chrono']-1)
    df['log_recency_score'] = (2 ** linear_interpolation - 1).fillna(1.0)
    return df

def add_type_weighted_log_recency_score(df):
    type_weights = {0:1, 1:6, 2:3, 3:0.1}
    df['type_weighted_log_recency_score'] = df['log_recency_score'] / df['type'].map(type_weights)
    return df



pipeline = [
    ts_day,
    add_session_length,
    add_action_num_reverse_chrono,
    add_log_recency_score,
    add_type_weighted_log_recency_score
]