In [1]:
import os
import time
import glob
import gc
import warnings
from multiprocessing import Pool

from tqdm import tqdm
import pandas as pd
import polars as pl
import numpy as np
import xgboost as xgb
# import cudf
from sklearn.model_selection import GroupKFold
import joblib
# from openfe import openfe, transform

warnings.filterwarnings("ignore")

# Setup

In [2]:
DATA_PATH = "/home/search3/lichunyu/otto-recommender-system/data"
TYPE_MAP = {'clicks':0, 'carts':1, 'orders':2}
DOWNSAMPLE_RATE = 20

def read_parquet(f):
    df = pd.read_parquet(f)
    df.ts = (df.ts/1000).astype('int32')
    df['type'] = df['type'].map(TYPE_MAP).astype('int8')
    return df

# Interacition Feature

In [3]:
def interaction_feature_1(df, prefix):
    if isinstance(df, pd.DataFrame):
        df = pl.DataFrame(df)
    def func(x):
        if x == TYPE_MAP["clicks"]:
            return 1
        elif x == TYPE_MAP["orders"]:
            return 6
        elif x == TYPE_MAP["carts"]:
            return 3

    df = df.with_columns([
        pl.col("type").apply(func).alias("type_score")
    ]).groupby(["session", "aid"]).agg([
        pl.col("type_score").sum().alias("interaction_type_core")
    ])
    df.to_pandas().to_parquet(
        os.path.join(DATA_PATH, f"input/feature/{prefix}/interaction_feature_1.parquet")
    )
    return df

def interaction_feature_2(df, prefix):
    if isinstance(df, pd.DataFrame):
        df = pl.DataFrame(df)

    df = df.groupby(["session", "aid"]).agg([
        (pl.col("type").filter(pl.col("type")==TYPE_MAP["clicks"]).count()/pl.col("type").filter(pl.col("type")==TYPE_MAP["carts"]).count().count()).alias("interaction_clicks_carts_ratio"),
        (pl.col("type").filter(pl.col("type")==TYPE_MAP["carts"]).count()/pl.col("type").filter(pl.col("type")==TYPE_MAP["clicks"]).count().count()).alias("interaction_carts_clicks_ratio"),
        (pl.col("type").filter(pl.col("type")==TYPE_MAP["clicks"]).count()/pl.col("type").filter(pl.col("type")==TYPE_MAP["orders"]).count().count()).alias("interaction_clicks_orders_ratio"),
        (pl.col("type").filter(pl.col("type")==TYPE_MAP["orders"]).count()/pl.col("type").filter(pl.col("type")==TYPE_MAP["clicks"]).count().count()).alias("interaction_orders_clicks_ratio"),
        (pl.col("type").filter(pl.col("type")==TYPE_MAP["orders"]).count()/pl.col("type").filter(pl.col("type")==TYPE_MAP["carts"]).count().count()).alias("interaction_orders_carts_ratio"),
        (pl.col("type").filter(pl.col("type")==TYPE_MAP["carts"]).count()/pl.col("type").filter(pl.col("type")==TYPE_MAP["orders"]).count().count()).alias("interaction_carts_orders_ratio"),
    ])
    df.to_pandas().to_parquet(
        os.path.join(DATA_PATH, f"input/feature/{prefix}/interaction_feature_2.parquet")
    )
    return df

def interaction_feature_3(df, prefix):
    if isinstance(df, pd.DataFrame):
        df = pl.DataFrame(df)
    def func(x):
        if x == TYPE_MAP["clicks"]:
            return 1
        elif x == TYPE_MAP["orders"]:
            return 6
        elif x == TYPE_MAP["carts"]:
            return 3
    ts_maximal = df_train_interaction["ts"].max()
    df = df.with_columns([
        (-(pl.col("ts")-ts_maximal)/(24*60*60)).alias("ts_day")
    ]).with_columns([
        (10/(pl.col("ts_day")+1)).alias("ts_day_score"),
        pl.col("type").apply(func).alias("type_score")
    ]).with_columns([
        (pl.col("type_score")*pl.col("ts_day_score")).alias("ts_day_type_score")
    ]).groupby(["session", "aid"]).agg([
        pl.col("ts_day_type_score").sum().alias("interaction_timing_decay_score")
    ])
    df.to_pandas().to_parquet(
        os.path.join(DATA_PATH, f"input/feature/{prefix}/interaction_feature_3.parquet")
    )
    return df

def interaction_feature_4(df, prefix):
    if isinstance(df, pd.DataFrame):
        df = pl.DataFrame(df)
    df = df.groupby(["session", "aid"]).agg([
        (pl.col("type")==TYPE_MAP["orders"]).sum().alias("interaction_orders_count"),
        (pl.col("type")==TYPE_MAP["clicks"]).sum().alias("interaction_clicks_count"),
        (pl.col("type")==TYPE_MAP["carts"]).sum().alias("interaction_carts_count"),
        (pl.col("type")<5).sum().alias("interaction_behavior_count")
    ])
    df.to_pandas().to_parquet(
        os.path.join(DATA_PATH, f"input/feature/{prefix}/interaction_feature_4.parquet")
    )
    return df

def interaction_feature_5(df, prefix):
    if isinstance(df, pd.DataFrame):
        df = pl.DataFrame(df)
    df = df.groupby(["session", "aid"]).agg([
        ((pl.col("ts").max()-pl.col("ts").min())/(24*60*60)).alias("interaction_behavor_period")
    ])
    df.to_pandas().to_parquet(
        os.path.join(DATA_PATH, f"input/feature/{prefix}/interaction_feature_5.parquet")
    )
    return df

In [None]:
pipeline = [
    interaction_feature_1,
    interaction_feature_2,
    interaction_feature_3,
    interaction_feature_4,
    interaction_feature_5,
]

In [4]:
pipeline = [
    interaction_feature_3,
    interaction_feature_4,
    interaction_feature_5,
]

# Build

## Train

In [5]:
%%time

valid_a_path = sorted(glob.glob(
    os.path.join(DATA_PATH, "input/otto-validation/test_parquet/*.parquet")
))

df_train_interaction = pd.concat([read_parquet(i) for i in valid_a_path])
# df_train_interaction = pl.DataFrame(df_train_interaction)
print(f"df_train_interaction size: {len(df_train_interaction)}")
df_train_interaction.head(5)

df_train_interaction size: 7683577
CPU times: user 1.43 s, sys: 221 ms, total: 1.66 s
Wall time: 1.33 s


Unnamed: 0,session,aid,ts,type
0,11098528,11830,1661119200,0
1,11098529,1105029,1661119200,0
2,11098530,264500,1661119200,0
3,11098530,264500,1661119288,0
4,11098530,409236,1661119369,0


In [6]:
for p in tqdm(pipeline):
    p(df_train_interaction, "train")

100%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 3/3 [00:04<00:00,  1.41s/it]


## Test

In [7]:
%%time

kaggle_test_path = sorted(glob.glob(
    os.path.join(DATA_PATH, "input/otto-chunk-data-inparquet-format/test_parquet/*.parquet")
))

df_test_interaction = pd.concat([read_parquet(i) for i in kaggle_test_path])
# df_test_interaction = pl.DataFrame(df_test_interaction)
df_test_interaction.shape

CPU times: user 1.78 s, sys: 238 ms, total: 2.02 s
Wall time: 1.95 s


(6928123, 4)

In [8]:
for p in tqdm(pipeline):
    p(df_test_interaction, "test")

100%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 3/3 [00:04<00:00,  1.57s/it]


# Todo

In [None]:
def ts_day(df):
    ts_minimal = df["ts"].min()
    df["ts_day"] = (df["ts"]-ts_minimal) // (24*60*60)
    return df

def add_session_length(df):
    # If not using cuDF, remove .to_pandas()
    df['session_length'] = df.groupby('session')['ts'].transform('count')
    return df

def add_action_num_reverse_chrono(df):
    df['action_num_reverse_chrono'] = df.session_length - df.groupby('session').cumcount() - 1
    return df

def add_log_recency_score(df):
    linear_interpolation = 0.1 + ((1-0.1) / (df['session_length']-1)) * (df['session_length']-df['action_num_reverse_chrono']-1)
    df['log_recency_score'] = (2 ** linear_interpolation - 1).fillna(1.0)
    return df

def add_type_weighted_log_recency_score(df):
    type_weights = {0:1, 1:6, 2:3, 3:0.1}
    df['type_weighted_log_recency_score'] = df['log_recency_score'] / df['type'].map(type_weights)
    return df



pipeline = [
    ts_day,
    add_session_length,
    add_action_num_reverse_chrono,
    add_log_recency_score,
    add_type_weighted_log_recency_score
]