In [1]:
import os
import time
import glob
import gc
import pickle
import warnings
from multiprocessing import Pool

from tqdm import tqdm
import pandas as pd
import polars as pl
import numpy as np
import xgboost as xgb
# import cudf
from sklearn.model_selection import GroupKFold
import joblib
# from openfe import openfe, transform

warnings.filterwarnings("ignore")

In [2]:
DATA_PATH = "/home/search3/lichunyu/otto-recommender-system/data"
TYPE_MAP = {'clicks':0, 'carts':1, 'orders':2}
TOPN = 50
DOWNSAMPLE_RATE = 20
VALID_DATA_RATIO = 0.2
MODEL_TYPE = "clicks"

def read_parquet(f):
    df = pd.read_parquet(f)
    df.ts = (df.ts/1000).astype('int32')
    df['type'] = df['type'].map(TYPE_MAP).astype('int8')
    return df

In [3]:
paths = sorted(glob.glob(os.path.join(DATA_PATH, f"output/submission_part_{MODEL_TYPE}_n_es*.csv")))
paths

['/home/search3/lichunyu/otto-recommender-system/data/output/submission_part_clicks_n_es100.csv',
 '/home/search3/lichunyu/otto-recommender-system/data/output/submission_part_clicks_n_es150.csv',
 '/home/search3/lichunyu/otto-recommender-system/data/output/submission_part_clicks_n_es200.csv',
 '/home/search3/lichunyu/otto-recommender-system/data/output/submission_part_clicks_n_es250.csv',
 '/home/search3/lichunyu/otto-recommender-system/data/output/submission_part_clicks_n_es300.csv',
 '/home/search3/lichunyu/otto-recommender-system/data/output/submission_part_clicks_n_es350.csv',
 '/home/search3/lichunyu/otto-recommender-system/data/output/submission_part_clicks_n_es400.csv']

In [4]:
def read_sub(path, weight=1, contains_str=""): # by default let us assing the weight of 1 to predictions from each submission, this will be akin to a standard vote ensemble
    '''a helper function for loading and preprocessing submissions'''
    return (
        pl.read_csv(path)
            .filter(pl.col("session_type").str.contains(f"_{MODEL_TYPE}"))
            .with_column(pl.col('labels').str.split(by=' '))
            .with_column(pl.lit(weight).alias('vote'))
            .explode('labels')
            .rename({'labels': 'aid'})
            .with_column(pl.col('aid').cast(pl.UInt32)) # we are casting the `aids` to `Int32`! memory management is super important to ensure we don't run out of resources
            .with_column(pl.col('vote').cast(pl.UInt8))
    )

In [5]:
# subs = [read_sub(path, w, c_str) for path, w, c_str in paths]
subs = [read_sub(path) for path in paths]
subs[0].head()

session_type,aid,vote
str,u32,u8
"""12899779_click...",59625,1
"""12899779_click...",1253524,1
"""12899779_click...",737445,1
"""12899779_click...",731692,1
"""12899779_click...",438191,1


In [6]:
# subs = subs[0].join(
#     subs[1], how='outer', on=['session_type', 'aid']
# ).join(
#     subs[2], how='outer', on=['session_type', 'aid'], suffix='_right2'
# # ).join(
# #     subs[3], how='outer', on=['session_type', 'aid'], suffix='_right3'
# )
# subs.head()

s = None
for idx, sub in enumerate(subs):
    if s is None:
        s = sub
        continue
    s = s.join(sub, how='outer', on=['session_type', 'aid'], suffix=f'_right{idx}')
subs = s
subs.head()

session_type,aid,vote,vote_right1,vote_right2,vote_right3,vote_right4,vote_right5,vote_right6
str,u32,u8,u8,u8,u8,u8,u8,u8
"""12899779_click...",59625,1,1,1,1,1,1,1
"""12899779_click...",1253524,1,1,1,1,1,1,1
"""12899779_click...",737445,1,1,1,1,1,1,1
"""12899779_click...",731692,1,1,1,1,1,1,1
"""12899779_click...",438191,1,1,1,1,1,1,1


In [7]:
subs = (subs
    .fill_null(0)
    .with_column(
        (
#             pl.col('vote') + pl.col('vote_right') + pl.col('vote_right2')
            pl.col('vote') + pl.col('vote_right1') + pl.col('vote_right2') + pl.col('vote_right3') + \
            pl.col('vote_right4') + pl.col('vote_right5') + pl.col('vote_right6')
        ).alias('vote_sum'))
#     .drop(['vote', 'vote_right', 'vote_right2'])
    .drop(['vote','vote_right1', 'vote_right2', 'vote_right3', 'vote_right4', 'vote_right5', 'vote_right6'])
    .sort(by='vote_sum')
    .reverse()
)

subs.head()

session_type,aid,vote_sum
str,u32,u8
"""14571581_click...",579690,7
"""14571581_click...",754412,7
"""14571581_click...",1551213,7
"""14571581_click...",1764910,7
"""14571581_click...",984794,7


In [8]:
preds = subs.groupby('session_type').agg([
    pl.col('aid').head(20).alias('labels')
])

preds = preds.with_column(pl.col('labels').apply(lambda lst: ' '.join([str(aid) for aid in lst])))
preds.head()

session_type,labels
str,str
"""13869315_click...","""471073 579690 ..."
"""13353257_click...","""579690 1551213..."
"""13255544_click...","""471073 579690 ..."
"""13734976_click...","""717965 471073 ..."
"""13294045_click...","""245725 216829 ..."


In [9]:
preds = preds.to_pandas()
df_submission = pd.read_csv("../data/output/submission_583.csv")
df_submission = df_submission[~df_submission.session_type.str.contains(f"_{MODEL_TYPE}$")]
df_submission = pd.concat([preds, df_submission])
df_submission.to_csv(f"../data/output/submission_ensemble_{MODEL_TYPE}.csv", index=False)
df_submission.shape

(5015409, 2)