In [14]:
from datetime import timedelta
import polars as pl
import pandas as pd
import implicit

from tools import load_data_actions, generate_lightfm_recs_mapper
from tqdm import tqdm

from lightfm.data import Dataset
from lightfm import LightFM

from rectools import Columns
from rectools.dataset import Dataset as RTDataset
from rectools.models import (
    ImplicitALSWrapperModel,
    ImplicitBPRWrapperModel,
    LightFMWrapperModel,
    PureSVDModel,
    ImplicitItemKNNWrapperModel,
    EASEModel
)

from sklearn.model_selection import train_test_split
from sklearn.utils import shuffle
# First Stage
SEED = 42
top_N = 40
DATA_DIR = 'data/'

df_test_users = pl.read_parquet(f'{DATA_DIR}/test_users.pq')
df_clickstream = pl.read_parquet(f'{DATA_DIR}/clickstream.pq')
df_event = pl.read_parquet(f'{DATA_DIR}/events.pq')
df_train, df_eval = load_data_actions(df_clickstream, df_event)

def dataframe2rectools(df):
    return (
        df[["cookie", "node", "event_date"]]
        .with_columns(pl.lit(1).alias('weight'))
        .rename({
            "cookie": Columns.User,
            "node": Columns.Item,
            "weight": Columns.Weight,
            "event_date": Columns.Datetime,
        })
    ).to_pandas()
df_train = dataframe2rectools(df_train)

In [15]:
dataset = RTDataset.construct(
    interactions_df=df_train,
    user_features_df=None,
    cat_user_features=None,
    # item_features_df=unmelted_item_features.to_pandas(),
    # cat_item_features=["category"],
)

In [16]:
%%time
model = ImplicitItemKNNWrapperModel(implicit.nearest_neighbours.BM25Recommender(K=top_N))

CPU times: user 11 μs, sys: 68 μs, total: 79 μs
Wall time: 83.9 μs


In [17]:
model.fit(dataset)

<rectools.models.implicit_knn.ImplicitItemKNNWrapperModel at 0x4e02a4850>

In [18]:
df_pred = model.recommend(
    users=list(df_eval["cookie"].unique()),
    dataset=dataset,
    k=top_N,
    filter_viewed=True,
)
df_pred = pl.DataFrame(
    df_pred[["user_id", "item_id"]],
    schema={"user_id": pl.Int64, "item_id": pl.Int64}
    ).rename({"user_id": "cookie", "item_id": "node"})
from tools import recall_at
recall_at(df_eval, df_pred, k=40)

0.08697969190158865

In [20]:
df_pred.group_by("cookie").head(40)["cookie","node"].write_csv('results/first_stage_prediction_BM25_40.csv')

# Submission

In [21]:
df_train = dataframe2rectools(df_clickstream)
dataset = RTDataset.construct(
    interactions_df=df_train,
    user_features_df=None,
    cat_user_features=None,
    # item_features_df=unmelted_item_features.to_pandas(),
    # cat_item_features=["category"],
)


In [22]:
%%time
model = ImplicitItemKNNWrapperModel(implicit.nearest_neighbours.BM25Recommender(K=top_N))
model.fit(dataset)

CPU times: user 34.6 s, sys: 353 ms, total: 35 s
Wall time: 35.2 s


<rectools.models.implicit_knn.ImplicitItemKNNWrapperModel at 0x4e0639390>

In [23]:
df_pred = model.recommend(
    users=list(df_test_users["cookie"].unique()),
    dataset=dataset,
    k=top_N,
    filter_viewed=True,
)

In [None]:
df_pred = pl.DataFrame(
    df_pred[["user_id", "item_id"]],
    schema={"user_id": pl.Int64, "item_id": pl.Int64}
    ).rename({"user_id": "cookie", "item_id": "node"})

In [None]:
df_pred.group_by("cookie").head(40)["cookie","node"].write_csv('data/test_hybrid_stage_candidates_BM25_40.csv')