In [2]:
from pathlib import Path
import pandas as pd
from rectools import Columns
import numpy as np
import logging

from rectools.models.sasrec import save_pickle 

from rectools.models.sasrec import (
    SASRecConfig,
    SASRecProcessorConfig,
    TrainPreprocessingConfig,
    TrainConfig,
    ItemModelConfig,
    SequenceTaskConverterConfig,
    train_sasrec_script,
    SASRecRecommeder
)

from rectools.metrics import MAP, calc_metrics, MeanInvUserFreq, Serendipity
from rectools.dataset import Dataset

In [3]:
logging.basicConfig()
logging.getLogger().setLevel(logging.INFO)

logger = logging.getLogger()

# Data

In [3]:
%%time
!wget -q https://github.com/irsafilo/KION_DATASET/raw/f69775be31fa5779907cf0a92ddedb70037fb5ae/data_original.zip -O data_original.zip
!unzip -o data_original.zip
!rm data_original.zip

Archive:  data_original.zip
  inflating: data_original/interactions.csv  
  inflating: __MACOSX/data_original/._interactions.csv  
  inflating: data_original/users.csv  
  inflating: __MACOSX/data_original/._users.csv  
  inflating: data_original/items.csv  
  inflating: __MACOSX/data_original/._items.csv  
CPU times: user 241 ms, sys: 91.3 ms, total: 332 ms
Wall time: 30.3 s


In [26]:
DATA_PATH = Path("data_original")

interactions = (
    pd.read_csv(DATA_PATH / 'interactions.csv', parse_dates=["last_watch_dt"])
    .rename(columns={"last_watch_dt": "datetime"})
)

# Split dataset

In [27]:
interactions[Columns.Weight] = np.where(interactions['watched_pct'] > 10, 3, 1)

# Split to train / test
max_date = interactions[Columns.Datetime].max()
train = interactions[interactions[Columns.Datetime] < max_date - pd.Timedelta(days=7)].copy()
test = interactions[interactions[Columns.Datetime] >= max_date - pd.Timedelta(days=7)].copy()
train.drop(train.query("total_dur < 300").index, inplace=True)

# drop items with less than 20 interactions in train
items = train["item_id"].value_counts()
items = items[items >= 20]
items = items.index.to_list()
train = train[train["item_id"].isin(items)]
    
# drop users with less than 2 interactions in train
users = train["user_id"].value_counts()
users = users[users >= 2]
users = users.index.to_list()
train = train[(train["user_id"].isin(users))]

# leave item features for items only from train
items = train["item_id"].drop_duplicates().to_list()
users = train["user_id"].drop_duplicates().to_list()

# drop cold users from test
cold_users = set(test[Columns.User]) - set(train[Columns.User])
test.drop(test[test[Columns.User].isin(cold_users)].index, inplace=True)
test_users = test[Columns.User].unique().astype(str)

item_features = pd.DataFrame()
item_features["item_id"] = train["item_id"].copy().drop_duplicates()
item_features["value"] = ""
item_features["feature"] = "tags_set"

catalog=train[Columns.Item].unique().astype(str)


In [28]:
dataset = Dataset.construct(
    interactions_df=train,
    item_features_df=item_features,
    cat_item_features=["tags_set"]
)

# Specify configs

In [29]:
hidden_units=128
session_maxlen=32


item_model_config = ItemModelConfig(
    name="idemb",
    hidden_units=hidden_units,
)

model_cfg = SASRecConfig(
    maxlen=session_maxlen,
    hidden_units=hidden_units,  # 50
    num_blocks=2,
    num_heads=1,
    dropout_rate=0.2,
    use_pos_emb=True,
    use_sm_head=True,
    item_model=item_model_config,
)

# TODO reused in train config
processor_config = SASRecProcessorConfig(
    session_maxlen=session_maxlen,
    enable_item_features=False,  # True,
    # item_tags_maxlen=64,
)

train_preprocessing_config = TrainPreprocessingConfig(
    min_item_freq=20,  # 1
    min_user_freq=2,  # 2 is minimal to generate target
    # keep_tags_types=["genres_"],
    keep_tags_types=[],
)

train_config = TrainConfig(
    lr=1e-3,
    batch_size=128,
    epochs=5,
    l2_emb=0.0,
    device="cuda:1",
    # device="cpu",
    negative_samples=0,
    loss="sm_ce",
    processor_config=processor_config,
)

task_converter_config = SequenceTaskConverterConfig()

# Train model

In [31]:
from rectools.models.sasrec import run_train_script
model_dir = "rectools/models/sasrec_test_model"

def train_sasrec(
        dataset: Dataset,
        processor_config: SASRecProcessorConfig,
        model_config: SASRecConfig,
        train_config: TrainConfig,
        train_preprocessing_config: TrainPreprocessingConfig,
        task_converter_config: SequenceTaskConverterConfig,
):
    train = dataset.get_raw_interactions()
    train["user_id"] = train["user_id"].astype(str)
    train["item_id"] = train["item_id"].astype(str)
    item_features = pd.DataFrame()
    item_features["item_id"] = train["item_id"].copy().drop_duplicates().astype(str)
    item_features["tags_set"] = ""
    train.rename(columns={"datetime": "first_intr_dt", "weight": "score"}, inplace=True)

    if task_converter_config is None:
        task_converter_config = SequenceTaskConverterConfig() # TODO do we need it?

    logger.info("running training script")
    run_train_script(
        user_item_interactions=train,
        item_features=item_features,
        user_features=None,
        processor_config=processor_config,
        model_config=model_config,
        train_config=train_config,
        train_preprocessing_config=train_preprocessing_config,
        task_converter_config=task_converter_config,
        model_dir=model_dir,
    )

In [32]:
train_sasrec(dataset, processor_config, model_cfg, train_config, train_preprocessing_config, task_converter_config)

INFO:root:running training script
INFO:rectools.models.sasrec:testing dataset
INFO:rectools.models.sasrec:converting datasets to task format
INFO:rectools.models.sasrec:sessions lens: 0.95q: 24.0; 0.5q: 4.0
INFO:rectools.models.sasrec:building preprocessor
INFO:rectools.models.sasrec:building train dataset
INFO:rectools.models.sasrec:building model
INFO:rectools.models.sasrec:building trainer
INFO:rectools.models.sasrec:used sm_ce loss
INFO:rectools.models.sasrec:undable to init param encoder.attention_layernorms.0.weight with xavier: Fan in and fan out can not be computed for tensor with fewer than 2 dimensions
INFO:rectools.models.sasrec:undable to init param encoder.attention_layernorms.0.bias with xavier: Fan in and fan out can not be computed for tensor with fewer than 2 dimensions
INFO:rectools.models.sasrec:undable to init param encoder.attention_layernorms.1.weight with xavier: Fan in and fan out can not be computed for tensor with fewer than 2 dimensions
INFO:rectools.models.s

# Test model

In [10]:
metrics_name = {
    'MAP': MAP,
    'MIUF': MeanInvUserFreq,
    'Serendipity': Serendipity
    

}
metrics = {}
for metric_name, metric in metrics_name.items():
    for k in (1, 5, 10):
        metrics[f'{metric_name}@{k}'] = metric(k=k)


In [33]:
from rectools import AnyIds
import typing as tp

def recommend(
      dataset: Dataset,
      k: int,
      items_to_recommend: tp.Optional[AnyIds] = None,
):
      train = dataset.get_raw_interactions()

      train["user_id"] = train["user_id"].astype(str)
      train["item_id"] = train["item_id"].astype(str)
      item_features = pd.DataFrame()
      item_features["item_id"] = train["item_id"].copy().drop_duplicates().astype(str)
      item_features["tags_set"] = ""
      train.rename(columns={"datetime": "first_intr_dt", "weight": "score"}, inplace=True)

      test["user_id"] = test["user_id"].astype(str)
      test["item_id"] = test["item_id"].astype(str)

      rec_df = train[train["user_id"].isin(users)]
      recommender = SASRecRecommeder.load(model_dir)

      recs = recommender.recommend(
            user_item_interactions=rec_df, 
            user_features=users, 
            item_features=item_features,
            top_k=k, 
            candidate_items=items_to_recommend, 
      )
      return recs

In [34]:
candidate_items = interactions['item_id'].drop_duplicates().astype(str)
recs = recommend(test_users, dataset, k=10, items_to_recommend=candidate_items)

INFO:rectools.models.sasrec:sessions lens: 0.95q: 41.0; 0.5q: 7.0
100%|██████████| 740/740 [00:08<00:00, 84.37it/s]
100%|██████████| 94705/94705 [00:02<00:00, 31979.40it/s]


In [35]:
calc_metrics(metrics, recs, test, train, catalog)

{'MAP@1': 0.048131569733957345,
 'MAP@5': 0.0821316195172286,
 'MAP@10': 0.09132761212357746,
 'MIUF@1': 18.824620072061013,
 'MIUF@5': 18.824620072061013,
 'MIUF@10': 18.824620072061013,
 'Serendipity@1': 0.09934005596325432,
 'Serendipity@5': 0.06045158580034103,
 'Serendipity@10': 0.04451197076609255}