In [14]:
from pathlib import Path
import pandas as pd
from rectools import Columns
import numpy as np
import os

In [15]:
import pandas as pd
import logging

from rectools.models.sasrec import save_pickle 

from rectools.models.sasrec import (
    SASRecConfig,
    SASRecProcessorConfig,
    TrainPreprocessingConfig,
    TrainConfig,
    ItemModelConfig,
    SequenceTaskConverterConfig,
    train_sasrec_script,
    SASRecRecommeder
)

from rectools.metrics import MAP, calc_metrics, MeanInvUserFreq, Serendipity

In [16]:
logging.basicConfig()
logging.getLogger().setLevel(logging.INFO)

logger = logging.getLogger()

# Data

In [17]:
%%time
!wget -q https://github.com/irsafilo/KION_DATASET/raw/f69775be31fa5779907cf0a92ddedb70037fb5ae/data_original.zip -O data_original.zip
!unzip -o data_original.zip
!rm data_original.zip

Archive:  data_original.zip
  inflating: data_original/interactions.csv  
  inflating: __MACOSX/data_original/._interactions.csv  
  inflating: data_original/users.csv  
  inflating: __MACOSX/data_original/._users.csv  
  inflating: data_original/items.csv  
  inflating: __MACOSX/data_original/._items.csv  
CPU times: user 1.12 s, sys: 573 ms, total: 1.69 s
Wall time: 59.5 s


In [18]:
DATA_PATH = Path("data_original")

user_features = pd.read_csv(DATA_PATH / 'users.csv')
item_features = pd.read_csv(DATA_PATH / 'items.csv')
user_item_interactions = (
    pd.read_csv(DATA_PATH / 'interactions.csv', parse_dates=["last_watch_dt"])
    .rename(columns={"last_watch_dt": "first_intr_dt", "total_dur": "score"})
)

In [19]:
user_features["user_id"] = user_features["user_id"].astype(str)
item_features["item_id"] = item_features["item_id"].astype(str)
user_item_interactions["user_id"] = user_item_interactions["user_id"].astype(str)
user_item_interactions["item_id"] = user_item_interactions["item_id"].astype(str)

# Split dataset

In [20]:
train_ds_path = "rectools/models/train_interactions_splitted_wtg.pkl"
test_ds_path="rectools/models/test_interactions_splitted_wtg.pkl"

user_item_interactions[Columns.Weight] = np.where(user_item_interactions['watched_pct'] > 10, 3, 1)

# Split to train / test
max_date = user_item_interactions["first_intr_dt"].max()
train_df = user_item_interactions[user_item_interactions["first_intr_dt"] < max_date - pd.Timedelta(days=7)].copy()
test_df = user_item_interactions[user_item_interactions["first_intr_dt"] >= max_date - pd.Timedelta(days=7)].copy()
train_df.drop(train_df.query("score < 300").index, inplace=True)

# drop items with less than 20 interactions in train
items = train_df['item_id'].value_counts()
items = items[items >= 20]
items = items.index.to_list()
train_df = train_df[train_df['item_id'].isin(items)]
    
# drop users with less than 2 interactions in train
users = train_df['user_id'].value_counts()
users = users[users >= 2]
users = users.index.to_list()
train_df = train_df[(train_df['user_id'].isin(users)) & (train_df['item_id'].isin(items))]

# leave item features for items only from train
items = train_df['item_id'].drop_duplicates().to_list()
users = train_df['user_id'].drop_duplicates().to_list()
item_features_train = item_features[item_features['item_id'].isin(items)]

# drop cold users from test
cold_users = set(test_df[Columns.User]) - set(train_df[Columns.User])
test_df.drop(test_df[test_df[Columns.User].isin(cold_users)].index, inplace=True)
test_users = test_df[Columns.User].unique()

train_ds = (train_df, pd.DataFrame(), pd.DataFrame())
test_ds = (test_df, pd.DataFrame(), pd.DataFrame())

save_pickle(train_ds, train_ds_path)
save_pickle(test_ds, test_ds_path)


In [21]:
savepath = "rectools/models/kion_pers_recs_item_features_test.parquet"
item_features_train["tags_set"] = ''
item_features_train = item_features_train[['item_id', 'tags_set']].copy().drop_duplicates()
os.makedirs(os.path.dirname(savepath), exist_ok=True)
item_features_train.to_parquet(savepath)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  item_features_train["tags_set"] = ''


In [22]:
savepath = "rectools/models/kion_pers_recs_item_features_test.parquet"
item_features["tags_set"] = ''
item_features = item_features[['item_id', 'tags_set']].copy().drop_duplicates()
# os.makedirs(os.path.dirname(savepath), exist_ok=True)
# item_features.to_parquet(savepath)

# Specify configs

In [23]:
hidden_units=128
session_maxlen=32


item_model_config = ItemModelConfig(
    name="idemb",
    hidden_units=hidden_units,
)

model_cfg = SASRecConfig(
    maxlen=session_maxlen,
    hidden_units=hidden_units,  # 50
    num_blocks=2,
    num_heads=1,
    dropout_rate=0.2,
    use_pos_emb=True,
    use_sm_head=True,
    item_model=item_model_config,
)

# TODO reused in train config
processor_config = SASRecProcessorConfig(
    session_maxlen=session_maxlen,
    enable_item_features=False,  # True,
    # item_tags_maxlen=64,
)

train_preprocessing_config = TrainPreprocessingConfig(
    min_item_freq=20,  # 1
    min_user_freq=2,  # 2 is minimal to generate target
    # keep_tags_types=["genres_"],
    keep_tags_types=[],
)

train_config = TrainConfig(
    lr=1e-3,
    batch_size=128,
    epochs=5,
    l2_emb=0.0,
    device="cuda:1",
    # device="cpu",
    negative_samples=0,
    loss="sm_ce",
    processor_config=processor_config,
)

task_converter_config = SequenceTaskConverterConfig()

# Train model

In [24]:
model_dir = "rectools/models/sasrec_test_model"

train_sasrec_script(
    train_ds_path="rectools/models/train_interactions_splitted_wtg.pkl",
    item_features_path="rectools/models/kion_pers_recs_item_features_test.parquet",
    model_dir=model_dir,
    processor_config=processor_config,
    model_config=model_cfg,
    train_config=train_config,
    train_preprocessing_config=train_preprocessing_config,
    task_converter_config=task_converter_config,
)

INFO:rectools.models.sasrec:loading datasets
INFO:rectools.models.sasrec:running training script
INFO:rectools.models.sasrec:testing dataset
INFO:rectools.models.sasrec:converting datasets to task format
INFO:rectools.models.sasrec:sessions lens: 0.95q: 24.0; 0.5q: 4.0
INFO:rectools.models.sasrec:building preprocessor
INFO:rectools.models.sasrec:building train dataset
INFO:rectools.models.sasrec:building model
INFO:rectools.models.sasrec:building trainer
INFO:rectools.models.sasrec:used sm_ce loss
INFO:rectools.models.sasrec:undable to init param encoder.attention_layernorms.0.weight with xavier: Fan in and fan out can not be computed for tensor with fewer than 2 dimensions
INFO:rectools.models.sasrec:undable to init param encoder.attention_layernorms.0.bias with xavier: Fan in and fan out can not be computed for tensor with fewer than 2 dimensions
INFO:rectools.models.sasrec:undable to init param encoder.attention_layernorms.1.weight with xavier: Fan in and fan out can not be computed

# Test model

In [25]:
metrics_name = {
    'MAP': MAP,
    'MIUF': MeanInvUserFreq,
    'Serendipity': Serendipity
    

}
metrics = {}
for metric_name, metric in metrics_name.items():
    for k in (1, 5, 10):
        metrics[f'{metric_name}@{k}'] = metric(k=k)


In [26]:
rec_df = train_df[train_df['user_id'].isin(np.unique(test_df['user_id']))]

In [27]:
recommender = SASRecRecommeder.load(model_dir)
x = rec_df
y = test_df
candidate_items = item_features['item_id']

pred_top_k = 10
recs = recommender.recommend(
    user_item_interactions=x, 
    user_features=user_features, 
    item_features=item_features,
    top_k=pred_top_k, 
    candidate_items=candidate_items, 
)



INFO:rectools.models.sasrec:sessions lens: 0.95q: 41.0; 0.5q: 7.0
100%|██████████| 740/740 [00:10<00:00, 71.10it/s]
100%|██████████| 94705/94705 [00:03<00:00, 25686.73it/s]


In [28]:
logger.info("computing metrics")
catalog=train_df[Columns.Item].unique().astype(str)
calc_metrics(metrics, recs, test_df, train_df, catalog)

INFO:root:computing metrics


{'MAP@1': 0.03996276581801799,
 'MAP@5': 0.06481679718888116,
 'MAP@10': 0.07257142857770682,
 'MIUF@1': 3.3982965266764427,
 'MIUF@5': 4.820015979809291,
 'MIUF@10': 5.19301814529131,
 'Serendipity@1': 0.0003231127097184948,
 'Serendipity@5': 0.0002893187185587979,
 'Serendipity@10': 0.0002688154142529778}