In [1]:
from pathlib import Path
import pandas as pd
from rectools import Columns
import numpy as np
import os

In [2]:
import pandas as pd

from rectools.models.sasrec import load_pickle, save_pickle 

from rectools.models.sasrec import (
    SASRecConfig,
    SASRecProcessorConfig,
    TrainPreprocessingConfig,
    TrainConfig,
    ItemModelConfig,
    SequenceTaskConverterConfig,
    TargetTransform,
    train_sasrec_script,
    run_test_sasrec_script,
    SASRecRecommeder
)
from datetime import date, timedelta
from rectools.metrics import MAP, calc_metrics, MeanInvUserFreq, Serendipity
from rectools.dataset import Dataset

# Data

In [3]:
%%time
!wget -q https://github.com/irsafilo/KION_DATASET/raw/f69775be31fa5779907cf0a92ddedb70037fb5ae/data_original.zip -O data_original.zip
!unzip -o data_original.zip
!rm data_original.zip

Archive:  data_original.zip
  inflating: data_original/interactions.csv  
  inflating: __MACOSX/data_original/._interactions.csv  
  inflating: data_original/users.csv  
  inflating: __MACOSX/data_original/._users.csv  
  inflating: data_original/items.csv  
  inflating: __MACOSX/data_original/._items.csv  
CPU times: user 400 ms, sys: 131 ms, total: 532 ms
Wall time: 55.2 s


In [4]:
DATA_PATH = Path("data_original")

user_features = pd.read_csv(DATA_PATH / 'users.csv')
item_features = pd.read_csv(DATA_PATH / 'items.csv')
user_item_interactions = (
    pd.read_csv(DATA_PATH / 'interactions.csv', parse_dates=["last_watch_dt"])
    .rename(columns={"last_watch_dt": "first_intr_dt", "total_dur": "score"})
)

In [5]:
user_features["user_id"] = user_features["user_id"].astype(str)
item_features["item_id"] = item_features["item_id"].astype(str)
user_item_interactions["user_id"] = user_item_interactions["user_id"].astype(str)
user_item_interactions["item_id"] = user_item_interactions["item_id"].astype(str)

# Split dataset

In [6]:
# df = user_item_interactions.copy()
# intr = user_item_interactions
# test_ratio = 0.1
# SPLIT_KEY = "user_id"
# x_min_score = 0
# y_pos_watch_seconds_th = 300

# train_ds_path = "rectools/models/train_interactions_splitted_wtg.pkl"
# test_ds_path="rectools/models/test_interactions_splitted_wtg.pkl"


# assert 0 <= test_ratio <= 1

# users = df[SPLIT_KEY].unique()
# test_users_cnt = int(len(users) * test_ratio)

# users = np.random.permutation(users)
# test_users, train_users = users[:test_users_cnt], users[test_users_cnt:]

# train_df = df[df[SPLIT_KEY].isin(train_users)]
# test_df = df[df[SPLIT_KEY].isin(test_users)]

# train_intr = train_df
# test_intr = test_df
# assert train_intr["user_id"].isin(test_intr["user_id"]).any() == False

# train_item_candidates = train_intr["item_id"].drop_duplicates().to_list()
# test_item_candidates = intr["item_id"].drop_duplicates().to_list()

# max_date = user_item_interactions["first_intr_dt"].max()
# test_start_date = max_date - pd.Timedelta(days=7)

# train_intr_x = train_intr[train_intr["first_intr_dt"].dt.date < test_start_date]
# test_intr_x = test_intr[test_intr["first_intr_dt"].dt.date < test_start_date]

# train_intr_y = train_intr[train_intr["first_intr_dt"].dt.date >= test_start_date]
# test_intr_y = test_intr[test_intr["first_intr_dt"].dt.date >= test_start_date]

# # remove users from y part which did not appeare in x part
# train_intr_y = train_intr_y[train_intr_y["user_id"].isin(train_intr_x["user_id"])]
# test_intr_y = test_intr_y[test_intr_y["user_id"].isin(test_intr_x["user_id"])]


# x_target_transform=TargetTransform.THRESHOLD

# train_intr_x = train_intr_x[train_intr_x["score"] > x_min_score]
# test_intr_x = test_intr_x[test_intr_x["score"] > x_min_score]

# train_intr_y = train_intr_y[train_intr_y["score"] > y_pos_watch_seconds_th]
# test_intr_y = test_intr_y[test_intr_y["score"] > y_pos_watch_seconds_th]

# train_ds = (train_intr_x, train_intr_y, train_item_candidates)
# test_ds = (test_intr_x, test_intr_y, test_item_candidates)

# save_pickle(train_ds, train_ds_path)
# save_pickle(test_ds, test_ds_path)

In [7]:
train_ds_path = "rectools/models/train_interactions_splitted_wtg.pkl"
test_ds_path="rectools/models/test_interactions_splitted_wtg.pkl"

user_item_interactions[Columns.Weight] = np.where(user_item_interactions['watched_pct'] > 10, 3, 1)

# Split to train / test
max_date = user_item_interactions["first_intr_dt"].max()
train_df = user_item_interactions[user_item_interactions["first_intr_dt"] < max_date - pd.Timedelta(days=7)].copy()
test_df = user_item_interactions[user_item_interactions["first_intr_dt"] >= max_date - pd.Timedelta(days=7)].copy()
train_df.drop(train_df.query("score < 300").index, inplace=True)

# drop items with less than 20 interactions in train
items = train_df['item_id'].value_counts()
items = items[items >= 20]
items = items.index.to_list()
train_df = train_df[train_df['item_id'].isin(items)]
    
# drop users with less than 2 interactions in train
users = train_df['user_id'].value_counts()
users = users[users >= 2]
users = users.index.to_list()
train_df = train_df[(train_df['user_id'].isin(users)) & (train_df['item_id'].isin(items))]

# drop cold users from test
cold_users = set(test_df[Columns.User]) - set(train_df[Columns.User])
test_df.drop(test_df[test_df[Columns.User].isin(cold_users)].index, inplace=True)
test_users = test_df[Columns.User].unique()

cold_items = set(test_df[Columns.Item]) - set(train_df[Columns.Item])
test_df.drop(test_df[test_df[Columns.Item].isin(cold_items)].index, inplace=True)

item_features = item_features.loc[item_features[Columns.Item].isin(train_df[Columns.Item])].copy()

train_ds = (train_df, pd.DataFrame(), pd.DataFrame())
test_ds = (test_df, pd.DataFrame(), pd.DataFrame())

save_pickle(train_ds, train_ds_path)
save_pickle(test_ds, test_ds_path)


In [8]:
savepath = "rectools/models/kion_pers_recs_item_features_test.parquet"
item_features["tags_set"] = ''
item_features = item_features[['item_id', 'tags_set']].copy().drop_duplicates()
os.makedirs(os.path.dirname(savepath), exist_ok=True)
item_features.to_parquet(savepath)

# Specify configs

In [9]:
hidden_units=128
session_maxlen=32


item_model_config = ItemModelConfig(
    name="idemb",
    hidden_units=hidden_units,
)

model_cfg = SASRecConfig(
    maxlen=session_maxlen,
    hidden_units=hidden_units,  # 50
    num_blocks=2,
    num_heads=1,
    dropout_rate=0.2,
    use_pos_emb=True,
    use_sm_head=True,
    item_model=item_model_config,
)

# TODO reused in train config
processor_config = SASRecProcessorConfig(
    session_maxlen=session_maxlen,
    enable_item_features=False,  # True,
    # item_tags_maxlen=64,
)

train_preprocessing_config = TrainPreprocessingConfig(
    min_item_freq=20,  # 1
    min_user_freq=2,  # 2 is minimal to generate target
    # keep_tags_types=["genres_"],
    keep_tags_types=[],
)

train_config = TrainConfig(
    lr=1e-3,
    batch_size=128,
    epochs=5,
    l2_emb=0.0,
    device="cuda:1",
    # device="cpu",
    negative_samples=0,
    loss="sm_ce",
    processor_config=processor_config,
)

task_converter_config = SequenceTaskConverterConfig()

# Train model

In [10]:
model_dir = "rectools/models/sasrec_test_model"

train_sasrec_script(
    train_ds_path="rectools/models/train_interactions_splitted_wtg.pkl",
    item_features_path="rectools/models/kion_pers_recs_item_features_test.parquet",
    model_dir=model_dir,
    processor_config=processor_config,
    model_config=model_cfg,
    train_config=train_config,
    train_preprocessing_config=train_preprocessing_config,
    task_converter_config=task_converter_config,
)

# Test model

In [11]:
metrics_name = {
    'MAP': MAP,
    'MIUF': MeanInvUserFreq,
    'Serendipity': Serendipity
    

}
metrics = {}
for metric_name, metric in metrics_name.items():
    for k in (1, 5, 10):
        metrics[f'{metric_name}@{k}'] = metric(k=k)


In [12]:
rec_df = train_df[train_df['user_id'].isin(np.unique(test_df['user_id']))]

In [13]:
recommender = SASRecRecommeder.load(model_dir)
x = rec_df
y = test_df
candidate_items = item_features['item_id']

pred_top_k = 10
recs = recommender.recommend(
    user_item_interactions=x, 
    user_features=user_features, 
    item_features=item_features,
    top_k=pred_top_k, 
    candidate_items=candidate_items, 
)

100%|██████████| 731/731 [00:10<00:00, 71.63it/s]
100%|██████████| 93516/93516 [00:02<00:00, 34247.56it/s]


In [14]:
catalog=train_df[Columns.Item].unique().astype(str)
calc_metrics(metrics, recs, test_df, train_df, catalog)

{'MAP@1': 0.041042036775140275,
 'MAP@5': 0.0667955345783929,
 'MAP@10': 0.07467896800535924,
 'MIUF@1': 3.3346319783960356,
 'MIUF@5': 4.735062167926775,
 'MIUF@10': 5.132194375143549,
 'Serendipity@1': 0.0003361417990098743,
 'Serendipity@5': 0.0002930853827333228,
 'Serendipity@10': 0.000272484260108242}

In [27]:
# run_test_sasrec_script(
#     ds_path="rectools/models/train_interactions_splitted_wtg.pkl",
#     label="train",
#     item_features_path="rectools/models/kion_pers_recs_item_features_test.parquet",
#     model_path="rectools/models/sasrec_test_model",
#     #genres_test_ds_path="data/processed/genres_dataset.parquet",
# )

In [28]:
# run_test_sasrec_script(
#     ds_path="rectools/models/test_interactions_splitted_wtg.pkl",
#     label="test",
#     item_features_path="rectools/models/kion_pers_recs_item_features_test.parquet",
#     model_path="rectools/models/sasrec_test_model",
#     #genres_test_ds_path="data/processed/genres_dataset.parquet",
# )

In [30]:
# pd.read_csv("rectools/models/sasrec_test_model/artifacts/test/metrics.csv")

In [29]:
# pd.read_csv("rectools/models/sasrec_test_model/artifacts/train/metrics.csv")