In [1]:
!pwd

/data/home/dmtikhono1/git_project/sasrec/RecTools/examples


In [2]:
import sys
sys.path.append("/data/home/dmtikhono1/git_project/sasrec/RecTools/")

In [3]:
from pathlib import Path
import pandas as pd
from rectools import Columns
import numpy as np
import logging
import os
import torch
from lightning_fabric import seed_everything

from rectools.models import ImplicitALSWrapperModel
from implicit.als import AlternatingLeastSquares
from rectools.models.sasrec import SasRecModel

from rectools.metrics import MAP, calc_metrics, MeanInvUserFreq, Serendipity
from rectools.dataset import Dataset

In [4]:
os.environ["CUBLAS_WORKSPACE_CONFIG"] = ":4096:8"
os.environ["OPENBLAS_NUM_THREADS"] = "1"

logging.basicConfig()
logging.getLogger().setLevel(logging.INFO)

logger = logging.getLogger()

# Data

In [5]:
# %%time
# !wget -q https://github.com/irsafilo/KION_DATASET/raw/f69775be31fa5779907cf0a92ddedb70037fb5ae/data_original.zip -O data_original.zip
# !unzip -o data_original.zip
# !rm data_original.zip

In [6]:
DATA_PATH = Path("data_original")

interactions = (
    pd.read_csv(DATA_PATH / 'interactions.csv', parse_dates=["last_watch_dt"])
    .rename(columns={"last_watch_dt": "datetime"})
)

# Split dataset

In [7]:
interactions[Columns.Weight] = np.where(interactions['watched_pct'] > 10, 3, 1)

# Split to train / test
max_date = interactions[Columns.Datetime].max()
train = interactions[interactions[Columns.Datetime] < max_date - pd.Timedelta(days=7)].copy()
test = interactions[interactions[Columns.Datetime] >= max_date - pd.Timedelta(days=7)].copy()
train.drop(train.query("total_dur < 300").index, inplace=True)

# drop items with less than 20 interactions in train
items = train["item_id"].value_counts()
items = items[items >= 20]
items = items.index.to_list()
train = train[train["item_id"].isin(items)]
    
# drop users with less than 2 interactions in train
users = train["user_id"].value_counts()
users = users[users >= 2]
users = users.index.to_list()
train = train[(train["user_id"].isin(users))]

# leave item features for items only from train
# items = train["item_id"].drop_duplicates().to_list()
users = train["user_id"].drop_duplicates().to_list()

# drop cold users from test
test_users = test[Columns.User].unique()
cold_users = set(test[Columns.User]) - set(train[Columns.User])
test.drop(test[test[Columns.User].isin(cold_users)].index, inplace=True)

catalog=train[Columns.Item].unique()


In [8]:
dataset = Dataset.construct(
    interactions_df=train,
)

# sasrec

In [9]:
RANDOM_SEED = 32
torch.use_deterministic_algorithms(True)
seed_everything(RANDOM_SEED, workers=True)

Seed set to 32


32

In [10]:
factors=128
session_maxlen=32
model = SasRecModel(
    factors=factors,  # 50
    n_blocks=2,
    n_heads=1,
    dropout_rate=0.2,
    use_pos_emb=True,
    session_maxlen=session_maxlen,
    lr=1e-3,
    batch_size=128,
    epochs=5,
    device="cuda:1",
    loss="softmax",
)

In [11]:

%%time
model.fit(dataset)

INFO:rectools.models.sasrec:training epoch 1
INFO:rectools.models.sasrec:training epoch 2
INFO:rectools.models.sasrec:training epoch 3
INFO:rectools.models.sasrec:training epoch 4
INFO:rectools.models.sasrec:training epoch 5


CPU times: user 4min 50s, sys: 8.14 s, total: 4min 58s
Wall time: 4min 53s


<rectools.models.sasrec.SasRecModel at 0x7fec53893d60>

In [17]:
%%time
recs = model.recommend(
    users = test_users, 
    dataset = dataset,
    k = 10,
    filter_viewed = True,
    on_unsupported_targets="warn"
)

            because of missing known items
  interactions[Columns.User] = dataset.user_id_map.convert_to_external(interactions[Columns.User])
                Model `<class 'rectools.models.sasrec.SasRecModel'>` doesn't support recommendations for cold users,
                but some of given users are cold: they are not in the `dataset.user_id_map`
            
100%|██████████| 740/740 [00:02<00:00, 267.59it/s]


CPU times: user 2min 15s, sys: 11min 24s, total: 13min 40s
Wall time: 22 s


In [18]:
metrics_name = {
    'MAP': MAP,
    'MIUF': MeanInvUserFreq,
    'Serendipity': Serendipity
    

}
metrics = {}
for metric_name, metric in metrics_name.items():
    for k in (1, 5, 10):
        metrics[f'{metric_name}@{k}'] = metric(k=k)


In [19]:
recs["item_id"] = recs["item_id"].apply(str)
test["item_id"] = test["item_id"].astype(str)
features_results = []
metric_values = calc_metrics(metrics, recs[["user_id", "item_id", "rank"]], test, train, catalog)
metric_values["model"] = "sasrec"
features_results.append(metric_values)

In [20]:
# major recommend
recs.sort_values(["user_id", "rank"])

Unnamed: 0,user_id,item_id,score,rank
575550,3,7793,2.755187,1
575551,3,7829,2.623583,2
575552,3,15297,2.618209,3
575553,3,3784,2.395707,4
575554,3,14899,1.994578,5
...,...,...,...,...
224955,1097544,3734,2.108971,6
224956,1097544,13865,2.089862,7
224957,1097544,14431,2.058302,8
224958,1097544,4151,1.943950,9


In [21]:
features_results

[{'MAP@1': 0.04896729054820606,
  'MAP@5': 0.08284725776567772,
  'MAP@10': 0.09202214080523476,
  'MIUF@1': 18.824620072061013,
  'MIUF@5': 18.824620072061013,
  'MIUF@10': 18.824620072061013,
  'Serendipity@1': 0.10074441687344914,
  'Serendipity@5': 0.06064590171647837,
  'Serendipity@10': 0.04443191713787037,
  'model': 'sasrec'}]

### Item to item

In [22]:
target_items = [13865, 4457, 15297]

In [23]:
%%time
recs = model.recommend_to_items(
    target_items = target_items, 
    dataset = dataset,
    k = 10,
    filter_itself = True,
    items_to_recommend=None, #white_list,
)

CPU times: user 1.76 s, sys: 2.4 s, total: 4.16 s
Wall time: 1.14 s


In [24]:
recs

Unnamed: 0,target_item_id,item_id,score,rank
0,13865,9728,0.753347,1
1,13865,4151,0.740239,2
2,13865,3734,0.716284,3
3,13865,6809,0.673116,4
4,13865,142,0.650436,5
5,13865,1844,0.646556,6
6,13865,7571,0.645828,7
7,13865,15297,0.624771,8
8,13865,8636,0.623193,9
9,13865,10440,0.582206,10


In [25]:
raise ValueError()  # skip updating cells below

ValueError: 

# ALS

In [None]:
users = pd.read_csv(DATA_PATH / 'users.csv')
items = pd.read_csv(DATA_PATH / 'items.csv')

In [None]:
# Process user features to the form of a flatten dataframe
users.fillna('Unknown', inplace=True)
users = users.loc[users[Columns.User].isin(train[Columns.User])].copy()
user_features_frames = []
for feature in ["sex", "age", "income"]:
    feature_frame = users.reindex(columns=[Columns.User, feature])
    feature_frame.columns = ["id", "value"]
    feature_frame["feature"] = feature
    user_features_frames.append(feature_frame)
user_features = pd.concat(user_features_frames)

# Process item features to the form of a flatten dataframe
items = items.loc[items[Columns.Item].isin(train[Columns.Item])].copy()
items["genre"] = items["genres"].str.lower().str.replace(", ", ",", regex=False).str.split(",")
genre_feature = items[["item_id", "genre"]].explode("genre")
genre_feature.columns = ["id", "value"]
genre_feature["feature"] = "genre"
content_feature = items.reindex(columns=[Columns.Item, "content_type"])
content_feature.columns = ["id", "value"]
content_feature["feature"] = "content_type"
item_features = pd.concat((genre_feature, content_feature))

candidate_items = interactions['item_id'].drop_duplicates().astype(int)
test["user_id"] = test["user_id"].astype(int)
test["item_id"] = test["item_id"].astype(int)
catalog=train[Columns.Item].unique()

In [None]:
dataset_no_features = Dataset.construct(
    interactions_df=train,
)

dataset_full_features = Dataset.construct(
    interactions_df=train,
    user_features_df=user_features,
    cat_user_features=["sex", "age", "income"],
    item_features_df=item_features,
    cat_item_features=["genre", "content_type"],
)

In [None]:
K_RECOS = 10
NUM_THREADS = 32
RANDOM_STATE = 32
ITERATIONS = 10

def make_base_model(factors: int, regularization: float, alpha: float, fit_features_together: bool=False):
    return ImplicitALSWrapperModel(
        AlternatingLeastSquares(
            factors=factors,
            regularization=regularization,
            alpha=alpha,
            random_state=RANDOM_STATE,
            use_gpu=False,
            num_threads = NUM_THREADS,
            iterations=ITERATIONS),
        fit_features_together = fit_features_together,
        )

In [None]:
n_factors = 128
regularization = 0.5
alpha = 10

model = make_base_model(factors=n_factors, regularization=regularization, alpha=alpha)
model.fit(dataset_no_features)
recos = model.recommend(
    users=test_users.astype(int),
    dataset=dataset_no_features,
    k=K_RECOS,
    filter_viewed=True,
)
metric_values = calc_metrics(metrics, recos, test, train, catalog)
metric_values["model"] = "no_features_factors_128_alpha_10_reg_0.5"
features_results.append(metric_values)

  check_blas_config()


In [None]:
model = make_base_model(factors = n_factors, regularization=regularization, alpha=alpha, fit_features_together=True)
model.fit(dataset_full_features)
recos = model.recommend(
    users=test_users.astype(int),
    dataset=dataset_full_features,
    k=K_RECOS,
    filter_viewed=True,
)
metric_values = calc_metrics(metrics, recos, test, train, catalog)
metric_values["model"] = "full_features_factors_128_fit_together_True"
features_results.append(metric_values)



In [None]:
features_df = (
    pd.DataFrame(features_results)
    .set_index("model")
    .sort_values(by=["MAP@10", "Serendipity@10"], ascending=False)
)
features_df

Unnamed: 0_level_0,MAP@1,MAP@5,MAP@10,MIUF@1,MIUF@5,MIUF@10,Serendipity@1,Serendipity@5,Serendipity@10
model,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1
sasrec,0.047579,0.081093,0.090322,18.82462,18.82462,18.82462,0.098168,0.059983,0.044268
full_features_factors_128_fit_together_True,0.033849,0.056533,0.062486,4.339514,5.338082,6.044169,0.000429,0.00046,0.000459
no_features_factors_128_alpha_10_reg_0.5,0.01553,0.028466,0.03282,6.603847,6.943217,7.146507,0.001047,0.000904,0.000815
