In [1]:
from pathlib import Path
import pandas as pd
from rectools import Columns
import numpy as np
import logging

from rectools.models import ImplicitALSWrapperModel
from implicit.als import AlternatingLeastSquares
from rectools.models.sasrec import (SasRecRecommenderModel)

from rectools.metrics import MAP, calc_metrics, MeanInvUserFreq, Serendipity
from rectools.dataset import Dataset

In [2]:
logging.basicConfig()
logging.getLogger().setLevel(logging.INFO)

logger = logging.getLogger()

# Data

In [3]:
%%time
!wget -q https://github.com/irsafilo/KION_DATASET/raw/f69775be31fa5779907cf0a92ddedb70037fb5ae/data_original.zip -O data_original.zip
!unzip -o data_original.zip
!rm data_original.zip

Archive:  data_original.zip
  inflating: data_original/interactions.csv  
  inflating: __MACOSX/data_original/._interactions.csv  
  inflating: data_original/users.csv  
  inflating: __MACOSX/data_original/._users.csv  
  inflating: data_original/items.csv  
  inflating: __MACOSX/data_original/._items.csv  
CPU times: user 611 ms, sys: 142 ms, total: 752 ms
Wall time: 41.1 s


In [4]:
DATA_PATH = Path("data_original")

interactions = (
    pd.read_csv(DATA_PATH / 'interactions.csv', parse_dates=["last_watch_dt"])
    .rename(columns={"last_watch_dt": "datetime"})
)

# Split dataset

In [5]:
interactions[Columns.Weight] = np.where(interactions['watched_pct'] > 10, 3, 1)

# Split to train / test
max_date = interactions[Columns.Datetime].max()
train = interactions[interactions[Columns.Datetime] < max_date - pd.Timedelta(days=7)].copy()
test = interactions[interactions[Columns.Datetime] >= max_date - pd.Timedelta(days=7)].copy()
train.drop(train.query("total_dur < 300").index, inplace=True)

# drop items with less than 20 interactions in train
items = train["item_id"].value_counts()
items = items[items >= 20]
items = items.index.to_list()
train = train[train["item_id"].isin(items)]
    
# drop users with less than 2 interactions in train
users = train["user_id"].value_counts()
users = users[users >= 2]
users = users.index.to_list()
train = train[(train["user_id"].isin(users))]

# leave item features for items only from train
items = train["item_id"].drop_duplicates().to_list()
users = train["user_id"].drop_duplicates().to_list()

# drop cold users from test
cold_users = set(test[Columns.User]) - set(train[Columns.User])
test.drop(test[test[Columns.User].isin(cold_users)].index, inplace=True)
test_users = test[Columns.User].unique().astype(str)

catalog=train[Columns.Item].unique().astype(str)


In [6]:
dataset = Dataset.construct(
    interactions_df=train,
)

# sasrec

In [7]:
hidden_units=128
session_maxlen=32
model = SasRecRecommenderModel(
    name="idemb",
    hidden_units_item=hidden_units,
    maxlen=session_maxlen,
    hidden_units=hidden_units,  # 50
    num_blocks=2,
    num_heads=1,
    dropout_rate=0.2,
    use_pos_emb=True,
    use_sm_head=True,
    session_maxlen=session_maxlen,
    lr=1e-3,
    batch_size=128,
    epochs=5,
    l2_emb=0.0,
    device="cuda:1",
    # device="cpu",
    negative_samples=0,
    loss="sm_ce",
)

In [8]:
model._fit(dataset)

INFO:rectools.models.sasrec:converting datasets to task format
INFO:rectools.models.sasrec:sessions lens: 0.95q: 24.0; 0.5q: 4.0
INFO:rectools.models.sasrec:building preprocessor
INFO:rectools.models.sasrec:building train dataset
INFO:rectools.models.sasrec:building model
INFO:rectools.models.sasrec:building trainer
INFO:rectools.models.sasrec:used sm_ce loss
INFO:rectools.models.sasrec:undable to init param encoder.attention_layernorms.0.weight with xavier: Fan in and fan out can not be computed for tensor with fewer than 2 dimensions
INFO:rectools.models.sasrec:undable to init param encoder.attention_layernorms.0.bias with xavier: Fan in and fan out can not be computed for tensor with fewer than 2 dimensions
INFO:rectools.models.sasrec:undable to init param encoder.attention_layernorms.1.weight with xavier: Fan in and fan out can not be computed for tensor with fewer than 2 dimensions
INFO:rectools.models.sasrec:undable to init param encoder.attention_layernorms.1.bias with xavier: F

In [9]:
# recommender = SASRecRecommeder(model.processor, model.model, model.task_converter)
candidate_items = interactions['item_id'].drop_duplicates().astype(str)
recs = model.recommend(dataset, 10, test_users, candidate_items)

INFO:rectools.models.sasrec:sessions lens: 0.95q: 41.0; 0.5q: 7.0
100%|██████████| 740/740 [00:09<00:00, 74.49it/s]
100%|██████████| 94705/94705 [00:03<00:00, 27916.90it/s]


In [10]:
metrics_name = {
    'MAP': MAP,
    'MIUF': MeanInvUserFreq,
    'Serendipity': Serendipity
    

}
metrics = {}
for metric_name, metric in metrics_name.items():
    for k in (1, 5, 10):
        metrics[f'{metric_name}@{k}'] = metric(k=k)


In [11]:
test["user_id"] = test["user_id"].astype(str)
test["item_id"] = test["item_id"].astype(str)
features_results = []
metric_values = calc_metrics(metrics, recs, test, train, catalog)
metric_values["model"] = "sasrec"
features_results.append(metric_values)

In [12]:
features_results

[{'MAP@1': 0.04726213852168703,
  'MAP@5': 0.08124079823642769,
  'MAP@10': 0.09035620127339238,
  'MIUF@1': 18.824620072061013,
  'MIUF@5': 18.824620072061013,
  'MIUF@10': 18.824620072061013,
  'Serendipity@1': 0.0981046407264664,
  'Serendipity@5': 0.06047044537352403,
  'Serendipity@10': 0.04435788554276184,
  'model': 'sasrec'}]

# ALS

In [13]:
users = pd.read_csv(DATA_PATH / 'users.csv')
items = pd.read_csv(DATA_PATH / 'items.csv')

In [14]:
# Process user features to the form of a flatten dataframe
users.fillna('Unknown', inplace=True)
users = users.loc[users[Columns.User].isin(train[Columns.User])].copy()
user_features_frames = []
for feature in ["sex", "age", "income"]:
    feature_frame = users.reindex(columns=[Columns.User, feature])
    feature_frame.columns = ["id", "value"]
    feature_frame["feature"] = feature
    user_features_frames.append(feature_frame)
user_features = pd.concat(user_features_frames)

# Process item features to the form of a flatten dataframe
items = items.loc[items[Columns.Item].isin(train[Columns.Item])].copy()
items["genre"] = items["genres"].str.lower().str.replace(", ", ",", regex=False).str.split(",")
genre_feature = items[["item_id", "genre"]].explode("genre")
genre_feature.columns = ["id", "value"]
genre_feature["feature"] = "genre"
content_feature = items.reindex(columns=[Columns.Item, "content_type"])
content_feature.columns = ["id", "value"]
content_feature["feature"] = "content_type"
item_features = pd.concat((genre_feature, content_feature))

candidate_items = interactions['item_id'].drop_duplicates().astype(int)
test["user_id"] = test["user_id"].astype(int)
test["item_id"] = test["item_id"].astype(int)
catalog=train[Columns.Item].unique()

In [15]:
dataset_no_features = Dataset.construct(
    interactions_df=train,
)

dataset_full_features = Dataset.construct(
    interactions_df=train,
    user_features_df=user_features,
    cat_user_features=["sex", "age", "income"],
    item_features_df=item_features,
    cat_item_features=["genre", "content_type"],
)

In [16]:
K_RECOS = 10
NUM_THREADS = 32
RANDOM_STATE = 32
ITERATIONS = 10

def make_base_model(factors: int, regularization: float, alpha: float, fit_features_together: bool=False):
    return ImplicitALSWrapperModel(
        AlternatingLeastSquares(
            factors=factors,
            regularization=regularization,
            alpha=alpha,
            random_state=RANDOM_STATE,
            use_gpu=False,
            num_threads = NUM_THREADS,
            iterations=ITERATIONS),
        fit_features_together = fit_features_together,
        )

In [17]:
n_factors = 128
regularization = 0.5
alpha = 10

model = make_base_model(factors=n_factors, regularization=regularization, alpha=alpha)
model.fit(dataset_no_features)
recos = model.recommend(
    users=test_users.astype(int),
    dataset=dataset_no_features,
    k=K_RECOS,
    filter_viewed=True,
)
metric_values = calc_metrics(metrics, recos, test, train, catalog)
metric_values["model"] = "no_features_factors_128_alpha_10_reg_0.5"
features_results.append(metric_values)

  check_blas_config()


In [18]:
model = make_base_model(factors = n_factors, regularization=regularization, alpha=alpha, fit_features_together=True)
model.fit(dataset_full_features)
recos = model.recommend(
    users=test_users.astype(int),
    dataset=dataset_full_features,
    k=K_RECOS,
    filter_viewed=True,
)
metric_values = calc_metrics(metrics, recos, test, train, catalog)
metric_values["model"] = "full_features_factors_128_fit_together_True"
features_results.append(metric_values)



In [19]:
features_df = (
    pd.DataFrame(features_results)
    .set_index("model")
    .sort_values(by=["MAP@10", "Serendipity@10"], ascending=False)
)
features_df

Unnamed: 0_level_0,MAP@1,MAP@5,MAP@10,MIUF@1,MIUF@5,MIUF@10,Serendipity@1,Serendipity@5,Serendipity@10
model,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1
sasrec,0.047262,0.081241,0.090356,18.82462,18.82462,18.82462,0.098105,0.06047,0.044358
full_features_factors_128_fit_together_True,0.033849,0.056533,0.062486,4.339514,5.338082,6.044169,0.000429,0.00046,0.000459
no_features_factors_128_alpha_10_reg_0.5,0.01553,0.028466,0.03282,6.603847,6.943217,7.146507,0.001047,0.000904,0.000815
