In [1]:
from pathlib import Path
import pandas as pd
from rectools import Columns
import numpy as np
import logging
import os

from rectools.models import ImplicitALSWrapperModel
from implicit.als import AlternatingLeastSquares
from rectools.models.sasrec import SasRecRecommenderModel

from rectools.metrics import MAP, calc_metrics, MeanInvUserFreq, Serendipity
from rectools.dataset import Dataset

In [2]:
os.environ["CUBLAS_WORKSPACE_CONFIG"] = ":4096:8"

logging.basicConfig()
logging.getLogger().setLevel(logging.INFO)

logger = logging.getLogger()

# Data

In [15]:
%%time
!wget -q https://github.com/irsafilo/KION_DATASET/raw/f69775be31fa5779907cf0a92ddedb70037fb5ae/data_original.zip -O data_original.zip
!unzip -o data_original.zip
!rm data_original.zip

Archive:  data_original.zip
  End-of-central-directory signature not found.  Either this file is not
  a zipfile, or it constitutes one disk of a multi-part archive.  In the
  latter case the central directory and zipfile comment will be found on
  the last disk(s) of this archive.
unzip:  cannot find zipfile directory in one of data_original.zip or
        data_original.zip.zip, and cannot find data_original.zip.ZIP, period.
CPU times: user 8.75 ms, sys: 369 ms, total: 377 ms
Wall time: 1.23 s


In [3]:
DATA_PATH = Path("data_original")

interactions = (
    pd.read_csv(DATA_PATH / 'interactions.csv', parse_dates=["last_watch_dt"])
    .rename(columns={"last_watch_dt": "datetime"})
)

# Split dataset

In [4]:
interactions[Columns.Weight] = np.where(interactions['watched_pct'] > 10, 3, 1)

# Split to train / test
max_date = interactions[Columns.Datetime].max()
train = interactions[interactions[Columns.Datetime] < max_date - pd.Timedelta(days=7)].copy()
test = interactions[interactions[Columns.Datetime] >= max_date - pd.Timedelta(days=7)].copy()
train.drop(train.query("total_dur < 300").index, inplace=True)

# drop items with less than 20 interactions in train
items = train["item_id"].value_counts()
items = items[items >= 20]
items = items.index.to_list()
train = train[train["item_id"].isin(items)]
    
# drop users with less than 2 interactions in train
users = train["user_id"].value_counts()
users = users[users >= 2]
users = users.index.to_list()
train = train[(train["user_id"].isin(users))]

# leave item features for items only from train
# items = train["item_id"].drop_duplicates().to_list()
users = train["user_id"].drop_duplicates().to_list()

# drop cold users from test
test_users = test[Columns.User].unique()
cold_users = set(test[Columns.User]) - set(train[Columns.User])
test.drop(test[test[Columns.User].isin(cold_users)].index, inplace=True)

catalog=train[Columns.Item].unique()


In [5]:
dataset = Dataset.construct(
    interactions_df=train,
)

# sasrec

In [6]:
factors=128
session_maxlen=32
model = SasRecRecommenderModel(
    random_state=32,
    factors=factors,  # 50
    n_blocks=2,
    n_heads=1,
    dropout_rate=0.2,
    item_net_dropout_rate=0.2,
    use_pos_emb=True,
    session_maxlen=session_maxlen,
    lr=1e-3,
    batch_size=128,
    epochs=5,
    device="cuda:1",
    loss="softmax",
)

Seed set to 32


In [7]:
%%time
model.fit(dataset)

INFO:rectools.models.sasrec:building model
INFO:rectools.models.sasrec:building trainer
INFO:rectools.models.sasrec:unable to init param encoder.attention_layernorms.0.weight with xavier: Fan in and fan out can not be computed for tensor with fewer than 2 dimensions
INFO:rectools.models.sasrec:unable to init param encoder.attention_layernorms.0.bias with xavier: Fan in and fan out can not be computed for tensor with fewer than 2 dimensions
INFO:rectools.models.sasrec:unable to init param encoder.attention_layernorms.1.weight with xavier: Fan in and fan out can not be computed for tensor with fewer than 2 dimensions
INFO:rectools.models.sasrec:unable to init param encoder.attention_layernorms.1.bias with xavier: Fan in and fan out can not be computed for tensor with fewer than 2 dimensions
INFO:rectools.models.sasrec:unable to init param encoder.attention_layers.0.in_proj_bias with xavier: Fan in and fan out can not be computed for tensor with fewer than 2 dimensions
INFO:rectools.model

CPU times: user 5min 12s, sys: 3.65 s, total: 5min 15s
Wall time: 5min 19s


<rectools.models.sasrec.SasRecRecommenderModel at 0x7f58340f6400>

In [8]:
%%time
recs = model.recommend(
    users = test_users, 
    dataset = dataset,
    k = 10,
    filter_viewed = True,
    assume_external_ids = True,
)

100%|██████████| 740/740 [00:02<00:00, 301.83it/s]


CPU times: user 2min 35s, sys: 12min 59s, total: 15min 34s
Wall time: 28.1 s


In [9]:
recs

Unnamed: 0,user_id,item_id,score,rank
0,3,7793,3.059519,1
1,3,15297,2.744265,2
2,3,3784,2.680801,3
3,3,7829,2.679824,4
4,3,14899,2.337226,5
...,...,...,...,...
947045,1097544,11118,2.289302,6
947046,1097544,6162,2.216834,7
947047,1097544,10440,2.208683,8
947048,1097544,5434,2.208431,9


In [10]:
metrics_name = {
    'MAP': MAP,
    'MIUF': MeanInvUserFreq,
    'Serendipity': Serendipity
    

}
metrics = {}
for metric_name, metric in metrics_name.items():
    for k in (1, 5, 10):
        metrics[f'{metric_name}@{k}'] = metric(k=k)


In [11]:
recs["item_id"] = recs["item_id"].apply(str)
test["item_id"] = test["item_id"].astype(str)
features_results = []
metric_values = calc_metrics(metrics, recs[["user_id", "item_id", "rank"]], test, train, catalog)
metric_values["model"] = "sasrec"
features_results.append(metric_values)

In [12]:
features_results

[{'MAP@1': 0.047579110560324996,
  'MAP@5': 0.081092572796464,
  'MAP@10': 0.09032230539383843,
  'MIUF@1': 18.824620072061013,
  'MIUF@5': 18.824620072061013,
  'MIUF@10': 18.824620072061013,
  'Serendipity@1': 0.09816799535399398,
  'Serendipity@5': 0.059982713365428986,
  'Serendipity@10': 0.044268025003728055,
  'model': 'sasrec'}]

# ALS

In [13]:
users = pd.read_csv(DATA_PATH / 'users.csv')
items = pd.read_csv(DATA_PATH / 'items.csv')

In [14]:
# Process user features to the form of a flatten dataframe
users.fillna('Unknown', inplace=True)
users = users.loc[users[Columns.User].isin(train[Columns.User])].copy()
user_features_frames = []
for feature in ["sex", "age", "income"]:
    feature_frame = users.reindex(columns=[Columns.User, feature])
    feature_frame.columns = ["id", "value"]
    feature_frame["feature"] = feature
    user_features_frames.append(feature_frame)
user_features = pd.concat(user_features_frames)

# Process item features to the form of a flatten dataframe
items = items.loc[items[Columns.Item].isin(train[Columns.Item])].copy()
items["genre"] = items["genres"].str.lower().str.replace(", ", ",", regex=False).str.split(",")
genre_feature = items[["item_id", "genre"]].explode("genre")
genre_feature.columns = ["id", "value"]
genre_feature["feature"] = "genre"
content_feature = items.reindex(columns=[Columns.Item, "content_type"])
content_feature.columns = ["id", "value"]
content_feature["feature"] = "content_type"
item_features = pd.concat((genre_feature, content_feature))

candidate_items = interactions['item_id'].drop_duplicates().astype(int)
test["user_id"] = test["user_id"].astype(int)
test["item_id"] = test["item_id"].astype(int)
catalog=train[Columns.Item].unique()

In [None]:
dataset_no_features = Dataset.construct(
    interactions_df=train,
)

dataset_full_features = Dataset.construct(
    interactions_df=train,
    user_features_df=user_features,
    cat_user_features=["sex", "age", "income"],
    item_features_df=item_features,
    cat_item_features=["genre", "content_type"],
)

In [None]:
K_RECOS = 10
NUM_THREADS = 32
RANDOM_STATE = 32
ITERATIONS = 10

def make_base_model(factors: int, regularization: float, alpha: float, fit_features_together: bool=False):
    return ImplicitALSWrapperModel(
        AlternatingLeastSquares(
            factors=factors,
            regularization=regularization,
            alpha=alpha,
            random_state=RANDOM_STATE,
            use_gpu=False,
            num_threads = NUM_THREADS,
            iterations=ITERATIONS),
        fit_features_together = fit_features_together,
        )

In [None]:
n_factors = 128
regularization = 0.5
alpha = 10

model = make_base_model(factors=n_factors, regularization=regularization, alpha=alpha)
model.fit(dataset_no_features)
recos = model.recommend(
    users=test_users.astype(int),
    dataset=dataset_no_features,
    k=K_RECOS,
    filter_viewed=True,
)
metric_values = calc_metrics(metrics, recos, test, train, catalog)
metric_values["model"] = "no_features_factors_128_alpha_10_reg_0.5"
features_results.append(metric_values)

  check_blas_config()


In [None]:
model = make_base_model(factors = n_factors, regularization=regularization, alpha=alpha, fit_features_together=True)
model.fit(dataset_full_features)
recos = model.recommend(
    users=test_users.astype(int),
    dataset=dataset_full_features,
    k=K_RECOS,
    filter_viewed=True,
)
metric_values = calc_metrics(metrics, recos, test, train, catalog)
metric_values["model"] = "full_features_factors_128_fit_together_True"
features_results.append(metric_values)



In [None]:
features_df = (
    pd.DataFrame(features_results)
    .set_index("model")
    .sort_values(by=["MAP@10", "Serendipity@10"], ascending=False)
)
features_df

Unnamed: 0_level_0,MAP@1,MAP@5,MAP@10,MIUF@1,MIUF@5,MIUF@10,Serendipity@1,Serendipity@5,Serendipity@10
model,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1
sasrec,0.047579,0.081093,0.090322,18.82462,18.82462,18.82462,0.098168,0.059983,0.044268
full_features_factors_128_fit_together_True,0.033849,0.056533,0.062486,4.339514,5.338082,6.044169,0.000429,0.00046,0.000459
no_features_factors_128_alpha_10_reg_0.5,0.01553,0.028466,0.03282,6.603847,6.943217,7.146507,0.001047,0.000904,0.000815
