In [1]:
import os
from datetime import timedelta, datetime

from tqdm.auto import tqdm
import seaborn as sns
import polars as pl
from tqdm.auto import tqdm, trange
from implicit.nearest_neighbours import CosineRecommender, BM25Recommender, TFIDFRecommender
from implicit.als import AlternatingLeastSquares
from implicit import gpu
from catboost import Pool, CatBoostRanker, FeaturesData

In [2]:
gpu.HAS_CUDA

True

In [3]:
sns.set_style("darkgrid")

In [4]:
%load_ext autoreload
%autoreload 2

from recs_utils.metrics import compute_metrics, model_cross_validate, mean_average_prec, join_true_pred_and_preprocess
from recs_utils.load_data import MTSDataset
from recs_utils.split import TimeRangeSplit
from recs_utils.implicit_model import ImplicitRecommender
from recs_utils.utils import get_direct_and_inv_mapping

In [5]:
data_dir = "data"

In [6]:
df = MTSDataset.load_interactions(os.path.join(data_dir, "interactions.csv"))
df_users = MTSDataset.load_users(os.path.join(data_dir, "users.csv"))
df_items = MTSDataset.load_items(os.path.join(data_dir, "items.csv"))

In [7]:
len(df)

1532998

In [8]:
df.head()

user_id,item_id,progress,rating,start_date
u32,u32,u8,f32,date
126706,14433,80,,2018-01-01
127290,140952,58,,2018-01-01
66991,198453,89,,2018-01-01
46791,83486,23,5.0,2018-01-01
79313,188770,88,5.0,2018-01-01


In [9]:
df_users.head()

user_id,age,sex
u32,cat,f32
1,"""45_54""",
2,"""18_24""",0.0
3,"""65_inf""",0.0
4,"""18_24""",0.0
5,"""35_44""",0.0


In [10]:
df_items.head()

item_id,title,genres,authors,year
u32,str,cat,cat,cat
128115,"""Ворон-челобитч…","""Зарубежные дет…","""Михаил Салтыко…","""1886"""
210979,"""Скрипка Ротшил…","""Классическая п…","""Антон Чехов""","""1894"""
95632,"""Испорченные де…","""Зарубежная кла…","""Михаил Салтыко…","""1869"""
247906,"""Странный челов…","""Пьесы и драмат…","""Михаил Лермонт…","""1831"""
294280,"""Господа ташкен…","""Зарубежная кла…","""Михаил Салтыко…","""1873"""


In [11]:
users_mapping, users_inv_mapping = get_direct_and_inv_mapping(df, "user_id")
len(users_mapping)

151600

In [12]:
items_mapping, items_inv_mapping = get_direct_and_inv_mapping(df, "item_id")
len(items_mapping)

59599

In [13]:
df_items  = df_items.with_columns(
    pl.col("title").str.strip().str.to_lowercase().alias("title"),
    pl.col("genres").cast(str).str.strip().str.to_lowercase().alias("genres")
)

In [14]:
last_date = df.select(pl.col("start_date").max())[0, 0]
folds = 7
start_date = last_date - timedelta(days=folds)
start_date, last_date

(datetime.date(2019, 12, 24), datetime.date(2019, 12, 31))

In [15]:
cv = TimeRangeSplit(start_date=start_date, folds=folds, interval=timedelta(days=1))
cv.max_n_splits, cv.get_n_splits(df, datetime_column='start_date')

(7, 7)

In [16]:
folds_with_stats = list(cv.split(
    df, 
    user_column='user_id',
    item_column='item_id',
    datetime_column='start_date',
    fold_stats=True
)
)

folds_info_with_stats = pl.from_dicts([info for _, _, info in folds_with_stats])

In [17]:
folds_info_with_stats

Start date,End date,Train,New users,New users interactions,New items,New items interactions,Known interactions,Test
date,date,i64,i64,i64,i64,i64,i64,i64
2019-12-24,2019-12-25,1515946,3,3,0,0,0,2045
2019-12-25,2019-12-26,1517994,1,1,0,0,0,2141
2019-12-26,2019-12-27,1520136,0,0,0,0,0,2177
2019-12-27,2019-12-28,1522313,0,0,0,0,0,2110
2019-12-28,2019-12-29,1524423,2,4,0,0,0,2205
2019-12-29,2019-12-30,1526632,4,4,0,0,0,2118
2019-12-30,2019-12-31,1528754,1,1,0,0,0,2168


In [18]:
top_N = 10
num_neg_samples = 4

In [19]:
top_N_negative_sampling = top_N * 2

In [21]:
train_idx, test_idx, info = folds_with_stats[0]

train = df.join(train_idx, on=["user_id", "item_id"], how="inner")
test = df.join(test_idx, on=["user_id", "item_id"], how="inner")
train.shape, test.shape

((1515946, 5), (2045, 5))

In [22]:
recommender = ImplicitRecommender(
    BM25Recommender(K=top_N_negative_sampling),
    users_mapping, 
    items_mapping,
    items_inv_mapping
)

In [23]:
recommender.fit(train)



  0%|          | 0/59599 [00:00<?, ?it/s]

In [24]:
train_pred_neg_sampling = recommender.recommend(
    train.select(pl.col("user_id").unique()).to_series(), 
    n=top_N_negative_sampling)

In [25]:
del recommender

In [26]:
train_pred_neg_sampling.head()

user_id,item_id,rank
u32,u32,u32
0,51473,1
0,289133,2
0,234741,3
0,169090,4
0,177262,5


In [27]:
neg_samples = train_pred_neg_sampling.lazy().groupby("user_id").agg(
    [
        pl.col("item_id").implode().flatten().alias("neg_item_id"),
    ]
).collect()

In [28]:
neg_train = train.lazy().select(pl.col("user_id"), pl.col("start_date")).unique().join(neg_samples.lazy(), on="user_id").select(
    pl.col("user_id"),
    pl.col("neg_item_id").apply(lambda x: x.sample(n=num_neg_samples, seed=232443)).alias("item_id"),
    pl.col("start_date"),
    pl.lit(0).alias("target")
).explode("item_id").join(
    train.lazy().select(pl.col("user_id"), pl.col("item_id"), pl.col("start_date")).unique(), on=["user_id", "item_id", "start_date"], how="anti").collect()

In [29]:
del neg_samples

In [30]:
neg_train.groupby(["user_id", "item_id", "start_date"]).count().filter(pl.col("count") > 1)

user_id,item_id,start_date,count
u32,u32,date,u32


In [31]:
train_full = train.select(
    pl.col("user_id"),
    pl.col("item_id"),
    pl.col("start_date"),
    pl.lit(1).alias("target")
).vstack(neg_train)

In [32]:
del train

In [33]:
assert train_full.groupby(["user_id", "item_id", "start_date"]).count().filter(
    pl.col("count") > 1).is_empty(), "(user_id, item_id, start_date) must be unique but found duplicates"


In [34]:
train_full.sort(["user_id", "start_date"]).head()

user_id,item_id,start_date,target
u32,u32,date,i32
0,48485,2018-01-29,1
0,177262,2018-01-29,0
0,28031,2018-01-29,0
0,234741,2018-01-29,0
0,77583,2018-01-29,0


In [35]:
df_users_features  = df_users.with_columns(
    pl.col("age").cast(str).fill_null("unknown").alias("age"),
    pl.col("sex").cast(pl.Int32).cast(str).str.replace("0", "female").str.replace("1", "male").fill_null("unknown").alias("sex")
)

In [36]:
df_items_features = df_items.select(
    pl.col("item_id"),
    pl.col("genres"),
    pl.col("year")
).with_columns(
    pl.col("genres").cast(str).fill_null("genre_unknown").alias("genres"),
    pl.col("year").cast(str).fill_null("unknown").alias("year")
)

In [37]:
train_full = train_full.lazy().join(df_users_features.lazy(), on="user_id").join(
    df_items_features.lazy(), on="item_id"
).collect()

In [38]:
train_full = train_full.with_columns(pl.col("start_date").dt.timestamp("us").alias("start_date"))

In [39]:
train_target = train_full.select(pl.col("target")).to_series()
train_full = train_full.drop("target")
timestamps = train_full.select(pl.col("start_date")).to_series()
train_full = train_full.drop("start_date")

In [40]:
train_full.head()

user_id,item_id,age,sex,genres,year
u32,u32,str,str,str,str
126706,14433,"""25_34""","""female""","""историческая л…","""2007"""
127290,140952,"""25_34""","""female""","""боевики""","""2006"""
46791,83486,"""18_24""","""male""","""здоровье""","""2003"""
79313,188770,"""55_64""","""female""","""стихи и поэзия…","""unknown"""
63454,78434,"""25_34""","""female""","""боевики""","""2012"""


In [44]:
num_features = [col for col in train_full.schema if train_full.schema[col] in pl.NUMERIC_DTYPES]
cat_features = [col for col in train_full.schema if train_full.schema[col].is_(pl.Utf8)]

In [45]:
num_features, cat_features

(['user_id', 'item_id'], ['age', 'sex', 'genres', 'year'])

In [47]:
import numpy as np

In [48]:
train_dataset = FeaturesData(
    num_feature_data=train_full.select(pl.col(num_features)).to_numpy().astype(np.float32),
    cat_feature_data=train_full.select(pl.col(cat_features)).to_numpy(),
    num_feature_names=num_features,
    cat_feature_names=cat_features
)

In [50]:
del train_full

In [49]:
train_dataset

<_catboost.FeaturesData at 0x7faf1f6c24d0>

In [53]:
train_pool = Pool(
    data=train_dataset,
    label=train_target.to_numpy(),
    timestamp=timestamps.to_numpy(),
)

In [56]:
ranker = CatBoostRanker(task_type='GPU')

In [57]:
ranker.fit(train_pool)

: 

: 