In [1]:
import os
from datetime import timedelta, datetime

import seaborn as sns
import polars as pl
import numpy as np
from implicit.nearest_neighbours import BM25Recommender
from catboost import Pool, CatBoostRanker, FeaturesData

In [2]:
sns.set_style("darkgrid")

In [9]:
%load_ext autoreload
%autoreload 2

from recs_utils.metrics import compute_metrics, model_cross_validate, mean_average_prec, join_true_pred_and_preprocess
from recs_utils.load_data import MTSDataset
from recs_utils.split import TimeRangeSplit
from recs_utils.implicit_model import ImplicitRecommender
from recs_utils.utils import get_direct_and_inv_mapping
from recs_utils.ranking import add_features, add_group_ids, features_target_ranking, convert_to_features_data

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


In [10]:
data_dir = "data"

In [11]:
df = MTSDataset.load_interactions(os.path.join(data_dir, "interactions.csv"))
df_users = MTSDataset.load_users(os.path.join(data_dir, "users.csv"))
df_items = MTSDataset.select_genres(MTSDataset.load_items(os.path.join(data_dir, "items.csv")), 0.98)

In [12]:
df = df.filter(pl.col("item_id").is_in(df_items.select(pl.col("item_id").unique()).to_series()) & 
          pl.col("user_id").is_in(df_users.select(pl.col("user_id").unique()).to_series())
)

In [13]:
num_items = df.select(pl.col("item_id").n_unique())[0, 0]

По причинам произвоидительности берём только элементы с наибольшим количеством оценок

In [14]:
selected_item_ids = df.lazy().groupby("item_id").count().top_k(min(20_000, num_items), by="count").select(pl.col("item_id").unique()).collect().to_series()

In [16]:
df = df.filter(pl.col("item_id").is_in(selected_item_ids))

In [17]:
del selected_item_ids

In [18]:
len(df)

903513

In [19]:
df.head()

user_id,item_id,progress,rating,start_date
u32,u32,u8,f32,date
126706,14433,80,,2018-01-01
127451,14876,69,,2018-01-01
47287,258483,22,,2018-01-01
47551,64060,55,,2018-01-01
59484,161786,27,,2018-01-01


In [20]:
df_users.head()

user_id,age,sex
u32,cat,i8
1,"""45_54""",
2,"""18_24""",0.0
3,"""65_inf""",0.0
4,"""18_24""",0.0
5,"""35_44""",0.0


In [21]:
df_items.head()

item_id,title,genres,authors,year
u32,str,str,cat,cat
128115,"""ворон-челобитч…","""зарубежная кла…","""Михаил Салтыко…","""1886"""
210979,"""скрипка ротшил…","""классическая п…","""Антон Чехов""","""1894"""
95632,"""испорченные де…","""зарубежная кла…","""Михаил Салтыко…","""1869"""
247906,"""странный челов…","""литература 19 …","""Михаил Лермонт…","""1831"""
294280,"""господа ташкен…","""зарубежная кла…","""Михаил Салтыко…","""1873"""


In [22]:
users_mapping, users_inv_mapping = get_direct_and_inv_mapping(df, "user_id")
len(users_mapping)

134947

In [23]:
items_mapping, items_inv_mapping = get_direct_and_inv_mapping(df, "item_id")
len(items_mapping)

20000

In [24]:
last_date = df.select(pl.col("start_date").max())[0, 0]
folds = 3
start_date = last_date - timedelta(weeks=folds)
start_date, last_date

(datetime.date(2019, 12, 10), datetime.date(2019, 12, 31))

In [25]:
cv = TimeRangeSplit(start_date=start_date, folds=folds, interval=timedelta(weeks=1))
cv.max_n_splits, cv.get_n_splits(df, datetime_column='start_date')

(3, 3)

In [26]:
folds_with_stats = list(cv.split(
    df, 
    user_column='user_id',
    item_column='item_id',
    datetime_column='start_date',
    fold_stats=True
)
)

folds_info_with_stats = pl.from_dicts([info for _, _, info in folds_with_stats])

In [27]:
folds_info_with_stats

Start date,End date,Train,New users,New users interactions,New items,New items interactions,Known interactions,Test
date,date,i64,i64,i64,i64,i64,i64,i64
2019-12-10,2019-12-17,878312,173,261,0,0,0,7877
2019-12-17,2019-12-24,886450,177,240,0,0,0,7780
2019-12-24,2019-12-31,894470,191,261,0,0,0,7862


In [28]:
top_N = 10
num_neg_samples = 5

In [29]:
top_N_negative_sampling = top_N * 2

In [71]:
train_idx, test_idx, info = folds_with_stats[0]

train = df.join(train_idx, on=["user_id", "item_id"], how="inner")
test = df.join(test_idx, on=["user_id", "item_id"], how="inner")
train.shape, test.shape

((878312, 5), (7877, 5))

In [72]:
recommender = ImplicitRecommender(
    BM25Recommender(K=top_N_negative_sampling),
    users_mapping, 
    items_mapping,
    items_inv_mapping
)

In [73]:
recommender.fit(train)



  0%|          | 0/20000 [00:00<?, ?it/s]

In [74]:
train_pred_neg_sampling = recommender.recommend(
    train.select(pl.col("user_id").unique()).to_series(), 
    n=top_N_negative_sampling)

In [75]:
test_pred = recommender.recommend(
    test.select(pl.col("user_id").unique()).to_series(), 
    n=top_N)

In [76]:
baseline_map = mean_average_prec(join_true_pred_and_preprocess(test, test_pred))

In [78]:
baseline_map

0.018184587227608056

In [79]:
train_pred_neg_sampling = train_pred_neg_sampling.filter(pl.col("item_id").is_not_null())

In [80]:
neg_samples = train_pred_neg_sampling.lazy().groupby("user_id").agg(
    [
        pl.col("item_id").implode().flatten().alias("neg_item_id"),
    ]
).collect()

In [81]:
neg_train = train.lazy().select(pl.col("user_id"), pl.col("start_date")).unique().join(neg_samples.lazy(), on="user_id").select(
    pl.col("user_id"),
    pl.col("neg_item_id").apply(lambda x: x.sample(n=num_neg_samples, seed=232443)).alias("item_id"),
    pl.col("start_date"),
    pl.lit(0).alias("target")
).explode("item_id").join(
    train.lazy().select(pl.col("user_id"), pl.col("item_id"), pl.col("start_date")).unique(), on=["user_id", "item_id", "start_date"], how="anti").collect()

In [82]:
neg_samples.head(n=3)

user_id,neg_item_id
u32,list[u32]
152024,"[81693, 48361, … 63000]"
152848,"[152558, 275805, … 291560]"
128608,"[103229, 262016, … 27600]"


In [83]:
del neg_samples

In [84]:
train_full = train.select(
    pl.col("user_id"),
    pl.col("item_id"),
    pl.col("start_date"),
    pl.lit(1).alias("target")
).vstack(neg_train)

In [85]:
del train
del neg_train

In [86]:
assert train_full.groupby(["user_id", "item_id", "start_date"]).count().filter(
    pl.col("count") > 1).is_empty(), "(user_id, item_id, start_date) must be unique but found duplicates"


In [87]:
train_full.sort(["user_id", "start_date"]).head()

user_id,item_id,start_date,target
u32,u32,date,i32
1,284459,2018-02-26,1
1,52368,2018-02-26,0
1,150986,2018-02-26,0
1,286091,2018-02-26,0
1,184147,2018-02-26,0


In [88]:
df_users_features  = df_users.with_columns(
    pl.col("age").cast(str).fill_null("unknown"),
    pl.col("sex").cast(str).fill_null("unknown")
)

In [89]:
df_items_features = df_items.select(
    pl.col("item_id"),
    pl.col("genres"),
    pl.col("year")
).with_columns(
    pl.col("genres").cast(str).fill_null("genre_unknown").alias("genres"),
    pl.col("year").cast(str).fill_null("unknown").alias("year")
)

In [90]:
train_full = add_features(train_full, df_items_features, df_users_features)

In [91]:
train_full.head(n=3)

user_id,item_id,start_date,target,age,sex,genres,year
u32,u32,date,i32,str,str,str,str
126706,14433,2018-01-01,1,"""25_34""","""0""","""историческая л…","""2007"""
127451,14876,2018-01-01,1,"""45_54""","""0""","""зарубежные дет…","""1843"""
47287,258483,2018-01-01,1,"""18_24""","""0""","""боевая фантаст…","""2018"""


In [92]:
train_full = train_full.with_columns(pl.col("start_date").dt.timestamp("us"))

In [93]:
train_full.head()

user_id,item_id,start_date,target,age,sex,genres,year
u32,u32,i64,i32,str,str,str,str
126706,14433,1514764800000000,1,"""25_34""","""0""","""историческая л…","""2007"""
127451,14876,1514764800000000,1,"""45_54""","""0""","""зарубежные дет…","""1843"""
47287,258483,1514764800000000,1,"""18_24""","""0""","""боевая фантаст…","""2018"""
47551,64060,1514764800000000,1,"""35_44""","""unknown""","""современные де…","""2002"""
59484,161786,1514764800000000,1,"""45_54""","""1""","""научная фантас…","""1989"""


In [94]:
train_full = add_group_ids(train_full, ["user_id", "start_date"])

In [95]:
train_full.head()

user_id,item_id,start_date,target,age,sex,genres,year,group_id
u32,u32,i64,i32,str,str,str,str,u32
133008,281585,1514764800000000,1,"""25_34""","""1""","""зарубежная кла…","""1853""",0
133008,39160,1514764800000000,0,"""25_34""","""1""","""современные де…","""2018""",0
133008,271233,1514764800000000,0,"""25_34""","""1""","""остросюжетные …","""2005""",0
133008,70926,1514764800000000,0,"""25_34""","""1""","""современные де…","""2018""",0
133008,164367,1514764800000000,0,"""25_34""","""1""","""современные де…","""unknown""",0


In [96]:
ranker_train_info = features_target_ranking(train_full)

In [97]:
train_pool = Pool(
    data=ranker_train_info.features,
    label=ranker_train_info.target,
    timestamp=ranker_train_info.timestamps,
    group_id=ranker_train_info.group_ids
)

In [100]:
ranker = CatBoostRanker(task_type='GPU', random_seed=3466, metric_period=25, eval_metric="MAP", iterations=500, max_ctr_complexity=3)

In [101]:
if os.path.exists("model.cbm"):
    ranker.load_model("model.cbm")
else:
    ranker.fit(train_pool)
    ranker.save_model("model.cbm")

In [102]:
test_with_features = add_features(test_pred, df_items_features, df_users_features).unique()

In [103]:
test_with_features.head()

user_id,item_id,rank,age,sex,genres,year
u32,u32,u32,str,str,str,str
146078,210979,4,"""65_inf""","""0""","""классическая п…","""1894"""
99259,133138,9,"""18_24""","""0""","""зарубежные дет…","""unknown"""
62308,167994,3,"""35_44""","""0""","""зарубежная дра…","""1600"""
36,80487,8,"""65_inf""","""1""","""литература 20 …","""1916"""
98296,80487,3,"""35_44""","""0""","""литература 20 …","""1916"""


In [104]:
len(test_pred), len(test_with_features)

(48220, 48220)

In [106]:
scores = []

for user_id, data in test_with_features.groupby("user_id", maintain_order=True):
    features = convert_to_features_data(data, ["rank", "item_id"])
    scores.append(ranker.predict(features))

In [110]:
ranked_pred = pl.concat([test_with_features, pl.DataFrame({"score": np.concatenate(scores, axis=-1)})], how="horizontal")

In [112]:
ranked_pred = ranked_pred.sort(["user_id", "score"], descending=True).with_columns(
    (pl.col("item_id").cumcount() + 1).over("user_id").alias("new_rank")
)

In [113]:
ranked_pred.head()

user_id,item_id,rank,age,sex,genres,year,score,new_rank
u32,u32,u32,str,str,str,str,f64,u32
159573,6456,7,"""55_64""","""0""","""любовное фэнте…","""2015""",1.799081,1
159573,178771,2,"""55_64""","""0""","""городское фэнт…","""2018""",0.24672,2
159573,39513,4,"""55_64""","""0""","""любовное фэнте…","""2015""",-0.203573,3
159573,15891,9,"""55_64""","""0""","""городское фэнт…","""2016""",-0.304394,4
159573,311133,5,"""55_64""","""0""","""любовное фэнте…","""2018""",-0.427979,5


In [120]:
joined_rank_data = join_true_pred_and_preprocess(test, ranked_pred.select(
    pl.col(["user_id", "item_id"]),
    pl.col("new_rank").alias("rank")
))

In [121]:
mean_average_prec(joined_rank_data)

0.01412668001792624

In [117]:
baseline_map

0.018184587227608056