# Colab

In [2]:
# !git clone -b tree-ranking https://github.com/KernelA/education-recsys.git

In [3]:
# %cd /content/education-recsys/

In [1]:
# !pip install --no-cache-dir -r ./requirements.txt  kaggle

In [4]:
# %load_ext dotenv
# %dotenv

In [5]:
# !kaggle datasets download -d sharthz23/mts-library -p ./data --unzip 

In [6]:
import os

In [7]:
is_colab = False

In [8]:
if is_colab:
    remote_dir = "/content/drive"
else:
    remote_dir = os.getcwd()

In [9]:
if is_colab:
    from google.colab import drive
    drive.mount(remote_dir)

In [10]:
if is_colab:
    out_exp_dir = os.path.join(remote_dir, "MyDrive/EducationRecSys/tree-ranking")
else:
    out_exp_dir = os.path.join(remote_dir, "catboost-dir")

In [11]:
if is_colab:
    from google.colab import output
    output.enable_custom_widget_manager()

# Основной код

In [12]:
import os
from datetime import timedelta, datetime

import seaborn as sns
import polars as pl
import numpy as np
from implicit.nearest_neighbours import BM25Recommender
from catboost import Pool, CatBoostRanker, FeaturesData

In [13]:
sns.set_style("darkgrid")

In [14]:
%load_ext autoreload
%autoreload 2

from recs_utils.metrics import compute_metrics, model_cross_validate, mean_average_prec, join_true_pred_and_preprocess
from recs_utils.load_data import MTSDataset, MovieLens100K
from recs_utils.split import TimeRangeSplit
from recs_utils.implicit_model import ImplicitRecommender
from recs_utils.utils import get_direct_and_inv_mapping
from recs_utils.ranking import add_features, add_group_ids, features_target_ranking, convert_to_features_data, features_target_pairwise_ranking

In [15]:
data_dir = os.path.join("data")

In [16]:
df = MTSDataset.load_interactions(os.path.join(data_dir, "interactions.csv"))
df_users = MTSDataset.load_users(os.path.join(data_dir, "users.csv"))
df_items = MTSDataset.select_genres(MTSDataset.load_items(os.path.join(data_dir, "items.csv")), 0.98)

In [17]:
df = df.filter(pl.col("item_id").is_in(df_items.select(pl.col("item_id").unique()).to_series()) & 
          pl.col("user_id").is_in(df_users.select(pl.col("user_id").unique()).to_series())
)

In [18]:
num_items = df.select(pl.col("item_id").n_unique())[0, 0]

In [19]:
print("Num items:", num_items)

Num items: 59599


По причинам произвоидительности берём только элементы с наибольшим количеством оценок

In [20]:
selected_item_ids = df.lazy().groupby("item_id").count().top_k(min(30_000, num_items), by="count").select(pl.col("item_id").unique()).collect().to_series()

In [21]:
df = df.filter(pl.col("item_id").is_in(selected_item_ids))

In [22]:
del selected_item_ids

In [23]:
len(df)

1063269

In [24]:
df.head()

user_id,item_id,progress,rating,start_date
u32,u32,u8,f32,date
126706,14433,80,,2018-01-01
127290,140952,58,,2018-01-01
127451,14876,69,,2018-01-01
47287,258483,22,,2018-01-01
47551,64060,55,,2018-01-01


In [25]:
df_users.head()

user_id,age,sex
u32,cat,i8
1,"""45_54""",
2,"""18_24""",0.0
3,"""65_inf""",0.0
4,"""18_24""",0.0
5,"""35_44""",0.0


In [26]:
df_items.head(n=3)

item_id,title,genres,authors,year
u32,str,str,cat,cat
128115,"""ворон-челобитч…","""зарубежная кла…","""Михаил Салтыко…","""1886"""
210979,"""скрипка ротшил…","""классическая п…","""Антон Чехов""","""1894"""
95632,"""испорченные де…","""зарубежная кла…","""Михаил Салтыко…","""1869"""


In [27]:
users_mapping, users_inv_mapping = get_direct_and_inv_mapping(df, "user_id")
len(users_mapping)

135520

In [28]:
items_mapping, items_inv_mapping = get_direct_and_inv_mapping(df, "item_id")
len(items_mapping)

30000

In [29]:
last_date = df.select(pl.col("start_date").max())[0, 0]
folds = 3
start_date = last_date - timedelta(weeks=folds)
start_date, last_date

(datetime.date(2019, 12, 10), datetime.date(2019, 12, 31))

In [30]:
cv = TimeRangeSplit(start_date=start_date, folds=folds, interval=timedelta(weeks=1))
cv.max_n_splits, cv.get_n_splits(df, datetime_column='start_date')

(3, 3)

In [31]:
folds_with_stats = list(cv.split(
    df, 
    user_column='user_id',
    item_column='item_id',
    datetime_column='start_date',
    fold_stats=True
)
)

folds_info_with_stats = pl.from_dicts([info for _, _, info in folds_with_stats])

In [32]:
folds_info_with_stats

Start date,End date,Train,New users,New users interactions,New items,New items interactions,Known interactions,Test
date,date,i64,i64,i64,i64,i64,i64,i64
2019-12-10,2019-12-17,1032838,74,100,0,0,0,9640
2019-12-17,2019-12-24,1042578,75,101,0,0,0,9615
2019-12-24,2019-12-31,1052294,67,96,0,0,0,9675


In [33]:
top_N = 10
num_neg_samples = 3

In [34]:
top_N_negative_sampling = max(num_neg_samples * 2, top_N)

In [35]:
train_idx, test_idx, info = folds_with_stats[0]

train = df.join(train_idx, on=["user_id", "item_id"], how="inner")
test = df.join(test_idx, on=["user_id", "item_id"], how="inner")
train.shape, test.shape

((1032838, 5), (9640, 5))

In [36]:
recommender = ImplicitRecommender(
    BM25Recommender(K=top_N_negative_sampling),
    users_mapping, 
    items_mapping,
    items_inv_mapping,
    # rating_col_name="rating"
)

In [37]:
recommender.fit(train)



  0%|          | 0/30000 [00:00<?, ?it/s]

In [38]:
train_pred_neg_sampling = recommender.recommend(
    train.select(pl.col("user_id").unique()).to_series(), 
    n=top_N_negative_sampling)

In [39]:
test_pred = recommender.recommend(
    test.select(pl.col("user_id").unique()).to_series(), 
    n=top_N)

In [40]:
baseline_map = mean_average_prec(join_true_pred_and_preprocess(test, test_pred))

In [41]:
baseline_map

0.01453943859233516

In [42]:
assert train_pred_neg_sampling.lazy().groupby("user_id").count().filter(pl.col("count") < num_neg_samples).collect().is_empty(), f"You must sample {num_neg_samples} for each user"

In [43]:
train = train.with_row_count("pair_id")

In [44]:
neg_train = train.lazy().join(train_pred_neg_sampling.lazy().select(pl.col(["user_id", "item_id"])), on="user_id", suffix="_neg").filter(
    pl.col("item_id") != pl.col("item_id_neg")
).groupby(
    ["user_id", "item_id", "start_date", "pair_id"]
).agg(
    pl.col("item_id_neg").implode().flatten().sample(n=num_neg_samples, seed=1455, with_replacement=False).alias("neg_item_id")
).select(
    pl.all().exclude("item_id")
).explode("neg_item_id").select(
    pl.col("user_id"),
    pl.col("neg_item_id").alias("item_id"),
    pl.col("start_date"),
    pl.col("pair_id"),
    pl.lit(0).cast(pl.Int8).alias("target")).collect()

In [45]:
neg_train.head()

user_id,item_id,start_date,pair_id,target
u32,u32,date,u32,i8
3,38745,2018-12-28,508317,0
3,316205,2018-12-28,508317,0
3,214016,2018-12-28,508317,0
6,177822,2019-06-11,760729,0
6,109493,2019-06-11,760729,0


In [46]:
train_full = train.select(
    pl.col("user_id"),
    pl.col("item_id"),
    pl.col("start_date"),
    pl.col("pair_id"),
    pl.lit(1).cast(pl.Int8).alias("target")
).vstack(neg_train)

In [48]:
train_full = train_full.unique(["pair_id", "target"])

In [49]:
train_full.sort("pair_id", "target").head(n=6)

user_id,item_id,start_date,pair_id,target
u32,u32,date,u32,i8
126706,307855,2018-01-01,0,0
126706,14433,2018-01-01,0,1
127290,189678,2018-01-01,1,0
127290,140952,2018-01-01,1,1
127451,232360,2018-01-01,2,0
127451,14876,2018-01-01,2,1


In [50]:
train_full.groupby(["pair_id"]).count().filter(
    pl.col("count") <= 1)

pair_id,count
u32,u32


In [51]:
assert train_full.groupby(["pair_id"]).count().filter(
    pl.col("count") < 2).is_empty(), f"Expected at least 2 pairs: positive and negative (user_id, item_id, start_date) "

In [52]:
# df_users_features = df_users
df_users_features = df_users.with_columns(
    pl.col("age").cast(str).fill_null("unknown"),
    pl.col("sex").cast(str).fill_null("unknown")
)

In [53]:
# df_items_features = df_items.select(pl.all().exclude(["IMDb_URL", "title"])).with_columns(pl.col("release_date").dt.timestamp("us"))
df_items_features = df_items.select(
    pl.col("item_id"),
    pl.col("genres"),
    pl.col("year")
).with_columns(
    pl.col("genres").cast(str).fill_null("genre_unknown").alias("genres"),
    pl.col("year").cast(str).fill_null("unknown").alias("year")
)

In [54]:
train_full = add_features(add_features(train_full, df_items_features, "item_id"), df_users_features, "user_id")

In [55]:
train_full.head(n=3)

user_id,item_id,start_date,pair_id,target,genres,year,age,sex
u32,u32,date,u32,i8,str,str,str,str
47551,64060,2018-01-01,4,1,"""современные де…","""2002""","""35_44""","""unknown"""
1011,18519,2018-01-01,10,1,"""сказки""","""unknown""","""18_24""","""0"""
137220,167814,2018-01-01,13,1,"""любовное фэнте…","""2013""","""25_34""","""0"""


In [56]:
train_full = train_full.with_columns(pl.col("start_date").dt.timestamp("us"))

In [57]:
train_full.head()

user_id,item_id,start_date,pair_id,target,genres,year,age,sex
u32,u32,i64,u32,i8,str,str,str,str
47551,64060,1514764800000000,4,1,"""современные де…","""2002""","""35_44""","""unknown"""
1011,18519,1514764800000000,10,1,"""сказки""","""unknown""","""18_24""","""0"""
137220,167814,1514764800000000,13,1,"""любовное фэнте…","""2013""","""25_34""","""0"""
50171,54384,1514764800000000,39,1,"""зарубежные при…","""1851""","""35_44""","""1"""
90067,72200,1514764800000000,58,1,"""боевое фэнтези…","""2003""","""18_24""","""1"""


In [58]:
# train_full = add_group_ids(train_full, ["user_id", "start_date"])

In [59]:
train_full.head()

user_id,item_id,start_date,pair_id,target,genres,year,age,sex
u32,u32,i64,u32,i8,str,str,str,str
47551,64060,1514764800000000,4,1,"""современные де…","""2002""","""35_44""","""unknown"""
1011,18519,1514764800000000,10,1,"""сказки""","""unknown""","""18_24""","""0"""
137220,167814,1514764800000000,13,1,"""любовное фэнте…","""2013""","""25_34""","""0"""
50171,54384,1514764800000000,39,1,"""зарубежные при…","""1851""","""35_44""","""1"""
90067,72200,1514764800000000,58,1,"""боевое фэнтези…","""2003""","""18_24""","""1"""


In [60]:
ranker_train_info = features_target_pairwise_ranking(train_full, group_id_col="pair_id")

In [61]:
assert 2 * len(ranker_train_info.pairs) == ranker_train_info.features.get_object_count()

In [62]:
ranker_train_info.features.get_feature_names()

['genres', 'year', 'age', 'sex']

In [63]:
train_pool = Pool(
    data=ranker_train_info.features,
    label=ranker_train_info.target,
    timestamp=ranker_train_info.timestamps,
    pairs=ranker_train_info.pairs,
    group_id=ranker_train_info.group_ids
)

In [64]:
train_dir = os.path.join(out_exp_dir, "log")

In [65]:
os.makedirs(train_dir, exist_ok=True)

In [66]:
ranker = CatBoostRanker(task_type='GPU', random_seed=3466, metric_period=50, 
                        eval_metric=f"MAP:top=2", 
                        custom_metric=["RecallAt:top=1", "PrecisionAt:top=3"],
                        iterations=800,
                        train_dir=train_dir,
                        loss_function="PairLogitPairwise"
                        # max_ctr_complexity=3
                        )

In [67]:
model_path = os.path.join(out_exp_dir, "model", "model.cbm")

In [68]:
os.makedirs(os.path.dirname(model_path), exist_ok=True)

In [69]:
force_train = True

In [70]:
if os.path.exists(model_path) and not force_train:
    ranker.load_model(model_path)
else:
    ranker.fit(train_pool)
    ranker.save_model(model_path)

Groupwise loss function. OneHotMaxSize set to 10


Metric MAP:top=2 is not implemented on GPU. Will use CPU for metric computation, this could significantly affect learning time
Metric RecallAt:top=1 is not implemented on GPU. Will use CPU for metric computation, this could significantly affect learning time
Metric PrecisionAt:top=3 is not implemented on GPU. Will use CPU for metric computation, this could significantly affect learning time


0:	learn: 0.7514983	total: 353ms	remaining: 4m 41s
50:	learn: 0.7997745	total: 4.38s	remaining: 1m 4s
100:	learn: 0.8098516	total: 8.64s	remaining: 59.8s
150:	learn: 0.8135317	total: 13s	remaining: 56s
200:	learn: 0.8158477	total: 17.1s	remaining: 51.1s
250:	learn: 0.8178146	total: 21.2s	remaining: 46.3s
300:	learn: 0.8191977	total: 25.5s	remaining: 42.3s
350:	learn: 0.8203799	total: 29.6s	remaining: 37.9s
400:	learn: 0.8212571	total: 34.2s	remaining: 34s
450:	learn: 0.8220854	total: 38.8s	remaining: 30s
500:	learn: 0.8228977	total: 43.2s	remaining: 25.8s
550:	learn: 0.8236461	total: 47.3s	remaining: 21.4s
600:	learn: 0.8242387	total: 51.4s	remaining: 17s
650:	learn: 0.8247934	total: 55.5s	remaining: 12.7s
700:	learn: 0.8252635	total: 59.5s	remaining: 8.41s
750:	learn: 0.8256411	total: 1m 3s	remaining: 4.15s
799:	learn: 0.8259674	total: 1m 7s	remaining: 0us


In [71]:
test_with_features = add_features(add_features(test_pred, df_items_features, ["item_id"]), df_users_features, ["user_id"])

In [72]:
test_with_features.head()

user_id,item_id,rank,genres,year,age,sex
u32,u32,u32,str,str,str,str
21,222616,1,"""боевое фэнтези…","""2006""","""18_24""","""0"""
21,196161,2,"""героическое фэ…","""2017""","""18_24""","""0"""
21,163270,3,"""героическое фэ…","""2018""","""18_24""","""0"""
21,281909,4,"""боевое фэнтези…","""2008""","""18_24""","""0"""
21,81827,5,"""любовное фэнте…","""2016""","""18_24""","""0"""


In [73]:
len(test_pred), len(test_with_features)

(61300, 61300)

In [74]:
scores = []

for user_id, data in test_with_features.groupby("user_id", maintain_order=True):
    features = convert_to_features_data(data, ["rank", "item_id"])
    scores.append(ranker.predict(features))

In [75]:
ranked_pred = pl.concat([test_with_features, pl.DataFrame({"score": np.concatenate(scores, axis=-1)})], how="horizontal")

In [76]:
ranked_pred = ranked_pred.sort(["user_id", "score"], descending=True).with_columns(
    (pl.col("item_id").cumcount() + 1).over("user_id").alias("new_rank")
)

In [77]:
ranked_pred.head()

user_id,item_id,rank,genres,year,age,sex,score,new_rank
u32,u32,u32,str,str,str,str,f64,u32
159592,80324,3,"""справочная лит…","""2012""","""18_24""","""1""",1.220586,1
159592,86864,2,"""детские приклю…","""unknown""","""18_24""","""1""",1.151036,2
159592,185744,6,"""русское фэнтез…","""unknown""","""18_24""","""1""",0.263253,3
159592,96057,8,"""зарубежная фан…","""unknown""","""18_24""","""1""",0.015402,4
159592,38320,7,"""современная ру…","""unknown""","""18_24""","""1""",-0.118408,5


In [78]:
joined_rank_data = join_true_pred_and_preprocess(test, ranked_pred.select(
    pl.col(["user_id", "item_id"]),
    pl.col("new_rank").alias("rank")
))

In [79]:
mean_average_prec(joined_rank_data)

0.010392250033126285

In [80]:
baseline_map

0.01453943859233516