# Colab

In [151]:
# !git clone -b tree-ranking https://github.com/KernelA/education-recsys.git

In [152]:
# %cd /content/education-recsys/

In [153]:
# !pip install --no-cache-dir -r ./requirements.txt  kaggle

In [154]:
# %load_ext dotenv
# %dotenv

In [155]:
# !kaggle datasets download -d sharthz23/mts-library -p ./data --unzip 

In [156]:
import os

In [157]:
is_colab = False

In [158]:
if is_colab:
    remote_dir = "/content/drive"
else:
    remote_dir = os.getcwd()

In [159]:
if is_colab:
    from google.colab import drive
    drive.mount(remote_dir)

In [160]:
if is_colab:
    out_exp_dir = os.path.join(remote_dir, "MyDrive/EducationRecSys/tree-ranking")
else:
    out_exp_dir = os.path.join(remote_dir, "catboost-dir")

In [161]:
if is_colab:
    from google.colab import output
    output.enable_custom_widget_manager()

# Основной код

In [162]:
import os
import pathlib
from datetime import timedelta, datetime

import seaborn as sns
import polars as pl
import numpy as np
from sklearn.preprocessing import LabelEncoder
from implicit.nearest_neighbours import BM25Recommender
from catboost import Pool, CatBoostRanker, FeaturesData

In [163]:
sns.set_style("darkgrid")

In [164]:
%load_ext autoreload
%autoreload 2

from recs_utils.metrics import compute_metrics, model_cross_validate, mean_average_prec, join_true_pred_and_preprocess
from recs_utils.load_data import MTSDataset, MovieLens100K
from recs_utils.split import TimeRangeSplit
from recs_utils.implicit_model import ImplicitRecommender
from recs_utils.utils import get_direct_and_inv_mapping
from recs_utils.ranking import add_features, add_group_ids, features_target_ranking, convert_to_features_data, features_target_pairwise_ranking

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


In [165]:
data_dir = pathlib.Path("./data/mts/dump")

In [166]:
df = pl.read_parquet(data_dir / "interactions.parquet")
df_users = pl.read_parquet(data_dir / "users.parquet")
df_items = MTSDataset.select_genres(pl.read_parquet(data_dir / "items.parquet"), 0.98)

In [167]:
df = df.filter(pl.col("item_id").is_in(df_items.select(pl.col("item_id").unique()).to_series()) & 
          pl.col("user_id").is_in(df_users.select(pl.col("user_id").unique()).to_series())
)

In [168]:
num_items = df.select(pl.col("item_id").n_unique())[0, 0]

In [169]:
print("Num items:", num_items)

Num items: 59599


In [170]:
# max_items = 30_000
max_items = num_items

По причинам произвоидительности берём только элементы с наибольшим количеством оценок

In [171]:
selected_item_ids = df.lazy().groupby("item_id").count().top_k(min(max_items, num_items), by="count").select(pl.col("item_id").unique()).collect().to_series()

In [172]:
df = df.filter(pl.col("item_id").is_in(selected_item_ids))

In [173]:
del selected_item_ids

In [174]:
len(df)

1399932

In [175]:
df.head()

user_id,item_id,progress,rating,start_date
u32,u32,u8,f32,date
126706,14433,80,,2018-01-01
127290,140952,58,,2018-01-01
46791,83486,23,5.0,2018-01-01
79313,188770,88,5.0,2018-01-01
63454,78434,87,,2018-01-01


In [176]:
df_users.head()

user_id,age,sex
u32,cat,i8
1,"""45_54""",
2,"""18_24""",0.0
3,"""65_inf""",0.0
4,"""18_24""",0.0
5,"""35_44""",0.0


In [177]:
df_items.head(n=3)

item_id,title,genres,authors,year
u32,str,str,cat,cat
128115,"""ворон-челобитч…","""зарубежная кла…","""Михаил Салтыко…","""1886"""
210979,"""скрипка ротшил…","""классическая п…","""Антон Чехов""","""1894"""
95632,"""испорченные де…","""зарубежная кла…","""Михаил Салтыко…","""1869"""


In [181]:
last_date = df.get_column("start_date").max()
folds = 3
start_date = last_date - timedelta(weeks=folds)
start_date, last_date

(datetime.date(2019, 12, 10), datetime.date(2019, 12, 31))

In [182]:
cv = TimeRangeSplit(start_date=start_date, folds=folds, interval=timedelta(weeks=1))
cv.max_n_splits, cv.get_n_splits(df, datetime_column='start_date')

(3, 3)

In [183]:
folds_with_stats = list(cv.split(
    df, 
    user_column='user_id',
    item_column='item_id',
    datetime_column='start_date',
    fold_stats=True
)
)

folds_info_with_stats = pl.from_dicts([info for _, _, info in folds_with_stats])

In [184]:
folds_info_with_stats

Start date,End date,Train,New users,New users interactions,New items,New items interactions,Known interactions,Test
date,date,i64,i64,i64,i64,i64,i64,i64
2019-12-10,2019-12-17,1358452,5,5,0,0,0,13115
2019-12-17,2019-12-24,1371572,9,19,0,0,0,13219
2019-12-24,2019-12-31,1384810,9,12,0,0,0,13246


In [185]:
top_N = 10
num_neg_samples = 3

In [186]:
top_N_negative_sampling = max(num_neg_samples * 2, top_N)

In [187]:
train_idx, test_idx, info = folds_with_stats[0]

train = df.join(train_idx, on=["user_id", "item_id"], how="inner")
test = df.join(test_idx, on=["user_id", "item_id"], how="inner")
train.shape, test.shape

((1358452, 5), (13115, 5))

In [188]:
recommender = ImplicitRecommender(
    BM25Recommender(K=top_N_negative_sampling),
)

In [189]:
recommender.fit(train)



In [190]:
train_pred_neg_sampling = recommender.recommend(train, num_recs_per_user=top_N_negative_sampling)

In [191]:
test_pred = recommender.recommend(test, num_recs_per_user=top_N)

In [192]:
baseline_map = mean_average_prec(join_true_pred_and_preprocess(test, test_pred))

In [193]:
baseline_map

0.00948556606380896

In [194]:
assert train_pred_neg_sampling.lazy().groupby("user_id").count().filter(pl.col("count") < num_neg_samples).collect().is_empty(), f"You must sample {num_neg_samples} for each user"

In [195]:
train = train.with_row_count("pair_id")

In [196]:
neg_train = train.lazy().join(train_pred_neg_sampling.lazy().select(pl.col(["user_id", "item_id"])), on="user_id", suffix="_neg").filter(
    pl.col("item_id") != pl.col("item_id_neg")
).groupby(
    ["user_id", "item_id", "start_date", "pair_id"]
).agg(
    pl.col("item_id_neg").implode().flatten().sample(n=num_neg_samples, seed=1455, with_replacement=False).alias("neg_item_id")
).select(
    pl.all().exclude("item_id")
).explode("neg_item_id").select(
    pl.col("user_id"),
    pl.col("neg_item_id").alias("item_id"),
    pl.col("start_date"),
    pl.col("pair_id"),
    pl.lit(0).cast(pl.Int8).alias("target")).collect()

In [197]:
neg_train.head()

user_id,item_id,start_date,pair_id,target
u32,u32,date,u32,i8
63454,107047,2018-01-01,4,0
63454,221471,2018-01-01,4,0
63454,131689,2018-01-01,4,0
11142,30383,2018-01-01,25,0
11142,130223,2018-01-01,25,0


In [198]:
train_full = train.select(
    pl.col("user_id"),
    pl.col("item_id"),
    pl.col("start_date"),
    pl.col("pair_id"),
    pl.lit(1).cast(pl.Int8).alias("target")
).vstack(neg_train)

In [199]:
train_full.filter(pl.col("pair_id") == 660976)

user_id,item_id,start_date,pair_id,target
u32,u32,date,u32,i8
58790,203882,2018-12-16,660976,1
58790,193358,2018-12-16,660976,0
58790,1070,2018-12-16,660976,0
58790,231555,2018-12-16,660976,0


In [200]:
train_full = train_full.unique(["pair_id", "target"])

In [201]:
train_full.sort("pair_id", "target").head(n=6)

user_id,item_id,start_date,pair_id,target
u32,u32,date,u32,i8
126706,108460,2018-01-01,0,0
126706,14433,2018-01-01,0,1
127290,227589,2018-01-01,1,0
127290,140952,2018-01-01,1,1
46791,125654,2018-01-01,2,0
46791,83486,2018-01-01,2,1


In [202]:
train_full.groupby("pair_id").count().filter(
    pl.col("count") <= 1)

pair_id,count
u32,u32


In [203]:
assert train_full.groupby(["pair_id"]).count().filter(
    pl.col("count") < 2).is_empty(), f"Expected at least 2 pairs: positive and negative (user_id, item_id, start_date) "

In [204]:
# df_users_features = df_users
df_users_features = df_users.with_columns(
    pl.col("age").cast(str).fill_null("unknown"),
    pl.col("sex").cast(str).fill_null("unknown")
)

In [205]:
# df_items_features = df_items.select(pl.all().exclude(["IMDb_URL", "title"])).with_columns(pl.col("release_date").dt.timestamp("us"))
df_items_features = df_items.select(
    pl.col("item_id"),
    pl.col("genres"),
    pl.col("year")
).with_columns(
    pl.col("genres").cast(str).fill_null("genre_unknown").alias("genres"),
    pl.col("year").cast(str).fill_null("unknown").alias("year")
)

In [206]:
train_full = add_features(add_features(train_full, df_items_features, "item_id"), df_users_features, "user_id")

In [207]:
train_full.head(n=3)

user_id,item_id,start_date,pair_id,target,genres,year,age,sex
u32,u32,date,u32,i8,str,str,str,str
59484,161786,2018-01-01,10,1,"""научная фантас…","""1989""","""45_54""","""1"""
73724,306961,2018-01-01,39,1,"""альтернативная…","""unknown""","""45_54""","""0"""
36913,292838,2018-01-01,84,1,"""иронические де…","""2010""","""25_34""","""0"""


In [208]:
train_full = train_full.with_columns(pl.col("start_date").dt.timestamp("us"))

In [209]:
train_full.head()

user_id,item_id,start_date,pair_id,target,genres,year,age,sex
u32,u32,i64,u32,i8,str,str,str,str
59484,161786,1514764800000000,10,1,"""научная фантас…","""1989""","""45_54""","""1"""
73724,306961,1514764800000000,39,1,"""альтернативная…","""unknown""","""45_54""","""0"""
36913,292838,1514764800000000,84,1,"""иронические де…","""2010""","""25_34""","""0"""
24438,185522,1514764800000000,147,1,"""зарубежные дет…","""2003""","""45_54""","""0"""
10040,80663,1514764800000000,148,1,"""биографии и ме…","""2015""","""18_24""","""0"""


In [210]:
# train_full = add_group_ids(train_full, ["user_id", "start_date"])

In [211]:
train_full.head()

user_id,item_id,start_date,pair_id,target,genres,year,age,sex
u32,u32,i64,u32,i8,str,str,str,str
59484,161786,1514764800000000,10,1,"""научная фантас…","""1989""","""45_54""","""1"""
73724,306961,1514764800000000,39,1,"""альтернативная…","""unknown""","""45_54""","""0"""
36913,292838,1514764800000000,84,1,"""иронические де…","""2010""","""25_34""","""0"""
24438,185522,1514764800000000,147,1,"""зарубежные дет…","""2003""","""45_54""","""0"""
10040,80663,1514764800000000,148,1,"""биографии и ме…","""2015""","""18_24""","""0"""


In [212]:
ranker_train_info = features_target_pairwise_ranking(train_full, group_id_col="pair_id")

In [213]:
assert 2 * len(ranker_train_info.pairs) == ranker_train_info.features.get_object_count()

In [214]:
ranker_train_info.features.get_feature_names()

['genres', 'year', 'age', 'sex']

In [215]:
train_pool = Pool(
    data=ranker_train_info.features,
    label=ranker_train_info.target,
    timestamp=ranker_train_info.timestamps,
    pairs=ranker_train_info.pairs,
    group_id=ranker_train_info.group_ids
)

In [216]:
train_dir = os.path.join(out_exp_dir, "log")

In [217]:
os.makedirs(train_dir, exist_ok=True)

In [233]:
ranker = CatBoostRanker(task_type='GPU', random_seed=3466, metric_period=50, 
                        eval_metric=f"MAP:top=1", 
                        custom_metric=["RecallAt:top=1", "PrecisionAt:top=1"],
                        iterations=1200,
                        train_dir=train_dir,
                        loss_function="YetiRankPairwise"
                        # max_ctr_complexity=3
                        )

In [234]:
model_path = os.path.join(out_exp_dir, "model", "model.cbm")

In [235]:
os.makedirs(os.path.dirname(model_path), exist_ok=True)

In [236]:
force_train = True

In [237]:
if os.path.exists(model_path) and not force_train:
    ranker.load_model(model_path)
else:
    ranker.fit(train_pool)
    ranker.save_model(model_path)

Groupwise loss function. OneHotMaxSize set to 10


Metric PFound is not implemented on GPU. Will use CPU for metric computation, this could significantly affect learning time
Metric MAP:top=1 is not implemented on GPU. Will use CPU for metric computation, this could significantly affect learning time
Metric RecallAt:top=1 is not implemented on GPU. Will use CPU for metric computation, this could significantly affect learning time
Metric PrecisionAt:top=1 is not implemented on GPU. Will use CPU for metric computation, this could significantly affect learning time


0:	learn: 0.6779989	total: 397ms	remaining: 7m 55s
50:	learn: 0.7346325	total: 7.52s	remaining: 2m 49s
100:	learn: 0.7486124	total: 14.7s	remaining: 2m 39s
150:	learn: 0.7513876	total: 21.8s	remaining: 2m 31s
200:	learn: 0.7557948	total: 29s	remaining: 2m 24s
250:	learn: 0.7576793	total: 36.3s	remaining: 2m 17s
300:	learn: 0.7592915	total: 43.5s	remaining: 2m 9s
350:	learn: 0.7609713	total: 50.7s	remaining: 2m 2s
400:	learn: 0.7625268	total: 58s	remaining: 1m 55s
450:	learn: 0.7642397	total: 1m 5s	remaining: 1m 48s
500:	learn: 0.7652497	total: 1m 12s	remaining: 1m 41s
550:	learn: 0.7667595	total: 1m 19s	remaining: 1m 34s
600:	learn: 0.7679741	total: 1m 27s	remaining: 1m 26s
650:	learn: 0.7690783	total: 1m 34s	remaining: 1m 19s
700:	learn: 0.7701855	total: 1m 41s	remaining: 1m 12s
750:	learn: 0.7710026	total: 1m 49s	remaining: 1m 5s
800:	learn: 0.7717932	total: 1m 56s	remaining: 58s
850:	learn: 0.7726118	total: 2m 3s	remaining: 50.7s
900:	learn: 0.7732839	total: 2m 10s	remaining: 43.5s


In [223]:
test_with_features = add_features(add_features(test_pred, df_items_features, ["item_id"]), df_users_features, ["user_id"])

In [224]:
test_with_features.head()

user_id,item_id,rank,genres,year,age,sex
u32,u32,u32,str,str,str,str
21,222616,1,"""боевое фэнтези…","""2006""","""18_24""","""0"""
21,196161,2,"""героическое фэ…","""2017""","""18_24""","""0"""
21,7122,3,"""книги про волш…","""2019""","""18_24""","""0"""
21,281909,4,"""боевое фэнтези…","""2008""","""18_24""","""0"""
21,81827,5,"""любовное фэнте…","""2016""","""18_24""","""0"""


In [225]:
len(test_pred), len(test_with_features)

(88170, 88170)

In [226]:
predicted_scores = []

for user_id, data in test_with_features.groupby("user_id"):
    features = convert_to_features_data(data, ["rank", "item_id"])
    
    predicted_scores.append(
        pl.DataFrame(
            {
                "user_id": data.get_column("user_id"),
                "item_id": data.get_column("item_id"),
                "score": ranker.predict(features)
            },
            schema={col: dtype for col, dtype in data.schema.items() if col in ("user_id", "item_id")} | {"score": pl.Float32}
        )
    )

In [227]:
ranked_pred = test_with_features.join(pl.concat(predicted_scores), on=["user_id", "item_id"], how="inner")

In [228]:
ranked_pred = ranked_pred.sort(["user_id", "score"], descending=True).with_columns(
    (pl.col("item_id").cumcount() + 1).over("user_id").alias("new_rank")
)

In [229]:
ranked_pred.head()

user_id,item_id,rank,genres,year,age,sex,score,new_rank
u32,u32,u32,str,str,str,str,f32,u32
159595,12699,9,"""критика""","""1911""","""18_24""","""unknown""",0.429674,1
159595,86073,8,"""книги для дете…","""unknown""","""18_24""","""unknown""",0.280971,2
159595,229850,3,"""общая история""","""1990""","""18_24""","""unknown""",0.133104,3
159595,237005,4,"""городское фэнт…","""2017""","""18_24""","""unknown""",0.116347,4
159595,178771,2,"""городское фэнт…","""2018""","""18_24""","""unknown""",-0.042737,5


In [230]:
joined_rank_data = join_true_pred_and_preprocess(test, ranked_pred.select(
    pl.col(["user_id", "item_id"]),
    pl.col("new_rank").alias("rank")
))

In [231]:
mean_average_prec(joined_rank_data)

0.007476633439598811

In [232]:
baseline_map

0.00948556606380896