# Colab

In [None]:
# !git clone -b tree-ranking https://github.com/KernelA/education-recsys.git

In [None]:
# %cd /content/education-recsys/

In [None]:
# !pip install --no-cache-dir -r ./requirements.txt  kaggle

In [None]:
# %load_ext dotenv
# %dotenv

In [None]:
# !kaggle datasets download -d sharthz23/mts-library -p ./data --unzip 

In [83]:
import os

In [None]:
is_colab = False

In [None]:
if is_colab:
    remote_dir = "/content/drive"
else:
    remote_dir = os.getcwd()

In [None]:
if is_colab:
    from google.colab import drive
    drive.mount(remote_dir)

In [None]:
if is_colab:
    out_exp_dir = os.path.join(remote_dir, "MyDrive/EducationRecSys/tree-ranking")
else:
    out_exp_dir = os.path.join(remote_dir, "catboost-dir")

In [None]:
if is_colab:
    from google.colab import output
    output.enable_custom_widget_manager()

# Основной код

In [1]:
import os
from datetime import timedelta, datetime

import seaborn as sns
import polars as pl
import numpy as np
from implicit.nearest_neighbours import BM25Recommender
from catboost import Pool, CatBoostRanker, FeaturesData

In [2]:
sns.set_style("darkgrid")

In [3]:
%load_ext autoreload
%autoreload 2

from recs_utils.metrics import compute_metrics, model_cross_validate, mean_average_prec, join_true_pred_and_preprocess
from recs_utils.load_data import MTSDataset, MovieLens100K
from recs_utils.split import TimeRangeSplit
from recs_utils.implicit_model import ImplicitRecommender
from recs_utils.utils import get_direct_and_inv_mapping
from recs_utils.ranking import add_features, add_group_ids, features_target_ranking, convert_to_features_data, features_target_pairwise_ranking

In [4]:
data_dir = os.path.join("data")

In [5]:
df = MTSDataset.load_interactions(os.path.join(data_dir, "interactions.csv"))
df_users = MTSDataset.load_users(os.path.join(data_dir, "users.csv"))
df_items = MTSDataset.select_genres(MTSDataset.load_items(os.path.join(data_dir, "items.csv")), 0.98)

In [6]:
df = df.filter(pl.col("item_id").is_in(df_items.select(pl.col("item_id").unique()).to_series()) & 
          pl.col("user_id").is_in(df_users.select(pl.col("user_id").unique()).to_series())
)

In [7]:
num_items = df.select(pl.col("item_id").n_unique())[0, 0]

In [8]:
print("Num items:", num_items)

Num items: 59599


По причинам произвоидительности берём только элементы с наибольшим количеством оценок

In [9]:
selected_item_ids = df.lazy().groupby("item_id").count().top_k(min(30_000, num_items), by="count").select(pl.col("item_id").unique()).collect().to_series()

In [10]:
df = df.filter(pl.col("item_id").is_in(selected_item_ids))

In [11]:
del selected_item_ids

In [12]:
len(df)

1063269

In [13]:
df.head()

user_id,item_id,progress,rating,start_date
u32,u32,u8,f32,date
126706,14433,80,,2018-01-01
127451,14876,69,,2018-01-01
47287,258483,22,,2018-01-01
47551,64060,55,,2018-01-01
59484,161786,27,,2018-01-01


In [14]:
df_users.head()

user_id,age,sex
u32,cat,i8
1,"""45_54""",
2,"""18_24""",0.0
3,"""65_inf""",0.0
4,"""18_24""",0.0
5,"""35_44""",0.0


In [15]:
df_items.head(n=3)

item_id,title,genres,authors,year
u32,str,str,cat,cat
128115,"""ворон-челобитч…","""зарубежная кла…","""Михаил Салтыко…","""1886"""
210979,"""скрипка ротшил…","""классическая п…","""Антон Чехов""","""1894"""
95632,"""испорченные де…","""зарубежная кла…","""Михаил Салтыко…","""1869"""


In [16]:
users_mapping, users_inv_mapping = get_direct_and_inv_mapping(df, "user_id")
len(users_mapping)

135515

In [17]:
items_mapping, items_inv_mapping = get_direct_and_inv_mapping(df, "item_id")
len(items_mapping)

30000

In [18]:
last_date = df.select(pl.col("start_date").max())[0, 0]
folds = 3
start_date = last_date - timedelta(weeks=folds)
start_date, last_date

(datetime.date(2019, 12, 10), datetime.date(2019, 12, 31))

In [19]:
cv = TimeRangeSplit(start_date=start_date, folds=folds, interval=timedelta(weeks=1))
cv.max_n_splits, cv.get_n_splits(df, datetime_column='start_date')

(3, 3)

In [20]:
folds_with_stats = list(cv.split(
    df, 
    user_column='user_id',
    item_column='item_id',
    datetime_column='start_date',
    fold_stats=True
)
)

folds_info_with_stats = pl.from_dicts([info for _, _, info in folds_with_stats])

In [21]:
folds_info_with_stats

Start date,End date,Train,New users,New users interactions,New items,New items interactions,Known interactions,Test
date,date,i64,i64,i64,i64,i64,i64,i64
2019-12-10,2019-12-17,1032812,72,101,0,0,0,9665
2019-12-17,2019-12-24,1042578,77,112,0,0,0,9595
2019-12-24,2019-12-31,1052285,61,83,0,0,0,9693


In [22]:
top_N = 10
num_neg_samples = 3

In [23]:
top_N_negative_sampling = max(num_neg_samples * 2, top_N)

In [24]:
train_idx, test_idx, info = folds_with_stats[0]

train = df.join(train_idx, on=["user_id", "item_id"], how="inner")
test = df.join(test_idx, on=["user_id", "item_id"], how="inner")
train.shape, test.shape

((1032812, 5), (9665, 5))

In [25]:
recommender = ImplicitRecommender(
    BM25Recommender(K=top_N_negative_sampling),
    users_mapping, 
    items_mapping,
    items_inv_mapping,
    # rating_col_name="rating"
)

In [26]:
recommender.fit(train)



  0%|          | 0/30000 [00:00<?, ?it/s]

In [27]:
train_pred_neg_sampling = recommender.recommend(
    train.select(pl.col("user_id").unique()).to_series(), 
    n=top_N_negative_sampling)

In [30]:
test_pred = recommender.recommend(
    test.select(pl.col("user_id").unique()).to_series(), 
    n=top_N)

In [31]:
baseline_map = mean_average_prec(join_true_pred_and_preprocess(test, test_pred))

In [32]:
baseline_map

0.014352939513304056

In [33]:
assert train_pred_neg_sampling.lazy().groupby("user_id").count().filter(pl.col("count") < num_neg_samples).collect().is_empty(), f"You must sample {num_neg_samples} for each user"

In [56]:
train = train.with_row_count("pair_id")

In [60]:
neg_train = train.lazy().join(train_pred_neg_sampling.lazy().select(pl.col(["user_id", "item_id"])), on="user_id", suffix="_neg").filter(
    pl.col("item_id") != pl.col("item_id_neg")
).groupby(
    ["user_id", "item_id", "start_date", "pair_id"]
).agg(
    pl.col("item_id_neg").implode().flatten().sample(n=num_neg_samples, seed=1455, with_replacement=False).alias("neg_item_id")
).select(
    pl.all().exclude("item_id")
).explode("neg_item_id").select(
    pl.col("user_id"),
    pl.col("neg_item_id").alias("item_id"),
    pl.col("start_date"),
    pl.col("pair_id"),
    pl.lit(0).cast(pl.Int8).alias("target")).collect()

In [61]:
neg_train.head()

user_id,item_id,start_date,pair_id,target
u32,u32,date,u32,i8
2,155763,2018-02-14,56793,0
2,34831,2018-02-14,56793,0
2,38183,2018-02-14,56793,0
2,155763,2019-02-26,596795,0
2,34831,2019-02-26,596795,0


In [62]:
train_full = train.select(
    pl.col("user_id"),
    pl.col("item_id"),
    pl.col("start_date"),
    pl.col("pair_id"),
    pl.lit(1).cast(pl.Int8).alias("target")
).vstack(neg_train)

In [65]:
train_full.sort("pair_id", "target").head(n=6)

user_id,item_id,start_date,pair_id,target
u32,u32,date,u32,i8
126706,307855,2018-01-01,0,0
126706,99126,2018-01-01,0,0
126706,276863,2018-01-01,0,0
126706,14433,2018-01-01,0,1
127451,232360,2018-01-01,1,0
127451,14142,2018-01-01,1,0


In [66]:
train_full.groupby(["pair_id"]).count().filter(
    pl.col("count") <= 1)

pair_id,count
u32,u32


In [67]:
assert train_full.groupby(["pair_id"]).count().filter(
    pl.col("count") < 2).is_empty(), f"Expected at least 2 pairs: positive and negative (user_id, item_id, start_date) "

In [68]:
# df_users_features = df_users
df_users_features = df_users.with_columns(
    pl.col("age").cast(str).fill_null("unknown"),
    pl.col("sex").cast(str).fill_null("unknown")
)

In [69]:
# df_items_features = df_items.select(pl.all().exclude(["IMDb_URL", "title"])).with_columns(pl.col("release_date").dt.timestamp("us"))
df_items_features = df_items.select(
    pl.col("item_id"),
    pl.col("genres"),
    pl.col("year")
).with_columns(
    pl.col("genres").cast(str).fill_null("genre_unknown").alias("genres"),
    pl.col("year").cast(str).fill_null("unknown").alias("year")
)

In [70]:
train_full = add_features(add_features(train_full, df_items_features, "item_id"), df_users_features, "user_id")

In [71]:
train_full.head(n=3)

user_id,item_id,start_date,pair_id,target,genres,year,age,sex
u32,u32,date,u32,i8,str,str,str,str
126706,14433,2018-01-01,0,1,"""историческая л…","""2007""","""25_34""","""0"""
127451,14876,2018-01-01,1,1,"""зарубежные дет…","""1843""","""45_54""","""0"""
47287,258483,2018-01-01,2,1,"""боевая фантаст…","""2018""","""18_24""","""0"""


In [72]:
train_full = train_full.with_columns(pl.col("start_date").dt.timestamp("us"))

In [73]:
train_full.head()

user_id,item_id,start_date,pair_id,target,genres,year,age,sex
u32,u32,i64,u32,i8,str,str,str,str
126706,14433,1514764800000000,0,1,"""историческая л…","""2007""","""25_34""","""0"""
127451,14876,1514764800000000,1,1,"""зарубежные дет…","""1843""","""45_54""","""0"""
47287,258483,1514764800000000,2,1,"""боевая фантаст…","""2018""","""18_24""","""0"""
47551,64060,1514764800000000,3,1,"""современные де…","""2002""","""35_44""","""unknown"""
59484,161786,1514764800000000,4,1,"""научная фантас…","""1989""","""45_54""","""1"""


In [74]:
# train_full = add_group_ids(train_full, ["user_id", "start_date"])

In [75]:
train_full.head()

user_id,item_id,start_date,pair_id,target,genres,year,age,sex
u32,u32,i64,u32,i8,str,str,str,str
126706,14433,1514764800000000,0,1,"""историческая л…","""2007""","""25_34""","""0"""
127451,14876,1514764800000000,1,1,"""зарубежные дет…","""1843""","""45_54""","""0"""
47287,258483,1514764800000000,2,1,"""боевая фантаст…","""2018""","""18_24""","""0"""
47551,64060,1514764800000000,3,1,"""современные де…","""2002""","""35_44""","""unknown"""
59484,161786,1514764800000000,4,1,"""научная фантас…","""1989""","""45_54""","""1"""


In [76]:
ranker_train_info = features_target_ranking(train_full, group_id_col="pair_id")

In [77]:
# assert 2 * len(ranker_train_info.pairs) == ranker_train_info.features.get_object_count()

In [78]:
ranker_train_info.features.get_feature_names()

['genres', 'year', 'age', 'sex']

In [79]:
train_pool = Pool(
    data=ranker_train_info.features,
    label=ranker_train_info.target,
    timestamp=ranker_train_info.timestamps,
    # pairs=ranker_train_info.pairs,
    group_id=ranker_train_info.group_ids
)

In [None]:
train_dir = os.path.join(out_exp_dir, "log")

In [None]:
os.makedirs(train_dir, exist_ok=True)

In [80]:
ranker = CatBoostRanker(task_type='GPU', random_seed=3466, metric_period=50, 
                        eval_metric=f"MAP:top=2", 
                        custom_metric=["RecallAt:top=1", "PrecisionAt:top=3"],
                        iterations=800,
                        train_dir=train_dir,
                        loss_function="YetiRankPairwise"
                        # max_ctr_complexity=3
                        )

In [None]:
model_path = os.path.join(out_exp_dir, "model", "model.cbm")

In [None]:
os.makedirs(os.path.dirname(model_path), exist_ok=True)

In [81]:
force_train = True

In [82]:
if os.path.exists(model_path) and not force_train:
    ranker.load_model(model_path)
else:
    ranker.fit(train_pool)
    ranker.save_model(model_path)

Groupwise loss function. OneHotMaxSize set to 10


Metric PFound is not implemented on GPU. Will use CPU for metric computation, this could significantly affect learning time
Metric PFound is not implemented on GPU. Will use CPU for metric computation, this could significantly affect learning time


0:	total: 1.05s	remaining: 13m 58s
50:	total: 46.5s	remaining: 11m 22s
100:	total: 2m 34s	remaining: 17m 47s
150:	total: 4m 27s	remaining: 19m 8s


KeyboardInterrupt: 

In [53]:
test_with_features = add_features(add_features(test_pred, df_items_features, ["item_id"]), df_users_features, ["user_id"])

In [54]:
test_with_features.head()

user_id,item_id,rank,release_date,genres,age,gender,occupation,zip_code
u32,u32,u32,i64,cat,i16,cat,cat,cat
13,15,6,822873600000000,"""drama""",47,"""M""","""educator""","""29206"""
54,125,5,836006400000000,"""drama,romance""",22,"""M""","""executive""","""66315"""
54,300,2,852076800000000,"""action,thrille…",22,"""M""","""executive""","""66315"""
54,323,10,855273600000000,"""action,thrille…",22,"""M""","""executive""","""66315"""
102,237,6,850435200000000,"""drama,romance""",38,"""M""","""programmer""","""30220"""


In [55]:
len(test_pred), len(test_with_features)

(440, 440)

In [56]:
scores = []

for user_id, data in test_with_features.groupby("user_id", maintain_order=True):
    features = convert_to_features_data(data, ["rank", "item_id"])
    scores.append(ranker.predict(features))

In [57]:
ranked_pred = pl.concat([test_with_features, pl.DataFrame({"score": np.concatenate(scores, axis=-1)})], how="horizontal")

In [58]:
ranked_pred = ranked_pred.sort(["user_id", "score"], descending=True).with_columns(
    (pl.col("item_id").cumcount() + 1).over("user_id").alias("new_rank")
)

In [59]:
ranked_pred.head()

user_id,item_id,rank,release_date,genres,age,gender,occupation,zip_code,score,new_rank
u32,u32,u32,i64,cat,i16,cat,cat,cat,f64,u32
903,176,10,504921600000000,"""action,sci-fi,…",28,"""M""","""educator""","""20850""",-2.911295,1
903,117,6,834105600000000,"""action,adventu…",28,"""M""","""educator""","""20850""",-3.308877,2
903,69,7,757382400000000,"""comedy,romance…",28,"""M""","""educator""","""20850""",-3.569425,3
903,172,3,315532800000000,"""action,adventu…",28,"""M""","""educator""","""20850""",-3.719976,4
903,56,2,757382400000000,"""crime,drama""",28,"""M""","""educator""","""20850""",-4.752843,5


In [60]:
joined_rank_data = join_true_pred_and_preprocess(test, ranked_pred.select(
    pl.col(["user_id", "item_id"]),
    pl.col("new_rank").alias("rank")
))

In [61]:
mean_average_prec(joined_rank_data)

0.13284188946922712

In [62]:
baseline_map

0.1390233073839528