In [35]:
from datetime import timedelta
import numpy as np
import polars as pl
import pandas as pd
import implicit

from tools import load_data_actions, generate_lightfm_recs_mapper
from tqdm import tqdm

from lightfm.data import Dataset
from lightfm import LightFM

from rectools import Columns
from rectools.dataset import Dataset as RTDataset
from rectools.models import (
    ImplicitALSWrapperModel,
    ImplicitBPRWrapperModel,
    LightFMWrapperModel,
    PureSVDModel,
    ImplicitItemKNNWrapperModel,
    EASEModel
)

from sklearn.model_selection import train_test_split
from sklearn.utils import shuffle
from sklearn.metrics import roc_auc_score

import mlflow
from mlflow.models import infer_signature

In [2]:
def dataframe2rectools(df):
    return (
        df[["cookie", "node", "event_date"]]
        .with_columns(pl.lit(1).alias('weight'))
        .rename({
            "cookie": Columns.User,
            "node": Columns.Item,
            "weight": Columns.Weight,
            "event_date": Columns.Datetime,
        })
    ).to_pandas()

In [3]:
SEED = 42
top_N = 40
DATA_DIR = 'data/'

df_test_users = pl.read_parquet(f'{DATA_DIR}/test_users.pq')
df_clickstream = pl.read_parquet(f'{DATA_DIR}/clickstream.pq')

df_cat_features = pl.read_parquet(f'{DATA_DIR}/cat_features_preproc_20.pq')
df_text_features = pl.read_parquet(f'{DATA_DIR}/text_features.pq')
df_event = pl.read_parquet(f'{DATA_DIR}/events.pq')

df_train, df_eval = load_data_actions(df_clickstream, df_event)

mean_by_cat = (
    df_text_features
    .join(df_cat_features["item", "node"], on="item", how="left")
    .to_pandas()
    .groupby("node")["title_projection"]
    .apply(lambda vs: np.mean(np.stack(vs.values), axis=0).tolist())
    .reset_index(name="mean_title_projection")
)

addit_features = pl.read_parquet(DATA_DIR+"cat_features_preproc_20.pq")
atr_cols = [i for i in addit_features.columns if "attr" in i]
addit_features = (
    addit_features.group_by("node")
    .agg([pl.col(atr).mean() for atr in atr_cols])
)

item_features = pl.DataFrame(mean_by_cat)
item_features = item_features.with_columns(pl.col("mean_title_projection").list.to_struct()).unnest("mean_title_projection")
item_features = item_features.join(addit_features, how="left", on="node")
item_features = item_features.rename({"node": "id"})
item_features = item_features.rename({"id": "node"})

df_train = dataframe2rectools(df_train)

  item_features = item_features.with_columns(pl.col("mean_title_projection").list.to_struct()).unnest("mean_title_projection")


In [4]:
candidates_ALS = pl.read_csv('data/first_stage_candidates_ALS_200.csv')
candidates_ANN = pl.read_csv('data/first_stage_prediction_ANN_40.csv')
candidates_BM25 = pl.read_csv('data/first_stage_prediction_BM25_40.csv')

In [5]:
candidates = pl.concat([candidates_ALS, candidates_BM25, candidates_ANN])
candidates = candidates.unique(subset=["cookie", "node"], maintain_order=True)
candidates = candidates.to_pandas()
candidates['rank'] = candidates.groupby('cookie').cumcount() + 1 

In [6]:
df_eval = df_eval.to_pandas()
df_eval = df_eval.astype({"node": int})

In [7]:
# positive interactions
pos = candidates.merge(
    df_eval,
    on=['cookie', 'node'],
    how='inner'
    )

pos.loc[:, 'target'] = 1
print(pos.shape)
# negative interactions
neg = candidates.set_index(['cookie', 'node'])\
        .join(df_eval.set_index(['cookie', 'node']))

neg = neg[neg['event'].isnull()].reset_index()     
neg = neg.sample(frac=0.03)
neg.loc[:, 'target'] = 0

print(neg.shape)


# train test
ctb_train_users, ctb_test_users = train_test_split(
    df_eval['cookie'].unique(),
    random_state=SEED,
    test_size=0.2
    )
# train eval
# ctb_train_users, ctb_eval_users = train_test_split(
#     ctb_train_users,
#     random_state=SEED,
#     test_size=0.1
#     )

select_col = ["cookie", "node", "rank", 'target']


# Catboost train
ctb_train = shuffle(
    pd.concat([
        pos[pos['cookie'].isin(ctb_train_users)],
        neg[neg['cookie'].isin(ctb_train_users)]
])[select_col]
)
# Catboost test
ctb_test = shuffle(
    pd.concat([
        pos[pos['cookie'].isin(ctb_test_users)],
        neg[neg['cookie'].isin(ctb_test_users)]
])[select_col]
)
# # for early stopping
# ctb_eval = shuffle(
#     pd.concat([
#         pos[pos['cookie'].isin(ctb_eval_users)],
#         neg[neg['cookie'].isin(ctb_eval_users)]
# ])[select_col]
# )

(47261, 5)
(413161, 5)


In [8]:
ctb_train['target'].value_counts(normalize=True), ctb_test['target'].value_counts(normalize=True)

(target
 0    0.897621
 1    0.102379
 Name: proportion, dtype: float64,
 target
 0    0.896279
 1    0.103721
 Name: proportion, dtype: float64)

In [11]:
item_features = item_features.to_pandas()
user_col = ['cookie']
item_col = item_features.columns

train_feat = (
    ctb_train
    # .merge(
    #     users[user_col],
    #     on=['user_id'],
    #     how='left')
    .merge(
        item_features[item_col],
        on=['node'],
        how='left')
)
# eval_feat = (
#     ctb_eval
#     # .merge(
#     #     users[user_col],
#     #     on=['user_id'],
#     #     how='left')
#     .merge(
#         item_features[item_col],
#         on=['node'],
#         how='left')
# )
test_feat = (
    ctb_test
    # .merge(
    #     users[user_col],
    #     on=['user_id'],
    #     how='left')
    .merge(
        item_features[item_col],
        on=['node'],
        how='left'
        )
)
train_feat.isna().sum().sum(), test_feat.isna().sum().sum()

AttributeError: 'DataFrame' object has no attribute 'to_pandas'

In [12]:
train_feat = train_feat.sort_values(by=["cookie", "target", "rank"], ascending=[True, False, True])
# eval_feat = eval_feat.sort_values(by=["cookie", "target", "rank"], ascending=[True, False, True])
test_feat = test_feat.sort_values(by=["cookie", "target", "rank"], ascending=[True, False, True])

In [13]:
drop_col = ['cookie', 'node']
target_col = ['target']
cat_col = [f"field_{i}" for i in range(64)]

X_train, y_train = train_feat.drop(drop_col + target_col, axis=1), train_feat[target_col]
train_cookies = train_feat["cookie"]
# X_val, y_val = eval_feat.drop(drop_col + target_col, axis=1), eval_feat[target_col]
# val_cookies = eval_feat["cookie"]
X_test, y_test = test_feat.drop(drop_col + target_col, axis=1), test_feat['target']
test_cookies = test_feat["cookie"]
X_train.shape, y_train.shape, X_test.shape, y_test.shape

((368512, 85), (368512, 1), (91910, 85), (91910,))

In [15]:
from catboost import Pool
train_pool = Pool(
    data=X_train,
    label=y_train,
    group_id=train_cookies,
    cat_features=["rank"]
)
# val_pool = Pool(
#     data=X_val,
#     label=y_val,
#     group_id=val_cookies
# )
test_pool = Pool(
    data=X_test,
    label=y_test,
    group_id=test_cookies,
    cat_features=["rank"]
)

In [16]:
from catboost import CatBoostClassifier, CatBoostRanker
from copy import deepcopy

# параметры для обучения
est_params = {
  'subsample': 0.9,
  'max_depth': 4,
  'n_estimators': 5000,
  'learning_rate': 0.03,
  'thread_count': 20,
  'random_state': SEED,
  'verbose': 200,
#   "loss_function":'YetiRankPairwise',
#  "eval_metric":'NDCG'
}

ctb_model = CatBoostClassifier(**est_params)


default_parameters = {
    'iterations': 5000,
    'custom_metric': ['NDCG', 'AUC'],
    'verbose': 100,
    # "loss_function":'YetiRankPairwise',
    'thread_count': 20,
    'learning_rate': 0.1,
    'max_depth': 4,
    'random_seed': SEED,
}

parameters = {}

def fit_model(loss_function, additional_params=None, train_pool=None, test_pool=None):
    parameters = deepcopy(default_parameters)
    parameters['loss_function'] = loss_function
    parameters['train_dir'] = loss_function

    if additional_params is not None:
        parameters.update(additional_params)

    model = CatBoostRanker(**parameters)
    model.fit(train_pool, eval_set=test_pool, plot=True, early_stopping_rounds=100,)

    return model

In [18]:
model = fit_model(
    'RMSE', 
    {'custom_metric': ['PrecisionAt:top=40', 'RecallAt:top=40', 'MAP:top=40', 'AUC']},
    train_pool=train_pool,
    test_pool=test_pool
    )



MetricVisualizer(layout=Layout(align_self='stretch', height='500px'))

0:	learn: 0.3014027	test: 0.3031835	best: 0.3031835 (0)	total: 26ms	remaining: 2m 9s
100:	learn: 0.2891574	test: 0.2908572	best: 0.2908572 (100)	total: 2.62s	remaining: 2m 7s
200:	learn: 0.2878517	test: 0.2898062	best: 0.2898062 (200)	total: 5.09s	remaining: 2m 1s
300:	learn: 0.2870966	test: 0.2893645	best: 0.2893645 (300)	total: 7.66s	remaining: 1m 59s
400:	learn: 0.2865652	test: 0.2891557	best: 0.2891557 (400)	total: 10.2s	remaining: 1m 57s
500:	learn: 0.2861316	test: 0.2890014	best: 0.2890014 (500)	total: 12.7s	remaining: 1m 54s
600:	learn: 0.2857741	test: 0.2889353	best: 0.2889353 (600)	total: 15.3s	remaining: 1m 51s
700:	learn: 0.2854879	test: 0.2888529	best: 0.2888529 (700)	total: 17.8s	remaining: 1m 49s
800:	learn: 0.2852241	test: 0.2887887	best: 0.2887887 (800)	total: 20.5s	remaining: 1m 47s
900:	learn: 0.2849846	test: 0.2887726	best: 0.2887686 (846)	total: 23s	remaining: 1m 44s
1000:	learn: 0.2847254	test: 0.2887414	best: 0.2887405 (950)	total: 25.5s	remaining: 1m 41s
1100:	le

In [None]:
# 0.7585 # RMSE

0.7585

In [36]:
y_pred = model.predict(X_test)
signature = infer_signature(X_test, y_pred)



In [39]:
mlflow.set_tracking_uri("http://127.0.0.1:5000")
mlflow.set_experiment("avito_cup_pers_recs")
with mlflow.start_run():
    roc_auc = roc_auc_score(y_test, y_pred)

    mlflow.log_params(model.get_all_params())
    mlflow.log_metric("roc_auc", roc_auc)

    mlflow.set_tag("Training Info", "Final CatboostRanker model of competition")

    model_info = mlflow.catboost.log_model(
        model, 
        "ctb_model_ranker", 
        signature=signature,
        registered_model_name="comp-final-ranker",
        )

    print(f"ROC AUC score = {roc_auc:.4f}")

Successfully registered model 'comp-final-ranker'.
2025/06/09 23:28:51 INFO mlflow.store.model_registry.abstract_store: Waiting up to 300 seconds for model version to finish creation. Model name: comp-final-ranker, version 1


ROC AUC score = 0.7571
🏃 View run mysterious-whale-163 at: http://127.0.0.1:5000/#/experiments/2/runs/ccf1f433a3b146b2a2244b7f0f776b45
🧪 View experiment at: http://127.0.0.1:5000/#/experiments/2


Created version '1' of model 'comp-final-ranker'.


In [73]:
# # save model 
# import dill 
# with open(f"ctb_model_ranker.dill", 'wb') as f:
#     dill.dump(ctb_model, f)

# Submission

In [None]:
# from catboost import CatBoostClassifier, CatBoostRanker

In [74]:
# import dill
# with open("ctb_model_baseline.dill", "rb") as f:
#     ctb_model = dill.load(f)

In [75]:
df_final = dataframe2rectools(df_clickstream[["cookie", "node", "event_date"]])

In [76]:
candidates_ALS = pl.read_csv('data/test_hybrid_stage_candidates_ALS_200.csv')
candidates_ANN = pl.read_csv('data/test_prediction_ANN_40.csv')
candidates_BM25 = pl.read_csv('data/test_hybrid_stage_candidates_BM25_40.csv')

In [77]:
candidates = pl.concat([candidates_ALS, candidates_BM25, candidates_ANN])
candidates = candidates.unique(subset=["cookie", "node"], maintain_order=True)
candidates = candidates.to_pandas()
candidates['rank'] = candidates.groupby('cookie').cumcount() + 1 

In [78]:
candidates.shape

(23032214, 3)

In [80]:
user_col = ['cookie']
drop_col = ['cookie', 'node']
target_col = ['target']
cat_col = [f"field_{i}" for i in range(64)]
item_col = item_features.columns
item_features = item_features.to_pandas()
final_feat = (
    candidates
    # .merge(
    #     users[user_col],
    #     on=['user_id'],
    #     how='left')
    .merge(
        item_features[item_col],
        on=['node'],
        how='left')
)
X_final = final_feat.drop(drop_col, axis=1)

In [81]:
del item_features
del candidates

In [83]:
y_pred = model.predict(X_final)

In [84]:
del X_final

In [86]:
final_feat["score"] = y_pred
res = final_feat.sort_values(by=["cookie", "score"], ascending=[True, False]).groupby("cookie").head(40)
res = res[["cookie", "node"]].drop_duplicates()
res = pl.DataFrame(res)
addit = df_test_users.filter(pl.col("cookie").is_in(res["cookie"]) == False).with_columns(pl.lit(1).cast(pl.Int64).alias('node'))
addit

cookie,node
i64,i64


In [87]:
res.head()

cookie,node
i64,i64
1,57915
1,239954
1,198003
1,195257
1,199289


In [88]:
pl.concat([res, addit]).group_by("cookie").head(40)["cookie","node"].write_csv('results/prediction_hybrid_ranker_final.csv')