### Полный датасет, без embeddings

In [15]:
import polars as pl
from sklearn.metrics.pairwise import cosine_similarity
from scipy.sparse import csr_matrix
import numpy as np
file_path_train_interactions = "../data/train_interactions.parquet"
file_path_users_meta = "../data/users_meta.parquet.parquet"
file_path_items_meta = "../data/items_meta.parquet.parquet"
file_path_test_pair = "../data/test_pairs.csv.csv"

print("Загружаем данные...")
train = pl.scan_parquet(file_path_train_interactions)
# Первые N строк
N = 3_000_000
train = train.slice(0, N)

print("Обрабатываем данные...")
train_i = pl.scan_parquet(file_path_items_meta)
train_u = pl.scan_parquet(file_path_users_meta)
train_i = train_i.drop('embeddings')
train = train.join(train_i, on='item_id')
train = train.join(train_u, on='user_id')
train = train.with_columns((pl.col('timespent') / pl.col('duration')).alias('engagement_ratio'))
print(train.collect().head())

Загружаем данные...
Обрабатываем данные...
shape: (5, 12)
┌─────────┬─────────┬───────────┬──────┬───┬──────────┬────────┬─────┬──────────────────┐
│ user_id ┆ item_id ┆ timespent ┆ like ┆ … ┆ duration ┆ gender ┆ age ┆ engagement_ratio │
│ ---     ┆ ---     ┆ ---       ┆ ---  ┆   ┆ ---      ┆ ---    ┆ --- ┆ ---              │
│ u32     ┆ u32     ┆ u8        ┆ u8   ┆   ┆ u8       ┆ u8     ┆ u8  ┆ f64              │
╞═════════╪═════════╪═══════════╪══════╪═══╪══════════╪════════╪═════╪══════════════════╡
│ 3810    ┆ 138979  ┆ 6         ┆ 0    ┆ … ┆ 54       ┆ 1      ┆ 36  ┆ 0.111111         │
│ 101874  ┆ 331160  ┆ 6         ┆ 0    ┆ … ┆ 6        ┆ 2      ┆ 52  ┆ 1.0              │
│ 150332  ┆ 73709   ┆ 11        ┆ 0    ┆ … ┆ 16       ┆ 1      ┆ 24  ┆ 0.6875           │
│ 4982    ┆ 189745  ┆ 5         ┆ 0    ┆ … ┆ 25       ┆ 1      ┆ 40  ┆ 0.2              │
│ 149601  ┆ 289643  ┆ 1         ┆ 0    ┆ … ┆ 23       ┆ 1      ┆ 34  ┆ 0.043478         │
└─────────┴─────────┴───────────┴──────┴──

In [21]:
train.collect().head()

user_id,item_id,timespent,like,dislike,share,bookmarks,source_id,duration,gender,age,engagement_ratio
u32,u32,u8,u8,u8,u8,u8,u32,u8,u8,u8,f64
3810,138979,6,0,0,0,0,4278,54,1,36,0.111111
101874,331160,6,0,0,0,0,2049,6,2,52,1.0
150332,73709,11,0,0,0,0,16375,16,1,24,0.6875
4982,189745,5,0,0,0,0,166,25,1,40,0.2
149601,289643,1,0,0,1,0,1459,23,1,34,0.043478


Простейшая реализация CatBoost

In [58]:
import polars as pl
import pandas as pd
from catboost import CatBoostClassifier, Pool
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, classification_report

# Целевая переменная
df = train.with_columns(
    pl.when(pl.col('like') == 1).then(1)
        .when(pl.col('dislike') == 1).then(-1)
    .otherwise(0)
    .alias('target')
).collect()

features = ['timespent', 'share', 'bookmarks', 'source_id', 'duration',
            'gender', 'age', 'engagement_ratio']

# Разделение данных(у Polar нет своего метода деления:()
user_ids = df.select("user_id").to_pandas()["user_id"].values
X = df.select(features).to_pandas()
y = df['target'].to_numpy()

X_train, X_test, y_train, y_test, user_ids_train, user_ids_test =\
    train_test_split(X, y, user_ids, test_size=0.2, random_state=42)

# cat_feature_indices = [features.index(cat) for cat in categorical_features]
train_pool = Pool(X_train, y_train)
test_pool = Pool(X_test, y_test)

In [None]:
data_pd = df.to_pandas()
data_pd.describe()

In [59]:
# Баланс классов
df.group_by("target").len().sort("target")

target,len
i32,u32
-1,1112
0,2859639
1,139249


In [76]:
params = {
        'iterations': 7,
        'depth': 6,
        'learning_rate': 0.1,
        'loss_function': 'PairLogit',  # Функция потерь для ранжирования
        'task_type': 'CPU',
        'thread_count': -1,
        'verbose': 100
}

model = CatBoostClassifier(
    iterations=20,
    depth=6,
    learning_rate=0.1,
    loss_function='MultiClass',
    verbose=10
)

model.fit(train_pool)
y_pred = model.predict(X_test)

print("Accuracy:", accuracy_score(y_test, y_pred))
# print("Classification Report:\n", classification_report(y_test, y_pred))

0:	learn: 0.9324951	total: 350ms	remaining: 6.65s
10:	learn: 0.3465008	total: 3.01s	remaining: 2.46s
19:	learn: 0.2248666	total: 5.25s	remaining: 0us
Accuracy: 0.9531866666666666


In [86]:
from catboost import CatBoostRanker
model = CatBoostRanker(
    loss_function='YetiRank',
    task_type='CPU',
    iterations=3000,
    reg_lambda=24,
    depth=8,
    min_child_samples=1,
    random_state=17,
    learning_rate=0.05,
    early_stopping_rounds=100,
    verbose=100
)

model.fit(train_pool)
y_pred = model.predict(X_test)

CatBoostError: catboost/private/libs/target/data_providers.cpp:280: Groupwise loss/metrics require nontrivial groups

## Метрики

In [83]:
# Игнорируем класс игнора, только лайки и дизлайки
from sklearn.metrics import roc_auc_score
import numpy as np

y_true = y_test #— реальные метки: -1, 0, 1
y_score = y_pred #— предсказания модели от -1 до 1
# Исключим 0

mask = y_true != 0
filtered_y_true = y_true[mask]
filtered_y_score = y_score[mask]

# Преобразуем классы: -1 -> 0, 1 -> 1
binary_y_true = (filtered_y_true == 1).astype(int)

# ROC AUC
roc_auc = roc_auc_score(binary_y_true, filtered_y_score)
print("ROC AUC:", roc_auc)


ROC AUC: 0.5008608012625085


In [82]:
# Кастомная метрика
from custom_roc_auc import custom_roc_auc
def call_user_auc(y_pred_metric, y_true_metric, group_ids):
    sort_idx = np.argsort(group_ids)
    y_pred_sorted = y_pred_metric[sort_idx]
    y_true_sorted = y_true_metric[sort_idx]
    group_ids_sorted = group_ids[sort_idx]
    _, group_sizes = np.unique(group_ids_sorted, return_counts=True)
    return custom_roc_auc(y_pred_sorted, y_true_sorted, group_sizes)

y_true = y_test #— реальные метки: -1, 0, 1
y_score = y_pred #— предсказания модели от -1 до 1
group_ids = user_ids_test

y_true = np.asarray(y_true, dtype=np.float64)
y_score = np.asarray(y_score, dtype=np.float64)
group_ids = np.asarray(group_ids, dtype=np.int32)

print(y_true.shape)  # Должен быть одномерный массив
print(y_score.shape)  # Должен быть одномерный массив
print(group_ids.shape)  # Должен быть одномерный массив

print(type(y_true), type(y_score), type(group_ids))

score = call_user_auc(y_score, y_true, group_ids)

(600000,)
(600000, 1)
(600000,)
<class 'numpy.ndarray'> <class 'numpy.ndarray'> <class 'numpy.ndarray'>


TypingError: Failed in nopython mode pipeline (step: nopython frontend)
[1m[1m[1m[1mNo implementation of function Function(<function argsort at 0x000001AB39A9E660>) found for signature:
 
 >>> argsort(array(float64, 2d, C))
 
There are 2 candidate implementations:
[1m      - Of which 2 did not match due to:
      Overload of function 'argsort': File: numba\core\typing\npydecl.py: Line 368.
        With argument(s): '(array(float64, 2d, C))':[0m
[1m       No match.[0m
[0m
[0m[1mDuring: resolving callee type: Function(<function argsort at 0x000001AB39A9E660>)[0m
[0m[1mDuring: typing of call at C:\Users\Mi\PycharmProjects\PythonProject\custom_roc_auc.py (20)
[0m
[1m
File "..\custom_roc_auc.py", line 20:[0m
[1mdef compute_group_auc(y_pred, y_true, group_indices):
    <source elided>
            
[1m        order = np.argsort(p)
[0m        [1m^[0m[0m

[0m[1mDuring: Pass nopython_type_inference[0m