### Полный датасет, без embeddings

In [54]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from custom_roc_auc import custom_roc_auc
from catboost import CatBoostRanker, Pool
from itertools import product
from sklearn.metrics.pairwise import cosine_similarity
from scipy.sparse import csr_matrix

In [81]:
file_path_train_interactions = "../data/train_interactions.parquet"
file_path_users_meta = "../data/users_meta.parquet.parquet"
file_path_items_meta = "../data/items_meta.parquet.parquet"
file_path_test_pair = "../data/test_pairs.csv.csv"

print("Загружаем данные...")
train = pd.read_parquet(file_path_train_interactions)
# Первые N строк
N = 3_000_000
train = train.head(N)

print("Обрабатываем данные...")
train_i = pd.read_parquet(file_path_items_meta)
train_u = pd.read_parquet(file_path_users_meta)
train_i = train_i.drop(columns='embeddings', errors='ignore')
train = train.merge(train_i, how='left', on='item_id', sort=False)
train = train.merge(train_u, how='left', on='user_id', sort=False)
train['engagement_ratio'] = train['timespent'] / train['duration']
train.head()

Загружаем данные...
Обрабатываем данные...


Unnamed: 0,user_id,item_id,timespent,like,dislike,share,bookmarks,source_id,duration,gender,age,engagement_ratio
0,3810,138979,6,0,0,0,0,4278,54,1,36,0.111111
1,101874,331160,6,0,0,0,0,2049,6,2,52,1.0
2,150332,73709,11,0,0,0,0,16375,16,1,24,0.6875
3,4982,189745,5,0,0,0,0,166,25,1,40,0.2
4,149601,289643,1,0,0,1,0,1459,23,1,34,0.043478


In [83]:
# Формирование таргета и фичей. Разбивка датасета.
# user_ids - массив со всеми уникальными пользователями.

df = train.copy()  # Создаем копию, чтобы не изменять исходный DataFrame
df['target'] = np.select(
    [df['like'] == 1, df['dislike'] == 1],
    [1, -1],
    default=0
)

features = ['timespent', 'share', 'bookmarks', 'source_id', 'duration',
            'gender', 'age', 'engagement_ratio']

user_ids = df['user_id'].values
X = df[features]
y = df['target'].values

X_train, X_test, y_train, y_test, user_ids_train, user_ids_test =\
    train_test_split(X, y, user_ids, test_size=0.2, random_state=42)

Простейшая реализация CatBoost

In [84]:
train_pool = Pool(X_train, y_train)
test_pool = Pool(X_test, y_test)
user_ids_train

array([ 83735, 121792,  31437, ..., 154351, 132277,  54535], dtype=uint32)

In [90]:
df.columns()

TypeError: 'Index' object is not callable

In [85]:
# Группировка и сортировка данных для CatBoost
# Временные переменные

X = X_train
y = y_train
group_id = user_ids_train

# Сортировка по user_id
sorted_idx = np.argsort(group_id)
X = X.iloc[sorted_idx].reset_index(drop=True)
y = np.array(y)[sorted_idx]
group_id = np.array(group_id)[sorted_idx]

# group_sizes - массив, содержащий кол-во объектов в каждой группе.
# То есть, количество роликов у каждого пользователя
group_sizes = (
    pd.Series(group_id)
    .value_counts()
    .sort_index()
    .values
)


In [86]:
model = CatBoostRanker(
    loss_function='YetiRank',
    task_type='CPU',
    iterations=1,
    reg_lambda=24,
    depth=5,
    min_child_samples=1,
    random_state=17,
    learning_rate=0.05,
    early_stopping_rounds=100,
    verbose=100
)

model.fit(
    X, y,
    group_id=group_id
)


0:	total: 765ms	remaining: 0us


<catboost.core.CatBoostRanker at 0x23d9af66c90>

In [87]:
y_pred = model.predict(X_test)
print(custom_roc_auc(y_pred, y_test, group_sizes))



0.6730139462289033


In [59]:
y_pred_train = model.predict(X_train)

In [68]:
print(custom_roc_auc(y_pred, y_test, group_sizes))
print(custom_roc_auc(y_pred_train, y_train, group_sizes))

0.7437825076799527
0.741833746021551


In [64]:
# Перебор гиперпараметров

param_grid = {
    'depth': [5],
    'learning_rate': [0.05],
    'reg_lambda': [10, 24],
    'iterations': [1, 10, 20],
}

combinations = list(product(*param_grid.values()))
param_names = list(param_grid.keys())

for combo in combinations:
    params = dict(zip(param_names, combo))

    print(f"📦 Testing params: {params}")

    model = CatBoostRanker(
        loss_function='YetiRank',
        task_type='CPU',
        early_stopping_rounds=100,
        verbose=False,
        random_state=17,
        min_child_samples=1,
        **params
    )

    model.fit(X, y, group_id=group_id)

    y_pred = model.predict(X_test)
    score = custom_roc_auc(y_pred, y_test, user_ids_test)

    print(f"✅ AUC score: {score:.6f}\n")


📦 Testing params: {'depth': 5, 'learning_rate': 0.05, 'reg_lambda': 10, 'iterations': 1}
✅ AUC score: 0.747202

📦 Testing params: {'depth': 5, 'learning_rate': 0.05, 'reg_lambda': 10, 'iterations': 10}
✅ AUC score: 0.743306

📦 Testing params: {'depth': 5, 'learning_rate': 0.05, 'reg_lambda': 10, 'iterations': 20}
✅ AUC score: 0.741273

📦 Testing params: {'depth': 5, 'learning_rate': 0.05, 'reg_lambda': 24, 'iterations': 1}
✅ AUC score: 0.746867

📦 Testing params: {'depth': 5, 'learning_rate': 0.05, 'reg_lambda': 24, 'iterations': 10}
✅ AUC score: 0.743795

📦 Testing params: {'depth': 5, 'learning_rate': 0.05, 'reg_lambda': 24, 'iterations': 20}
✅ AUC score: 0.741309



## Метрики

In [83]:
# Игнорируем класс игнора, только лайки и дизлайки
from sklearn.metrics import roc_auc_score
import numpy as np

y_true = y_test #— реальные метки: -1, 0, 1
y_score = y_pred #— предсказания модели от -1 до 1
# Исключим 0

mask = y_true != 0
filtered_y_true = y_true[mask]
filtered_y_score = y_score[mask]

# Преобразуем классы: -1 -> 0, 1 -> 1
binary_y_true = (filtered_y_true == 1).astype(int)

# ROC AUC
roc_auc = roc_auc_score(binary_y_true, filtered_y_score)
print("ROC AUC:", roc_auc)


ROC AUC: 0.5008608012625085
