In [1]:
!pip install catboost

Collecting catboost
  Downloading catboost-1.2.8-cp311-cp311-manylinux2014_x86_64.whl.metadata (1.2 kB)
Downloading catboost-1.2.8-cp311-cp311-manylinux2014_x86_64.whl (99.2 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m99.2/99.2 MB[0m [31m9.1 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: catboost
Successfully installed catboost-1.2.8


In [2]:
import pandas as pd
import numpy as np
from catboost import CatBoostClassifier, Pool
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report, roc_auc_score

In [3]:
np.random.seed(42)
num_users = 1000
num_products = 500
num_categories = 20
num_interactions = 50000

data = []
purchased_products = set()  # Храним product_id купленных товаров

for _ in range(num_interactions):
    user_id = np.random.randint(1, num_users + 1)
    product_id = np.random.randint(1, num_products + 1)
    category = np.random.randint(1, num_categories + 1)
    price = np.round(np.random.uniform(10, 1000), 2)

    # Проверяем, не куплен ли уже товар (любым пользователем)
    already_purchased = product_id in purchased_products

    viewed = np.random.choice([0, 1], p=[0.3, 0.7])
    bought = 0

    if viewed and not already_purchased:
        if len(purchased_products) < 0.7 * num_products:
            bought = np.random.choice([0, 1], p=[0.9, 0.1])  # 10% шанс покупки
            if bought:
                purchased_products.add(product_id)  # Запоминаем покупку товара

    data.append([user_id, product_id, category, price, viewed, bought])

df = pd.DataFrame(data, columns=[
    'user_id',
    'product_id',
    'category',
    'price',
    'viewed',
    'bought'
])
df

Unnamed: 0,user_id,product_id,category,price,viewed,bought
0,103,436,15,734.67,1,0
1,467,215,11,464.66,1,0
2,131,150,2,724.78,1,0
3,956,277,1,311.20,1,0
4,561,475,10,56.20,1,0
...,...,...,...,...,...,...
49995,639,125,16,514.49,1,0
49996,135,125,14,969.21,1,0
49997,581,450,2,533.90,0,0
49998,478,481,2,654.68,1,0


In [4]:
# Признаки пользователя
user_features = df.groupby('user_id').agg({
    'viewed': 'sum',
    'bought': 'sum',
    'price': 'mean'
}).reset_index()
user_features.columns = ['user_id', 'user_views', 'user_purchases', 'user_avg_price']

# Признаки товара
product_features = df.groupby('product_id').agg({
    'viewed': 'sum',
    'bought': 'sum',
    'price': 'mean'
}).reset_index()
product_features.columns = ['product_id', 'product_views', 'product_purchases', 'product_avg_price']

# Объединение признаков
df = df.merge(user_features, on='user_id')
df = df.merge(product_features, on='product_id')
df

Unnamed: 0,user_id,product_id,category,price,viewed,bought,user_views,user_purchases,user_avg_price,product_views,product_purchases,product_avg_price
0,103,436,15,734.67,1,0,37,0,455.107500,62,1,527.441915
1,467,215,11,464.66,1,0,37,1,508.220784,73,0,515.901818
2,131,150,2,724.78,1,0,25,0,539.808140,81,1,386.159474
3,956,277,1,311.20,1,0,30,0,451.396667,72,1,526.080521
4,561,475,10,56.20,1,0,41,0,539.432321,77,1,514.217115
...,...,...,...,...,...,...,...,...,...,...,...,...
49995,639,125,16,514.49,1,0,25,1,519.315897,66,1,480.845217
49996,135,125,14,969.21,1,0,31,0,494.881064,66,1,480.845217
49997,581,450,2,533.90,0,0,30,0,530.877551,63,1,511.678500
49998,478,481,2,654.68,1,0,37,1,454.162955,60,1,573.988791


In [5]:
X = df[['user_id', 'product_id', 'category', 'price',
        'user_views', 'user_purchases', 'user_avg_price',
        'product_views', 'product_purchases', 'product_avg_price']]
y = df['bought']

# Указание категориальных признаков
cat_features = ['user_id', 'product_id', 'category']
X

Unnamed: 0,user_id,product_id,category,price,user_views,user_purchases,user_avg_price,product_views,product_purchases,product_avg_price
0,103,436,15,734.67,37,0,455.107500,62,1,527.441915
1,467,215,11,464.66,37,1,508.220784,73,0,515.901818
2,131,150,2,724.78,25,0,539.808140,81,1,386.159474
3,956,277,1,311.20,30,0,451.396667,72,1,526.080521
4,561,475,10,56.20,41,0,539.432321,77,1,514.217115
...,...,...,...,...,...,...,...,...,...,...
49995,639,125,16,514.49,25,1,519.315897,66,1,480.845217
49996,135,125,14,969.21,31,0,494.881064,66,1,480.845217
49997,581,450,2,533.90,30,0,530.877551,63,1,511.678500
49998,478,481,2,654.68,37,1,454.162955,60,1,573.988791


Unnamed: 0,product_purchases
count,50000.0
mean,0.70414
std,0.456433
min,0.0
25%,0.0
50%,1.0
75%,1.0
max,1.0


In [6]:
# 4. Разделение данных
X_train, X_test, y_train, y_test = train_test_split(
    X, y,
    test_size=0.2,
    stratify=y,
    random_state=42
)

# 5. Создание CatBoost Pool
train_pool = Pool(
    X_train,
    y_train,
    cat_features=cat_features
)

test_pool = Pool(
    X_test,
    y_test,
    cat_features=cat_features
)

# 6. Обучение модели
model = CatBoostClassifier(
    iterations=500,
    learning_rate=0.05,
    depth=6,
    eval_metric='AUC',
    random_seed=42,
    early_stopping_rounds=20
)

model.fit(
    train_pool,
    eval_set=test_pool,
    verbose=10
)

0:	test: 0.8992044	best: 0.8992044 (0)	total: 186ms	remaining: 1m 32s
10:	test: 0.8986700	best: 0.9153345 (1)	total: 613ms	remaining: 27.3s
20:	test: 0.8960344	best: 0.9153345 (1)	total: 1.07s	remaining: 24.5s
Stopped by overfitting detector  (20 iterations wait)

bestTest = 0.9153344842
bestIteration = 1

Shrink model to first 2 iterations.


<catboost.core.CatBoostClassifier at 0x79e830b36050>

In [7]:
y_pred_proba = model.predict_proba(test_pool)[:, 1]
y_pred = model.predict(test_pool)

print("Classification Report:")
print(classification_report(y_test, y_pred))

print("\nROC-AUC Score:", roc_auc_score(y_test, y_pred_proba))

Classification Report:
              precision    recall  f1-score   support

           0       0.99      1.00      1.00      9930
           1       0.00      0.00      0.00        70

    accuracy                           0.99     10000
   macro avg       0.50      0.50      0.50     10000
weighted avg       0.99      0.99      0.99     10000


ROC-AUC Score: 0.9153344842468709


  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


In [13]:
def generate_recommendations(user_id, top_n=5):
    user_data = X[X['user_id'] == user_id].copy()
    purchased_products = df[df['bought'] == 1]['product_id'].unique()

    # Создаем датафрейм всех товаров, исключая купленные
    all_products = pd.DataFrame({'product_id': range(1, num_products + 1)})
    available_products = all_products[~all_products['product_id'].isin(purchased_products)]

    if available_products.empty:
        return pd.DataFrame(columns=['product_id', 'probability'])  # Нет доступных товаров

    recs = user_data[['user_id', 'product_id', 'category', 'price',
        'user_views', 'user_purchases', 'user_avg_price',
        'product_views', 'product_purchases', 'product_avg_price']]

    # Предсказание вероятностей
    pool = Pool(recs, cat_features=cat_features)
    probabilities = model.predict_proba(pool)[:, 1]
    recs['probability'] = probabilities
    fin = recs.sort_values('probability', ascending=False).head(top_n)[['product_id', 'probability']]

    # Возвращаем топ-N товаров
    return fin.reset_index(drop=True)

generate_recommendations(100)

Unnamed: 0,product_id,probability
0,287,0.406522
1,345,0.406522
2,109,0.404006
3,407,0.404006
4,67,0.404006
