# Описание
Этот ноутбук используется для обучения пяти моделей CatBoost, каждая из которых предсказывает категории на своём уровне иерархии.

# Import

In [1]:
import pandas as pd
import numpy as np

import networkx as nx

from imblearn.under_sampling import RandomUnderSampler

from sklearn.model_selection import train_test_split
from sklearn.utils.class_weight import compute_class_weight

from catboost import CatBoostClassifier, Pool

import pickle

import warnings
warnings.filterwarnings('ignore')

RAND = 42

In [2]:
df = pd.read_csv('final_dataset.parquet')
category_tree = pd.read_csv('category_tree.csv')#

In [3]:
category_tree['cat_name']

0                                   Электроника
1                         Ноутбуки и компьютеры
2                               Бытовая техника
3                                Детские товары
4                            Дача, сад и огород
                         ...                   
1891       Аккумуляторы для мобильных телефонов
1892      Коннекторы и контейнеры для телефонов
1893             Корпусные детали для телефонов
1894          Клавиатуры и кнопки для телефонов
1895    Прочие запчасти для мобильных устройств
Name: cat_name, Length: 1896, dtype: object

In [4]:
df.head()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,306,307,308,309,310,311,pickup,delivery,guarantee,labels
0,-0.510629,0.099171,0.125018,-1.031065,-0.573534,0.528563,-0.356196,-0.251334,0.165545,0.034537,...,-0.229483,0.095974,-0.821279,-0.027559,0.199403,-1.176701,1,0,0,10501
1,-0.411902,0.308944,0.570843,-0.950997,-0.160174,0.546029,-0.228088,-0.644098,-0.581524,0.218997,...,-0.370437,0.598914,-0.085835,0.639326,0.7913,-0.577839,1,0,1,140
2,-0.375486,1.006588,1.110223,-1.658172,0.222705,0.20853,-0.198095,-0.289276,0.04166,0.378753,...,-0.22676,0.107944,-0.389043,0.408599,0.087426,-1.061347,1,1,0,1397
3,-0.101817,-0.057519,0.683319,-1.130178,-0.202833,0.138714,-0.025431,-0.187406,0.27403,0.569496,...,-0.226971,0.320086,-0.332852,-0.335904,0.324005,-0.755862,1,0,0,3645
4,-0.445797,0.672504,0.738099,-1.049344,0.012485,0.123203,-0.317788,0.001642,-0.86733,0.124729,...,-0.250246,-0.0724,0.193559,1.014038,0.597358,-0.658637,1,0,1,10421


# Level

In [8]:
# создаем граф (иерархию)
G = nx.DiGraph()

for _, row in category_tree.iterrows():
    G.add_edge(row['parent_id'], row['cat_id'])  # parent → child

terminators = [
    x for x in G.nodes() if G.out_degree(x) == 0 and G.in_degree(x) == 1
]  # ищем конечные узлы

print(terminators)
print(len(terminators))

[117, 152, 164, 169, 170, 173, 176, 177, 180, 182, 185, 188, 192, 196, 233, 235, 237, 277, 280, 283, 284, 287, 290, 327, 361, 390, 392, 404, 406, 417, 423, 438, 445, 446, 460, 465, 469, 1001, 1003, 1007, 1019, 1020, 1021, 1025, 1026, 1027, 1030, 1045, 1046, 1054, 1055, 1061, 1064, 1072, 1077, 1078, 1081, 1082, 1083, 1085, 1088, 1090, 1094, 1101, 1102, 1106, 1115, 1116, 1118, 1122, 1124, 1131, 1132, 1133, 1134, 1135, 1137, 1139, 1140, 1141, 1148, 1149, 1152, 1154, 1158, 1161, 1166, 1170, 1171, 1184, 1185, 1186, 1198, 1199, 1205, 1206, 1211, 1216, 1229, 1234, 1238, 1249, 1250, 1251, 1253, 1259, 1260, 1261, 1265, 1268, 1273, 1275, 1276, 1277, 1290, 1291, 1293, 1295, 1300, 1303, 1304, 1305, 1306, 1311, 1312, 1321, 1322, 1327, 1329, 1335, 1338, 1344, 1345, 1347, 1350, 1357, 1361, 1362, 1365, 1367, 1370, 1372, 1375, 1376, 1377, 1378, 1379, 1380, 1382, 1396, 1397, 1398, 1399, 1400, 1401, 1402, 1403, 1404, 1405, 1407, 1409, 1412, 1413, 1414, 1415, 1417, 1426, 1427, 1430, 1441, 1447, 1450, 1463

In [9]:
def get_category_path(cat_id: int) -> list:
    """
    Находит путь от корневой категории 
    до заданной категории cat_id в графе.
    """
    path = []
    while cat_id in G:
        path.append(cat_id)  # добавляем текущую категорию в путь
        parents = list(G.predecessors(cat_id))  # получаем родителя
        if not parents:
            break
        cat_id = parents[0]  # переходим к родителю (предполагается один родитель)
    return list(reversed(path))  # от корня к листу


print(get_category_path(31423))  # пример использования

[0, 2, 119, 1243, 14383, 31423]


In [10]:
# выбираем конечные категории
category_tree = category_tree[category_tree['cat_id'].isin(terminators)]

In [11]:
df['category_path'] = df['labels'].apply(get_category_path)
df.head()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,307,308,309,310,311,pickup,delivery,guarantee,labels,category_path
0,-0.510629,0.099171,0.125018,-1.031065,-0.573534,0.528563,-0.356196,-0.251334,0.165545,0.034537,...,0.095974,-0.821279,-0.027559,0.199403,-1.176701,1,0,0,10501,"[0, 3, 135, 1388, 10501]"
1,-0.411902,0.308944,0.570843,-0.950997,-0.160174,0.546029,-0.228088,-0.644098,-0.581524,0.218997,...,0.598914,-0.085835,0.639326,0.7913,-0.577839,1,0,1,140,"[0, 3, 140]"
2,-0.375486,1.006588,1.110223,-1.658172,0.222705,0.20853,-0.198095,-0.289276,0.04166,0.378753,...,0.107944,-0.389043,0.408599,0.087426,-1.061347,1,1,0,1397,"[0, 3, 137, 1397]"
3,-0.101817,-0.057519,0.683319,-1.130178,-0.202833,0.138714,-0.025431,-0.187406,0.27403,0.569496,...,0.320086,-0.332852,-0.335904,0.324005,-0.755862,1,0,0,3645,"[0, 2, 124, 3645]"
4,-0.445797,0.672504,0.738099,-1.049344,0.012485,0.123203,-0.317788,0.001642,-0.86733,0.124729,...,-0.0724,0.193559,1.014038,0.597358,-0.658637,1,0,1,10421,"[0, 2, 124, 1308, 10421]"


In [12]:
# разбиваем категории на уровни
for i in range(6):  # допустим, максимум 5 уровней
    df[f"level_{i}"] = df['category_path'].apply(lambda x: x[i] if len(x) > i else None)

df = df.drop(['level_0', 'category_path', 'labels'], axis=1)
df.head()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,310,311,pickup,delivery,guarantee,level_1,level_2,level_3,level_4,level_5
0,-0.510629,0.099171,0.125018,-1.031065,-0.573534,0.528563,-0.356196,-0.251334,0.165545,0.034537,...,0.199403,-1.176701,1,0,0,3,135.0,1388.0,10501.0,
1,-0.411902,0.308944,0.570843,-0.950997,-0.160174,0.546029,-0.228088,-0.644098,-0.581524,0.218997,...,0.7913,-0.577839,1,0,1,3,140.0,,,
2,-0.375486,1.006588,1.110223,-1.658172,0.222705,0.20853,-0.198095,-0.289276,0.04166,0.378753,...,0.087426,-1.061347,1,1,0,3,137.0,1397.0,,
3,-0.101817,-0.057519,0.683319,-1.130178,-0.202833,0.138714,-0.025431,-0.187406,0.27403,0.569496,...,0.324005,-0.755862,1,0,0,2,124.0,3645.0,,
4,-0.445797,0.672504,0.738099,-1.049344,0.012485,0.123203,-0.317788,0.001642,-0.86733,0.124729,...,0.597358,-0.658637,1,0,1,2,124.0,1308.0,10421.0,


In [13]:
# количество уникальных категорий для каждого уровния
for i in range(5):
    print(f"level_{i+1}:", df[f"level_{i+1}"].nunique())

level_1: 23
level_2: 201
level_3: 710
level_4: 731
level_5: 144


# Modeling

In [14]:
def catbost_train(dataset: pd.DataFrame,
                  level: str = None) -> CatBoostClassifier:
    """
    Обучает модель CatBoostClassifier для многоклассовой классификации категорий 
    на одном из уровней иерархии категорий (от level_1 до level_5).
    Модель сохраняется в файл после обучения.
    
    :param dataset: датасет для обучения.
    :param level: уровень иерархии.
    :return clf_cat: обученная модель.
    """
    torch.cuda.empty_cache()

    rows_list = ['level_1', 'level_2', 'level_3', 'level_4', 'level_5']

    if level == None:
        y_series = dataset['level_1']
        X = dataset.drop(rows_list, axis=1)
    else:
        mask = dataset[level].notna()
        dataset = dataset[mask]

        dataset[level] = dataset[level].astype(int)
        y_series = dataset[level]
        X = dataset.drop(rows_list[int(level[-1]) - 1:], axis=1)

    print(
        f"Датасет для обучения {level}: содержит {X.columns}, размер {X.shape}"
    )

    X.columns = X.columns.astype('str')

    # создаем объект RandomUnderSampler
    max_samples_per_class = 100  # максимальное количество каждого класса

    undersampler = RandomUnderSampler(sampling_strategy=lambda y: {
        cls: min(count, max_samples_per_class)
        for cls, count in y_series.value_counts().items()
    },
                                      random_state=RAND)

    # применяем undersampling ко всему датасету
    X_resampled, y_resampled = undersampler.fit_resample(X, y_series)


    X_train, X_val, y_train, y_val = train_test_split(X_resampled,
                                                      y_resampled,
                                                      test_size=0.2,
                                                      shuffle=True,
                                                      stratify=y_resampled,
                                                      random_state=RAND)


    # получаем список уникальных классов
    class_labels = np.unique(y_train)

    # вычисляем веса для каждого класса
    class_weights = compute_class_weight(class_weight='balanced',
                                         classes=class_labels,
                                         y=y_train)

    class_weights = class_weights.tolist()

    # данные для валидации
    eval_set = [(X_val, y_val)]

    # создаем объект Pool для обучения CatBoost
    train_pool = Pool(X_train, label=y_train)

    # инициализируем и обучаем CatBoostClassifier
    clf_cat = CatBoostClassifier(
        iterations=700,
        grow_policy='Lossguide',
        class_weights=class_weights,
        early_stopping_rounds=75,
        use_best_model=True,
        loss_function='MultiClass',  # многоклассовая классификация
        task_type='GPU' if torch.cuda.is_available() else
        'CPU'  # используем GPU, если доступно
    )

    # обучаем модель
    clf_cat.fit(train_pool, eval_set=eval_set, verbose=10)

    # сохраняем модель
    with open(f'clf_cat_{1 if level == None else level}.pkl', 'wb') as file:
        pickle.dump(clf_cat, file)

    return clf_cat

In [15]:
clf_level_1 = catbost_train(df)

Датасет для обучения None: содержит Index(['0', '1', '2', '3', '4', '5', '6', '7', '8', '9',
       ...
       '305', '306', '307', '308', '309', '310', '311', 'pickup', 'delivery',
       'guarantee'],
      dtype='object', length=315), размер (1501560, 315)
Learning rate set to 0.130698
0:	learn: 3.0286629	test: 3.0851849	best: 3.0851849 (0)	total: 63.1ms	remaining: 44.1s
10:	learn: 2.3661223	test: 2.7730448	best: 2.7730448 (10)	total: 356ms	remaining: 22.3s
20:	learn: 1.9300907	test: 2.5812529	best: 2.5812529 (20)	total: 667ms	remaining: 21.6s
30:	learn: 1.6370743	test: 2.4586813	best: 2.4586813 (30)	total: 966ms	remaining: 20.8s
40:	learn: 1.4123396	test: 2.3784371	best: 2.3784371 (40)	total: 1.25s	remaining: 20.1s
50:	learn: 1.2478041	test: 2.3259166	best: 2.3259166 (50)	total: 1.56s	remaining: 19.8s
60:	learn: 1.1239299	test: 2.2879336	best: 2.2879336 (60)	total: 1.85s	remaining: 19.4s
70:	learn: 1.0136211	test: 2.2550272	best: 2.2550272 (70)	total: 2.15s	remaining: 19s
80:	learn

In [16]:
clf_level_2 = catbost_train(df, level='level_2')

Датасет для обучения level_2: содержит Index(['0', '1', '2', '3', '4', '5', '6', '7', '8', '9',
       ...
       '306', '307', '308', '309', '310', '311', 'pickup', 'delivery',
       'guarantee', 'level_1'],
      dtype='object', length=316), размер (1500341, 316)
Learning rate set to 0.160336
0:	learn: 4.1529965	test: 4.1908476	best: 4.1908476 (0)	total: 263ms	remaining: 3m 4s
10:	learn: 2.5269935	test: 2.6870504	best: 2.6870504 (10)	total: 3.02s	remaining: 3m 9s
20:	learn: 2.0885290	test: 2.3432084	best: 2.3432084 (20)	total: 5.71s	remaining: 3m 4s
30:	learn: 1.7402846	test: 2.0597086	best: 2.0597086 (30)	total: 8.45s	remaining: 3m 2s
40:	learn: 1.4926689	test: 1.8861950	best: 1.8861950 (40)	total: 11.1s	remaining: 2m 58s
50:	learn: 1.3192151	test: 1.7681509	best: 1.7681509 (50)	total: 13.8s	remaining: 2m 55s
60:	learn: 1.1738786	test: 1.6702028	best: 1.6702028 (60)	total: 16.6s	remaining: 2m 53s
70:	learn: 1.0654787	test: 1.6064004	best: 1.6064004 (70)	total: 19.3s	remaining: 2m 5

In [17]:
clf_level_3 = catbost_train(df, level='level_3')

Датасет для обучения level_3: содержит Index(['0', '1', '2', '3', '4', '5', '6', '7', '8', '9',
       ...
       '307', '308', '309', '310', '311', 'pickup', 'delivery', 'guarantee',
       'level_1', 'level_2'],
      dtype='object', length=317), размер (1467228, 317)
Learning rate set to 0.179904
0:	learn: 5.7655197	test: 5.7678677	best: 5.7678677 (0)	total: 2.74s	remaining: 31m 58s
10:	learn: 3.5896928	test: 3.6374364	best: 3.6374364 (10)	total: 30.1s	remaining: 31m 23s
20:	learn: 2.7135113	test: 2.8289683	best: 2.8289683 (20)	total: 56.5s	remaining: 30m 26s
30:	learn: 2.0667304	test: 2.2619912	best: 2.2619912 (30)	total: 1m 23s	remaining: 29m 58s
40:	learn: 1.7343655	test: 1.9891633	best: 1.9891633 (40)	total: 1m 49s	remaining: 29m 25s
50:	learn: 1.4426315	test: 1.7470598	best: 1.7470598 (50)	total: 2m 16s	remaining: 28m 56s
60:	learn: 1.2269050	test: 1.5594897	best: 1.5594897 (60)	total: 2m 43s	remaining: 28m 31s
70:	learn: 1.0652709	test: 1.4246001	best: 1.4246001 (70)	total: 3m

In [18]:
clf_level_4 = catbost_train(df, level='level_4')

Датасет для обучения level_4: содержит Index(['0', '1', '2', '3', '4', '5', '6', '7', '8', '9',
       ...
       '308', '309', '310', '311', 'pickup', 'delivery', 'guarantee',
       'level_1', 'level_2', 'level_3'],
      dtype='object', length=318), размер (773996, 318)
Learning rate set to 0.180063
0:	learn: 6.0838630	test: 6.0863393	best: 6.0863393 (0)	total: 2.86s	remaining: 33m 21s
10:	learn: 3.9640982	test: 3.9963937	best: 3.9963937 (10)	total: 31.4s	remaining: 32m 44s
20:	learn: 2.9585065	test: 3.0431668	best: 3.0431668 (20)	total: 59.3s	remaining: 31m 56s
30:	learn: 2.3269493	test: 2.4809287	best: 2.4809287 (30)	total: 1m 27s	remaining: 31m 23s
40:	learn: 1.8796897	test: 2.0953562	best: 2.0953562 (40)	total: 1m 55s	remaining: 30m 55s
50:	learn: 1.5662349	test: 1.8309299	best: 1.8309299 (50)	total: 2m 23s	remaining: 30m 28s
60:	learn: 1.3266059	test: 1.6289116	best: 1.6289116 (60)	total: 2m 52s	remaining: 30m 1s
70:	learn: 1.1496255	test: 1.4876303	best: 1.4876303 (70)	total: 

In [19]:
clf_level_5 = catbost_train(df, level='level_5')

Датасет для обучения level_5: содержит Index(['0', '1', '2', '3', '4', '5', '6', '7', '8', '9',
       ...
       '309', '310', '311', 'pickup', 'delivery', 'guarantee', 'level_1',
       'level_2', 'level_3', 'level_4'],
      dtype='object', length=319), размер (79166, 319)
Learning rate set to 0.152367
0:	learn: 4.6750998	test: 4.6917580	best: 4.6917580 (0)	total: 196ms	remaining: 2m 16s
10:	learn: 2.0707262	test: 2.1956810	best: 2.1956810 (10)	total: 2.09s	remaining: 2m 10s
20:	learn: 1.5360931	test: 1.6924465	best: 1.6924465 (20)	total: 3.86s	remaining: 2m 4s
30:	learn: 1.2147318	test: 1.4000116	best: 1.4000116 (30)	total: 5.59s	remaining: 2m
40:	learn: 0.9744825	test: 1.1986240	best: 1.1986240 (40)	total: 7.39s	remaining: 1m 58s
50:	learn: 0.7905934	test: 1.0458527	best: 1.0458527 (50)	total: 9.21s	remaining: 1m 57s
60:	learn: 0.6730136	test: 0.9517513	best: 0.9517513 (60)	total: 11s	remaining: 1m 55s
70:	learn: 0.5790376	test: 0.8734951	best: 0.8734951 (70)	total: 12.7s	remainin