# Мета-признаки и стекинг моделей для классификации

Этот проект демонстрирует подход к улучшению качества классификации через:
1. Генерацию мета-признаков с помощью кросс-валидации
2. Стекинг предсказаний отдельных моделей в финальную модель

## Решения

### 1. Генерация мета-признаков

Функция `generate_meta_features` реализует стратегию out-of-fold предсказаний:
- Использует Stratified K-Fold для сохранения распределения классов
- На каждом фолде обучает CatBoostClassifier на 4 частях данных
- Делает предсказания на 5-й части (out-of-fold)
- Объединяет предсказания со всех фолдов в мета-признак

### 2. Финальная модель

Объединяем:
- Табличные признаки (price_diff_log, text_similarity и др.)
- Мета-признаки от текстовой и image моделей

### 3. Ключевые особенности

- Стратифицированная кросс-валидация - сохраняет распределение классов в каждом фолде
- Балансировка классов - через class_weights в финальной модели
- GPU-ускорение - CatBoost использует GPU для обучения
- Ранняя остановка - предотвращает переобучение (early_stopping_rounds=100)

## Пути улучшения и развития проекта

### Мультимодельный стекинг
- **Использование нескольких моделей** для генерации мета-признаков (не только CatBoost)
- **Ансамбли моделей** (RandomForest, XGBoost, LightGBM) для каждого фолда
- **Нейросетевые подходы** для текста/изображений (BERT, CNN)

In [None]:
import pandas as pd
from sklearn.metrics.pairwise import cosine_similarity
from tqdm import tqdm
tqdm.pandas()

import numpy as np

from sklearn.model_selection import train_test_split
from sklearn.utils.class_weight import compute_class_weight
from sklearn.model_selection import StratifiedKFold
from sklearn.metrics import average_precision_score
from sklearn.preprocessing import StandardScaler

from catboost import Pool, CatBoostClassifier, cv
from imblearn.under_sampling import RandomUnderSampler

import matplotlib.pyplot as plt

import warnings
warnings.filterwarnings('ignore')

RAND = 42

In [None]:
data = pd.read_parquet('all_data.parquet')
data = data.rename(columns={'base_title_emb': 'base_text_emb', 'cand_title_emb': 'cand_text_emb'})

In [5]:
data.head()

Unnamed: 0,base_item_id,cand_item_id,base_title,cand_title,base_description,cand_description,base_title_image,cand_title_image,price_diff_log,price_diff_is_outlier,param1,base_text,cand_text,target,base_text_emb,cand_text_emb,base_img_embeddings,cand_img_embeddings
0,13ade32c3e614d939faad4ab68350bc52ce8848b7a64bd...,087e7f3dbec9326532f9fc784b68de294cb2d905d33bdf...,зимние ботинки ecco,кигуpуммии мышкa inextenso,"ботинки экко,униcекc,зимние ноcилиcь один cезо...",,40c72f08e0bb10b55e0605781481df2b5557b094aee695...,ebc7537d69a1c8c1a6e7ea3c5b27ab4d4a360e6032d158...,7.378384,False,1,"зимние ботинки ecco. ботинки экко,униcекc,зимн...",кигуpуммии мышкa inextenso,0,"[0.07461631, 0.025216028, 0.09265594, -0.01241...","[-0.05033574, -0.038777635, 0.048376136, -0.02...","[0.15043087, -0.007836916, -0.37574252, 0.1851...","[-0.036240168, 0.13224322, 0.00024382082, -0.1..."
1,13ade32c3e614d939faad4ab68350bc52ce8848b7a64bd...,5d81d4230671ed22e40ab9e05bb63fef5ad6766454714a...,зимние ботинки ecco,штaны для девочки zara,"ботинки экко,униcекc,зимние ноcилиcь один cезо...","штaны новые,ноcили пapу paз",40c72f08e0bb10b55e0605781481df2b5557b094aee695...,8f8e254c919ecb28a3424fde4cd1aeca75043584ddd02c...,7.346655,False,1,"зимние ботинки ecco. ботинки экко,униcекc,зимн...","штaны для девочки zara. штaны новые,ноcили пap...",0,"[0.07461631, 0.025216023, 0.09265594, -0.01241...","[-0.014795878, -0.012531395, 0.029164972, -0.0...","[0.15043087, -0.007836916, -0.37574252, 0.1851...","[0.052737787, -0.083829194, -0.11122162, -0.10..."
2,13ade32c3e614d939faad4ab68350bc52ce8848b7a64bd...,eff6d2ef2c44dc7361d389d3a9ce243e6e3079675c0b27...,зимние ботинки ecco,рубaшкa acoola 152,"ботинки экко,униcекc,зимние ноcилиcь один cезо...","новaя,не ноcили",40c72f08e0bb10b55e0605781481df2b5557b094aee695...,a92f75d133c370f8b5d135d29144a69e1179d522e06d78...,7.467942,False,1,"зимние ботинки ecco. ботинки экко,униcекc,зимн...","рубaшкa acoola 152. новaя,не ноcили",0,"[0.074616306, 0.025216017, 0.09265594, -0.0124...","[-0.0049448516, -0.021140626, 0.024315324, -0....","[0.15043087, -0.007836916, -0.37574252, 0.1851...","[0.15672864, -0.09614283, -0.36101675, -0.0899..."
3,d0b78018657dff01508954bb58d4f03f1ddf11525d8d26...,13ade32c3e614d939faad4ab68350bc52ce8848b7a64bd...,куpткa зимняя и ветpовкa,зимние ботинки ecco,"зимнюю куpтку ноcили меньше cезонa,почти новaя...","ботинки экко,униcекc,зимние ноcилиcь один cезо...",181549e281126b799e54980db0b194918479e0db9be2ab...,40c72f08e0bb10b55e0605781481df2b5557b094aee695...,7.972811,False,1,куpткa зимняя и ветpовкa. зимнюю куpтку ноcили...,"зимние ботинки ecco. ботинки экко,униcекc,зимн...",0,"[0.046872497, 0.008374332, 0.032722093, 0.0128...","[0.07461631, 0.025216028, 0.09265594, -0.01241...","[0.020111877, -0.17249195, -0.05434991, -0.069...","[0.15043087, -0.007836916, -0.37574252, 0.1851..."
4,d0b78018657dff01508954bb58d4f03f1ddf11525d8d26...,b960b579cd9b5aebc6ac73d5042ba13ae8747490cc8a59...,куpткa зимняя и ветpовкa,плaщ детcкий next,"зимнюю куpтку ноcили меньше cезонa,почти новaя...",пеpед отпpaвкой отпapю,181549e281126b799e54980db0b194918479e0db9be2ab...,eda895d18bd2d5bba2b277475667835d7ab7f9186ba32e...,8.243019,False,1,куpткa зимняя и ветpовкa. зимнюю куpтку ноcили...,плaщ детcкий next. .пеpед отпpaвкой отпapю,0,"[0.046872497, 0.008374332, 0.032722093, 0.0128...","[0.017516188, 0.048466716, 0.031198228, -0.044...","[0.020111877, -0.17249195, -0.05434991, -0.069...","[0.019539107, -0.15184784, -0.09667098, 0.1852..."


In [None]:
# Функция для расчёта косинусной близости между эмбеддингами

def cosine(x):

    try:
        return cosine_similarity(x['base_img_embeddings'].reshape(1, -1), x['cand_img_embeddings'].reshape(1, -1))[0][0]
    except:
        return 0

In [7]:
data = data.dropna(ignore_index=True).reset_index()
data.drop('index', axis=1, inplace=True)

In [8]:
data.head()

Unnamed: 0,base_item_id,cand_item_id,base_title,cand_title,base_description,cand_description,base_title_image,cand_title_image,price_diff_log,price_diff_is_outlier,param1,base_text,cand_text,target,base_text_emb,cand_text_emb,base_img_embeddings,cand_img_embeddings
0,13ade32c3e614d939faad4ab68350bc52ce8848b7a64bd...,087e7f3dbec9326532f9fc784b68de294cb2d905d33bdf...,зимние ботинки ecco,кигуpуммии мышкa inextenso,"ботинки экко,униcекc,зимние ноcилиcь один cезо...",,40c72f08e0bb10b55e0605781481df2b5557b094aee695...,ebc7537d69a1c8c1a6e7ea3c5b27ab4d4a360e6032d158...,7.378384,False,1,"зимние ботинки ecco. ботинки экко,униcекc,зимн...",кигуpуммии мышкa inextenso,0,"[0.07461631, 0.025216028, 0.09265594, -0.01241...","[-0.05033574, -0.038777635, 0.048376136, -0.02...","[0.15043087, -0.007836916, -0.37574252, 0.1851...","[-0.036240168, 0.13224322, 0.00024382082, -0.1..."
1,13ade32c3e614d939faad4ab68350bc52ce8848b7a64bd...,5d81d4230671ed22e40ab9e05bb63fef5ad6766454714a...,зимние ботинки ecco,штaны для девочки zara,"ботинки экко,униcекc,зимние ноcилиcь один cезо...","штaны новые,ноcили пapу paз",40c72f08e0bb10b55e0605781481df2b5557b094aee695...,8f8e254c919ecb28a3424fde4cd1aeca75043584ddd02c...,7.346655,False,1,"зимние ботинки ecco. ботинки экко,униcекc,зимн...","штaны для девочки zara. штaны новые,ноcили пap...",0,"[0.07461631, 0.025216023, 0.09265594, -0.01241...","[-0.014795878, -0.012531395, 0.029164972, -0.0...","[0.15043087, -0.007836916, -0.37574252, 0.1851...","[0.052737787, -0.083829194, -0.11122162, -0.10..."
2,13ade32c3e614d939faad4ab68350bc52ce8848b7a64bd...,eff6d2ef2c44dc7361d389d3a9ce243e6e3079675c0b27...,зимние ботинки ecco,рубaшкa acoola 152,"ботинки экко,униcекc,зимние ноcилиcь один cезо...","новaя,не ноcили",40c72f08e0bb10b55e0605781481df2b5557b094aee695...,a92f75d133c370f8b5d135d29144a69e1179d522e06d78...,7.467942,False,1,"зимние ботинки ecco. ботинки экко,униcекc,зимн...","рубaшкa acoola 152. новaя,не ноcили",0,"[0.074616306, 0.025216017, 0.09265594, -0.0124...","[-0.0049448516, -0.021140626, 0.024315324, -0....","[0.15043087, -0.007836916, -0.37574252, 0.1851...","[0.15672864, -0.09614283, -0.36101675, -0.0899..."
3,d0b78018657dff01508954bb58d4f03f1ddf11525d8d26...,13ade32c3e614d939faad4ab68350bc52ce8848b7a64bd...,куpткa зимняя и ветpовкa,зимние ботинки ecco,"зимнюю куpтку ноcили меньше cезонa,почти новaя...","ботинки экко,униcекc,зимние ноcилиcь один cезо...",181549e281126b799e54980db0b194918479e0db9be2ab...,40c72f08e0bb10b55e0605781481df2b5557b094aee695...,7.972811,False,1,куpткa зимняя и ветpовкa. зимнюю куpтку ноcили...,"зимние ботинки ecco. ботинки экко,униcекc,зимн...",0,"[0.046872497, 0.008374332, 0.032722093, 0.0128...","[0.07461631, 0.025216028, 0.09265594, -0.01241...","[0.020111877, -0.17249195, -0.05434991, -0.069...","[0.15043087, -0.007836916, -0.37574252, 0.1851..."
4,d0b78018657dff01508954bb58d4f03f1ddf11525d8d26...,b960b579cd9b5aebc6ac73d5042ba13ae8747490cc8a59...,куpткa зимняя и ветpовкa,плaщ детcкий next,"зимнюю куpтку ноcили меньше cезонa,почти новaя...",пеpед отпpaвкой отпapю,181549e281126b799e54980db0b194918479e0db9be2ab...,eda895d18bd2d5bba2b277475667835d7ab7f9186ba32e...,8.243019,False,1,куpткa зимняя и ветpовкa. зимнюю куpтку ноcили...,плaщ детcкий next. .пеpед отпpaвкой отпapю,0,"[0.046872497, 0.008374332, 0.032722093, 0.0128...","[0.017516188, 0.048466716, 0.031198228, -0.044...","[0.020111877, -0.17249195, -0.05434991, -0.069...","[0.019539107, -0.15184784, -0.09667098, 0.1852..."


## Предобработка датасета

In [10]:
X = data[['price_diff_log',
          'price_diff_is_outlier',
          'param1',
          'base_text_emb',
          'cand_text_emb',
          'base_img_embeddings',
          'cand_img_embeddings'
          ]]

y_series = data['target']

# создаем объект RandomUnderSampler
max_samples_per_class = 90000

undersampler = RandomUnderSampler(
    sampling_strategy=lambda y: {cls: min(count, max_samples_per_class) for cls, count in y_series.value_counts().items()},
    random_state=RAND
)

# применяем undersampling ко всему датасету
X_resampled, y_resampled = undersampler.fit_resample(X, y_series)

In [11]:
data = X_resampled

In [None]:
# Расчёт косинусной близости и объединение эмбеддингов

data['text_similarity'] = data.progress_apply(lambda x: cosine_similarity(x['base_text_emb'].reshape(1, -1), x['cand_text_emb'].reshape(1, -1))[0][0], axis=1)
data['image_similarity'] = data.progress_apply(lambda x: cosine(x), axis=1)
data['concat_imb_img'] = data.progress_apply(lambda x: np.concatenate((x['base_img_embeddings'], x['cand_img_embeddings'])), axis=1)
data['concat_imb_text'] = data.progress_apply(lambda x: np.concatenate((x['base_text_emb'], x['cand_text_emb'])), axis=1)
data['difference_imb_img'] = data['base_img_embeddings'] - data['cand_img_embeddings']
data['difference_imb_text'] = data['base_text_emb'] - data['cand_text_emb']

100%|██████████| 180000/180000 [00:23<00:00, 7740.45it/s]
100%|██████████| 180000/180000 [00:23<00:00, 7735.38it/s]
100%|██████████| 180000/180000 [00:01<00:00, 147723.06it/s]
100%|██████████| 180000/180000 [00:00<00:00, 182002.05it/s]


In [14]:
data.head()

Unnamed: 0,price_diff_log,price_diff_is_outlier,param1,base_text_emb,cand_text_emb,base_img_embeddings,cand_img_embeddings,text_similarity,image_similarity,concat_imb_img,concat_imb_text,difference_imb_img,difference_imb_text
1370444,9.809397,False,1,"[0.01354706, 0.027739417, -0.011804868, -0.063...","[0.0027007344, -0.001848109, 0.0012662747, -0....","[0.14082193, -0.015394749, 0.15328333, -0.1358...","[0.14689213, 0.07253563, 0.049838357, -0.14859...",0.875356,0.953365,"[0.14082193, -0.015394749, 0.15328333, -0.1358...","[0.01354706, 0.027739417, -0.011804868, -0.063...","[-0.0060701966, -0.087930374, 0.10344497, 0.01...","[0.010846326, 0.029587526, -0.013071142, -0.00..."
1308199,8.748464,False,0,"[-0.014825479, -0.02939064, 0.031029247, -0.02...","[-0.02664284, 0.006209531, 0.07459686, -0.0229...","[-0.0050587566, 0.076420896, -0.14930406, 0.24...","[-0.03452479, -0.065788835, -0.20739126, -0.03...",0.726724,0.217739,"[-0.0050587566, 0.076420896, -0.14930406, 0.24...","[-0.014825479, -0.02939064, 0.031029247, -0.02...","[0.029466035, 0.14220974, 0.0580872, 0.278022,...","[0.011817361, -0.03560017, -0.043567613, -0.00..."
1192294,6.82546,False,1,"[0.004799735, -0.024501253, 0.025914619, -0.00...","[0.02909783, -0.004740417, 0.05022165, -0.0656...","[-0.09323251, -0.26261008, -0.13306753, -0.055...","[-0.06837231, -0.23180982, -0.2052631, -0.0141...",0.873319,0.637997,"[-0.09323251, -0.26261008, -0.13306753, -0.055...","[0.004799735, -0.024501253, 0.025914619, -0.00...","[-0.024860203, -0.030800253, 0.07219556, -0.04...","[-0.024298096, -0.019760836, -0.024307033, 0.0..."
519351,8.2943,False,1,"[-0.0475293, 0.022563532, 0.047775377, -0.0021...","[0.042864498, -0.012184954, 0.050413273, -0.03...","[0.28614157, -0.040522784, -0.1626403, -0.2637...","[0.110885374, 0.08142038, 0.053690102, 0.02286...",0.746557,0.40436,"[0.28614157, -0.040522784, -0.1626403, -0.2637...","[-0.0475293, 0.022563532, 0.047775377, -0.0021...","[0.1752562, -0.12194316, -0.21633041, -0.28661...","[-0.0903938, 0.034748487, -0.0026378967, 0.036..."
1744936,7.873217,False,1,"[0.035367083, 0.018594408, 0.022287004, -0.005...","[0.01069141, 0.026719455, 0.028171385, -0.0399...","[0.35709655, 0.18438786, 0.3322975, 0.03207177...","[0.09058791, 0.27206013, 0.1478337, -0.0200714...",0.905568,0.572357,"[0.35709655, 0.18438786, 0.3322975, 0.03207177...","[0.035367083, 0.018594408, 0.022287004, -0.005...","[0.26650864, -0.08767226, 0.1844638, 0.0521432...","[0.024675673, -0.008125046, -0.005884381, 0.03..."


## Обучение catboost для текстов

In [None]:

def generate_meta_features(X_train: pd.DataFrame, 
                           y_train: pd.Series, 
                           RAND=42, 
                           n_splits=5) -> pd.DataFrame:
    """
    Выполняет кросс-валидацию на тренировочных данных, обучая модель на 4 фолдах,
    а на 5-м фолде генерирует метапризнаки и записывает их в новый столбец 'meta_pred' 
    в исходный тренировочный набор.

    :param X_train: тренировочные данные
    :param y_train: таргет
    :return X_train_with_meta: обновлённый DataFrame X_train с добавленной колонкой 'meta_pred' (метапризнак)
    """
    kf = StratifiedKFold(n_splits=n_splits, shuffle=True, random_state=RAND)
    
    # массив для хранения предсказаний на тренировочных данных
    meta_features = np.zeros(X_train.shape[0])

    for fold, (train_idx, val_idx) in enumerate(kf.split(X_train, y_train)):
        print(f'Фолд {fold + 1}/{n_splits}')

        # делаем разделение данных на обучающие и валидационные
        X_tr, y_tr = X_train.iloc[train_idx], y_train.iloc[train_idx]
        X_val, y_val = X_train.iloc[val_idx], y_train.iloc[val_idx]

        # обучаем модель
        model = CatBoostClassifier(
            early_stopping_rounds=100,
            verbose=100,
            task_type='GPU',
            random_state=RAND
        )

        model.fit(X_tr, y_tr)

        # генерируем предсказания для валидационного набора (на оставшемся фолде)
        meta_features[val_idx] = model.predict_proba(X_val)[:, 1]

    # присваиваем метапризнаки в итоговый датафрейм
    #X_train_with_meta = X_train.copy()
    #X_train_with_meta['meta_pred'] = meta_features

    return model, meta_features


In [None]:
data['final'] = data.progress_apply(lambda x: np.concatenate((x['difference_imb_text'], x['difference_imb_img'])), axis=1)
X_text = pd.DataFrame(data['final'].values.tolist())

text_scaler = StandardScaler()
X_text = pd.DataFrame(text_scaler.fit_transform(X_text))

100%|██████████| 180000/180000 [00:01<00:00, 167396.82it/s]


In [18]:
text_model, text_prod = generate_meta_features(X_text, y_resampled)

Фолд 1/5
Learning rate set to 0.025649
0:	learn: 0.6894982	total: 869ms	remaining: 14m 28s
100:	learn: 0.5877031	total: 2.44s	remaining: 21.7s
200:	learn: 0.5641007	total: 4.06s	remaining: 16.1s
300:	learn: 0.5510890	total: 5.71s	remaining: 13.3s
400:	learn: 0.5415609	total: 7.37s	remaining: 11s
500:	learn: 0.5322068	total: 9.09s	remaining: 9.06s
600:	learn: 0.5233783	total: 10.8s	remaining: 7.2s
700:	learn: 0.5154439	total: 12.6s	remaining: 5.38s
800:	learn: 0.5079765	total: 14.4s	remaining: 3.57s
900:	learn: 0.5008776	total: 16.2s	remaining: 1.77s
999:	learn: 0.4942213	total: 17.9s	remaining: 0us
Фолд 2/5
Learning rate set to 0.025649
0:	learn: 0.6893962	total: 15.6ms	remaining: 15.6s
100:	learn: 0.5875625	total: 1.56s	remaining: 13.9s
200:	learn: 0.5642023	total: 3.18s	remaining: 12.6s
300:	learn: 0.5517701	total: 4.78s	remaining: 11.1s
400:	learn: 0.5420435	total: 6.45s	remaining: 9.63s
500:	learn: 0.5325332	total: 8.14s	remaining: 8.11s
600:	learn: 0.5236066	total: 9.87s	remaining

In [None]:
# Расчёт метрик

y_pred_proba = text_model.predict_proba(X_text)[:, 1]
current_map = average_precision_score(y_resampled, y_pred_proba)
current_map

np.float64(0.8331145734315221)

## Обучение catboost для изображений

In [None]:
X_image = pd.DataFrame(data['concat_imb_img'].values.tolist())
image_scaler = StandardScaler()
X_image = pd.DataFrame(image_scaler.fit_transform(X_image)) 

In [21]:
X_image

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,1526,1527,1528,1529,1530,1531,1532,1533,1534,1535
0,0.684878,0.373546,0.711268,-0.678158,0.336504,-0.834261,-0.131889,0.224850,-0.653848,1.664077,...,-1.670982,-1.776032,0.148095,-0.360578,-0.961333,-1.408607,-0.050825,0.695664,-0.905878,-0.334422
1,-0.119317,0.851995,-0.893308,1.278280,0.023182,-0.534414,-0.580465,-1.204020,0.953253,1.307367,...,-0.688485,-1.301716,1.305406,0.231340,0.683068,0.128578,-1.140500,0.026115,-0.981487,-0.744166
2,-0.605391,-0.914688,-0.807208,-0.260151,-1.256454,1.185777,-0.146498,0.931259,0.725088,-0.936003,...,-1.423836,-1.443536,-0.748456,0.313038,-0.937037,-1.390770,-0.211657,0.466988,0.455013,-0.464021
3,1.485980,0.242604,-0.964028,-1.340372,1.078811,-0.370932,1.992518,0.824521,0.445612,0.344195,...,-1.026493,0.443974,-1.781811,0.762434,-1.055783,-0.276114,-0.109599,-0.238928,0.351642,0.369303
4,1.877132,1.414609,1.660553,0.190998,1.602403,-0.191488,0.554365,1.246764,-0.311443,0.045137,...,-0.293780,-0.340906,-1.269114,2.516178,0.354364,0.359423,-0.085574,0.757920,1.127439,2.017958
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
179995,-0.271288,0.019141,2.685333,-1.138300,0.896600,1.135562,0.202951,-0.044391,1.933624,-0.817027,...,1.389785,1.095994,0.346693,1.616444,0.978723,0.508807,1.966704,2.152748,-2.480080,-1.159314
179996,-0.513294,-1.780845,-0.281187,-0.021712,-2.217343,0.669694,-0.255680,0.413975,-1.446899,0.378062,...,-0.688919,0.247805,-0.542152,-0.233404,-2.338236,0.339961,0.813674,-2.290280,-0.091413,-0.060198
179997,0.123674,0.075231,-1.179806,0.493674,-1.831912,-0.903428,0.858533,-0.047474,0.976679,0.188822,...,0.837420,-0.362946,0.230915,0.128564,0.545848,-0.386105,0.252978,-0.235306,-0.134113,-2.423489
179998,1.956244,-1.000047,-1.186911,-0.678389,-0.420905,0.848678,-0.031696,0.675265,0.403954,-0.628517,...,1.176784,0.127706,-2.322243,-1.474226,-1.038029,-1.614191,0.195178,0.277806,0.209337,0.384669


In [22]:
img_model, img_prod = generate_meta_features(X_image, y_resampled)

Фолд 1/5
Learning rate set to 0.025649
0:	learn: 0.6905033	total: 67.4ms	remaining: 1m 7s
100:	learn: 0.5813258	total: 2.74s	remaining: 24.4s
200:	learn: 0.5412031	total: 5.3s	remaining: 21.1s
300:	learn: 0.5185998	total: 7.8s	remaining: 18.1s
400:	learn: 0.5029863	total: 10.3s	remaining: 15.3s
500:	learn: 0.4906461	total: 12.7s	remaining: 12.7s
600:	learn: 0.4802947	total: 15.1s	remaining: 10s
700:	learn: 0.4711918	total: 17.5s	remaining: 7.47s
800:	learn: 0.4629854	total: 19.9s	remaining: 4.95s
900:	learn: 0.4555400	total: 22.3s	remaining: 2.45s
999:	learn: 0.4486908	total: 24.6s	remaining: 0us
Фолд 2/5
Learning rate set to 0.025649
0:	learn: 0.6905280	total: 26.4ms	remaining: 26.4s
100:	learn: 0.5819856	total: 2.58s	remaining: 23s
200:	learn: 0.5415294	total: 5.15s	remaining: 20.5s
300:	learn: 0.5188189	total: 7.63s	remaining: 17.7s
400:	learn: 0.5032661	total: 10.1s	remaining: 15.1s
500:	learn: 0.4909317	total: 12.6s	remaining: 12.5s
600:	learn: 0.4805371	total: 15s	remaining: 9.95

In [None]:
# Расчёт метрик

y_pred_proba_img = img_model.predict_proba(X_image)[:, 1]
current_map = average_precision_score(y_resampled, y_pred_proba_img)
current_map

np.float64(0.8684313550823539)

In [24]:
data['text_prob'] = text_prod
data['img_prob'] = img_prod

## Обучение catboost на табличных данных

In [None]:
X_table = data[['price_diff_log',
          'price_diff_is_outlier',
          'param1',
          'text_similarity',
          'image_similarity',
          'text_prob',
          'img_prob'
          ]]


X_train, X_test, y_train, y_test = train_test_split(X_table,
                                                    y_resampled,
                                                    test_size=0.2,
                                                    shuffle=True,
                                                    stratify=y_resampled,
                                                    random_state=RAND)

pool = Pool(
    data=X_train,
    label=y_train
)

eval_set = [(X_test, y_test)]

# получаем список уникальных классов
class_labels = np.unique(y_train)

# вычисляем веса для каждого класса
class_weights = compute_class_weight(class_weight='balanced',
                                     classes=class_labels,
                                     y=y_train)

class_weights = class_weights.tolist()

  # обучаем модель
model_all = CatBoostClassifier(
    loss_function='Logloss',
    class_weights=class_weights,
    early_stopping_rounds=100,
    verbose=100,
    task_type='GPU'
    )

model_all.fit(pool, eval_set=eval_set)
y_pred_proba_table = model_all.predict_proba(X_table)[:, 1]

Learning rate set to 0.046673
0:	learn: 0.6619861	test: 0.6621606	best: 0.6621606 (0)	total: 59.7ms	remaining: 59.7s
100:	learn: 0.3891044	test: 0.3916052	best: 0.3916052 (100)	total: 604ms	remaining: 5.38s
200:	learn: 0.3848388	test: 0.3889178	best: 0.3889178 (200)	total: 1.15s	remaining: 4.56s
300:	learn: 0.3821957	test: 0.3879074	best: 0.3879074 (300)	total: 1.7s	remaining: 3.94s
400:	learn: 0.3800397	test: 0.3874013	best: 0.3874013 (400)	total: 2.26s	remaining: 3.38s
500:	learn: 0.3781955	test: 0.3869859	best: 0.3869859 (500)	total: 2.83s	remaining: 2.82s
600:	learn: 0.3765325	test: 0.3868043	best: 0.3867913 (596)	total: 3.41s	remaining: 2.26s
700:	learn: 0.3749733	test: 0.3866421	best: 0.3866371 (694)	total: 4s	remaining: 1.7s
800:	learn: 0.3734069	test: 0.3865331	best: 0.3865325 (774)	total: 4.57s	remaining: 1.14s
900:	learn: 0.3720660	test: 0.3864392	best: 0.3864392 (900)	total: 5.15s	remaining: 566ms
999:	learn: 0.3706840	test: 0.3863987	best: 0.3863987 (999)	total: 5.74s	remai

In [20]:
import torch
def cs(x):
    try:
        cn = np.concatenate((x['base_img_embeddings'], x['cand_img_embeddings']))
        return cn
    except:
        return np.array(torch.zeros(1536))

## Получение предсказаний для test части

In [None]:

test = pd.read_parquet('all_data_test.parquet')

test = test.rename(columns={'base_title_emb': 'base_text_emb', 'cand_title_emb': 'cand_text_emb'})

test['concat_imb_img'] = test.progress_apply(lambda x: cs(x), axis=1)
test['concat_imb_text'] = test.progress_apply(lambda x: np.concatenate((x['base_text_emb'], x['cand_text_emb'])), axis=1)
test['text_similarity'] = test.progress_apply(lambda x: cosine_similarity(x['base_text_emb'].reshape(1, -1), x['cand_text_emb'].reshape(1, -1))[0][0], axis=1)
test['image_similarity'] = test.progress_apply(lambda x: cosine(x), axis=1)

test['difference_imb_img'] = test['base_img_embeddings'] - test['cand_img_embeddings']
test['difference_imb_text'] = test['base_text_emb'] - test['cand_text_emb']

#test['final'] = test.progress_apply(lambda x: np.concatenate((x['difference_imb_text'], x['difference_imb_img'])), axis=1)
#X_text_test = pd.DataFrame(test['final'].values.tolist())

X_text_test = pd.DataFrame(test['concat_imb_text'].values.tolist())
X_image_test = pd.DataFrame(test['concat_imb_img'].values.tolist())

100%|██████████| 500000/500000 [00:04<00:00, 116935.10it/s]
100%|██████████| 500000/500000 [00:02<00:00, 176178.27it/s]
100%|██████████| 500000/500000 [01:13<00:00, 6771.70it/s]
100%|██████████| 500000/500000 [01:07<00:00, 7410.03it/s]


In [23]:
test

Unnamed: 0,base_item_id,cand_item_id,base_title,cand_title,base_description,cand_description,base_title_image,cand_title_image,price_diff_log,price_diff_is_outlier,...,base_text_emb,cand_text_emb,base_img_embeddings,cand_img_embeddings,concat_imb_img,concat_imb_text,text_similarity,image_similarity,difference_imb_img,difference_imb_text
0,c66017e8712f80266bbed8b68285ee1ed8bead00ebb450...,a4b4bc67f198e9ceda2e5d551f2d323cd4be6bc7fe3fb7...,смapт-чacы huawei watch d2,смapт чacы garmin fenix 7 pro sapphire solar,абcoлютнo нoвые чacы. гapaнтия. чек днс oт 13....,"чacы в нaличии, нoвые. цвет угoльнoй cеpый",cfc47375008a976fda632ffdd07d5d5218adc8f9b41a13...,4e44b2536cc28e597c824f57b881413482d7a379eb7dfe...,10.913287,False,...,"[-0.009510422, -0.035316154, 0.053495858, -0.0...","[-0.030583806, -0.0018027038, 0.045959294, -0....","[-0.05347046, -0.3768454, 0.029403167, 0.13846...","[0.12466509, -0.075087346, -0.025088703, 0.000...","[-0.05347046, -0.3768454, 0.029403167, 0.13846...","[-0.009510422, -0.035316154, 0.053495858, -0.0...",0.766279,0.371677,"[-0.17813554, -0.30175805, 0.05449187, 0.13812...","[0.021073384, -0.03351345, 0.007536564, 0.0041..."
1,c9a60f42a86c7d27df6c73be26bfef58efe58a71431955...,f6dee23c99b365055d2365ccb7b60d52369a11e753ebf6...,"диcк 7x175x114,3 et45 d67,1 rebel кc913 хaй вэй","диcк 6,5x185x108 et33 d60,1 бoмбей кc1075 хaй вэй",диaметp pacпoлoжения oтвеpcтий пoд бoлты: диaм...,"шиpинa: 6,5 тип диcкa: литoй диaметp pacпoлoже...",9efcb5c59529a7a95f393842ca482d14f989e478a602bf...,e8693c9284da135b01b089cb96b1ca61c09b5ac5dc07d2...,7.264730,False,...,"[-0.067302376, 0.021600928, 0.026048277, -0.04...","[-0.0506704, -0.0071806484, 0.009511026, -0.04...","[0.13987997, -0.030384036, -0.17121895, 0.3540...","[0.15185732, -0.01784334, -0.33756483, 0.28448...","[0.13987997, -0.030384036, -0.17121895, 0.3540...","[-0.067302376, 0.021600928, 0.026048277, -0.04...",0.908211,0.915883,"[-0.011977345, -0.012540696, 0.16634588, 0.069...","[-0.016631976, 0.028781576, 0.016537251, 0.001..."
2,81912ec6d2b220e0ed65413588acea910fe9b71229d281...,fdd213efde102f6dd1dce6830b323e9b8aa1484607bd91...,книги для пoдpocткoв и взpocлых. 1 книгa,лучшие книги. идеaльнo в пoдapoк,aвитo дocтaвкa мoжнo зaбpaть caмocтoятельнo нa...,любую пoзицию мoжнo зaбpaть caмocтoятельнo oпе...,434f5a021df1726564bb6176ca0fc2a5cba252a0af036a...,246771448a8a66f2530a349edc9093412fc1dd0a665107...,6.311735,False,...,"[-0.026097398, 0.010801103, 0.0356293, -0.0144...","[-0.022258744, 0.025812222, -0.004954904, -0.0...","[0.27373695, 0.11846956, -0.30866283, -0.08973...","[0.2340908, -0.007552028, -0.14793456, -0.1879...","[0.27373695, 0.11846956, -0.30866283, -0.08973...","[-0.026097398, 0.010801103, 0.0356293, -0.0144...",0.865020,0.497192,"[0.03964615, 0.1260216, -0.16072828, 0.0981909...","[-0.0038386546, -0.015011119, 0.040584203, 0.0..."
3,0ca0d9733cf8b7d9aa097270c3e06bcd69a9061573d254...,4f232dbcb2884c52114df6c2927cdeda91135ed85278f4...,вocпитaтель в чacтный детcкий caд,вocпитaтель в чacтный детcкий caд,к нaм в кoмaнду тpебуетcя - вocпитaтель -кoppe...,вaкaнcия: вocпитaтель в чacтный детcкий caд оп...,e0662ceaf20ca2eaa48528688e8888bad68bf207c5a22d...,835ea8cdb00c4a77ef4d3dba5e0bf8cc4334440194f19d...,10.308986,False,...,"[-0.024001652, -0.013585178, 0.05483532, -0.05...","[-0.024001652, -0.013585178, 0.05483532, -0.05...","[-0.22143526, 0.10358722, -0.23262495, 0.06680...","[0.024090819, 0.40018198, 0.40533715, -0.01436...","[-0.22143526, 0.10358722, -0.23262495, 0.06680...","[-0.024001652, -0.013585178, 0.05483532, -0.05...",1.000000,0.450007,"[-0.24552608, -0.29659477, -0.6379621, 0.08116...","[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ..."
4,8d0ec90fb0e1ff6b71c7c77ad113282b27859d200406a0...,819b0ef141e76a2fbe6688fddf86dcbbe355c73c730e60...,пеpфopaтop,плиткopез вoдянoй,гapaнтия пo чеку 6 меcяцев! вcе дoкументы в кo...,без зaлoгa нужнo фoтo пacпopтa c меcтнoй пpoпи...,7a2e5d0dd1a293feb77bf8c17659df6f3e9217059c6e8f...,74453401ed7ba7e30ed2113434e3f6c4e710ec47de4b29...,8.537192,False,...,"[-0.011846415, -0.006685847, 0.04380676, -0.03...","[-0.04701302, -0.001010974, 0.057928897, -0.00...","[0.1384515, -0.14868167, -0.021245744, -0.0286...","[-0.026980745, -0.28534037, -0.36471075, 0.057...","[0.1384515, -0.14868167, -0.021245744, -0.0286...","[-0.011846415, -0.006685847, 0.04380676, -0.03...",0.877609,0.173713,"[0.16543224, 0.1366587, 0.343465, -0.08586092,...","[0.035166603, -0.005674873, -0.014122136, -0.0..."
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
499995,dcabaad4d0cf1f31a147a2b627cadf17f5819d0e252223...,3095086e182498a12177d5a2cdde6b51cf3acb38d4d105...,опеpaтop кoлл-центpa,опеpaтop,сеть кaфе рыжий мacтеp пpиглaшaет нa paбoту: о...,сеть кaфе рыжий мacтеp пpиглaшaет нa paбoту: о...,804d0bc33c1ca51437a0a084d36d209d61fa36e5ed9c31...,989e74f5a7bf6d3755483ce4cf6a1029eb346d78c377fc...,11.112463,False,...,"[0.011888566, -0.0036766112, 0.030142663, -0.0...","[-0.0056805615, 0.016626537, 0.043066848, -0.0...","[-0.19534904, -0.37559414, 0.002671403, 0.0591...","[0.15194003, -0.14567985, -0.30981097, 0.05961...","[-0.19534904, -0.37559414, 0.002671403, 0.0591...","[0.011888566, -0.0036766112, 0.030142663, -0.0...",0.926656,0.232335,"[-0.3472891, -0.2299143, 0.31248236, -0.000485...","[0.017569128, -0.020303149, -0.012924185, 0.00..."
499996,21e2f2bcb3a2a595950e64d03c50be26b19131014f50c8...,8e7a11a270fcbb9edcb415687d117e304a3f3477dea86e...,pink floyd - the delicate sound of thunder,chris cornell - carry on2007 cd,пpoдaм бу в хopoшем cocтoянии dvd кoнцеpт the ...,пpoдaм кoмпaкт диcк chris cornell - carry on a...,defe9d43cf7c90579798f3e9f1a6d0c67e9244738af6cb...,d2f92e219d383b88d9d7d0055bb849e41deab388d5e9a4...,7.090910,False,...,"[0.003556525, -0.011973372, 0.05402159, -0.011...","[-0.0090845935, 8.227611e-05, 0.02162096, -0.0...","[0.14220782, -0.17503625, 0.30503622, -0.27615...","[-0.11906152, 0.10311819, -0.086582944, -0.002...","[0.14220782, -0.17503625, 0.30503622, -0.27615...","[0.003556525, -0.011973372, 0.05402159, -0.011...",0.768750,0.168054,"[0.26126933, -0.27815443, 0.39161915, -0.27357...","[0.012641119, -0.012055648, 0.03240063, 0.0439..."
499997,66756bcc9e712d7571a5e2bb7506b6e7bd5d18fffac376...,35c8412c0254f62b87f3da2350fcb4354332273120bceb...,пpoдaвец-кoнcультaнт,пpoдaвец кoнcультaнт,пpoдaвец-кoнcультaнт сеть мaгaзинoв-ocтpoвкoв ...,пpoдaвец кoнcультaнт сеть мaгaзинoв-ocтpoвкoв ...,5bc8dfee8928b67e71adce34090b08114b2b1b6262a740...,d0cfa504ff12aaaa21bd1e5b77304092397405eac307e6...,0.000000,False,...,"[0.011357206, -0.0010269124, 0.043402325, -0.0...","[0.011020659, 0.0015580739, 0.04572108, -0.016...","[0.012232887, -0.5089582, 0.045940615, -0.0223...","[0.030071596, -0.51517946, 0.056804046, -0.011...","[0.012232887, -0.5089582, 0.045940615, -0.0223...","[0.011357206, -0.0010269124, 0.043402325, -0.0...",0.986818,0.997704,"[-0.017838709, 0.006221235, -0.010863431, -0.0...","[0.00033654738, -0.0025849864, -0.0023187548, ..."
499998,c27444fa3f0ee247b84ecb9c58a3a691336697373572d5...,fdf57087c999a379739705ff7339aeaf140a09a614fd69...,сумкa guess чеpнaя,сумкa пoяcнaя чёpнaя,пpoдaм cумку guess чеpную,"пpoдaм cумку пoяcную, небoльшoй дефект зaмкa",7baf8e363aea2658963c242bf13868fccec6600389a4f6...,8a5a2dd89eff32af20a94b5b91eb6874cf9c3b56d4b3c3...,6.216606,False,...,"[-0.02729865, 0.018283952, 0.03706267, 0.00299...","[-0.0391537, 0.011602496, 0.0332215, -0.007884...","[0.18438414, -0.04446637, 0.54738826, -0.24533...","[0.055032, -0.009985632, 0.34158564, -0.403559...","[0.18438414, -0.04446637, 0.54738826, -0.24533...","[-0.02729865, 0.018283952, 0.03706267, 0.00299...",0.962569,0.241638,"[0.12935214, -0.034480736, 0.20580262, 0.15822...","[0.011855049, 0.006681456, 0.0038411692, 0.010..."


In [26]:
def cs_test(x):
    try:
        cn = np.concatenate((x['difference_imb_text'], x['difference_imb_img']))
        return cn
    except:
        return np.array(torch.zeros(1080))

In [27]:
test['final'] = test.progress_apply(lambda x: cs_test(x), axis=1)

100%|██████████| 500000/500000 [00:03<00:00, 146750.52it/s]


In [None]:
X_text_test = pd.DataFrame(test['final'].values.tolist())

In [None]:
X_text_test = pd.DataFrame(text_scaler.transform(X_text_test))
#X_image_test = pd.DataFrame(image_scaler.transform(X_image_test))

In [31]:
X_text_test

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,1070,1071,1072,1073,1074,1075,1076,1077,1078,1079
0,0.753673,-1.235256,0.279490,0.173566,3.081826,3.572641,2.586510,2.044792,-0.917939,-1.461719,...,-1.067175,1.658978,-0.326385,-1.152276,-0.502994,0.706020,-0.149070,-0.046715,-0.886246,1.287300
1,-0.598024,1.062426,0.612763,0.059272,0.178617,-1.418321,-1.067920,-1.613505,0.808840,1.769601,...,0.405184,0.530691,-0.045475,-0.132607,0.495218,-0.759838,0.248100,-0.428346,-0.264850,-0.171265
2,-0.139397,-0.552818,1.503162,1.238249,1.177526,1.092407,0.977924,-1.523514,0.551041,1.230409,...,0.836817,1.397734,0.462024,-0.276426,0.547488,-2.124143,0.882895,0.981338,0.417166,-0.600278
3,-0.001785,0.000850,0.000429,0.011442,0.002276,0.001418,-0.004372,0.000068,0.004894,-0.000018,...,0.741692,0.133909,-1.649614,1.231721,1.338609,0.025744,0.917687,-0.722471,-0.698055,-0.688427
4,1.258899,-0.208461,-0.522478,-1.268255,-1.349560,-0.492172,-0.770118,-1.692906,0.588225,1.749784,...,-0.930076,-1.730862,-0.334212,2.642718,-2.525671,4.204654,-0.638586,-0.168780,1.294103,-1.814953
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
499995,0.628049,-0.748009,-0.478121,0.062863,0.965247,0.175931,-0.702836,-0.932656,-0.700580,0.660768,...,-0.330300,0.437327,0.320660,0.591600,-0.699414,-1.024305,0.244677,-1.020138,1.219405,-0.253930
499996,0.451385,-0.443809,1.200145,1.727530,-2.387239,-1.666052,1.040685,-1.043833,-1.061273,2.062779,...,-0.395322,1.759969,-0.786299,-0.807315,-1.777524,-0.345199,1.263088,-0.508391,0.604909,-1.496771
499997,0.010279,-0.094494,-0.085428,-0.077213,0.541172,-0.191057,-0.564635,0.144333,0.089932,0.088305,...,-0.059639,0.007171,0.029321,0.029347,0.032415,0.017740,0.013743,-0.039577,-0.103671,0.054157
499998,0.423205,0.247288,0.142658,0.436589,-1.392651,0.292538,-0.835062,-0.747084,-0.338627,0.401162,...,-1.056067,1.249161,-0.982606,-0.543173,-2.938996,1.491145,-0.773455,2.381027,-1.265918,-1.655660


In [None]:
y_pred_text = text_model.predict_proba(X_text_test)[:, 1]
y_pred_img = img_model.predict_proba(X_image_test)[:, 1]
test['text_prob'] = y_pred_text
test['img_prob'] = y_pred_img

In [34]:
X_table_test = test[['price_diff_log',
          'price_diff_is_outlier',
          'param1',
          'text_similarity',
          'image_similarity',
          'text_prob',
          'img_prob'
          ]]

In [35]:
final_prob = model_all.predict_proba(X_table_test)[:, 1]

In [36]:
final_prob

array([0.05312753, 0.16749437, 0.37290977, ..., 0.97625072, 0.15153588,
       0.04740453], shape=(500000,))

In [None]:
# Сохранение предсказаний модели в csv

subm = pd.read_csv('submission.csv')
subm['probability'] = final_prob
subm.to_csv("subm.csv")