In [1]:
import matplotlib.pyplot as plt
from pathlib import Path
import os
import numpy as np
import pandas as pd
from tqdm import tqdm

import torch
from torch.utils.data import DataLoader


from sentence_transformers import SentenceTransformer
from PIL import Image


from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report
from sklearn.neighbors import NearestNeighbors
from catboost import CatBoostClassifier, Pool

In [2]:
# Путь к папке с датасетом
DATA_DIR = os.path.join('..', 'data')
TRAIN_DATA_DIR = os.path.join('..', 'data', 'train.csv')
TEST_DATA_DIR = os.path.join('..', 'data', 'test.csv')

# Путь к эмбеддингам текста.
TEXT_EMB_DIR = os.path.join('..', 'notebooks/text_emb')
IMG_EMB_DIR = os.path.join('..', 'notebooks/img_emb')

In [3]:
# Импортируем скрипт для работы с данными
import sys
sys.path.append(str(Path.cwd().parent))
from scripts import data_preprocess

# Загрузка данных.

In [4]:
# Загрузка
df_train = pd.read_csv(TRAIN_DATA_DIR, index_col='id')
df_test = pd.read_csv(TEST_DATA_DIR, index_col='id')

### Обработка данных

Табличные фичи.

In [5]:
import warnings
warnings.filterwarnings('ignore')

# Обрабатываем и получаем данные
df_train_num, df_train_text = data_preprocess.clean_data(df_train)
df_test_num, df_test_text = data_preprocess.clean_data(df_test, type='test')

In [6]:
df_train_full = df_train_num.merge(df_train_text.drop(columns=['ItemID', 'resolution']), how='left', on='id')
df_test_full = df_test_num.merge(df_test_text.drop(columns=['ItemID']), how='left', on='id')

In [7]:
# TEXT
embeddings, embeddings_test = data_preprocess.load_text_embeddings(dir=TEXT_EMB_DIR)
# IMAGES
train_img_emb, test_img_emb = data_preprocess.load_img_embeddings(dir=IMG_EMB_DIR)

## Нормализуем данные.

In [8]:
import pandas as pd
import numpy as np

def normalize_seller_features(df, group_col='SellerID', features=None):
    """
    Нормализует признаки продавцов, беря максимальные значения для каждого продавца
    
    Parameters:
    df - DataFrame с данными
    group_col - колонка для группировки (SellerID)
    features - список признаков для нормализации
    """
    if features is None:
        features = ['seller_time_alive', 'GmvTotal90', 'ExemplarAcceptedCountTotal90', 
                   'ExemplarReturnedCountTotal90', 'ExemplarReturnedValueTotal90']
    
    # Создаем копию датафрейма
    result_df = df.copy()
    
    # Для каждого признака находим максимальное значение по SellerID
    for feature in features:
        max_values = df.groupby(group_col)[feature].transform('max')
        result_df[feature] = max_values
    
    return result_df

In [9]:
def prepare_seller_features(train_df, test_df, group_col='SellerID', features=None):
    """
    Подготавливает признаки продавцов для train и test, используя train данные
    для общих продавцов и test данные для новых продавцов
    """
    if features is None:
        features = ['seller_time_alive', 'GmvTotal90', 'ExemplarAcceptedCountTotal90', 
                   'ExemplarReturnedCountTotal90', 'ExemplarReturnedValueTotal90']
    
    # Нормализуем train данные
    train_normalized = normalize_seller_features(train_df, group_col, features)
    
    # Для test данных используем два подхода:
    test_normalized = test_df.copy()
    
    # Находим общих продавцов между train и test
    common_sellers = set(train_df[group_col].unique()) & set(test_df[group_col].unique())
    new_sellers = set(test_df[group_col].unique()) - set(train_df[group_col].unique())
    
    print(f"Общих продавцов: {len(common_sellers)}")
    print(f"Новых продавцов в test: {len(new_sellers)}")
    
    # Для общих продавцов берем значения из train (максимальные исторические значения)
    seller_max_values = train_df.groupby(group_col)[features].max()
    
    for feature in features:
        # Для общих продавцов используем значения из train
        mask_common = test_normalized[group_col].isin(common_sellers)
        test_normalized.loc[mask_common, feature] = test_normalized.loc[mask_common, group_col].map(seller_max_values[feature])
        
        # Для новых продавцов берем максимальные значения из test
        mask_new = test_normalized[group_col].isin(new_sellers)
        if mask_new.any():
            new_seller_max = test_df.groupby(group_col)[feature].max()
            test_normalized.loc[mask_new, feature] = test_normalized.loc[mask_new, group_col].map(new_seller_max)
    
    return train_normalized, test_normalized

# Применение
train_processed, test_processed = prepare_seller_features(df_train, df_test)

Общих продавцов: 1789
Новых продавцов в test: 1342


In [10]:
# Обрабатываем и получаем данные
df_train_num, df_train_text = data_preprocess.clean_data(train_processed)
df_test_num, df_test_text = data_preprocess.clean_data(test_processed, type='test')

In [11]:
df_train_full = df_train_num.merge(df_train_text.drop(columns=['ItemID', 'resolution']), how='left', on='id')
df_test_full = df_test_num.merge(df_test_text.drop(columns=['ItemID']), how='left', on='id')

# Выявление некоторых полезных фич из картинок.

In [14]:
import cv2
from functools import lru_cache

@lru_cache(maxsize=1000)
def extract_image_features(img_path):
    try:
        # читаем через cv2 (быстро)
        img = cv2.imread(str(img_path))
        
        # размеры
        h, w, c = img.shape
        aspect_ratio = w / h
        area = w * h

        # переводим в RGB и считаем статистики
        img_rgb = cv2.cvtColor(img, cv2.COLOR_BGR2RGB)
        mean_colors = img_rgb.mean(axis=(0,1))  # R,G,B среднее
        std_colors = img_rgb.std(axis=(0,1))    # R,G,B std

        # яркость (grayscale)
        gray = cv2.cvtColor(img, cv2.COLOR_BGR2GRAY)
        mean_gray = gray.mean()
        std_gray = gray.std()

        # резкость через лапласиан
        sharpness = cv2.Laplacian(gray, cv2.CV_64F).var()

        # доля белых / чёрных пикселей
        white_ratio = (gray > 240).mean()
        black_ratio = (gray < 15).mean()

        # Шум
        noise = cv2.medianBlur(gray, 3) - gray
        noise = noise[noise != 0]
    
        
        return {
            "img_w": w,
            "img_h": h,
            "img_aspect_ratio": aspect_ratio,
            "img_area": area,
            "img_r_mean": mean_colors[0],
            "img_g_mean": mean_colors[1],
            "img_b_mean": mean_colors[2],
            "img_r_std": std_colors[0],
            "img_g_std": std_colors[1],
            "img_b_std": std_colors[2],
            "img_gray_mean": mean_gray,
            "img_gray_std": std_gray,
            "img_sharpness": sharpness,
            "img_white_ratio": white_ratio,
            "img_black_ratio": black_ratio,
            "img_noise_level": np.std(noise) if len(noise) > 0 else 0
        }
    except Exception as e:
        print(f"Ошибка с {img_path}: {e}")
        return None
    

def extract_img_features(df, img_dir):
    nan_count = 0
    stats_columns = ["img_w", "img_h", "img_aspect_ratio", "img_area", "img_r_mean", 
     "img_g_mean", "img_b_mean", "img_r_std", "img_g_std", "img_b_std","img_gray_mean",
    "img_gray_std", "img_sharpness", "img_white_ratio", "img_black_ratio", "img_noise_level"]

    new_data = []

    for idx, row in tqdm(df.iterrows(), total=len(df), desc="Extracting image features"):
        img_id = row["ItemID"].astype(int)
        path = img_dir / f"{img_id}.png"
        if os.path.exists(path):
            stats = extract_image_features(path)
            stats["ItemID"] = img_id
            new_data.append(stats)
        else:
            nan_count+=1

    new_data = pd.DataFrame(new_data, columns=["ItemID"] + stats_columns)
    df_out = df.merge(new_data, on="ItemID", how="left")
    print(f"Преобразование выполнено, нет изображений у {nan_count} товаров из {len(df)} ({nan_count/len(df):.2%})")

    return df_out

In [15]:
df_train_num_img_features = extract_img_features(df_train_num, TRAIN_DIR)

Extracting image features:   0%|          | 671/197198 [01:50<9:00:17,  6.06it/s] 


KeyboardInterrupt: 

In [None]:
df_test_num_img_features = extract_img_features(df_test_num, TRAIN_DIR)

Extracting image features:   0%|          | 0/197198 [00:00<?, ?it/s]

Преобразование выполнено, нет изображений у 0 товаров из 197198 (0.00%)





In [None]:
df_train_num_img_features

Unnamed: 0,resolution,rating_1_count,rating_2_count,rating_3_count,rating_4_count,rating_5_count,comments_published_count,photos_published_count,videos_published_count,PriceDiscounted,...,img_g_mean,img_b_mean,img_r_std,img_g_std,img_b_std,img_gray_mean,img_gray_std,img_sharpness,img_white_ratio,img_black_ratio
0,0,6.0,4.0,4.0,3.0,32.0,3.0,6.0,0.0,688.436773,...,198.876035,206.924128,80.92887,64.562206,62.761524,196.160332,63.01397,458.803291,0.337574,0.001431
1,0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,663.157297,...,,,,,,,,,,
2,0,0.0,0.0,1.0,0.0,1.0,0.0,0.0,0.0,606.573197,...,,,,,,,,,,
3,0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,856.755162,...,,,,,,,,,,
4,0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,822.274833,...,,,,,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
197193,0,4.0,1.0,0.0,6.0,60.0,76.0,29.0,9.0,1013.750244,...,,,,,,,,,,
197194,0,0.0,0.0,0.0,1.0,31.0,33.0,26.0,12.0,1177.025024,...,,,,,,,,,,
197195,0,1.0,1.0,1.0,0.0,27.0,10.0,15.0,0.0,1004.280743,...,,,,,,,,,,
197196,1,55.0,30.0,27.0,21.0,72.0,208.0,32.0,1.0,724.818970,...,,,,,,,,,,


Сохранение

In [16]:
df_train_num_img_features.to_parquet(DATA_DIR / "df_train_img_features.parquet")
df_test_num_img_features.to_parquet(DATA_DIR / "df_test_img_features.parquet")

NameError: name 'df_train_num_img_features' is not defined

Загрузка

In [None]:
train_img_emb = pd.read_parquet(DATA_DIR / "df_train_img_features.parquet")
test_img_emb = pd.read_parquet(DATA_DIR / "df_test_img_features.parquet")

# Добавление z-оценок по цене и длине описания.

## Методы для улучшения качетсва.

In [12]:
# === Групповые статистики ===
def compute_group_stats(train_df):
    stats = {}
    grouped = train_df.groupby("CommercialTypeName4")
    for col in ["PriceDiscounted", "desc_len"]:
        med = grouped[col].median()
        iqr = grouped[col].quantile(0.75) - grouped[col].quantile(0.25)
        stats[col] = {"med": med, "iqr": iqr}
    return stats

def apply_group_stats(df, stats):
    for col, d in stats.items():
        med = df["CommercialTypeName4"].map(d["med"]).fillna(df[col].median())
        iqr = df["CommercialTypeName4"].map(d["iqr"]).fillna(df[col].quantile(0.75)-df[col].quantile(0.25))
        df[f"{col}_z"] = (df[col] - med) / (iqr + 1e-6)
    return df


In [13]:
# == Длина описания ==
def get_desc_length(df, id_col="id"):
    """
    Возвращает DataFrame со столбцом длины описания.

    """
    desc_len = df.copy()
    desc_len["desc_len"] = desc_len['description'].str.len()
    return desc_len['desc_len']

## Построим модель и обучим ее исходя из нашего бейзлайна.

In [17]:
def df_extend(df_full, df_num, df_text, embeddings, emb_img, df_type: str ='train', group_stats=None):
    # Разделим данные

    # Возьмем категориальные признаки
    cat_cols = ["brand_name", "CommercialTypeName4"]
    cat_data = df_text[cat_cols].astype(str)

    # Добавим длину описания
    desc_len_df = get_desc_length(df_text, id_col="id")
    df_num = df_num.merge(desc_len_df, on="id", how="left")
    df_full = df_full.merge(desc_len_df, on="id", how="left")

    if df_type == 'train':
        # ====== Train/Val Data ======
        # 1) Трейн/тест сплит
        y = df_num["resolution"].astype(int).values
        data_train_num, data_val_num, y_train, y_val, train_cat, val_cat, embeddings_train, embeddings_val, emb_img_train, emb_img_val \
        = train_test_split(
            df_num, y, cat_data, embeddings, emb_img, test_size=0.21, stratify=y, random_state=41
        )

        # 2) Формируем полные датафреймы
        data_train_full = data_train_num.merge(train_cat, on='id', how='left')
        data_val_full = data_val_num.merge(val_cat, on='id', how='left')

        # 3) Получаем групповые статистики (z оценки и прочее)
        group_stats = compute_group_stats(data_train_full)
        
        # 4) Добавляем групповые статистики к данным
        data_train_full = apply_group_stats(data_train_full, group_stats)
        data_val_full = apply_group_stats(data_val_full, group_stats)
        
        # 5) После всего мы создаем числовые датафреймы
        data_train_num = data_train_full.drop(columns=cat_cols)
        data_val_num = data_val_full.drop(columns=cat_cols)

        return data_train_num, data_val_num, y_train, y_val, \
                train_cat, val_cat, embeddings_train, embeddings_val, emb_img_train, emb_img_val, group_stats
    
    elif df_type == 'test':
        assert group_stats is not None, "Для теста нужно передать обученные group_stats"

        # Применяем GroupStats
        data_test_full = df_num.merge(cat_data, on='id', how='left')
        df_num = apply_group_stats(data_test_full, group_stats)
        df_num.drop(columns=cat_cols, inplace=True)

        return df_num, cat_data

### Организуем пайплайн из методов.

Сначала трейн и валидация.

In [18]:
# Получаем данные для тренировки
data_train_num, data_val_num, y_train, y_val, train_cat, val_cat, embeddings_train, embeddings_val, emb_img_train, emb_img_val, group_stats \
    = df_extend(df_train_full, df_train_num, df_train_text, embeddings, 
                emb_img=train_img_emb, 
                df_type='train'
)

Потом тест.

In [19]:
data_test_num, cat_data = df_extend(df_test_full, df_test_num, df_test_text, embeddings_test, test_img_emb,
                                    df_type='test',
                                    group_stats=group_stats)

#### Посмотрим все ли так

In [20]:
data_test_num.sample(5)

Unnamed: 0_level_0,rating_1_count,rating_2_count,rating_3_count,rating_4_count,rating_5_count,comments_published_count,photos_published_count,videos_published_count,PriceDiscounted,item_time_alive,...,is_rating_exists,has_full_data_90d,missing_orders_only_90d,is_item_count,return_rate_90d,fake_return_rate_90d,avg_order_value_90d,desc_len,PriceDiscounted_z,desc_len_z
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
223492,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,820.805133,1370,...,0,1,0,1,0.0,0.0,0.428551,581,0.269282,0.059908
339917,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,553.2709,121,...,0,1,0,1,0.0,0.0,1.104002,39,-0.551144,-0.076233
506097,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,990.496314,2,...,0,1,0,1,0.0,0.0,0.002721,597,0.703949,0.632124
274365,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,909.33741,246,...,0,1,0,1,0.0,0.0,0.311667,588,-0.930488,0.666667
2746,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1216.978435,0,...,0,0,1,0,0.0,0.0,176.210258,563,0.330893,-1.0


In [21]:
data_val_num.sample(10)

Unnamed: 0_level_0,resolution,rating_1_count,rating_2_count,rating_3_count,rating_4_count,rating_5_count,comments_published_count,photos_published_count,videos_published_count,PriceDiscounted,...,is_rating_exists,has_full_data_90d,missing_orders_only_90d,is_item_count,return_rate_90d,fake_return_rate_90d,avg_order_value_90d,desc_len,PriceDiscounted_z,desc_len_z
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
263910,0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,579.063493,...,0,1,0,1,0.0,0.0,1.853516,48,-0.1105818,-0.1879195
165046,0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,795.146472,...,0,1,0,1,0.0,0.0,0.685534,129,-0.0003341717,-0.16
283070,0,0.0,0.0,0.0,0.0,2.0,2.0,1.0,1.0,618.141237,...,1,1,0,1,0.0,0.0,0.140849,560,-60331790.0,-3000000.0
247413,0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,700.825771,...,0,1,0,1,0.0,0.0,10.440262,580,-0.7599815,0.1037975
313567,0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,725.649953,...,0,1,0,1,0.0,0.0,6.376993,922,2.470277,1.545952
112383,0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,748.077909,...,0,1,0,1,0.0,0.0,0.771682,155,1.994021,0.1639344
230796,0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,637.194375,...,0,1,0,1,0.0,0.0,0.080509,554,0.07818386,0.8907104
443286,0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,689.057197,...,0,1,0,1,0.0,0.0,0.175609,0,-0.3658898,-0.3037736
373519,0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1127.564889,...,0,1,0,1,0.0,0.0,25.222913,574,-0.7785422,-0.6857143
134464,0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,670.544551,...,1,1,0,1,0.0,0.0,0.170691,0,0.6544056,-0.1183971


# Построим модель.

**PCA for text embeddings**

In [None]:
from sklearn.decomposition import PCA

pca_text = PCA(n_components=189, random_state=27)
pca_image = PCA(n_components=64, random_state=27)

In [23]:
# text
embeddings_train_p = pca_text.fit_transform(embeddings_train)
embeddings_val_p = pca_text.transform(embeddings_val)
embeddings_test_p = pca_text.transform(embeddings_test)

In [24]:
# images
emb_img_train_p = pca_image.fit_transform(emb_img_train)
emb_img_val_p = pca_image.transform(emb_img_val)
emb_img_test_p = pca_image.transform(test_img_emb)

In [None]:
with open('pca/pca_text.pkl', 'wb') as le_dump_file:
    pickle.dump(pca_text, le_dump_file)
with open('pca/pca_image.pkl', 'wb') as le_dump_file:
    pickle.dump(pca_image, le_dump_file)

In [None]:
import pickle
def load_pcas(dir='img_emb'):
    """
    retrun:
    """
    with open('pca/pca_text.pkl', 'rb') as le_dump_file:
        pca_text = pickle.load(le_dump_file)
    with open('pca/pca_image.pkl', 'rb') as le_dump_file:
        pca_image = pickle.load(le_dump_file)

    return pca_text, pca_image

pca_text, pca_image = load_pcas()

### Трейн и валидация.

In [25]:
X_train_num = data_train_num.drop(columns=['resolution', 'ItemID'])
X_val_num = data_val_num.drop(columns=['resolution', 'ItemID'])

# Теперь объединяем: эмбеддинги + изображения + числовые
X_train_num = np.concatenate([embeddings_train_p, emb_img_train_p, X_train_num], axis=1)  # (N, D + num_features)
X_val_num = np.concatenate([embeddings_val_p, emb_img_val_p, X_val_num], axis=1)  # (N, D + num_features)

In [26]:
# Возьмем категориальные признаки
cat_cols = ["brand_name", "CommercialTypeName4"]

In [27]:
# --- Объединяем все в один датафрейм
X_train = pd.concat(
    [pd.DataFrame(X_train_num), train_cat.reset_index(drop=True)], axis=1
)
X_val = pd.concat(
    [pd.DataFrame(X_val_num), val_cat.reset_index(drop=True)], axis=1
)

In [28]:
# --- Создаем пулы для catboost

# Категориальные признаки теперь — последние len(cat_cols) колонок
cat_features_idx = list(range(X_train_num.shape[1], X_train_num.shape[1] + len(cat_cols)))

train_pool = Pool(X_train, label=y_train, cat_features=cat_features_idx)
val_pool = Pool(X_val, label=y_val, cat_features=cat_features_idx)


In [31]:
# Модель
model = CatBoostClassifier(
    iterations=800,
    depth=12,
    learning_rate=0.035,
    eval_metric="F1",
    random_seed=63,
    od_type="Iter",
    bagging_temperature = 2,
    random_strength = 2,
    l2_leaf_reg = 4,
    od_wait=50,
    task_type="GPU" if torch.cuda.is_available() else "CPU"
)

model.fit(train_pool, eval_set=val_pool,
    verbose=100,
    use_best_model=True,      # Использовать лучшую модель по валидации
    plot=True                 # Построить график обучения
)

MetricVisualizer(layout=Layout(align_self='stretch', height='500px'))

0:	learn: 0.6758233	test: 0.6798140	best: 0.6798140 (0)	total: 448ms	remaining: 5m 57s
100:	learn: 0.8227537	test: 0.7672565	best: 0.7674961 (99)	total: 41.6s	remaining: 4m 48s
200:	learn: 0.8726557	test: 0.7791103	best: 0.7791103 (199)	total: 1m 20s	remaining: 3m 58s
300:	learn: 0.9076710	test: 0.7890580	best: 0.7894534 (296)	total: 1m 57s	remaining: 3m 15s
400:	learn: 0.9289022	test: 0.7950788	best: 0.7960008 (397)	total: 2m 34s	remaining: 2m 33s
500:	learn: 0.9419374	test: 0.7971571	best: 0.7971571 (488)	total: 3m 10s	remaining: 1m 53s
600:	learn: 0.9517927	test: 0.8010742	best: 0.8010753 (593)	total: 3m 46s	remaining: 1m 15s
bestTest = 0.8013816926
bestIteration = 601
Shrink model to first 602 iterations.


<catboost.core.CatBoostClassifier at 0x1f8701f9110>

In [32]:
# Предсказания
y_pred = model.predict(val_pool)
print("\n=== Classification report ===")
print(classification_report(y_val, y_pred, digits=4))


=== Classification report ===
              precision    recall  f1-score   support

           0     0.9832    0.9901    0.9867     38671
           1     0.8453    0.7618    0.8014      2741

    accuracy                         0.9750     41412
   macro avg     0.9143    0.8759    0.8940     41412
weighted avg     0.9741    0.9750    0.9744     41412



### Тест

In [33]:
# Возьмем числовные данные
num_data_test = data_test_num.drop(columns='ItemID').values
# Теперь объединяем: эмбеддинги + числовые
X_test_num = np.concatenate([embeddings_test_p, emb_img_test_p, num_data_test], axis=1)  # (N, D + num_features)

In [34]:
# Объединяем в датафрейм для CatBoost
X_test = pd.concat([pd.DataFrame(X_test_num), cat_data.reset_index(drop=True)], axis=1)

test_pool = Pool(X_test, cat_features=cat_features_idx)  # cat_features_idx те же, что для валидации

y_pred_test = model.predict(test_pool)

In [35]:
test_predictions = model.predict(test_pool)

submission = pd.DataFrame({
    'id': df_test.index,
    'prediction': test_predictions
})

submission.to_csv('submission.csv', index=False)


print(f"Создан файл submission.csv с {len(submission)} предсказаниями")
print(f"Распределение предсказаний:")
print(submission['prediction'].value_counts())
print()

Создан файл submission.csv с 31391 предсказаниями
Распределение предсказаний:
prediction
0    29586
1     1805
Name: count, dtype: int64



## Сохранение модели

In [None]:
model.save_model('catboost_model.cbm')