In [1]:
import matplotlib.pyplot as plt
from pathlib import Path
import os
import numpy as np
import pandas as pd
from tqdm import tqdm

import torch
from sentence_transformers import SentenceTransformer

from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report
from sklearn.neighbors import NearestNeighbors
from catboost import CatBoostClassifier, Pool

import warnings
warnings.filterwarnings('ignore')

# Загрузка данных с нормализацией и прочими вещами.

In [8]:
# Путь к папке с датасетом
DATA_DIR = os.path.join('..', 'data')
TRAIN_DATA_DIR = os.path.join('..', 'data', 'train.csv')
TEST_DATA_DIR = os.path.join('..', 'data', 'test.csv')

# Путь к эмбеддингам текста.
TEXT_EMB_DIR = os.path.join('..', 'notebooks/text_emb')
IMG_EMB_DIR = os.path.join('..', 'notebooks/img_emb')

Датасеты и эмбеддинги.

In [9]:
# Обработка данных
import sys
sys.path.append(str(Path.cwd().parent))
from scripts import data_preprocess

In [11]:
# Загрузка
df_train = pd.read_csv(TRAIN_DATA_DIR, index_col='id')
df_test = pd.read_csv(TEST_DATA_DIR, index_col='id')

In [12]:
# TEXT
embeddings, embeddings_test = data_preprocess.load_text_embeddings(dir=TEXT_EMB_DIR)
# IMAGES
train_img_emb, test_img_emb = data_preprocess.load_img_embeddings(dir=IMG_EMB_DIR)

Нормализуем

In [13]:
import pandas as pd
import numpy as np

def normalize_seller_features(df, group_col='SellerID', features=None):
    """
    Нормализует признаки продавцов, беря максимальные значения для каждого продавца
    
    Parameters:
    df - DataFrame с данными
    group_col - колонка для группировки (SellerID)
    features - список признаков для нормализации
    """
    if features is None:
        features = ['seller_time_alive', 'GmvTotal90', 'ExemplarAcceptedCountTotal90', 
                   'ExemplarReturnedCountTotal90', 'ExemplarReturnedValueTotal90']
    
    # Создаем копию датафрейма
    result_df = df.copy()
    
    # Для каждого признака находим максимальное значение по SellerID
    for feature in features:
        max_values = df.groupby(group_col)[feature].transform('max')
        result_df[feature] = max_values
    
    return result_df

In [14]:
def prepare_seller_features(train_df, test_df, group_col='SellerID', features=None):
    """
    Подготавливает признаки продавцов для train и test, используя train данные
    для общих продавцов и test данные для новых продавцов
    """
    if features is None:
        features = ['seller_time_alive', 'GmvTotal90', 'ExemplarAcceptedCountTotal90', 
                   'ExemplarReturnedCountTotal90', 'ExemplarReturnedValueTotal90']
    
    # Нормализуем train данные
    train_normalized = normalize_seller_features(train_df, group_col, features)
    
    # Для test данных используем два подхода:
    test_normalized = test_df.copy()
    
    # Находим общих продавцов между train и test
    common_sellers = set(train_df[group_col].unique()) & set(test_df[group_col].unique())
    new_sellers = set(test_df[group_col].unique()) - set(train_df[group_col].unique())
    
    print(f"Общих продавцов: {len(common_sellers)}")
    print(f"Новых продавцов в test: {len(new_sellers)}")
    
    # Для общих продавцов берем значения из train (максимальные исторические значения)
    seller_max_values = train_df.groupby(group_col)[features].max()
    
    for feature in features:
        # Для общих продавцов используем значения из train
        mask_common = test_normalized[group_col].isin(common_sellers)
        test_normalized.loc[mask_common, feature] = test_normalized.loc[mask_common, group_col].map(seller_max_values[feature])
        
        # Для новых продавцов берем максимальные значения из test
        mask_new = test_normalized[group_col].isin(new_sellers)
        if mask_new.any():
            new_seller_max = test_df.groupby(group_col)[feature].max()
            test_normalized.loc[mask_new, feature] = test_normalized.loc[mask_new, group_col].map(new_seller_max)
    
    return train_normalized, test_normalized

# Применение
train_processed, test_processed = prepare_seller_features(df_train, df_test)

Общих продавцов: 1789
Новых продавцов в test: 1342


Загружаем.

In [15]:
# Обрабатываем и получаем данные
df_train_num, df_train_text = data_preprocess.clean_data(train_processed)
df_test_num, df_test_text = data_preprocess.clean_data(test_processed, type='test')

**Есть ли изображение у товара.**

In [16]:
# Путь к папке с картинками
IMG_DIR = Path("C:/Users/vds/Work/Programming Stuff/ecup/data/images")
IMG_TRAIN_DIR = Path(IMG_DIR / "train")
IMG_TEST_DIR = Path(IMG_DIR / "test")

In [17]:
def has_img_col(df, img_dir):
    # Создаем словарь: ItemID -> has_img
    img_dict = {}
    unique_ids = df['ItemID'].unique()
    
    for img_id in tqdm(unique_ids, desc="Checking images"):
        path = img_dir / f"{int(img_id)}.png"
        img_dict[img_id] = 1 if path.exists() else 0
    
    # Создаем копию и добавляем колонку через map
    new_df = df.copy()
    new_df['has_img'] = df['ItemID'].map(img_dict)
    
    return new_df

In [18]:
df_train_num = has_img_col(df_train_num, IMG_TRAIN_DIR)
df_test_num = has_img_col(df_test_num, IMG_TEST_DIR)

Checking images: 100%|██████████| 197198/197198 [00:21<00:00, 9363.72it/s] 
Checking images: 100%|██████████| 31391/31391 [00:03<00:00, 9443.28it/s] 


**Формируем полный датафрейм.**

In [19]:
df_train_full = df_train_num.merge(df_train_text.drop(columns=['ItemID', 'resolution']), how='left', on='id')
df_test_full = df_test_num.merge(df_test_text.drop(columns=['ItemID']), how='left', on='id')

Добавляем z-оценки + разделение на train/val

In [21]:
# === Групповые статистики ===
def compute_group_stats(train_df):
    stats = {}
    grouped = train_df.groupby("CommercialTypeName4")
    for col in ["PriceDiscounted", "desc_len"]:
        med = grouped[col].median()
        iqr = grouped[col].quantile(0.75) - grouped[col].quantile(0.25)
        stats[col] = {"med": med, "iqr": iqr}
    return stats

def apply_group_stats(df, stats):
    for col, d in stats.items():
        med = df["CommercialTypeName4"].map(d["med"]).fillna(df[col].median())
        iqr = df["CommercialTypeName4"].map(d["iqr"]).fillna(df[col].quantile(0.75)-df[col].quantile(0.25))
        df[f"{col}_z"] = (df[col] - med) / (iqr + 1e-6)
    return df

# == Длина описания ==
def get_desc_length(df, id_col="id"):
    """
    Возвращает DataFrame со столбцом длины описания.

    """
    desc_len = df.copy()
    desc_len["desc_len"] = desc_len['description'].str.len()
    return desc_len['desc_len']

def df_extend(df_full, df_num, df_text, embeddings, emb_img, df_type: str ='train', group_stats=None):
    # Разделим данные

    # Возьмем категориальные признаки
    cat_cols = ["brand_name", "CommercialTypeName4"]
    cat_data = df_text[cat_cols].astype(str)

    # Добавим длину описания
    desc_len_df = get_desc_length(df_text, id_col="id")
    df_num = df_num.merge(desc_len_df, on="id", how="left")
    df_full = df_full.merge(desc_len_df, on="id", how="left")

    if df_type == 'train':
        # ====== Train/Val Data ======
        # 1) Трейн/тест сплит
        y = df_num["resolution"].astype(int).values
        data_train_num, data_val_num, y_train, y_val, train_cat, val_cat, embeddings_train, embeddings_val, emb_img_train, emb_img_val \
        = train_test_split(
            df_num, y, cat_data, embeddings, emb_img, test_size=0.25, stratify=y, random_state=42
        )

        # 2) Формируем полные датафреймы
        data_train_full = data_train_num.merge(train_cat, on='id', how='left')
        data_val_full = data_val_num.merge(val_cat, on='id', how='left')

        # 3) Получаем групповые статистики (z оценки и прочее)
        group_stats = compute_group_stats(data_train_full)
        
        # 4) Добавляем групповые статистики к данным
        data_train_full = apply_group_stats(data_train_full, group_stats)
        data_val_full = apply_group_stats(data_val_full, group_stats)
        
        # 5) После всего мы создаем числовые датафреймы
        data_train_num = data_train_full.drop(columns=cat_cols)
        data_val_num = data_val_full.drop(columns=cat_cols)

        return data_train_num, data_val_num, y_train, y_val, \
                train_cat, val_cat, embeddings_train, embeddings_val, emb_img_train, emb_img_val, group_stats
    
    elif df_type == 'test':
        assert group_stats is not None, "Для теста нужно передать обученные group_stats"

        # Применяем GroupStats
        data_test_full = df_num.merge(cat_data, on='id', how='left')
        df_num = apply_group_stats(data_test_full, group_stats)
        df_num.drop(columns=cat_cols, inplace=True)

        return df_num, cat_data

### Организуем пайплайн из методов.

Сначала трейн и валидация.

In [22]:
# Получаем данные для тренировки
data_train_num, data_val_num, y_train, y_val, cat_train, cat_val, embeddings_train, embeddings_val, emb_img_train, emb_img_val, group_stats \
    = df_extend(df_train_full, df_train_num, df_train_text, embeddings, 
                emb_img=train_img_emb, 
                df_type='train'
)

In [23]:
data_test_num, cat_test = df_extend(df_test_full, df_test_num, df_test_text, embeddings_test, test_img_emb,
                                    df_type='test',
                                    group_stats=group_stats)

# Создание нейросети.

In [24]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
device

device(type='cuda')

### Подготовка датасета.

Создаем словари.

In [25]:
vocab_brand = set(list(cat_train['brand_name'].replace('', '<unk>').unique()) + ['<unk>'])
vocab_type = set(list(cat_train['CommercialTypeName4'].replace('', '<unk>').unique()) + ['<unk>'])
print(f'Размер словаря vocab_brand: {len(vocab_brand)}')
print(f'Размер словаря vocab_type: {len(vocab_type)}')

Размер словаря vocab_brand: 3640
Размер словаря vocab_type: 617


Создаем удобные словари для индексов

In [26]:
brand2ind = {brand: i for i, brand in enumerate(vocab_brand)}
ind2brand = {i: brand for brand, i in brand2ind.items()}

type2ind = {tip: i for i, tip in enumerate(vocab_type)}
ind2type = {i: tip for tip, i in type2ind.items()}

In [27]:
from torch.utils.data import Dataset, DataLoader
from sklearn.preprocessing import StandardScaler
import pickle

class ModuleDataset(Dataset):
    def __init__(self, data_num, data_cat, text_embs, img_embs, y=None, mode: str = 'train'):
        """
        type: one of the types in ['train', 'val', 'test']
        """
        # Has image
        self.has_img = data_num['has_img'].values

        if mode != 'test':
            cols_to_drop = ['resolution','ItemID','has_img']
        else:
            cols_to_drop = ['ItemID','has_img']
        self.data_num = data_num.drop(columns=cols_to_drop)

        # Num data
        self.scaler = StandardScaler()
        if mode == 'train':
            self.data_num = self.scaler.fit_transform(self.data_num)
            with open('scaler.pkl', 'wb') as le_dump_file:
                pickle.dump(self.scaler, le_dump_file)
        else:
            with open('scaler.pkl', 'rb') as le_dump_file:
                self.scaler = pickle.load(le_dump_file)
            self.data_num = self.scaler.transform(self.data_num)

        # Cat data
        self.data_cat = data_cat
        self.brand_unk_ind = brand2ind['<unk>']
        self.type_unk_ind = type2ind['<unk>']

        # resolution
        self.y = y

        # text embeddings and image
        self.text_embs = text_embs
        self.img_embs = img_embs.values

        
    def __len__(self):
        return len(self.data_num)

    def __getitem__(self, idx):
        """
        idx: id товара в таблице
        """
        brand_id = brand2ind.get(self.data_cat['brand_name'].iloc[idx], self.brand_unk_ind)
        type_id  = type2ind.get(self.data_cat['CommercialTypeName4'].iloc[idx], self.type_unk_ind)
        sample = {
            "num": torch.tensor(self.data_num[idx], dtype=torch.float32),
            "brand": torch.tensor(brand_id, dtype=torch.long),
            "type": torch.tensor(type_id, dtype=torch.long),
            "text_emb": torch.tensor(self.text_embs[idx], dtype=torch.float32),
            "img_emb": torch.tensor(self.img_embs[idx], dtype=torch.float32),
            "has_img": torch.tensor(self.has_img[idx], dtype=torch.float32)
        }
        if self.y is not None:
            sample["label"] = torch.tensor(self.y[idx], dtype=torch.float32) 
        return sample

### Создаем датасеты.

In [None]:
train_dataset = ModuleDataset(data_train_num, cat_train, embeddings_train, emb_img_train, y=y_train, mode='train')
eval_dataset = ModuleDataset(data_val_num, cat_val, embeddings_val, emb_img_val, y=y_val, mode='val')
test_dataset = ModuleDataset(data_test_num, cat_test, embeddings_test, test_img_emb, mode='test')

In [None]:
batch_size = 64
train_dataloader = DataLoader(train_dataset, shuffle=True, batch_size=batch_size)

eval_dataloader = DataLoader(eval_dataset, shuffle=False, batch_size=batch_size)

test_dataloader = DataLoader(test_dataset, shuffle=False, batch_size=batch_size)

### Функция для обучения.

In [None]:
from sklearn.metrics import f1_score, precision_score, recall_score


device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

def evaluate(model, dataloader, criterion):
    """
    Evaluate model on validation dataloader.
    Метрики: loss, F1 (по положительному классу), precision, recall.
    """
    model.eval()
    all_preds, all_labels = [], []
    running_loss, total_samples = 0.0, 0

    with torch.no_grad():
        for batch in dataloader:
            num = batch['num'].float().to(device)
            brand = batch['brand'].to(device)
            type_ = batch['type'].to(device)
            text_emb = batch['text_emb'].float().to(device)
            img_emb = batch['img_emb'].to(device)
            has_img = batch['has_img'].float().to(device)
            labels = batch['label'].long().to(device)

            logits = model(num, brand, type_, text_emb, img_emb, has_img)
            loss = criterion(logits.view(-1), labels.float())

            running_loss += loss.item() * labels.size(0)
            total_samples += labels.size(0)

            preds = (torch.sigmoid(logits) > 0.5).long().cpu()
            all_preds.append(preds)
            all_labels.append(labels.cpu())

    # Конкатенируем предсказания и таргеты
    all_preds = torch.cat(all_preds).numpy()
    all_labels = torch.cat(all_labels).numpy()

    avg_loss = running_loss / total_samples
    f1 = f1_score(all_labels, all_preds, pos_label=1, average="binary")
    precision = precision_score(all_labels, all_preds, pos_label=1, average="binary")
    recall = recall_score(all_labels, all_preds, pos_label=1, average="binary")

    return avg_loss, f1, precision, recall

In [None]:
def train_one_epoch(model, dataloader, criterion, optimizer):
    """
    Тренировка одной эпохи.
    Метрики: loss, F1 (по положительному классу).
    """
    model.train()
    running_loss, total_samples = 0.0, 0
    all_preds, all_labels = [], []

    for batch in dataloader:
        num = batch['num'].float().to(device)
        brand = batch['brand'].to(device)
        type_ = batch['type'].to(device)
        text_emb = batch['text_emb'].float().to(device)
        img_emb = batch['img_emb'].to(device)
        has_img = batch['has_img'].float().to(device)
        labels = batch['label'].long().to(device)

        optimizer.zero_grad()
        logits = model(num, brand, type_, text_emb, img_emb, has_img)
        loss = criterion(logits.view(-1), labels.float())
        loss.backward()
        optimizer.step()

        running_loss += loss.item() * labels.size(0)
        total_samples += labels.size(0)

        preds = (torch.sigmoid(logits) > 0.5).long().cpu()
        all_preds.append(preds)
        all_labels.append(labels.cpu())

    all_preds = torch.cat(all_preds).numpy()
    all_labels = torch.cat(all_labels).numpy()

    avg_loss = running_loss / total_samples
    f1 = f1_score(all_labels, all_preds, pos_label=1, average="binary")

    return avg_loss, f1


In [None]:
def train(train_loader, val_loader, model, optimizer, criterion, num_epochs=10):
    """
    Полный цикл обучения.
    Метрики:
    - train: loss, F1 (по классу 1)
    - val: loss, F1, precision, recall (по классу 1)
    """
    model = model.to(device)
    history = {
        'train_loss': [], 'train_f1': [],
        'val_loss': [], 'val_f1': [], 'val_precision': [], 'val_recall': []
    }

    best_f1 = 0.0
    os.makedirs("checkpoints", exist_ok=True)

    epoch_pbar = tqdm(range(num_epochs), desc='Epochs', position=0, leave=True)
    for epoch in epoch_pbar:
        # Train
        train_loss, train_f1 = train_one_epoch(model, train_loader, criterion, optimizer)
        # Validation
        val_loss, val_f1, val_prec, val_rec = evaluate(model, val_loader, criterion)

        # Save metrics
        history['train_loss'].append(train_loss)
        history['train_f1'].append(train_f1)
        history['val_loss'].append(val_loss)
        history['val_f1'].append(val_f1)
        history['val_precision'].append(val_prec)
        history['val_recall'].append(val_rec)

        # Save best model by F1
        if val_f1 > best_f1:
            best_f1 = val_f1
            torch.save({
                'epoch': epoch,
                'model_state_dict': model.state_dict(),
                'optimizer_state_dict': optimizer.state_dict(),
            }, os.path.join("checkpoints", 'best_model.pt'))
            tqdm.write(f"Val F1: {val_f1:.4f} (saved)")

        # Logs
        tqdm.write(
            f"Epoch {epoch+1}/{num_epochs} | "
            f"Train: loss {train_loss:.4f}, f1 {train_f1:.4f} | "
            f"Val: loss {val_loss:.4f}, f1 {val_f1:.4f}, prec {val_prec:.4f}, rec {val_rec:.4f}"
        )

    return history, model

## Модель и обучение.

In [6]:
import torch
import torch.nn as nn
import torch.nn.functional as F

class GatedMultimodalNet(nn.Module):
    def __init__(self,
                 num_num_features,        # число числовых (табличных) фич
                 num_brands,              # число уникальных брендов
                 num_types,               # число уникальных типов
                 text_emb_dim=384,        # размер готовых текстовых эмбеддингов
                 img_emb_dim=512,         # размер готовых эмбеддингов изображений
                 proj_dim=128,            # размер проекции для каждой модальности
                 fusion_hidden=256,
                 num_classes=1,           # 1 -> бинар (BCEWithLogits), >1 -> multi-class (CrossEntropy)
                 p_drop_modality=0.4):
        super().__init__()
        self.p_drop_modality = p_drop_modality
        self.num_classes = num_classes

        # === 1) числовые признаки -> proj_dim
        self.num_proj = nn.Sequential(
            nn.Linear(num_num_features, proj_dim),
            nn.ReLU(),
            nn.BatchNorm1d(proj_dim),
            nn.Dropout(0.1)
        )

        #  === 2) Категориальные признаки 
        emb_dim_cat = proj_dim // 2  # размер эмбеддинга категориального признака
        self.brand_emb = nn.Embedding(num_brands, emb_dim_cat)
        self.com_type_emb = nn.Embedding(num_types, emb_dim_cat)

        # после конкатенации эмбеддингов -> proj_dim
        self.cat_proj = nn.Sequential(
            nn.Linear(2 * emb_dim_cat, proj_dim),
            nn.ReLU(),
            nn.LayerNorm(proj_dim),
            nn.Dropout(0.1)
        )

        #  === 3) Текстовые эмбеддинги -> proj_dim
        self.text_proj = nn.Sequential(
            nn.Linear(text_emb_dim, proj_dim),
            nn.ReLU(),
            nn.LayerNorm(proj_dim),
            nn.Dropout(0.1)
        )

        #  === 4)эмбеддинги изображений -> proj_dim
        # resnet
        # ...
        self.img_proj = nn.Sequential(
            nn.Linear(img_emb_dim, proj_dim),
            nn.ReLU(),
            nn.LayerNorm(proj_dim),
            nn.Dropout(0.1)
        )

        # Гейты (по сути скалярные веса, но реализуем векторно)
        self.gate_num = nn.Sequential(nn.Linear(proj_dim, proj_dim), nn.Sigmoid())
        self.gate_cat = nn.Sequential(nn.Linear(proj_dim, proj_dim), nn.Sigmoid())
        self.gate_text = nn.Sequential(nn.Linear(proj_dim, proj_dim), nn.Sigmoid())
        self.gate_img  = nn.Sequential(nn.Linear(proj_dim, proj_dim), nn.Sigmoid())

        # Fusion + classifier
        self.fusion = nn.Sequential(
            nn.Linear(proj_dim * 4, fusion_hidden),
            nn.ReLU(),
            nn.Dropout(0.3),
            nn.Linear(fusion_hidden, fusion_hidden // 2),
            nn.ReLU(),
            nn.Dropout(0.2),
            nn.Linear(fusion_hidden // 2, num_classes)
        )

    def modality_dropout(self, x, p):
        if not self.training or p <= 0:
            return x
        if torch.rand(1).item() < p:
            return torch.zeros_like(x)
        return x

    def forward(self, num_x, brand_x, comtype_x, text_emb, img_emb, has_img=None):
        """
        num_x:     [B, num_num_features] — числовые признаки
        brand_x:   [B] индексы брендов
        comtype_x: [B] индексы типов
        text_emb:  [B, text_emb_dim] эмбеддинги текста
        img_emb:   [B, img_emb_dim] эмбеддинги картинок
        has_img:   [B] (0/1), маска наличия изображения
        """

        # 1. Числовые
        num_h = self.num_proj(num_x)

        # 2. Категориальные
        brand_h = self.brand_emb(brand_x)
        comtype_h = self.com_type_emb(comtype_x)
        cat_h = torch.cat([brand_h, comtype_h], dim=1)
        cat_h = self.cat_proj(cat_h)

        # 3. Текст
        text_h = self.text_proj(text_emb)

        # 4. Изображения (+ маска)
        img_h = self.img_proj(img_emb)
        if has_img is not None:
            mask = has_img.view(-1,1).to(img_h.dtype)
            img_h = img_h * mask
        
        # Modality dropout
        num_h  = self.modality_dropout(num_h,  self.p_drop_modality)
        cat_h  = self.modality_dropout(cat_h,  self.p_drop_modality)
        text_h = self.modality_dropout(text_h, self.p_drop_modality)
        img_h  = self.modality_dropout(img_h,  self.p_drop_modality)


        # Gating
        num_out  = self.gate_num(num_h) * num_h
        cat_out  = self.gate_cat(cat_h) * cat_h
        text_out = self.gate_text(text_h) * text_h
        img_out  = self.gate_img(img_h) * img_h

        # Fuse
        fused = torch.cat([num_out, cat_out, text_out, img_out], dim=1)  # [B, proj_dim*4]
        logits = self.fusion(fused)
        return logits

## Обучаем.

In [7]:
nums = data_train_num.shape[1] - 3

NameError: name 'data_train_num' is not defined

In [32]:
# Задаем параметры для обучения
model = GatedMultimodalNet(nums, len(vocab_brand), len(vocab_type)).to(device)
criterion = nn.BCEWithLogitsLoss()
optimizer = torch.optim.Adam(model.parameters(), lr=1e-4)

In [33]:
# Обучаем
history, last_model = train(train_dataloader, eval_dataloader, model, optimizer, criterion, num_epochs=16)

Epochs:   6%|▋         | 1/16 [00:35<08:48, 35.24s/it]

Val F1: 0.7261 (saved)
Epoch 1/16 | Train: loss 0.1730, f1 0.3656 | Val: loss 178.9041, f1 0.7261, prec 0.7389, rec 0.7138


Epochs:  12%|█▎        | 2/16 [01:11<08:22, 35.87s/it]

Val F1: 0.7557 (saved)
Epoch 2/16 | Train: loss 0.1203, f1 0.6072 | Val: loss 191.5159, f1 0.7557, prec 0.7563, rec 0.7551


Epochs:  19%|█▉        | 3/16 [01:50<08:06, 37.43s/it]

Val F1: 0.7686 (saved)
Epoch 3/16 | Train: loss 0.1095, f1 0.6469 | Val: loss 254.5736, f1 0.7686, prec 0.7713, rec 0.7659


Epochs:  25%|██▌       | 4/16 [02:31<07:42, 38.54s/it]

Val F1: 0.7819 (saved)
Epoch 4/16 | Train: loss 0.1029, f1 0.6698 | Val: loss 258.8522, f1 0.7819, prec 0.7772, rec 0.7867


Epochs:  31%|███▏      | 5/16 [03:05<06:48, 37.10s/it]

Epoch 5/16 | Train: loss 0.0985, f1 0.6923 | Val: loss 247.3919, f1 0.7752, prec 0.8084, rec 0.7447


Epochs:  38%|███▊      | 6/16 [03:40<06:02, 36.25s/it]

Val F1: 0.7867 (saved)
Epoch 6/16 | Train: loss 0.0940, f1 0.7026 | Val: loss 311.2771, f1 0.7867, prec 0.7954, rec 0.7781


Epochs:  44%|████▍     | 7/16 [04:14<05:20, 35.65s/it]

Val F1: 0.7891 (saved)
Epoch 7/16 | Train: loss 0.0922, f1 0.7132 | Val: loss 455.8851, f1 0.7891, prec 0.7922, rec 0.7861


Epochs:  50%|█████     | 8/16 [04:49<04:42, 35.25s/it]

Val F1: 0.7956 (saved)
Epoch 8/16 | Train: loss 0.0888, f1 0.7255 | Val: loss 473.5703, f1 0.7956, prec 0.8034, rec 0.7879


Epochs:  56%|█████▋    | 9/16 [05:23<04:04, 34.99s/it]

Val F1: 0.7966 (saved)
Epoch 9/16 | Train: loss 0.0861, f1 0.7349 | Val: loss 454.0543, f1 0.7966, prec 0.7986, rec 0.7947


Epochs:  62%|██████▎   | 10/16 [05:57<03:28, 34.67s/it]

Val F1: 0.7985 (saved)
Epoch 10/16 | Train: loss 0.0840, f1 0.7416 | Val: loss 320.5532, f1 0.7985, prec 0.7890, rec 0.8082


Epochs:  69%|██████▉   | 11/16 [06:31<02:52, 34.47s/it]

Val F1: 0.8067 (saved)
Epoch 11/16 | Train: loss 0.0803, f1 0.7569 | Val: loss 467.8629, f1 0.8067, prec 0.7959, rec 0.8177


Epochs:  75%|███████▌  | 12/16 [07:04<02:16, 34.17s/it]

Epoch 12/16 | Train: loss 0.0801, f1 0.7551 | Val: loss 420.4906, f1 0.8040, prec 0.8054, rec 0.8026


Epochs:  81%|████████▏ | 13/16 [07:39<01:42, 34.16s/it]

Val F1: 0.8088 (saved)
Epoch 13/16 | Train: loss 0.0783, f1 0.7591 | Val: loss 576.9255, f1 0.8088, prec 0.8092, rec 0.8085


Epochs:  88%|████████▊ | 14/16 [08:13<01:08, 34.13s/it]

Epoch 14/16 | Train: loss 0.0764, f1 0.7693 | Val: loss 574.9660, f1 0.8007, prec 0.7856, rec 0.8164


Epochs:  94%|█████████▍| 15/16 [08:47<00:34, 34.13s/it]

Epoch 15/16 | Train: loss 0.0745, f1 0.7756 | Val: loss 518.9133, f1 0.8055, prec 0.8121, rec 0.7990


Epochs: 100%|██████████| 16/16 [09:21<00:00, 35.10s/it]

Val F1: 0.8097 (saved)
Epoch 16/16 | Train: loss 0.0746, f1 0.7768 | Val: loss 500.4735, f1 0.8097, prec 0.8076, rec 0.8118





## Загрузка модели и предикт.

In [34]:
def load_best_model(model, checkpoint_path, device="cuda"):
    """
    Загружает лучшую сохранённую модель.
    """
    checkpoint = torch.load(checkpoint_path, map_location=device)
    model.load_state_dict(checkpoint['model_state_dict'])
    model = model.to(device)
    model.eval()
    return model

In [35]:
def predict(model, dataloader, device="cuda"):
    """
    Получает предсказания на тестовом датасете.
    """
    model.eval()
    preds = []

    with torch.no_grad():
        for batch in tqdm(dataloader, desc="Predicting"):
            num = batch["num"].float().to(device)
            brand = batch["brand"].long().to(device)
            type_ = batch["type"].long().to(device)
            text_emb = batch["text_emb"].float().to(device)
            img_emb = batch["img_emb"].float().to(device)
            has_img = batch["has_img"].float().to(device)

            logits = model(num, brand, type_, text_emb, img_emb, has_img)
            prob = torch.sigmoid(logits).view(-1)      # бинарная классификация
            pred = (prob > 0.5).long()                 # 0 или 1
            preds.extend(pred.cpu().numpy())

    return preds

In [36]:
# Загружаем лучшую модель
best_model = load_best_model(model, "checkpoints/best_model.pt", device)

# Предсказания
test_preds = predict(best_model, test_dataloader, device)

# Сохраняем сабмишн
submission = pd.DataFrame({
    "id": df_test.index,
    "prediction": test_preds
})

submission.to_csv("submission.csv", index=False)

print(f"Создан файл submission.csv с {len(submission)} предсказаниями")
print("Распределение предсказаний:")
print(submission['prediction'].value_counts())

Predicting: 100%|██████████| 491/491 [00:03<00:00, 138.37it/s]


Создан файл submission.csv с 31391 предсказаниями
Распределение предсказаний:
prediction
0    29310
1     2081
Name: count, dtype: int64
