In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import warnings
import os
from PIL import Image
from transformers import AutoImageProcessor, AutoModel
import torch
import torch.nn as nn
from torch.utils.data import Dataset, DataLoader
import glob
from tqdm import tqdm
warnings.filterwarnings("ignore")

KeyboardInterrupt: 

In [2]:
train = pd.read_parquet("/kaggle/input/vseros-avito-stage1/train_dataset.parquet")
test = pd.read_parquet("/kaggle/input/vseros-avito-stage1/test_dataset.parquet")
train['target'] = np.log(train['price_TARGET'])

FileNotFoundError: [Errno 2] No such file or directory: '/kaggle/input/vseros-avito-stage1/train_dataset.parquet'

In [4]:
train['target']

0        10.839581
1        12.180755
2        15.796650
3        13.880362
4        10.896739
           ...    
69995    13.691080
69996    13.208541
69997    12.581079
69998    12.587928
69999    13.921671
Name: target, Length: 70000, dtype: float64

In [5]:
len(os.listdir("/kaggle/input/vseros-avito-stage1/AvitoAuto/АвтоПрайс/test_images"))

98075

In [6]:
len(os.listdir("/kaggle/input/vseros-avito-stage1/AvitoAuto/АвтоПрайс/train_images"))

273873

# EMBEDDINGS

In [38]:
processor = AutoImageProcessor.from_pretrained("google/vit-base-patch16-224", use_fast=True)
vit_model = AutoModel.from_pretrained("google/vit-base-patch16-224")
del vit_model.pooler
vit_model.pooler = torch.nn.Identity()
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
vit_model.to(device)
vit_model.eval()

Some weights of ViTModel were not initialized from the model checkpoint at google/vit-base-patch16-224 and are newly initialized: ['pooler.dense.bias', 'pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


ViTModel(
  (embeddings): ViTEmbeddings(
    (patch_embeddings): ViTPatchEmbeddings(
      (projection): Conv2d(3, 768, kernel_size=(16, 16), stride=(16, 16))
    )
    (dropout): Dropout(p=0.0, inplace=False)
  )
  (encoder): ViTEncoder(
    (layer): ModuleList(
      (0-11): 12 x ViTLayer(
        (attention): ViTAttention(
          (attention): ViTSelfAttention(
            (query): Linear(in_features=768, out_features=768, bias=True)
            (key): Linear(in_features=768, out_features=768, bias=True)
            (value): Linear(in_features=768, out_features=768, bias=True)
          )
          (output): ViTSelfOutput(
            (dense): Linear(in_features=768, out_features=768, bias=True)
            (dropout): Dropout(p=0.0, inplace=False)
          )
        )
        (intermediate): ViTIntermediate(
          (dense): Linear(in_features=768, out_features=3072, bias=True)
          (intermediate_act_fn): GELUActivation()
        )
        (output): ViTOutput(
          (d

In [39]:
train_image_dir = '/kaggle/input/vseros-avito-stage1/AvitoAuto/АвтоПрайс/train_images'
test_image_dir = '/kaggle/input/vseros-avito-stage1/AvitoAuto/АвтоПрайс/test_images'

print(f"Train images dir exists: {os.path.exists(train_image_dir)}")
print(f"Test images dir exists: {os.path.exists(test_image_dir)}")

# Проверим какие файлы есть в директориях
train_files = glob.glob(os.path.join(train_image_dir, "*.jpg"))
test_files = glob.glob(os.path.join(test_image_dir, "*.jpg"))
print(f"Found {len(train_files)} train images, {len(test_files)} test images")

Train images dir exists: True
Test images dir exists: True
Found 273873 train images, 98075 test images


In [40]:
class CarImageDataset(Dataset):
    def __init__(self, folder, train=True):
        self.folder = folder
        self.image_paths = []
        self.ID2Images = {}
        self.train = train

        for img in sorted(os.listdir(folder)):
            path = os.path.join(folder, img)
            ID = img.split("_")[0]
            self.image_paths.append(path)
            self.ID2Images.setdefault(int(ID), [])
            self.ID2Images[int(ID)].append(path)

        self.ids = sorted(self.ID2Images.keys())
    
    def __len__(self):
        return len(self.ids)
    
    def __getitem__(self, idx):
        ID = self.ids[idx]
        image_paths = self.ID2Images[ID]
        
        images = [Image.open(img).convert("RGB") for img in image_paths]
        processed = processor(images, return_tensors="pt")
        processed = {k: v.to(device) for k, v in processed.items()}

        with torch.inference_mode():
            outputs = vit_model(**processed)
            cls_embed = outputs.last_hidden_state[:, 0, :]

        if self.train:
            log_target = train.loc[train[train.ID == ID].index[0], "target"]
            target = train.loc[train[train.ID == ID].index[0], "price_TARGET"]
            return ID, torch.mean(cls_embed, dim=0).detach().cpu().numpy()
        else:
            return ID, torch.mean(cls_embed, dim=0).detach().cpu().numpy()

In [41]:
dataset = CarImageDataset(train_image_dir, train=True)
data_test = CarImageDataset(test_image_dir, train=False)

In [None]:
import torch

def rmse(y_true, y_pred):
    return torch.sqrt(torch.mean((y_true - y_pred) ** 2)).item()

def mape(y_true, y_pred, eps=1e-8):
    return (torch.mean(torch.abs((y_true - y_pred) / (y_true + eps))) * 100).item()

def median_ape(y_true, y_pred, eps=1e-8):
    return torch.median(torch.abs((y_true - y_pred) / (y_true + eps)) * 100).item()

In [None]:
device

In [None]:
from torch.utils.data import DataLoader, random_split
from torch.optim.lr_scheduler import CosineAnnealingLR

# train/val split
train_size = int(0.9 * len(dataset))
val_size = len(dataset) - train_size
train_dataset, val_dataset = random_split(dataset, [train_size, val_size])

train_loader = DataLoader(train_dataset, batch_size=16, shuffle=True, collate_fn=collate_fn)
val_loader = DataLoader(val_dataset, batch_size=16, shuffle=False, collate_fn=collate_fn)

num_epochs = 3
steps_per_epoch = len(train_loader)
total_steps = num_epochs * steps_per_epoch

# модель
model = EmbedAggregator(embed_dim=768, hidden_dim=512)
optimizer = torch.optim.Adam(model.parameters(), lr=3e-4)
scheduler = CosineAnnealingLR(optimizer, T_max=total_steps, eta_min=1e-7)
loss_fn = nn.MSELoss()  # оптимизируем RMSE в лог-пространстве

In [None]:
print("ЧЕКПОИНТ")

In [None]:
model.to(device)
vit_model.to(device);

In [None]:
# best_median_ape = float("inf")
# best_model_path = "best_model.pt"

# for epoch in range(3):
#     # ---- TRAIN ----
#     model.train()
#     train_loss = 0
#     pbar = tqdm(train_loader, desc=f"Epoch {epoch+1} [Train]", leave=False)
#     for IDs, embeds, log_targets, targets in pbar:
#         embeds = embeds.to(device)
#         log_targets = log_targets.to(device)

#         preds = model(embeds)  # предсказание в log-пространстве
#         loss = loss_fn(preds, log_targets)

#         optimizer.zero_grad()
#         loss.backward()
#         optimizer.step()
        
#         train_loss += loss.item()
#         pbar.set_postfix({"loss": f"{loss.item():.4f}"})

#     scheduler.step()

#     # ---- VALID ----
#     model.eval()
#     val_preds, val_targets = [], []
#     val_loss = 0
#     pbar = tqdm(val_loader, desc=f"Epoch {epoch+1} [Valid]", leave=False)
#     with torch.no_grad():
#         for IDs, embeds, log_targets, targets in pbar:
#             embeds = embeds.to(device)
#             log_targets = log_targets.to(device)

#             preds = model(embeds)
#             loss = loss_fn(preds, log_targets)
#             val_loss += loss.item()

#             # переводим обратно из log → exp
#             preds_real = torch.exp(preds.cpu())
#             targets_real = targets

#             val_preds.append(preds_real)
#             val_targets.append(targets_real)

#             pbar.set_postfix({"val_loss": f"{loss.item():.4f}"})
    
#     val_preds = torch.cat(val_preds)
#     val_targets = torch.cat(val_targets)

#     # метрики
#     val_rmse = rmse(val_targets, val_preds)
#     val_mape = mape(val_targets, val_preds)
#     val_median_ape = median_ape(val_targets, val_preds)

#     print(f"\nEpoch {epoch+1}")
#     print(f"  Train loss (log-space): {train_loss/len(train_loader):.4f}")
#     print(f"  Val loss (log-space): {val_loss/len(val_loader):.4f}")
#     print(f"  Val RMSE: {val_rmse:.2f}")
#     print(f"  Val MAPE: {val_mape:.2f}%")
#     print(f"  Val MedianAPE: {val_median_ape:.2f}%")

#     # ---- SAVE BEST MODEL ----
#     if val_median_ape < best_median_ape:
#         best_median_ape = val_median_ape
#         torch.save(model.state_dict(), best_model_path)
#         print(f"  ✅ New best model saved! MedianAPE = {best_median_ape:.2f}%")

In [None]:
model.load_state_dict(torch.load("/kaggle/working/best_model.pt", map_location=device))

In [None]:
train_dataset = CarImageDataset(train_image_dir, train=True)
train_loader = DataLoader(train_dataset, batch_size=80, shuffle=False, collate_fn=collate_fn)

test_dataset = CarImageDataset(test_image_dir, train=False)
test_loader = DataLoader(test_dataset, batch_size=80, shuffle=False, collate_fn=collate_fn_test)

In [None]:
def extract_all_embeds(dataloader, vit_model, device):
    vit_model.eval()
    all_ids = []
    all_embeds = []

    with torch.no_grad():
        for IDs, embeds, *rest in tqdm(dataloader):
            
            batch_embeds = []
            
            for embed in embeds: # embed: [num_images, 768]
                embed = embed.to(device)
                # если нужно прогнать через Vit (CLS эмбеддинг уже есть в embeds)
                batch_embeds.append(embed.cpu())  # [num_images, 768]

            
            # конкатим все эмбеддинги по ID
            batch_embeds = [e.flatten() for e in batch_embeds]  # [num_images*768]
            all_embeds.extend(batch_embeds)
            all_ids.extend(IDs)

    all_embeds = torch.stack(all_embeds, dim=1)  # [embed_dim_flat, train_size]
    return all_ids, all_embeds

In [None]:
def extract_final_embeds(dataloader, model, device):
    """
    Извлекает финальные эмбеддинги из модели EmbedAggregator для каждого ID.
    Возвращает:
        - all_ids: список ID
        - all_embeds: torch.Tensor [num_IDs, hidden_dim]
    """
    model.eval()
    all_ids = []
    all_embeds = []

    with torch.no_grad():
        for batch in tqdm(dataloader, desc="Extracting final embeddings"):
            IDs, embeds, *rest = batch  # rest игнорируем
            embeds = embeds.to(device)  # [batch_size, num_images, embed_dim]

            agg_embeds = model(embeds)  # [batch_size, hidden_dim]
            agg_embeds = agg_embeds.cpu()

            all_ids.extend(IDs)
            all_embeds.append(agg_embeds)

    all_embeds = torch.cat(all_embeds, dim=0)  # [num_IDs, hidden_dim]
    return all_ids, all_embeds

In [None]:
train_ids, train_agg_embeds = extract_all_embeds(train_loader, model, device)

In [None]:
test_ids, test_agg_embeds = extract_all_embeds(test_loader, model, device)

In [None]:
train_agg_embeds.shape

In [None]:
test_agg_embeds.shape

In [None]:
# TRAIN
train_embeds_cut = train_agg_embeds[:768, :]  # берём первые 768 эмбеддингов
train_emb_df = pd.DataFrame(train_embeds_cut.T, columns=[f"emb_{i}" for i in range(768)])
emb_cols = train_emb_df.columns
train_emb_df["ID"] = train_ids
train_df = train.merge(train_emb_df, on="ID", how="left")

In [None]:
test_embeds_cut = test_agg_embeds[:768, :]
test_emb_df = pd.DataFrame(test_embeds_cut.T, columns=[f"emb_{i}" for i in range(768)])
test_emb_df["ID"] = test_ids
test_df = test.merge(test_emb_df, on="ID", how="left")

In [None]:
train_df

In [None]:
test_df

In [None]:
train = train_df.copy()
test = test_df.copy()

# PREDICT

In [None]:
cat_features = ['equipment', 'body_type', 'drive_type', 'engine_type', 'doors_number', 'color', 'pts', 'diski', 'audiosistema', 'electropodemniki', 'fary', 'salon', 'upravlenie_klimatom',
                'usilitel_rul', 'steering_wheel', 'crashes_count', 'owners_count']
numeric_features = ['mileage', 'latitude', 'longitude'] + np.array(emb_cols).tolist()

listed_features = ['aktivnaya_bezopasnost_mult', 'audiosistema_mult', 'shini_i_diski_mult', 'electroprivod_mult', 'fary_mult', 'multimedia_navigacia_mult', 'obogrev_mult', 'pamyat_nastroek_mult', 'podushki_bezopasnosti_mult',
                   'pomosh_pri_vozhdenii_mult', 'protivoygonnaya_sistema_mult', 'salon_mult', 'upravlenie_klimatom_mult']

target = ['price_TARGET']
ID = ['ID']

In [None]:
from tqdm import tqdm

In [None]:
import pandas as pd

# Собираем все уникальные метки для train и test
labels = {col: set().union(*train[col]) for col in listed_features}
test_labels = {col: set().union(*test[col]) for col in listed_features}

# One-hot для train
for col in listed_features:
    # Разворачиваем список в строки
    exploded = train[[col]].explode(col)
    # Получаем one-hot кодировку
    dummies = pd.get_dummies(exploded[col], prefix=col)
    # Складываем обратно по индексам
    dummies = dummies.groupby(exploded.index).max()
    # Добавляем в train
    train = train.join(dummies)

# One-hot для test
for col in listed_features:
    exploded = test[[col]].explode(col)
    dummies = pd.get_dummies(exploded[col], prefix=col)
    dummies = dummies.groupby(exploded.index).max()
    test = test.join(dummies)

In [None]:
for i, lbs in enumerate(labels):
    print(lbs, labels[lbs] == test_labels[lbs])

In [None]:
for col in test.columns:
    if col not in target and col not in listed_features and col not in numeric_features and col not in cat_features and col not in ID:
        cat_features.append(col)

## CATBOOST

In [None]:
!pip install -U scikit-learn

In [None]:
from sklearn.metrics import mean_absolute_error, mean_absolute_percentage_error

In [None]:
!pip install -U "scikit-learn<1.5"

In [None]:
!pip install -U catboost

In [None]:
from catboost import CatBoostRegressor
import numpy as np

test[cat_features] = test[cat_features].fillna("NaN")
train[cat_features] = train[cat_features].fillna("NaN")

train['target'] = np.log(train['price_TARGET'])

class MedianAPE:
    
    @staticmethod
    def get_median_ape(y_true, y_pred, eps=1e-8):
        y_true = np.array(y_true)
        y_pred = np.array(y_pred)
        return np.median(np.abs(y_pred - y_true) / np.clip(np.abs(y_true), eps, None))
    
    def is_max_optimal(self):
        return False  # чем меньше MdAPE, тем лучше
    
    def evaluate(self, approxes, target, weight):
        assert len(approxes) == 1
        assert len(target) == len(approxes[0])
        y_true = np.array(target)
        y_pred = np.array(approxes[0])
        score = self.get_median_ape(y_true, y_pred)
        return score, 1  # (значение метрики, вес)
    
    def get_final_error(self, error, weight):
        return error


X = train[cat_features + numeric_features]
y = train['target']

indices = np.arange(len(train))
np.random.seed(52)
np.random.shuffle(indices)

split = int(0.9 * len(indices))
train_idx, val_idx = indices[:split], indices[split:]

X_train, X_val = X.iloc[train_idx], X.iloc[val_idx]
y_train, y_val = y.iloc[train_idx], y.iloc[val_idx]

# X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.1, random_state=52, shuffle=True)

model = CatBoostRegressor(
    iterations=7500,
    learning_rate=0.03,
    depth=10,
    cat_features=cat_features,
    loss_function="RMSE",
    eval_metric="RMSE",
    verbose=100,
    random_seed=42,
    task_type="GPU"
)

model.fit(X=X_train, y=y_train, eval_set=(X_val, y_val),
          cat_features=cat_features, use_best_model=True)


def median_APE(y_true, y_pred, eps=1e-8):
    y_true = np.array(y_true)
    y_pred = np.array(y_pred)
    return np.median(np.abs(y_pred - y_true) / np.clip(np.abs(y_true), eps, None))

y_pred = np.exp(model.predict(X_val))
y_val = train.loc[X_val.index, "price_TARGET"]
score = median_APE(y_val, y_pred)
print(f"Median_APE: {score}\n" +
      f"LB: {1 / (1 + score)}\n" +
      f"MAE: {mean_absolute_error(y_val, y_pred)}\n" +
      f"Mean_APE: {mean_absolute_percentage_error(y_val, y_pred)}")

In [None]:
submission = pd.DataFrame({"ID": test["ID"],
              "target": np.exp(model.predict(test[cat_features + numeric_features]))})

In [None]:
submission.to_csv("baseline.csv", index=False)

In [None]:
submission

# OPTUNA

In [None]:
from sklearn.model_selection import train_test_split
from catboost import CatBoostRegressor
import numpy as np
import optuna
from optuna.samplers import RandomSampler

# Предобработка данных
test[cat_features] = test[cat_features].fillna("NaN")
train[cat_features] = train[cat_features].fillna("NaN")
train['target'] = np.log(train['price_TARGET'])

# Метрика MedianAPE
class MedianAPE:
    @staticmethod
    def get_median_ape(y_true, y_pred, eps=1e-8):
        y_true = np.array(y_true)
        y_pred = np.array(y_pred)
        return np.median(np.abs(y_pred - y_true) / np.clip(np.abs(y_true), eps, None))
    
    def is_max_optimal(self):
        return False
    
    def evaluate(self, approxes, target, weight):
        assert len(approxes) == 1
        assert len(target) == len(approxes[0])
        y_true = np.array(target)
        y_pred = np.array(approxes[0])
        score = self.get_median_ape(y_true, y_pred)
        return score, 1
    
    def get_final_error(self, error, weight):
        return error

def median_APE(y_true, y_pred, eps=1e-8):
    y_true = np.array(y_true)
    y_pred = np.array(y_pred)
    return np.median(np.abs(y_pred - y_true) / np.clip(np.abs(y_true), eps, None))

# Разделение данных
X = train[cat_features + numeric_features]
y = train['target']
X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.1, random_state=52, shuffle=True)

In [None]:
# Функция для оптимизации
def objective(trial):
    params = {
        'iterations': trial.suggest_int('iterations', 1000, 7500),
        'learning_rate': trial.suggest_float('learning_rate', 0.001, 0.1, log=True),
        'depth': trial.suggest_int('depth', 4, 11),
        'l2_leaf_reg': trial.suggest_float('l2_leaf_reg', 1, 10),
        'random_strength': trial.suggest_float('random_strength', 0.1, 2),
        'bagging_temperature': trial.suggest_float('bagging_temperature', 0.0, 1.0),
        'border_count': trial.suggest_int('border_count', 32, 255),
        'grow_policy': trial.suggest_categorical('grow_policy', ['SymmetricTree', 'Depthwise', 'Lossguide']),
        'min_data_in_leaf': trial.suggest_int('min_data_in_leaf', 1, 250),
        'loss_function': 'RMSE',
        'eval_metric': MedianAPE(),
        'cat_features': cat_features,
        'verbose': False,
        'random_seed': 42,
        'task_type': 'GPU',
        # "early_stopping_rounds": 250
    }
    
    model = CatBoostRegressor(**params)
    
    try:
        model.fit(
            X=X_train, y=y_train,
            eval_set=(X_val, y_val),
            early_stopping_rounds=100,
            verbose=False
        )
        
        # Предсказание и оценка
        y_pred = np.exp(model.predict(X_val))
        y_true = train.loc[X_val.index, "price_TARGET"]
        score = median_APE(y_true, y_pred)
        
    except Exception as e:
        print(f"Ошибка при обучении: {e}")
        score = float('inf')
        
    return score

# Исследование
study = optuna.create_study(
    direction='minimize',
    sampler=RandomSampler(seed=16)
)
study.optimize(objective, n_trials=600)

# Лучшие параметры
print("Лучшие параметры:")
for key, value in study.best_params.items():
    print(f"{key}: {value}")
print(f"Лучший Median_APE: {study.best_value}")

In [None]:
study.best_params

In [None]:
pars = {'iterations': 4773,
 'learning_rate': 0.023540714865270384,
 'depth': 11,
 'l2_leaf_reg': 4.339528598041656,
 'random_strength': 1.4767698885793084,
 'bagging_temperature': 0.5228168517505888,
 'border_count': 186,
 'grow_policy': 'Depthwise',
 'min_data_in_leaf': 76}

In [None]:
pars_redacted = {'iterations': 1242,
 'learning_rate': 0.023540714865270384,
 'depth': 11,
 'l2_leaf_reg': 4.339528598041656,
 'random_strength': 1.4767698885793084,
 'bagging_temperature': 0.5228168517505888,
 'border_count': 186,
 'grow_policy': 'Depthwise',
 'min_data_in_leaf': 76}

In [None]:
# Финальное обучение с лучшими параметрами
best_model = CatBoostRegressor(
    **pars,
    cat_features=cat_features,
    loss_function="RMSE",
    eval_metric=MedianAPE(),
    verbose=100,
    random_seed=42,
    task_type="GPU"
)

best_model.fit(
    X=X_train, y=y_train,
    eval_set=(X_val, y_val),
    use_best_model=True,
    early_stopping_rounds=250
)

In [None]:
# Финальная оценка
y_pred = np.exp(best_model.predict(X_val))
y_true = train.loc[X_val.index, "price_TARGET"]
final_score = median_APE(y_true, y_pred)

print(f"\nФинальный Median_APE: {final_score}")
print(f"LB score: {1 / (1 + final_score)}")

In [None]:
submission = pd.DataFrame({"ID": test["ID"],
              "target": np.exp(best_model.predict(test[cat_features + numeric_features]))})

In [None]:
submission.to_csv("opt.csv", index=False)

In [None]:
submission