# Генерация эмбеддингов изображений

## Общее описание
Ноутбук выполняет обработку изображений товаров для получения векторных представлений с использованием предобученной ViT-модели. Процесс включает:

1. Извлечение изображений из архивов
2. Пакетную обработку изображений
3. Сохранение эмбеддингов
4. Объединение результатов

## Особенности реализации

### Пакетная обработка
| **Аспект** | **Реализация** | **Преимущество** |
|------------|----------------|------------------|
| Размер батча | 20,000 изображений | Оптимальное использование памяти GPU |
| Стратегия | Последовательная обработка батчей | Минимизация перегрузки памяти |
| Валидация | Проверка каждого изображения | Гарантия качества данных |

## Дальнейшие шаги
- Объединение всех батчей в единый датасет
- Нормализация эмбеддингов
- Интеграция с текстовыми признаками

In [None]:
import os
from PIL import Image
import numpy as np
import pandas as pd
from tqdm import tqdm
from pathlib import Path
from transformers import AutoFeatureExtractor, AutoModel
import torchvision.transforms as T
import torch
import zipfile
from pathlib import Path
from torch.utils.data import DataLoader
import pickle
import gc
from tqdm.auto import tqdm
tqdm.pandas(desc='Tokenizing rows')

import warnings
warnings.filterwarnings('ignore')


In [None]:
# Извлекаем изображения из архивов, отдельно для каждой части

# Set the target directory
p = Path('data/train/part_2')


for f in p.glob('*.zip'):
    # Open the Zip file
    with zipfile.ZipFile(f, 'r') as archive:
        archive.extractall('data/train/part_2')
        print(f"Extracted contents from '{f.name}' to '{f.stem}' directory.")

# Get embeddings

In [None]:
# Подгружаем модель для извлечения эмбеддингов изображений.

#train1 = pd.read_parquet('data/train/part_2/train_part_0002.snappy.parquet')
model_ckpt = "nateraw/vit-base-beans"
extractor = AutoFeatureExtractor.from_pretrained(model_ckpt)
model = AutoModel.from_pretrained(model_ckpt).to("cuda:0")
hidden_dim = model.config.hidden_size

Some weights of ViTModel were not initialized from the model checkpoint at nateraw/vit-base-beans and are newly initialized: ['vit.pooler.dense.bias', 'vit.pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [None]:
# Обрабатываем изображения по частям (указываем path для датасета и соответствующих фото)

dataset_path = 'data/test/test_part_0002.snappy.parquet'
image_folder_path = 'data/test'

In [4]:
# Data transformation chain.
transformation_chain = T.Compose(
    [
        # We first resize the input image to 256x256 and then we take center crop.
        T.Resize(extractor.size["height"]),
        T.CenterCrop(extractor.size["height"]),
        T.ToTensor(),
        T.Normalize(mean=extractor.image_mean, std=extractor.image_std),
    ]
)

In [None]:
# Функция для получения эмбеддингов фото

def image_dataset(dataset_path, image_path):

    dataset = pd.read_parquet(dataset_path)
    

    all_images_id = pd.concat([dataset['base_title_image'], dataset['cand_title_image']]).unique()

    batch_size = 20000
    batches = [all_images_id[i:i+batch_size] for i in range(0, all_images_id.shape[0], batch_size)]
    image_size = extractor.size["height"]
    
    device = model.device

    valid_indices = []
   
    for batch_idx, batch in enumerate(batches, start=1):
        print(f"\n🔄 Батч {batch_idx} из {len(batches)}")

        images_tensor = torch.zeros((len(batch), 3, image_size, image_size))
        

        batch_path = Path(image_path)
        for i, image_id in enumerate(tqdm(batch, leave=False)):
            img_path = batch_path / f'{image_id}.jpg'
            try:
                with Image.open(img_path) as im:
                    image = im.convert("RGB")
                    tensor = transformation_chain(image)
                    images_tensor[i] = tensor
                    valid_indices.append(image_id)
            except Exception:
                pass  # Оставим нули в images_tensor


        data_loader = DataLoader(images_tensor, batch_size=256, shuffle=False, num_workers=2, pin_memory=True)

        del images_tensor  # Освобождаем RAM

        outputs = []

        with torch.no_grad():
            for batch_data in data_loader:
                batch_data = batch_data.to(device, non_blocking=True)
                embeddings = model(pixel_values=batch_data).last_hidden_state[:, 0].cpu()
                outputs.append(embeddings)
                

        embeddings_tensor = torch.cat(outputs)
        emb_df = pd.DataFrame(embeddings_tensor.numpy())

        output_file = f'data/2_test_image_embedding_{batch_idx:03}.pkl'
        with open(output_file, 'wb') as f:
            pickle.dump(emb_df, f, protocol=pickle.HIGHEST_PROTOCOL)

        # Очистка
        del data_loader, outputs, emb_df, embeddings_tensor
        torch.cuda.empty_cache()
        gc.collect()

        print(f"✅ Готово: {output_file}")

    print(f"valid indices {len(valid_indices)}")
    with open("2_test_valid_indices", 'wb') as f:
        pickle.dump(valid_indices, f, protocol=pickle.HIGHEST_PROTOCOL)

        


In [6]:
image_dataset(dataset_path, image_folder_path)


🔄 Батч 1 из 21


  0%|          | 0/20000 [00:00<?, ?it/s]

✅ Готово: data/2_test_image_embedding_001.pkl

🔄 Батч 2 из 21


  0%|          | 0/20000 [00:00<?, ?it/s]

✅ Готово: data/2_test_image_embedding_002.pkl

🔄 Батч 3 из 21


  0%|          | 0/20000 [00:00<?, ?it/s]

✅ Готово: data/2_test_image_embedding_003.pkl

🔄 Батч 4 из 21


  0%|          | 0/20000 [00:00<?, ?it/s]

✅ Готово: data/2_test_image_embedding_004.pkl

🔄 Батч 5 из 21


  0%|          | 0/20000 [00:00<?, ?it/s]

✅ Готово: data/2_test_image_embedding_005.pkl

🔄 Батч 6 из 21


  0%|          | 0/20000 [00:00<?, ?it/s]

✅ Готово: data/2_test_image_embedding_006.pkl

🔄 Батч 7 из 21


  0%|          | 0/20000 [00:00<?, ?it/s]

✅ Готово: data/2_test_image_embedding_007.pkl

🔄 Батч 8 из 21


  0%|          | 0/20000 [00:00<?, ?it/s]

✅ Готово: data/2_test_image_embedding_008.pkl

🔄 Батч 9 из 21


  0%|          | 0/20000 [00:00<?, ?it/s]

✅ Готово: data/2_test_image_embedding_009.pkl

🔄 Батч 10 из 21


  0%|          | 0/20000 [00:00<?, ?it/s]

✅ Готово: data/2_test_image_embedding_010.pkl

🔄 Батч 11 из 21


  0%|          | 0/20000 [00:00<?, ?it/s]

✅ Готово: data/2_test_image_embedding_011.pkl

🔄 Батч 12 из 21


  0%|          | 0/20000 [00:00<?, ?it/s]

✅ Готово: data/2_test_image_embedding_012.pkl

🔄 Батч 13 из 21


  0%|          | 0/20000 [00:00<?, ?it/s]

✅ Готово: data/2_test_image_embedding_013.pkl

🔄 Батч 14 из 21


  0%|          | 0/20000 [00:00<?, ?it/s]

✅ Готово: data/2_test_image_embedding_014.pkl

🔄 Батч 15 из 21


  0%|          | 0/20000 [00:00<?, ?it/s]

✅ Готово: data/2_test_image_embedding_015.pkl

🔄 Батч 16 из 21


  0%|          | 0/20000 [00:00<?, ?it/s]

✅ Готово: data/2_test_image_embedding_016.pkl

🔄 Батч 17 из 21


  0%|          | 0/20000 [00:00<?, ?it/s]

✅ Готово: data/2_test_image_embedding_017.pkl

🔄 Батч 18 из 21


  0%|          | 0/20000 [00:00<?, ?it/s]

✅ Готово: data/2_test_image_embedding_018.pkl

🔄 Батч 19 из 21


  0%|          | 0/20000 [00:00<?, ?it/s]

✅ Готово: data/2_test_image_embedding_019.pkl

🔄 Батч 20 из 21


  0%|          | 0/20000 [00:00<?, ?it/s]

✅ Готово: data/2_test_image_embedding_020.pkl

🔄 Батч 21 из 21


  0%|          | 0/3845 [00:00<?, ?it/s]

✅ Готово: data/2_test_image_embedding_021.pkl
valid indices 403841


# Объединение эмбеддингов в датасет


In [4]:
def dataset_part(directory, dataset_path):

    full_data = pd.DataFrame()

    dataset = pd.read_parquet(dataset_path)
    all_images_id = pd.concat([dataset['base_title_image'], dataset['cand_title_image']]).unique() # ПОлучим список ID изображений для добавления


    for file in os.listdir(directory):
        if file.endswith(".pkl"):
            with open(Path(directory) / file, 'rb') as f:
                data = pickle.load(f)
            full_data = pd.concat([full_data, data], ignore_index=True)
            del data
        else:
             with open(Path(directory) / file, 'rb') as f:
                is_valid = pickle.load(f)

    
    all_data = pd.concat([pd.DataFrame({"id": all_images_id}), full_data], axis=1)
    is_valid_boolean_mask = all_data['id'].isin(is_valid)
    all_data["is_valid"] = np.where(is_valid_boolean_mask, 1, 0)
    print(f"Part done: len {len(all_data)}")

    
    return all_data

            

In [3]:
# Отработаем датасееты по частям и объеденим в один

data1 = dataset_part("data/image_embedding/part_1", 'data/train/part_1/train_part_0001.snappy.parquet')
data2 = dataset_part("data/image_embedding/part_2", 'data/train/part_2/train_part_0002.snappy.parquet')
data_mid1 = pd.concat([data1, data2], ignore_index=True)

data3 = dataset_part("data/image_embedding/part_3", 'data/train/part_3/train_part_0003.snappy.parquet')
data4 = dataset_part("data/image_embedding/part_4", 'data/train/part_4/train_part_0004.snappy.parquet')
data_mid2 = pd.concat([data3, data4], ignore_index=True)

final_dataset = pd.concat([data_mid1, data_mid2], ignore_index=True)



Part done: len 651056
Part done: len 654188
Part done: len 665349
Part done: len 480275


In [4]:
final_dataset

Unnamed: 0,id,0,1,2,3,4,5,6,7,8,...,759,760,761,762,763,764,765,766,767,is_valid
0,40c72f08e0bb10b55e0605781481df2b5557b094aee695...,0.150431,-0.007837,-0.375743,0.185125,0.157627,0.026153,0.036855,-0.187436,0.023010,...,-0.039751,-0.162987,-0.110698,-0.009659,-0.140307,-0.046654,-0.096303,0.115212,-0.088266,1
1,181549e281126b799e54980db0b194918479e0db9be2ab...,0.020112,-0.172492,-0.054350,-0.069442,-0.223897,-0.038433,0.178443,0.143185,-0.192779,...,-0.018214,-0.212182,-0.201924,-0.121836,-0.055999,0.036406,-0.120529,0.130749,-0.033510,1
2,39ea6660f757f965e92f5d64fa70f6b52a5a1067b64e4e...,0.046536,-0.050100,-0.328599,-0.017658,0.033470,0.071916,-0.264820,-0.045898,0.104968,...,0.038837,-0.031744,-0.380836,0.144741,0.117165,-0.033304,0.213437,-0.267944,0.181924,1
3,1db6f93576bc904cf2c79b2c9783123648c7b7b9d83d2a...,0.045596,-0.369093,0.255815,-0.155695,-0.228253,-0.016100,-0.070344,0.100765,-0.118112,...,0.122302,0.124594,-0.131536,0.142997,0.365038,0.013494,-0.057187,0.001802,-0.139450,1
4,79bd9fea45264fa0fb4eaa33c227d3d16646ffd25432ae...,0.257884,0.081383,-0.095372,-0.189768,-0.328686,-0.057056,0.151580,0.089236,-0.192927,...,0.144585,0.313195,0.267563,0.020021,0.098842,-0.256244,0.221498,-0.067899,0.482283,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2450863,0a78093cc0d94d33904b64d218b6dceffc7b0091323389...,-0.090081,-0.282590,0.247564,-0.009835,0.000436,-0.182356,-0.220152,-0.188813,0.057753,...,0.015113,-0.239260,-0.103945,-0.125620,0.110807,-0.035248,0.199342,-0.194876,0.075555,1
2450864,c75dbb144761c5a56422a4a0db7f3bda3cabc06cbe858e...,-0.152420,-0.117375,0.027629,-0.190627,-0.538336,0.109655,0.113162,-0.196852,0.057608,...,0.078428,0.316647,-0.016991,0.133258,-0.139889,0.039521,-0.098749,0.347209,0.004086,1
2450865,4ec42c9b1f9fecc1850357bda3b4f69378d7c93c4a6a05...,0.054741,-0.030969,-0.051611,-0.332978,-0.395662,-0.295773,-0.283911,-0.050584,-0.074790,...,0.072147,0.370626,-0.258408,-0.142357,0.192198,-0.255237,-0.133809,0.242901,0.202796,1
2450866,3099d8c28a8686fa9ae41269be64eed3cb2a6708ce0571...,-0.061228,-0.102187,0.208102,-0.159028,-0.094588,0.049488,-0.289952,-0.130007,0.278366,...,-0.003306,0.014440,-0.081698,0.242919,-0.103588,-0.012039,0.036888,0.219381,0.078352,1


In [6]:
final_dataset['is_valid'] = final_dataset['is_valid'].astype("int8")

In [8]:
final_dataset.to_parquet('image_dataset.parquet', index=False)

# Для test выборки

In [None]:
data1 = dataset_part("data/test_image/part_1", 'data/test/test_part_0001.snappy.parquet')
data2 = dataset_part("data/test_image/part_2", 'data/test/test_part_0002.snappy.parquet')

final_dataset = pd.concat([data1, data2], ignore_index=True)


Part done: len 403831
Part done: len 403845


In [6]:
final_dataset

Unnamed: 0,id,0,1,2,3,4,5,6,7,8,...,759,760,761,762,763,764,765,766,767,is_valid
0,cfc47375008a976fda632ffdd07d5d5218adc8f9b41a13...,-0.053470,-0.376845,0.029403,0.138460,0.081305,0.143291,-0.384077,0.290037,-0.103400,...,0.090915,-0.091706,-0.209423,-0.106779,0.148734,0.135629,0.077363,0.143927,0.349552,1
1,9efcb5c59529a7a95f393842ca482d14f989e478a602bf...,0.139880,-0.030384,-0.171219,0.354083,-0.237094,0.250550,0.013904,0.092169,0.255578,...,0.060941,0.173254,-0.456635,-0.081369,-0.561777,0.128867,0.299771,0.286959,-0.044755,1
2,434f5a021df1726564bb6176ca0fc2a5cba252a0af036a...,0.273737,0.118470,-0.308663,-0.089739,-0.470641,0.309041,0.293790,0.226620,0.063794,...,0.128215,0.080217,0.005760,0.260156,-0.187692,0.006975,-0.028788,-0.345603,-0.091642,1
3,e0662ceaf20ca2eaa48528688e8888bad68bf207c5a22d...,-0.221435,0.103587,-0.232625,0.066806,-0.090547,0.209647,0.057388,-0.192186,-0.033715,...,0.044297,-0.086333,0.037652,0.282149,-0.261861,-0.078956,-0.176428,-0.255011,-0.229893,1
4,7a2e5d0dd1a293feb77bf8c17659df6f3e9217059c6e8f...,0.138452,-0.148682,-0.021246,-0.028641,0.029578,-0.066481,-0.059650,0.054874,0.070609,...,0.045735,0.072206,0.174371,-0.312135,0.493590,-0.130459,0.121607,0.105870,-0.104576,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
807671,989e74f5a7bf6d3755483ce4cf6a1029eb346d78c377fc...,0.151940,-0.145680,-0.309811,0.059615,-0.381078,0.218436,-0.341084,-0.010255,0.062423,...,0.047976,-0.161501,0.009639,0.219280,0.214207,-0.177815,0.013626,-0.133731,-0.000886,1
807672,d2f92e219d383b88d9d7d0055bb849e41deab388d5e9a4...,-0.119062,0.103118,-0.086583,-0.002577,-0.237752,0.143954,-0.138608,0.316920,-0.344482,...,-0.043116,0.147337,0.121185,0.216853,0.083281,-0.256460,-0.219148,0.046884,0.240852,1
807673,d0cfa504ff12aaaa21bd1e5b77304092397405eac307e6...,0.030072,-0.515179,0.056804,-0.011981,-0.167763,0.051317,-0.007666,0.070439,-0.033652,...,0.024333,-0.050354,0.061658,0.223499,0.249443,0.021944,-0.097513,-0.366735,0.261194,1
807674,8a5a2dd89eff32af20a94b5b91eb6874cf9c3b56d4b3c3...,0.055032,-0.009986,0.341586,-0.403560,-0.059526,0.149128,0.166626,-0.123529,0.176064,...,0.072517,0.038852,-0.034587,0.169669,0.019322,-0.031343,-0.501698,0.111718,0.357918,1


In [None]:
final_dataset.to_parquet('test_image_dataset.parquet', index=False)

: 