In [1]:
from pathlib import Path
from optimum.exporters.onnx import export
from transformers import Blip2ForConditionalGeneration, OPTForCausalLM, Blip2Processor, Blip2Model

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
len(list(Path('../storage').rglob('*.npy')))

20342

In [3]:
len(list(Path('../storage').rglob('*.jpg')))

20342

# Сохраняем части модели в отдельные папки

In [2]:
processor = Blip2Processor.from_pretrained('Salesforce/blip2-flan-t5-xl-coco', cache_dir='../weights')
model = Blip2ForConditionalGeneration.from_pretrained('Salesforce/blip2-flan-t5-xl-coco', cache_dir='../weights', device_map='cuda')

Loading checkpoint shards: 100%|██████████| 2/2 [00:40<00:00, 20.47s/it]


In [5]:
model.save_pretrained('../weights/blip2_t5/model')
processor.save_pretrained('../weights/blip2_t5/model')
model.language_model.save_pretrained('../weights/blip2_t5/text_model')
model.vision_model.save_pretrained('../weights/blip2_t5/vision_model')

# Подгружаем модель

In [2]:
model = Blip2ForConditionalGeneration.from_pretrained('../weights/blip2_t5/model/', device_map='cuda')
processor = Blip2Processor.from_pretrained('../weights/blip2_t5/model/', cache_dir='../weights')

Loading checkpoint shards: 100%|██████████| 4/4 [01:43<00:00, 25.79s/it]


# Тестирование выходов и тесты для конвертации

In [6]:
from PIL import Image
import requests

img_url = 'https://storage.googleapis.com/sfr-vision-language-research/LAVIS/assets/merlion.png'
image = Image.open(requests.get(img_url, stream=True).raw).convert('RGB')
# inputs = processor(image, "qwerqwer", return_tensors="pt")
# inputs = processor(image, "This is an image of a Singapore with a beautifull view on a river and architecture", return_tensors="pt")
inputs = processor(image, "Question: which city is this? Answer:", return_tensors="pt")

In [7]:
import torch

# model.config.use_decoder_only_language_model = False
with torch.no_grad():
    out = model.vision_model(inputs['pixel_values'].cuda())

In [8]:
out

BaseModelOutputWithPooling(last_hidden_state=tensor([[[-0.6173, -0.3679, -0.5741,  ...,  0.4800, -0.3569, -0.2696],
         [-0.0477, -1.0861,  0.5750,  ..., -0.1060,  0.6890,  0.5087],
         [ 0.0362, -0.1837,  0.1834,  ..., -0.9065, -0.3407,  0.5649],
         ...,
         [-1.0153,  0.2360, -0.9590,  ...,  0.2437, -1.2497,  0.0564],
         [ 0.0139, -0.6889,  0.4561,  ..., -0.3212,  0.3912,  0.7078],
         [-0.0230, -1.0312,  0.5580,  ..., -0.0162,  0.7475,  0.6912]]],
       device='cuda:0'), pooler_output=tensor([[-1.4233, -0.7922, -1.3463,  ...,  0.4727, -0.6312, -0.6605]],
       device='cuda:0'), hidden_states=None, attentions=None)

In [4]:
inputs

{'pixel_values': tensor([[[[ 0.5289,  0.5289,  0.5581,  ..., -0.2740, -0.3032, -0.2886],
          [ 0.5435,  0.5435,  0.5581,  ..., -0.3032, -0.3178, -0.3032],
          [ 0.5727,  0.5435,  0.5435,  ..., -0.3178, -0.3908, -0.3178],
          ...,
          [-0.5660, -0.5660, -0.5660,  ..., -0.8288, -0.7266, -0.8726],
          [-0.5952, -0.5806, -0.6244,  ..., -0.8580, -0.8580, -0.9018],
          [-0.6828, -0.6244, -0.5222,  ..., -0.9018, -0.9310, -0.9018]],

         [[ 0.6792,  0.6792,  0.7092,  ...,  0.3040,  0.2740,  0.3040],
          [ 0.6792,  0.6942,  0.7092,  ...,  0.3040,  0.2890,  0.3040],
          [ 0.6942,  0.6942,  0.6942,  ...,  0.2890,  0.2139,  0.3040],
          ...,
          [-0.7466, -0.7466, -0.7466,  ..., -0.6565, -0.5365, -0.6865],
          [-0.7766, -0.7916, -0.8216,  ..., -0.6865, -0.6715, -0.7166],
          [-0.8066, -0.7766, -0.6115,  ..., -0.7466, -0.7766, -0.7316]],

         [[ 1.0794,  1.0936,  1.1078,  ...,  1.2358,  1.1789,  1.2358],
          [ 1

In [5]:
import torch

# model.config.use_decoder_only_language_model = False
with torch.no_grad():
    out = model(
        pixel_values = inputs['pixel_values'].cuda(),
        input_ids = inputs['input_ids'][:, :-1].cuda(),
        attention_mask = inputs['attention_mask'][:, :-1].cuda(),
        decoder_input_ids = torch.LongTensor([[0]]).cuda(),
        output_hidden_states=True
    )

torch.Size([1, 677, 1408])


In [6]:
out.vision_outputs

BaseModelOutputWithPooling(last_hidden_state=tensor([[[-0.6173, -0.3679, -0.5741,  ...,  0.4800, -0.3569, -0.2696],
         [-0.0477, -1.0861,  0.5750,  ..., -0.1060,  0.6890,  0.5087],
         [ 0.0362, -0.1837,  0.1834,  ..., -0.9065, -0.3407,  0.5649],
         ...,
         [-1.0153,  0.2360, -0.9590,  ...,  0.2437, -1.2497,  0.0564],
         [ 0.0139, -0.6889,  0.4561,  ..., -0.3212,  0.3912,  0.7078],
         [-0.0230, -1.0312,  0.5580,  ..., -0.0162,  0.7475,  0.6912]]],
       device='cuda:0'), pooler_output=tensor([[-1.4233, -0.7922, -1.3463,  ...,  0.4727, -0.6312, -0.6605]],
       device='cuda:0'), hidden_states=(tensor([[[ 0.7310,  0.2028, -0.1178,  ...,  0.3475, -0.1317, -0.3480],
         [ 0.3935,  0.9222, -0.0331,  ...,  0.0096,  0.0846, -0.3556],
         [ 0.3084,  1.4161, -0.1383,  ..., -0.0839,  0.1264, -0.1469],
         ...,
         [-0.4875, -0.2557,  0.1107,  ..., -0.0981,  0.1253,  1.3917],
         [-0.2723, -0.4330,  0.1945,  ...,  0.0883, -0.0205,  1.1

In [7]:
processor.batch_decode(model.generate(
        pixel_values = inputs['pixel_values'].cuda(),
        input_ids = inputs['input_ids'][:, :-1].cuda(),
        attention_mask = inputs['attention_mask'][:, :-1].cuda(),
    ))

tensor([[[ 22.5159,  21.3575, -10.0504,  ...,  13.3765,  24.7830,  32.7040],
         [  7.9274, -11.4760,  27.8471,  ...,   6.5680,   3.5965,  10.9894],
         [-18.7217,   3.9934,   3.5138,  ..., -10.5042, -18.3754, -55.2503],
         ...,
         [  8.1875,  -2.3125,  -1.1719,  ...,   9.2500,   1.4766, -13.6875],
         [ -4.6250, -10.2500,  -0.2617,  ...,  22.1250,   7.4062,  21.2500],
         [ -2.2969,   1.5234,  -2.7812,  ..., -15.4375,  -2.5156,  -3.8906]]],
       device='cuda:0')
{}
{'inputs_embeds': tensor([[[ 22.5159,  21.3575, -10.0504,  ...,  13.3765,  24.7830,  32.7040],
         [  7.9274, -11.4760,  27.8471,  ...,   6.5680,   3.5965,  10.9894],
         [-18.7217,   3.9934,   3.5138,  ..., -10.5042, -18.3754, -55.2503],
         ...,
         [  8.1875,  -2.3125,  -1.1719,  ...,   9.2500,   1.4766, -13.6875],
         [ -4.6250, -10.2500,  -0.2617,  ...,  22.1250,   7.4062,  21.2500],
         [ -2.2969,   1.5234,  -2.7812,  ..., -15.4375,  -2.5156,  -3.8906]]],



['<pad> a city is this?</s>']

# Проверка поиска по картинкам

In [42]:
from PIL import Image
import requests

img_url = 'https://mykaleidoscope.ru/x/uploads/posts/2022-10/1666307531_34-mykaleidoscope-ru-p-ustavshii-chelovek-za-kompyuterom-vkontakt-36.jpg'
image1 = Image.open(requests.get(img_url, stream=True).raw).convert('RGB')
inputs1 = processor(image1, "", return_tensors="pt")

# img_url = 'https://abrakadabra.fun/uploads/posts/2022-02/1643966530_3-abrakadabra-fun-p-paren-za-kompom-4.jpg'
# img_url = 'https://amelie-style.ru/wp-content/uploads/2023/03/1647535676_54-kartinkin-net-p-lyudi-za-kompyuterom-kartinki-57.jpg'
img_url = 'https://babyzzz.ru/wp-content/uploads/2019/08/post_5d642dcdb8002.jpg'
image2 = Image.open(requests.get(img_url, stream=True).raw).convert('RGB')
inputs2 = processor(image2, "", return_tensors="pt")

In [58]:
out1.language_model_outputs.decoder_hidden_states[-1].shape

torch.Size([1, 1, 2048])

In [43]:
import torch

# model.config.use_decoder_only_language_model = False
with torch.no_grad():
    out1 = model(
        pixel_values = inputs1['pixel_values'].cuda(),
        input_ids = inputs1['input_ids'].cuda(),
        attention_mask = inputs1['attention_mask'].cuda(),
        decoder_input_ids = torch.LongTensor([[0]]).cuda(),
        output_hidden_states=True
    )
    out2 = model(
        pixel_values = inputs2['pixel_values'].cuda(),
        input_ids = inputs2['input_ids'].cuda(),
        attention_mask = inputs2['attention_mask'].cuda(),
        decoder_input_ids = torch.LongTensor([[0]]).cuda(),
        output_hidden_states=True
    )

In [61]:
import numpy as np

features1 = out1.qformer_outputs.pooler_output.cpu().detach().numpy()
features1 = features1 / np.linalg.norm(features1)

features2 = out2.qformer_outputs.pooler_output.cpu().detach().numpy()
features2 = features2 / np.linalg.norm(features2)
features1 @ features2.T

array([[0.9612946]], dtype=float32)

In [62]:
import numpy as np

features1 = out1.vision_outputs.pooler_output.cpu().detach().numpy()
features1 = features1 / np.linalg.norm(features1)

features2 = out2.vision_outputs.pooler_output.cpu().detach().numpy()
features2 = features2 / np.linalg.norm(features2)
features1 @ features2.T # Ну очевидно вижн энкодер лучше

array([[0.3540915]], dtype=float32)

In [63]:
import numpy as np

features1 = out1.language_model_outputs.encoder_last_hidden_state[:, 0, :].cpu().detach().numpy()
features1 = features1 / np.linalg.norm(features1)

features2 = out2.language_model_outputs.encoder_last_hidden_state[:, 0, :].cpu().detach().numpy()
features2 = features2 / np.linalg.norm(features2)
features1 @ features2.T

array([[0.96974516]], dtype=float32)

In [64]:
import numpy as np

features1 = out1.language_model_outputs.decoder_hidden_states[-1][0].cpu().detach().numpy()
features1 = features1 / np.linalg.norm(features1)

features2 = out2.language_model_outputs.decoder_hidden_states[-1][0].cpu().detach().numpy()
features2 = features2 / np.linalg.norm(features2)
features1 @ features2.T

array([[0.9988814]], dtype=float32)