# BLIP2 baseline

this notebook is based on [salesforce-lavis](https://github.com/salesforce/LAVIS). <br>
BLIP2 models are very large to load, so I use some techniques such as init_empty_weights. <br>
And in order to submit within 9 hours, a beam width of beam search in decoder is reduced to 3. 

In [13]:
# locally downloaded salesforce-lavis
#!pip install salesforce-lavis --no-index --find-links=file:///kaggle/input/lavis-pip/
#!pip install salesforce-lavis

In [14]:
# in order to load local weights files, modified version of salesforce-lavis is required. so firstly uninstall.
#!pip uninstall -y salesforce-lavis

In [15]:
# and install modified salesforce-lavis
#!pip install salesforce-lavis --no-index --find-links=file:///kaggle/input/lavis-mod-wheel/salesforce_lavis-1.0.0.dev1-py3-none-any.whl

In [5]:
import os
import gc
import cv2
import sys
import torch

import numpy as np
import torch.nn as nn
import pandas as pd
import polars as pl
import matplotlib.pyplot as plt

from PIL import Image
from lavis.models import load_model, load_preprocess, load_model_and_preprocess
from lavis.processors import load_processor
from lavis.models.blip2_models.blip2_opt import Blip2OPT
from typing import Dict
from sklearn.metrics.pairwise import cosine_similarity 
from pathlib import Path
from accelerate import init_empty_weights
from lavis.models import load_model_and_preprocess


#sys.path.append('/kaggle/input/sentence-transformers-222/sentence-transformers')
#from sentence_transformers import SentenceTransformer, models

In [6]:
# these helper functions are based on the following repository. 
# https://github.com/FrancescoSaverioZuppichini/Loading-huge-PyTorch-models-with-linear-memory-consumption/blob/main/README.md
def get_keys_to_submodule(model: nn.Module) -> Dict[str, nn.Module]:
    keys_to_submodule = {}
    for submodule_name, submodule in model.named_modules():
        for param_name, param in submodule.named_parameters():
            splitted_param_name = param_name.split('.')
            is_leaf_param = len(splitted_param_name) == 1
            if is_leaf_param:
                if submodule_name != '':
                    key = f"{submodule_name}.{param_name}"
                else:
                    key = param_name
                keys_to_submodule[key] = submodule                
    return keys_to_submodule


def load_state_dict_with_low_memory(model: nn.Module, state_dict: Dict[str, torch.Tensor]):
    model.to(torch.device("meta"))
    keys_to_submodule = get_keys_to_submodule(model)
    for key, submodule in keys_to_submodule.items():
        val = state_dict.get(key)
        
        if val is not None:
            param_name = key.split('.')[-1]
            param_dtype = getattr(submodule, param_name).dtype
            val = val.to(param_dtype)
            new_val = torch.nn.Parameter(val, requires_grad=False)
            setattr(submodule, param_name, new_val)

In [7]:
comp_path = Path('../data/')
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

In [10]:
#with init_empty_weights():
 #   my_model = Blip2OPT(opt_model="facebook/opt-2.7b")

model, vis_processors, _ = load_model_and_preprocess(
         name="blip2_opt", model_type="caption_coco_opt2.7b", is_eval=True, device=device
 )

  0%|          | 0.00/1.89G [00:00<?, ?B/s]

In [8]:
class DictWrapper:
    def __init__(self, d):
        self.dict = d
    
    def __getattr__(self, name):
        return self.dict[name]

    def get(self, name, default_val=None):
        return self.dict.get(name, default_val)

dict_tr = {
    "name": "blip_image_train",
    "image_size": 224
}
dict_ev = {
    "name": "blip_image_eval",
    "image_size": 224
}
dict_t = {
    "name": "blip_caption"
}
config = {
    "vis_processor":{
        "train":DictWrapper(dict_tr),
        "eval":DictWrapper(dict_ev),
    },
    "text_processor":{
        "train":DictWrapper(dict_t),
        "eval":DictWrapper(dict_t)
    }
}
vis_processors = load_preprocess(config)[0]

In [9]:
load_state_dict_with_low_memory(my_model, torch.load("/kaggle/input/blip2-pretrained-opt27b-sdpth/blip2_pretrained_opt2.7b_sd.pth"))
my_model.eval()
gc.collect()

26

In [10]:
images = os.listdir(comp_path / 'images')
pred_prompt_list = []
for image_name in images:
    image = Image.open(comp_path / 'images' / image_name).convert('RGB')
    image = vis_processors["eval"](image).unsqueeze(0).to(device)
    pred_prompt = my_model.generate({"image": image}, num_beams=3)
    pred_prompt_list.append(pred_prompt[0])

In [11]:
del my_model
gc.collect()

141

In [12]:
st_model = SentenceTransformer('/kaggle/input/sentence-transformers-222/all-MiniLM-L6-v2')
prompt_embeddings = st_model.encode(pred_prompt_list, batch_size=256).flatten()

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

In [13]:
imgIds = [i.split('.')[0] for i in images]

EMBEDDING_LENGTH = 384
eIds = list(range(EMBEDDING_LENGTH))

imgId_eId = [
    '_'.join(map(str, i)) for i in zip(
        np.repeat(imgIds, EMBEDDING_LENGTH),
        np.tile(range(EMBEDDING_LENGTH), len(imgIds)))]

In [14]:
submission = pd.DataFrame(
    index=imgId_eId,
    data=prompt_embeddings,
    columns=['val']
).rename_axis('imgId_eId')

In [15]:
submission.to_csv('submission.csv')