https://github.com/openai/CLIP/issues/175 요거 참고해서 배치작업중...

In [57]:
import os, sys
from os.path import join as pathjoin
from typing import *
from dataclasses import dataclass

import numpy as np
import pandas as pd
import PIL
import requests

import torch, torchvision
import torch.nn.functional as F
from torch.utils.data import DataLoader, Dataset

from transformers import AutoProcessor, AutoModel, AutoTokenizer, AutoImageProcessor

In [58]:
processor = AutoProcessor.from_pretrained("koclip/koclip-base-pt")
model = AutoModel.from_pretrained("koclip/koclip-base-pt")
tokenizer = AutoTokenizer.from_pretrained("koclip/koclip-base-pt")
image_processor = AutoImageProcessor.from_pretrained("koclip/koclip-base-pt")

In [3]:
configs = {
    'train_path': '/kovar-vol/kovar/dataset/train.json',
    'test_path': '/kovar-vol/kovar/dataset/test.json',
    'image_path': '/kovar-vol/images/',
}

In [47]:
url = "http://images.cocodataset.org/val2017/000000039769.jpg"
image = PIL.Image.open(requests.get(url, stream=True).raw)
ToTensor = torchvision.transforms.ToTensor()
image_tensor = ToTensor(image)
print('original image size: ', image_tensor.shape)
text = ["강아지와 강아지 주인", "쳇바퀴를 달리는 햄스터", "자동차"]

# inputs = processor(
#     text=text,
#     images=image, 
#     return_tensors="pt", # could also be "pt" 
#     padding=True
# )
img = []

for k, v in inputs.items():
  print(k, v.shape)


outputs = model(**inputs)
print(outputs.logits_per_image, outputs.logits_per_text)
probs = torch.nn.functional.softmax(outputs.logits_per_image, dim=1)

for idx, prob in sorted(enumerate(*probs), key=lambda x: x[1], reverse=True):
    print(text[idx], prob)

original image size:  torch.Size([3, 480, 640])
input_ids torch.Size([3, 7])
token_type_ids torch.Size([3, 7])
attention_mask torch.Size([3, 7])
pixel_values torch.Size([1, 3, 224, 224])
tensor([[-2.9678, -0.1690,  0.2926]], grad_fn=<PermuteBackward0>) tensor([[-2.9678],
        [-0.1690],
        [ 0.2926]], grad_fn=<MulBackward0>)
자동차 tensor(0.5993, grad_fn=<UnbindBackward0>)
쳇바퀴를 달리는 햄스터 tensor(0.3777, grad_fn=<UnbindBackward0>)
강아지와 강아지 주인 tensor(0.0230, grad_fn=<UnbindBackward0>)


In [11]:
@dataclass
class MultimodalDataset(Dataset):
    dataset_path: str = None
    image_path: str = None
    transform: torchvision.transforms = None
    
    def __post_init__(self) -> None:
        ## set up train/test/image paths
        self.dataset_path = configs['train_path'] if self.dataset_path is None else self.dataset_path
        self.image_path = configs['image_path'] if self.image_path is None else self.image_path
        self.dataset = pd.read_json(self.dataset_path, lines=True)
        
        return None

        
    def __len__(self) -> int:
        return len(self.dataset)
    
    
    def _return_hypothesises(self, data_sample: pd.Series) -> List[str]:
        obs1, obs2 = data_sample[5], data_sample[6]
        hyp0, hyp1, hyp2 = data_sample[12], data_sample[13], data_sample[14]
        
        return [obs1 +  " " +hyp0 + " " + obs2, obs1 + " " + hyp1 + " " + obs2, obs1 + " " + hyp2 + " " + obs2]
        
        
    def __getitem__(self, idx) -> Tuple[List[str], PIL.Image.Image, int]:
        data_sample = self.dataset.iloc[idx, :]
        
        ## Get image and transform
        image_id = data_sample[4]
        image = PIL.Image.open(pathjoin(self.image_path, image_id[:3], f"{image_id}.jpg"))
        image = PIL.ImageOps.exif_transpose(image)  ## prevent Image rotation by default camera settings
        if self.transform:
            image = self.transform(image)
        
        ## Get Texts
        hyps_list = self._return_hypothesises(data_sample)
        
        inputs = processor(text=hyps_list,
                           images=image,
                           return_tensors='pt',
                           padding='max_length',
                           max_length=70)
        
        ## TODO: labels to be dealt with
        # label = data_sample[-1]
        
        return inputs   ## inputs are dictionary {input_ids: torch.Tensor (shape: num_hyps, padded sequence length), 
                        ##                        token_type_ids: torch.Tensor (shape: num_hyps, padded sequence length),
                        ##                        attention_mask: torch.Tensor (shape: num_hyps, padded sequence length),
                        ##                        pixel_values: torch.Tensor (shape: 1, color channels, height, width)
                        ##                       }
                        ## ?? pixel values의 첫번째 차원이 아마 배치 수인 것 같은데, 이거 squeeze하면 안받아준다....
                        ## ?? 라벨은 어떻게 처리하지??? visualBERT의 경우에는 inputs dict에서 label도 받았는데...

In [50]:
resize_and_normalize = torchvision.transforms.Compose([torchvision.transforms.Resize((224,224)),
                                                          torchvision.transforms.ToTensor(),
                                                          torchvision.transforms.Normalize(mean = [0.485, 0.456, 0.406], std = [0.229, 0.224, 0.225]), ## mean & std value are convention calculated from ImageNet.
                                                          torchvision.transforms.ToPILImage(),
                                                        ])

multimodal_dataset = MultimodalDataset(dataset_path=configs['test_path'], transform=resize_and_normalize)




it = enumerate(multimodal_dataset)

i, inputs1 = next(it)
# print(i, inputs1)
i, inputs2 = next(it)
# print(i, inputs2)

for k, v in inputs1.items():
  print(k, v.shape)
  
for k, v in inputs2.items():
  print(k, v.shape)

for k, v in inputs2.items():
  inputs1[k] = torch.stack((inputs1[k], v),axis=0)

for k, v in inputs1.items():
  print(k, v.shape)



inputs1['pixel_values'] = inputs1['pixel_values'].squeeze()
for k, v in inputs1.items():
  print(k, v.shape)
  
  
# outputs = model(**inputs1)

input_ids torch.Size([3, 70])
token_type_ids torch.Size([3, 70])
attention_mask torch.Size([3, 70])
pixel_values torch.Size([1, 3, 224, 224])
input_ids torch.Size([3, 70])
token_type_ids torch.Size([3, 70])
attention_mask torch.Size([3, 70])
pixel_values torch.Size([1, 3, 224, 224])
input_ids torch.Size([2, 3, 70])
token_type_ids torch.Size([2, 3, 70])
attention_mask torch.Size([2, 3, 70])
pixel_values torch.Size([2, 1, 3, 224, 224])
input_ids torch.Size([2, 3, 70])
token_type_ids torch.Size([2, 3, 70])
attention_mask torch.Size([2, 3, 70])
pixel_values torch.Size([2, 3, 224, 224])


In [38]:
inputs1['pixel_values'].squeeze().shape

torch.Size([2, 3, 224, 224])

In [34]:
outputs.keys()

odict_keys(['logits_per_image', 'logits_per_text', 'text_embeds', 'image_embeds', 'text_model_output', 'vision_model_output'])

In [None]:
## TODO
# def collate_for_multiple_choice(self, features_list: List[Dict]) -> dict:
#     return None
    

multimodal_dataloader = DataLoader(multimodal_dataset, batch_size=1, shuffle=True)

for i, batch in enumerate(multimodal_dataloader):
    print(i, batch)
    model(**batch)

0 {'input_ids': tensor([[[    0, 16221,  2069,  1511,  2073,  3651,  7285,  1408,   553,  1485,
           2138,   571,  2088,  1513,  2359,  2062,    18,  4836,  2138,  1163,
           3611,  2031,  2073,  6435,  2200,  6339,  2116,  2112,  7306,  7953,
           2031,  2069,  6263,  3670,  2371,  2062,    18,  3651,  2116,  1408,
            553,  1485,  2170,  1418,  3031, 26481,  2138,     3,    18,     2,
              1,     1,     1,     1,     1,     1,     1,     1,     1,     1,
              1,     1,     1,     1,     1,     1,     1,     1,     1,     1],
         [    0, 16221,  2069,  1511,  2073,  3651,  7285,  1408,   553,  1485,
           2138,   571,  2088,  1513,  2359,  2062,    18,   636,  1570,  1891,
           3651,  2116,  6989,  2205,  2318,  1123,  2069,     3,    18,  3651,
           2116,  1408,   553,  1485,  2170,  1418,  3031, 26481,  2138,     3,
             18,     2,     1,     1,     1,     1,     1,     1,     1,     1,
              1,     1,

ValueError: too many values to unpack (expected 2)

In [None]:
batch['input_ids'].shape

torch.Size([1, 3, 70])

In [None]:
a, b, c, d = 1, 2, 3, 4, 5

ValueError: too many values to unpack (expected 4)