In [2]:
import os, sys
from os.path import join as pathjoin
from typing import *
from dataclasses import dataclass

import numpy as np
import pandas as pd
import PIL
import requests

import torch, torchvision
import torch.nn.functional as F
from torch.utils.data import DataLoader, Dataset

from transformers import AutoProcessor, AutoModel

  from .autonotebook import tqdm as notebook_tqdm


In [3]:
processor = AutoProcessor.from_pretrained("koclip/koclip-base-pt")
model = AutoModel.from_pretrained("koclip/koclip-base-pt")

In [4]:
configs = {
    'train_path': '/kovar-vol/kovar/dataset/train.json',
    'test_path': '/kovar-vol/kovar/dataset/test.json',
    'image_path': '/kovar-vol/images/',
}

In [5]:
url = "http://images.cocodataset.org/val2017/000000039769.jpg"
image = PIL.Image.open(requests.get(url, stream=True).raw)
ToTensor = torchvision.transforms.ToTensor()
image_tensor = ToTensor(image)
print(image_tensor.shape)
text = ["강아지와 강아지 주인", "쳇바퀴를 달리는 햄스터", "자동차"]

inputs = processor(
    text=text,
    images=image, 
    return_tensors="pt", # could also be "pt" 
    padding=True
)

outputs = model(**inputs)
print(outputs.logits_per_image, outputs.logits_per_text)
probs = torch.nn.functional.softmax(outputs.logits_per_image, dim=1)

for idx, prob in sorted(enumerate(*probs), key=lambda x: x[1], reverse=True):
    print(text[idx], prob)

torch.Size([3, 480, 640])
tensor([[-2.9678, -0.1690,  0.2926]], grad_fn=<PermuteBackward0>) tensor([[-2.9678],
        [-0.1690],
        [ 0.2926]], grad_fn=<MulBackward0>)
자동차 tensor(0.5993, grad_fn=<UnbindBackward0>)
쳇바퀴를 달리는 햄스터 tensor(0.3777, grad_fn=<UnbindBackward0>)
강아지와 강아지 주인 tensor(0.0230, grad_fn=<UnbindBackward0>)


In [6]:
@dataclass
class MultimodalDataset(Dataset):
    dataset_path: str = None
    image_path: str = None
    transform: torchvision.transforms = None
    
    def __post_init__(self) -> None:
        ## set up train/test/image paths
        self.dataset_path = configs['train_path'] if self.dataset_path is None else self.dataset_path
        self.image_path = configs['image_path'] if self.image_path is None else self.image_path
        self.dataset = pd.read_json(self.dataset_path, lines=True)
        
        return None

        
    def __len__(self) -> int:
        return len(self.dataset)
    
    
    def _return_hypothesises(self, data_sample: pd.Series) -> List[str]:
        obs1, obs2 = data_sample[5], data_sample[6]
        hyp0, hyp1, hyp2 = data_sample[12], data_sample[13], data_sample[14]
        
        return [obs1 +  " " +hyp0 + " " + obs2, obs1 + " " + hyp1 + " " + obs2, obs1 + " " + hyp2 + " " + obs2]
        
        
    def __getitem__(self, idx) -> Tuple[List[str], PIL.Image.Image, int]:
        data_sample = self.dataset.iloc[idx, :]
        
        image_id = data_sample[4]
        image = PIL.Image.open(pathjoin(self.image_path, image_id[:3], f"{image_id}.jpg"))
        image = PIL.ImageOps.exif_transpose(image)  ## prevent Image rotation by default camera settings
        if self.transform:
            image = self.transform(image)
        
        hyps_list = self._return_hypothesises(data_sample)
        
        label = data_sample[-1]
        
        return (hyps_list, image, label)

In [11]:
tensorize_and_normalize = torchvision.transforms.Compose([torchvision.transforms.Resize((224,224)),
                                                          torchvision.transforms.ToTensor(),
                                                          torchvision.transforms.Normalize(mean = [0.485, 0.456, 0.406], std = [0.229, 0.224, 0.225]), ## mean & std value are convention calculated from ImageNet.
                                                        ])
ToPIL = torchvision.transforms.ToPILImage()

multimodal_dataset = MultimodalDataset(transform=tensorize_and_normalize)



num_correct = 0
for i, data_sample in enumerate(multimodal_dataset):

  texts, image, label = data_sample
  image = ToPIL(image)  ## CLIP got image input as PIL image, not tensor.
  
  inputs = processor(text=text,
                    images=image,
                    return_tensors='pt',
                    padding=True)
  
  outputs = model(**inputs)
  probs = outputs.logits_per_image.softmax(dim=1).squeeze()
  print(probs.shape)
  answer_idx = probs.argmax()
  
  if label == answer_idx:
    num_correct += 1
    print(f'✅ {i}th example {probs[answer_idx]}')
  else:
    print(f'❌ {i}th example {probs[answer_idx]}')

torch.Size([3])
❌ 0th example 0.5975189805030823
torch.Size([3])
✅ 1th example 0.45149415731430054
torch.Size([3])
❌ 2th example 0.6078193783760071
torch.Size([3])
❌ 3th example 0.5417043566703796
torch.Size([3])
❌ 4th example 0.6420307159423828
torch.Size([3])
❌ 5th example 0.5485196709632874
torch.Size([3])
✅ 6th example 0.5678942799568176
torch.Size([3])
❌ 7th example 0.4474325180053711
torch.Size([3])
✅ 8th example 0.6533774137496948
torch.Size([3])
✅ 9th example 0.5800740718841553
torch.Size([3])
❌ 10th example 0.5512664914131165
torch.Size([3])
✅ 11th example 0.551891028881073
torch.Size([3])
❌ 12th example 0.6265960931777954
torch.Size([3])
✅ 13th example 0.564146101474762
torch.Size([3])
❌ 14th example 0.41352543234825134
torch.Size([3])
✅ 15th example 0.4808609187602997
torch.Size([3])
✅ 16th example 0.6455770134925842
torch.Size([3])
❌ 17th example 0.5769298076629639
torch.Size([3])
❌ 18th example 0.5098403692245483
torch.Size([3])
❌ 19th example 0.5633478760719299
torch.Size

In [12]:
num_correct / len(multimodal_dataset)

0.34116371521503436