https://github.com/openai/CLIP/issues/175 요거 참고해서 배치작업중...

In [46]:
import os, sys
from os.path import join as pathjoin
from typing import *
from dataclasses import dataclass

import numpy as np
import pandas as pd
from PIL import Image
import requests

import torch, torchvision
import torch.nn.functional as F
from torch.utils.data import DataLoader, Dataset

from transformers import AutoProcessor, AutoModel, AutoTokenizer, AutoImageProcessor

In [47]:
processor = AutoProcessor.from_pretrained("koclip/koclip-base-pt")
model = AutoModel.from_pretrained("koclip/koclip-base-pt")
tokenizer = AutoTokenizer.from_pretrained("koclip/koclip-base-pt")
image_processor = AutoImageProcessor.from_pretrained("koclip/koclip-base-pt")

In [48]:
configs = {
    'train_path': '/kovar-vol/kovar/dataset/train.json',
    'test_path': '/kovar-vol/kovar/dataset/test.json',
    'image_path': '/kovar-vol/images/',
}

In [49]:
url = "http://images.cocodataset.org/val2017/000000039769.jpg"
image = PIL.Image.open(requests.get(url, stream=True).raw)
ToTensor = torchvision.transforms.ToTensor()
image_tensor = ToTensor(image)
print('original image size: ', image_tensor.shape)
text = ["강아지와 강아지 주인", "쳇바퀴를 달리는 햄스터", "자동차"]

inputs = processor(
    text=text,
    images=image, 
    return_tensors="pt", # could also be "pt" 
    padding=True
)

image_features = image_processor(image)
print(image_features['pixel_values'][0].shape)
imgs = []

for k, v in inputs.items():
  print(k, v.shape)


outputs = model(**inputs)
print(outputs.logits_per_image, outputs.logits_per_text)
probs = torch.nn.functional.softmax(outputs.logits_per_image, dim=1)

for idx, prob in sorted(enumerate(*probs), key=lambda x: x[1], reverse=True):
    print(text[idx], prob)

original image size:  torch.Size([3, 480, 640])
(3, 224, 224)
input_ids torch.Size([3, 7])
token_type_ids torch.Size([3, 7])
attention_mask torch.Size([3, 7])
pixel_values torch.Size([1, 3, 224, 224])
tensor([[-2.9678, -0.1690,  0.2926]], grad_fn=<PermuteBackward0>) tensor([[-2.9678],
        [-0.1690],
        [ 0.2926]], grad_fn=<MulBackward0>)
자동차 tensor(0.5993, grad_fn=<UnbindBackward0>)
쳇바퀴를 달리는 햄스터 tensor(0.3777, grad_fn=<UnbindBackward0>)
강아지와 강아지 주인 tensor(0.0230, grad_fn=<UnbindBackward0>)


In [50]:
outputs.keys()

odict_keys(['logits_per_image', 'logits_per_text', 'text_embeds', 'image_embeds', 'text_model_output', 'vision_model_output'])

In [51]:
@dataclass
class MultimodalDataset(Dataset):
    dataset_path: str = None
    image_path: str = None
    transform: torchvision.transforms = None
    
    def __post_init__(self) -> None:
        ## set up train/test/image paths
        self.dataset_path = configs['train_path'] if self.dataset_path is None else self.dataset_path
        self.image_path = configs['image_path'] if self.image_path is None else self.image_path
        self.dataset = pd.read_json(self.dataset_path, lines=True)
        
        return None

        
    def __len__(self) -> int:
        return len(self.dataset)
    
    
    def _return_hypothesises(self, data_sample: pd.Series) -> List[str]:
        obs1, obs2 = data_sample[5], data_sample[6]
        hyp0, hyp1, hyp2 = data_sample[12], data_sample[13], data_sample[14]
        
        return [obs1 +  " " +hyp0 + " " + obs2, obs1 + " " + hyp1 + " " + obs2, obs1 + " " + hyp2 + " " + obs2]
        
        
    def __getitem__(self, idx) -> Tuple[List[str], PIL.Image.Image, int]:
        data_sample = self.dataset.iloc[idx, :]
        
        ## Get image and transform
        image_id = data_sample[4]
        image = Image.open(pathjoin(self.image_path, image_id[:3], f"{image_id}.jpg"))
        image = ImageOps.exif_transpose(image)  ## prevent Image rotation by default camera settings
        if self.transform:
            image = self.transform(image)
        
        ## Get Texts
        hyps_list = self._return_hypothesises(data_sample)
        
        inputs = processor(text=hyps_list,
                           images=image,
                           return_tensors='pt',
                           padding='max_length',
                           max_length=70)
        
        ## TODO: labels to be dealt with
        # label = data_sample[-1]
        
        return inputs   ## inputs are dictionary {input_ids: torch.Tensor (shape: num_hyps, padded sequence length), 
                        ##                        token_type_ids: torch.Tensor (shape: num_hyps, padded sequence length),
                        ##                        attention_mask: torch.Tensor (shape: num_hyps, padded sequence length),
                        ##                        pixel_values: torch.Tensor (shape: 1, color channels, height, width)
                        ##                       }
                        ## ?? pixel values의 첫번째 차원이 아마 배치 수인 것 같은데, 이거 squeeze하면 안받아준다....
                        ## ?? 라벨은 어떻게 처리하지??? visualBERT의 경우에는 inputs dict에서 label도 받았는데...

In [53]:
paths = {
    ## dataset & image path
    "train_path": "/kovar-vol/kovar/dataset/train.json",
    "test_path": "/kovar-vol/kovar/dataset/test.json",
    "image_path": "/kovar-vol/images",
}



class JsonToDataset:
    '''
    Convert json files of train/test set to dataset object of hugginface datasets
    - add 'image_path'and 'input_prompt' columns to original DataFrame
    - 'image_path' : abs_path for each images
    - 'input_prompt' : list of str, ['{obs1}[sep]{hyp0}[sep]{obs2}', '{obs1}[sep]{hyp1}[sep]{obs2}', ...]
    '''
    def __init__(self):
        self.tokenizer = processor.tokenizer
        # # Not used now
        # self.max_seq_length = 0

    def __call__(self, json_path: str) -> Dataset:
        df = pd.read_json(json_path, lines=True)
        df["image_path"] = df["CLUE1"].map(self._get_image_paths)
        df["input_prompt"] = df.apply(lambda row: self._format_texts(row), axis=1)
        dataset = Dataset.from_pandas(df)
        return dataset

    def _get_image_paths(self, image_id: str) -> str:
        path = os.path.join(paths["image_path"], image_id[:3], f"{image_id}.jpg")
        return path

    def _format_texts(self, row: pd.DataFrame) -> List[str]:
        sep_token = self.tokenizer.sep_token
        obs1 = row["OBS1"]
        obs2 = row["OBS2"]
        hyps = row.loc["hyp0":"hyp2"]
        text_list = list()
        for hyp in hyps.values:
            prompt_format = sep_token.join([obs1, hyp, obs2])
            # self.max_seq_length = max(len(prompt_format), self.max_seq_length)
            text_list.append(prompt_format)
        return text_list

    # def get_max_seq_length(self):
    #     return self.max_seq_length


class MultimodalDataset:
    '''
    Dataset for KoVAR task based on koCLIP
    - return image, text, label when __getitem__ is called
    - image : an image (PIL.Image.Image)
    - text : List of str, several choices of hypothesis and observations
    - label : the index of sentence that contains a plausible hypothesis in the 'text'.
    '''
    def __init__(self, dataset: Dataset):
        self.tokenizer = processor.tokenizer

        self.image_paths = dataset["image_path"]
        self.texts = dataset["input_prompt"]
        self.labels = dataset["label"]

    def __len__(self) -> int:
        return len(self.texts)

    def __getitem__(self, idx: int) -> tuple:
        image = self._load_image(idx)
        text = self._load_texts(idx)
        label = self._load_label(idx)
        return image, text, label

    def _load_image(self, idx: int) -> Image.Image:
        path = self.image_paths[idx]
        image = Image.open(path)
        return image

    def _load_texts(self, idx: int) -> List[str]:
        return self.texts[idx]

    def _load_label(self, idx: int) -> int:
        return self.labels[idx]


def collate_fn(examples: List[tuple]):
    """
    example[0] = images
    example[1] = texts
    example[2] = labels
    """
    examples = list(filter(lambda x: x is not None, examples))
    

    # make labels
    num_hyp = 3  # temperally fixed
    labels_idx = [example[2] for example in examples]
    labels = np.zeros((len(examples), num_hyp))
    for i, label in enumerate(labels_idx):
        labels[i][label] = 1
    print(labels)
    
    # make list of dicts
    inputs = []   # inputs: List[dict]
    lengths = []  # lengths: List[int]
    for example in examples:
        input = processor(images=example[0], text=example[1], return_tensors='pt', padding=True )
        lengths.append(input["input_ids"].shape[1])  # 3 X N
        inputs.append(input)
    
    # dynamic padding in batch
    max_length = max(lengths)
    for idx, input in enumerate(inputs):
        length = lengths[idx]
        num_pad = max_length - length

        pad_token_id = processor.tokenizer.pad_token_id

        input['input_ids'] = np.pad(input['input_ids'], ((0, 0), (0, num_pad)), 'constant', constant_values=pad_token_id)
        input['token_type_ids'] = np.pad(input['token_type_ids'], ((0, 0), (0, num_pad)), 'constant', constant_values=0)
        input['attention_mask'] = np.pad(input['attention_mask'], ((0, 0), (0, num_pad)), 'constant', constant_values=0)
    
    # merge to 1 dict
    padded_inputs = dict()
    keys = ['pixel_values','input_ids', 'attention_mask', 'token_type_ids']
    for key in keys:
        if key == 'pixel_values':
            padded_inputs[key] = torch.stack([input[key][0] for input in inputs])
        else:
            padded_inputs[key] = torch.tensor([input[key] for input in inputs])


    return padded_inputs, labels


def get_data_loader(dataset, batch_size):
    data_loader = DataLoader(
        dataset, batch_size=batch_size, shuffle=False, collate_fn=collate_fn
    )
    return data_loader


if __name__ == "__main__":
    get_dataset = JsonToDataset()
    train_set = get_dataset(paths["train_path"])
    test_set = get_dataset(paths["test_path"])

    # max_seq_length = get_dataset.get_max_seq_length()
    batch_size = 4

    train_set = MultimodalDataset(train_set)
    test_set = MultimodalDataset(test_set)

    # print(train_set.__getitem__(1))

    train_loader = get_data_loader(train_set, batch_size)
    test_loader = get_data_loader(test_set, batch_size)

    for batch in train_loader:
        print(batch)
        break

AttributeError: type object 'Dataset' has no attribute 'from_pandas'

In [None]:
inputs1['pixel_values'].squeeze().shape

torch.Size([2, 3, 224, 224])

In [None]:
outputs.keys()

odict_keys(['logits_per_image', 'logits_per_text', 'text_embeds', 'image_embeds', 'text_model_output', 'vision_model_output'])

In [36]:
## TODO
# def collate_for_multiple_choice(self, features_list: List[Dict]) -> dict:
#     return None

resize_and_normalize = torchvision.transforms.Compose([torchvision.transforms.Resize((224,224)),
                                                          torchvision.transforms.ToTensor(),
                                                          torchvision.transforms.Normalize(mean = [0.485, 0.456, 0.406], std = [0.229, 0.224, 0.225]), ## mean & std value are convention calculated from ImageNet.
                                                          torchvision.transforms.ToPILImage(),
                                                        ])

multimodal_dataset = MultimodalDataset(dataset_path=configs['test_path'], transform=resize_and_normalize)   

multimodal_dataloader = DataLoader(multimodal_dataset, batch_size=2, shuffle=True)

for i, batch in enumerate(multimodal_dataloader):
    # print(i, batch)
    for k, v in batch.items():
      print(k, v.shape)
    
    for text_keys in ['input_ids', 'token_type_ids', 'attention_mask']: ## preprocess batch
      batch[text_keys] = batch[text_keys].reshape(-1, batch[text_keys].shape[2])  ## batch x num_hyps x sequence lenght -> (batch * num_hyps) x sequence length
    batch['pixel_values'] = batch['pixel_values'].squeeze()
      
    break

input_ids torch.Size([2, 3, 70])
token_type_ids torch.Size([2, 3, 70])
attention_mask torch.Size([2, 3, 70])
pixel_values torch.Size([2, 1, 3, 224, 224])


In [39]:
for k, v in batch.items():
    print(k, v.shape)
    
output = model(**batch)

input_ids torch.Size([6, 70])
token_type_ids torch.Size([6, 70])
attention_mask torch.Size([6, 70])
pixel_values torch.Size([2, 3, 224, 224])


In [43]:
output['logits_per_image']

tensor([[-1.2338, -0.8873, -1.0910,  0.2682, -0.3743, -0.0649],
        [-1.5100, -1.0999, -1.3481,  0.2801, -0.4611, -0.0920]],
       grad_fn=<PermuteBackward0>)

In [None]:
it = enumerate(multimodal_dataset)

i, inputs1 = next(it)
# print(i, inputs1)
i, inputs2 = next(it)
# print(i, inputs2)

for k, v in inputs1.items():
  print(k, v.shape)
  
for k, v in inputs2.items():
  print(k, v.shape)

for k, v in inputs2.items():
  inputs1[k] = torch.stack((inputs1[k], v),axis=0)

for k, v in inputs1.items():
  print(k, v.shape)



inputs1['pixel_values'] = inputs1['pixel_values'].squeeze()
for k, v in inputs1.items():
  print(k, v.shape)
  
  
# outputs = model(**inputs1)

NameError: name 'multimodal_dataset' is not defined

In [None]:
batch['input_ids'].shape

torch.Size([1, 3, 70])

In [21]:
import torch
import torch.nn.functional as F
from transformers import AutoProcessor, AutoModel
from preprocess import JsonToDataset, MultimodalDataset, get_data_loader

# paths
paths = {
    ## dataset & image path
    "train_path": "/kovar-vol/kovar/dataset/train.json",
    "test_path": "/kovar-vol/kovar/dataset/test.json",
    "image_path": "/kovar-vol/images",
    }   

model_checkpoint = "koclip/koclip-base-pt"
processor = AutoProcessor.from_pretrained(model_checkpoint)
model = AutoModel.from_pretrained(model_checkpoint)


if __name__ == "__main__":
    
    get_dataset = JsonToDataset()
    # train_set = get_dataset(paths["train_path"])
    test_set = get_dataset(paths["test_path"])

    batch_size = 8

    test_set = MultimodalDataset(test_set)
    test_loader = get_data_loader(test_set, batch_size)

    num_correct = 0
    for batched_inputs, labels in test_loader:
        outputs = model(**batched_inputs)
        logits_per_image = outputs['logits_per_image']
        
        ## 1) get proper outputs from 4*12 tensor by proper indexing
        indices = torch.arange(logits_per_image.shape[1]).reshape(batch_size,3)      ## indices: torch.tensor([[0, 1, 2],[3,4,5],[6,7,8],[9,10,11]])
        logits_per_image = torch.gather(input=logits_per_image,dim= 1, index = indices)

        # print(logits_per_image)
        
        ## 2) convert logits_per_image outputs into one-hot like 
        one_hot_outputs = F.one_hot(logits_per_image.argmax(dim=1)).detach().numpy()
        break
        
    #     ## 3) calculate the right outpus comparing to labels
    #     num_correct_in_batch = (one_hot_outputs == labels).all(axis=1).sum()
    #     print(num_correct)
    #     num_correct += num_correct_in_batch
    #     print(num_correct_in_batch, num_correct)
        
    
    # print(f'accuracy: {num_correct / len(test_loader)}')
        
            

In [26]:
batched_inputs['input_ids'].shape

torch.Size([24, 68])