In [1]:
import torch
from torch.utils.data import Dataset, DataLoader
from PIL import Image

import os
import json

from models import Showo, MAGVITv2
from training.prompting_utils import UniversalPrompting
from transformers import AutoTokenizer
from models.clip_encoder import CLIPVisionTower
from transformers import CLIPImageProcessor
from llava.llava import conversation as conversation_lib

conversation_lib.default_conversation = conversation_lib.conv_templates["phi1.5"]

  from .autonotebook import tqdm as notebook_tqdm


[2025-02-03 20:53:06,135] [INFO] [real_accelerator.py:203:get_accelerator] Setting ds_accelerator to cuda (auto detect)


In [2]:
from omegaconf import DictConfig, ListConfig, OmegaConf
config = OmegaConf.load('configs/showo_demo_w_clip_vit.yaml')
# device setup
device = torch.device("cuda:0")

In [3]:
# config load -  'showo_demo_w_clip_vit.yaml'

# device = "cpu"

# show o tokenizer setup and adding special tokens to universal prompting
# llm model : 'microsoft/phi-1_5'
tokenizer = AutoTokenizer.from_pretrained(config.model.showo.llm_model_path, padding_side ="left")
uni_prompting = UniversalPrompting(tokenizer, max_text_len=config.dataset.preprocessing.max_seq_length,
                                       special_tokens=("<|soi|>", "<|eoi|>", "<|sov|>", "<|eov|>", "<|t2i|>", "<|mmu|>", "<|t2v|>", "<|v2v|>", "<|lvg|>"),
                                       ignore_id=-100, cond_dropout_prob=config.training.cond_dropout_prob)

# setting up the visual question answering model: magvit-v2
vq_model = MAGVITv2
vq_model = vq_model.from_pretrained(config.model.vq_model.vq_model_name).to(device)
vq_model.requires_grad_(False)
vq_model.eval()

# setting up vision tower: clip-vit
vision_tower_name =config.clip_path
vision_tower = CLIPVisionTower(vision_tower_name).to(device)
clip_image_processor = CLIPImageProcessor.from_pretrained(vision_tower_name)

# setting up the showo model 
model = Showo.from_pretrained(config.model.showo.pretrained_model_path).to(device)

# setting up the parameters
temperature = 0.8  # 1.0 = no change, < 1.0 = less random, > 1.0 = more random, in predictions
top_k = 1  # retain only the top_k most likely tokens, clamp others to have 0 probability
SYSTEM_PROMPT = "A chat between a curious user and an artificial intelligence assistant. " \
                "The assistant gives helpful, detailed, and polite answers to the user's questions."
SYSTEM_PROMPT_LEN = 28

Working with z of shape (1, 13, 16, 16) = 3328 dimensions.
Look-up free quantizer with codebook size: 8192


The config attributes {'mask_token_id': 58497} were passed to Showo, but are not expected and will be ignored. Please verify your config.json configuration file.
  if self.w_clip_vit:


attention implementation:  sdpa


In [4]:
class PersonalizedMMUDataset(Dataset):
    def __init__(
        self,
        data_root: str,
        concept_name: str,
        clip_image_processor,
    ):
        self.data_root = data_root
        self.concept_name = concept_name
        self.clip_image_processor = clip_image_processor

        conversation_lib.default_conversation = conversation_lib.conv_templates["phi1.5"]
        with open(os.path.join(data_root, f"training_data/{concept_name}.json")) as f:
            conversations = json.load(f)
        self.conversations = conversations

    def __len__(self):
        return len(self.conversations)

    def __getitem__(self, idx):
        conv_item = self.conversations[idx]
        # {
        #     "messages": [
        #         {
        #             "content": "<image>How would you describe <BaGu>'s attire?",
        #             "role": "user"
        #         },
        #         {
        #             "content": "The image does not provide enough information to describe <BaGu>'s attire.",
        #             "role": "assistant"
        #         }
        #     ],
        #     "images": [
        #         "/home/arc/MulBench/two_concept/concept/train/BaGu/6.png"
        #     ]
        # }
        image_path = conv_item["images"][0]
        image = Image.open(image_path).convert("RGB")
        pixel_values = self.clip_image_processor(image, return_tensors="pt")['pixel_values'][0]
        
        question = conv_item["messages"][0]["content"].replace("<image>", "")
        answer = conv_item["messages"][1]["content"]
        
        conv = conversation_lib.default_conversation.copy()
        conv.append_message(conv.roles[0], question)
        prompt_w_o_answer = conv.get_prompt()
        conv.append_message(conv.roles[1], answer)
        prompt_w_answer = conv.get_prompt()
        
        return {
            # "image": image,
            "images": pixel_values,   # [3, 336, 336] tensor on cpu
            "question": question,           # Could you confirm if this is <dunpai> in the photo?
            "answer": answer,               # I can confirm that this is not <dunpai> in the photo.
            "prompt_w_answer": prompt_w_answer, #  USER: Could you confirm if this is <dunpai> in the photo? ASSISTANT: I can confirm that this is not <dunpai> in the photo.<|endoftext|>
            "prompt_w_o_answer": prompt_w_o_answer  #  USER: Could you confirm if this is <dunpai> in the photo?
        }

In [5]:
train_dataset =   PersonalizedMMUDataset('/home/arc/full_mcdata', 
                                         'dunpai', 
                                         clip_image_processor=clip_image_processor,)

In [6]:
train_dataset[0]

{'images': tensor([[[ 0.9668,  0.9668,  0.9668,  ..., -0.1426, -0.1572, -0.1572],
          [ 0.9668,  0.9668,  0.9668,  ..., -0.1426, -0.1572, -0.1572],
          [ 0.9814,  0.9814,  0.9814,  ..., -0.1280, -0.1426, -0.1426],
          ...,
          [-1.2229, -1.2229, -1.2229,  ..., -1.4711, -1.4565, -1.4419],
          [-1.2229, -1.2229, -1.2229,  ..., -1.4419, -1.4273, -1.4127],
          [-1.2229, -1.2229, -1.2083,  ..., -1.4419, -1.4273, -1.4127]],
 
         [[ 1.0544,  1.0544,  1.0544,  ...,  0.0638,  0.0488,  0.0488],
          [ 1.0544,  1.0544,  1.0544,  ...,  0.0638,  0.0488,  0.0488],
          [ 1.0694,  1.0694,  1.0694,  ...,  0.0789,  0.0638,  0.0638],
          ...,
          [-1.2718, -1.2718, -1.2718,  ..., -1.4369, -1.4219, -1.4069],
          [-1.2718, -1.2718, -1.2718,  ..., -1.4069, -1.3919, -1.3769],
          [-1.2718, -1.2718, -1.2568,  ..., -1.4069, -1.3919, -1.3769]],
 
         [[ 1.0083,  1.0083,  1.0083,  ...,  0.2688,  0.2546,  0.2546],
          [ 1.0083

In [7]:
from torchvision import transforms


def image_transform(sample, resolution=256):
    # input image is PIL image
    image = sample["images"]
    image = transforms.Resize(resolution, interpolation=transforms.InterpolationMode.BICUBIC)(image)
    image = transforms.CenterCrop((resolution, resolution))(image)
    image = transforms.ToTensor()(image)
    image = transforms.Normalize(mean=[0.5, 0.5, 0.5], std=[0.5, 0.5, 0.5], inplace=True)(image)
    sample["images"] = image
    return sample

data_root = '/home/arc/full_mcdata'
data_w_cases_path = os.path.join(data_root, "test_concepts.json")
with open(data_w_cases_path, "r") as f:
    data_w_cases = json.load(f)

class PersonalizedT2IDataset(Dataset):
    def __init__(
        self,
        data_root: str,
        concept_name: str,
    ):
        self.data_root = data_root
        self.concept_name = concept_name
        
        case_type = data_w_cases[concept_name]
        training_img_dir_path = os.path.join(data_root, case_type, "concept/train", concept_name)
        self.img_paths = []
        for img in os.listdir(training_img_dir_path):
            if img.endswith(('png', 'jpg', 'jpeg')) and "mask" not in img:
                img_path = os.path.join(training_img_dir_path, img)
                self.img_paths.append(img_path)
                
        assert len(self.img_paths) == 10, f"Expected 10 images for mcllava dataset, found {len(img_paths)}"
        
    def __len__(self):
        return len(self.img_paths)
    
    def __getitem__(self, idx):
        img_path = self.img_paths[idx]
        img = Image.open(img_path).convert("RGB")
        item = {
            "condition": f"A photo of <{self.concept_name}>.",
            "images": img,  # [3, 256, 256] tensor on cpu
        }
        item = image_transform(item)
        return item

In [8]:
t2i_loader = DataLoader(
    PersonalizedT2IDataset(data_root, "dunpai"),
    batch_size=10,
    shuffle=True,
    num_workers=0,
    pin_memory=True,
)

In [9]:
for batch in t2i_loader:
    b = batch
    break
    

In [10]:
t2i_dataset = PersonalizedT2IDataset(data_root, "dunpai")

In [11]:
t2i_dataset[0]["images"].shape

torch.Size([3, 256, 256])

In [6]:
from pdata import get_personalized_mmudataloader

mmu_loader = get_personalized_mmudataloader('/home/arc/full_mcdata', "dunpai", tokenizer, batch_size=5, num_workers=10)

In [7]:
list_mmu_dataset = list(mmu_loader)

In [9]:
batch = list_mmu_dataset[0]

In [13]:
batch["labels"][0]

tensor([50295, 50295, 50295, 50295, 50295, 50295, 50295, 50295, 50295, 50295,
        50295, 50295, 50295, 50295, 50295, 50295, 50295, 50295, 50295, 50295,
        50295, 50295, 50295, 50295, 50295, 50295, 50295, 50295, 50295, 50295,
        50295, 50295, 50295, 50295, 50295, 50295, 50295, 50295, 50295, 50295,
        50295, 50295, 50295, 50295, 50295, 50295, 50295, 50295, 50295, 50295,
        50295, 50295, 50295, 50295, 50295, 50295, 50295, 50295, 50295, 50295,
        10705,  8808,  8643,    25,  1279,    67,   403, 49712,    29,   468,
          257,  7209,   290, 40551, 11743,    13, 50256])