In [2]:
from pdata import PersonalizedMMUDataset, PersonalizedT2IDataset, get_personalized_mmudataloader

import torch
from torch.utils.data import Dataset, DataLoader
import torch.nn.functional as F
import torch.nn as nn
import numpy as np
from tqdm import tqdm
from PIL import Image

from models import Showo, MAGVITv2, get_mask_chedule
from training.prompting_utils import UniversalPrompting, create_attention_mask_for_mmu, create_attention_mask_for_mmu_vit
from training.utils import get_config, flatten_omega_conf, mask_or_random_replace_tokens, AverageMeter
from transformers import AutoTokenizer
from models.clip_encoder import CLIPVisionTower
from transformers import CLIPImageProcessor
from llava.llava import conversation as conversation_lib

conversation_lib.default_conversation = conversation_lib.conv_templates["phi1.5"]

import os
from omegaconf import DictConfig, ListConfig, OmegaConf
config = OmegaConf.load('configs/showo_demo.yaml')
# device setup
device = torch.device("cuda:0")

[2025-02-03 21:10:47,840] [INFO] [real_accelerator.py:203:get_accelerator] Setting ds_accelerator to cuda (auto detect)


In [3]:
# config load -  'showo_demo_w_clip_vit.yaml'

# show o tokenizer setup and adding special tokens to universal prompting
# llm model : 'microsoft/phi-1_5'
tokenizer = AutoTokenizer.from_pretrained(config.model.showo.llm_model_path, padding_side ="left")
uni_prompting = UniversalPrompting(tokenizer, max_text_len=config.dataset.preprocessing.max_seq_length,
                                       special_tokens=("<|soi|>", "<|eoi|>", "<|sov|>", "<|eov|>", "<|t2i|>", "<|mmu|>", "<|t2v|>", "<|v2v|>", "<|lvg|>"),
                                       ignore_id=-100, cond_dropout_prob=config.training.cond_dropout_prob)

# setting up the magvit-v2, for t2i
vq_model = MAGVITv2
vq_model = vq_model.from_pretrained(config.model.vq_model.vq_model_name).to(device)
# vq_model.requires_grad_(False)
# vq_model.eval()

# setting up vision tower: clip-vit only for mmu
# vision_tower_name =config.clip_path
# vision_tower = CLIPVisionTower(vision_tower_name).to(device)
# clip_image_processor = CLIPImageProcessor.from_pretrained(vision_tower_name)

# setting up the showo model 
model = Showo.from_pretrained(config.model.showo.pretrained_model_path).to(device)
# model.eval()

# setting up the parameters
temperature = 0.8  # 1.0 = no change, < 1.0 = less random, > 1.0 = more random, in predictions
top_k = 1  # retain only the top_k most likely tokens, clamp others to have 0 probability
LLAVA_SYSTEM_PROMPT = "A chat between a curious user and an artificial intelligence assistant. " \
                "The assistant gives helpful, detailed, and polite answers to the user's questions."
LLAVA_SYSTEM_PROMPT_LEN = 28

Working with z of shape (1, 13, 16, 16) = 3328 dimensions.
Look-up free quantizer with codebook size: 8192


The config attributes {'mask_token_id': 58497} were passed to Showo, but are not expected and will be ignored. Please verify your config.json configuration file.
  if self.w_clip_vit:


attention implementation:  sdpa


In [4]:
data_root = "/home/arc/full_mcdata"
concept = "dunpai"

new_tokens = [f"<{concept}>"] + [f"<token_{i}>" for i in range(16)]
num_added_tokens = tokenizer.add_tokens(new_tokens)
model.showo.resize_token_embeddings(len(tokenizer))

placeholder_token_ids = [
    tokenizer.convert_tokens_to_ids(token)
    for token in new_tokens
]

In [5]:
mask_schedule = get_mask_chedule(config.training.get("mask_schedule", "cosine"))
mask_id = model.mask_token_id

  mask_id = model.mask_token_id


In [6]:

t2i_dataset = PersonalizedT2IDataset(data_root, concept)
t2i_dataloader = DataLoader(t2i_dataset, batch_size=5, shuffle=True, num_workers=10, pin_memory=True)

mmu_dataloader = get_personalized_mmudataloader(data_root, concept, tokenizer, batch_size=5, num_workers=10)

iterables = {
    'mmu_flow': mmu_dataloader,
    't2i_flow': t2i_dataloader
}

from lightning.pytorch.utilities import CombinedLoader

combined_dataloader = CombinedLoader(iterables, mode="max_size_cycle")


In [7]:
# vocab size = 58498 = 50295  llm vocabsize
#                    + 10     <|soi|> <|eoi|> <|sov|> <|eov|> <|t2i|> <|mmu|> <|t2v|> <|v2v|> <|lvg|> <|pad|>
#                    + 8192   vq model codebook size
#                    + 1      mask token (token id == 58497)
from typing import Union


uni_prompting.sptids_dict
# {'<|soi|>': tensor([50296]),
#  '<|eoi|>': tensor([50297]),
#  '<|sov|>': tensor([50298]),
#  '<|eov|>': tensor([50299]),
#  '<|t2i|>': tensor([50300]),
#  '<|mmu|>': tensor([50301]),
#  '<|t2v|>': tensor([50302]),
#  '<|v2v|>': tensor([50303]),
#  '<|lvg|>': tensor([50304]),
#  '<|sot|>': tensor([50256]),
#  '<|eot|>': tensor([50256]),
#  '<|pad|>': tensor([50295])}

# uni_prompting.text_tokenizer == tokenizer
def prepare_inputs_and_labels(
        pixel_values_or_image_ids: Union[torch.FloatTensor, torch.LongTensor],
        texts: Union[str, str],
        min_masking_rate: float = 0.0,
        is_train: bool = True,
):

    image_tokens = vq_model.get_code(pixel_values_or_image_ids)
    image_tokens = image_tokens + len(uni_prompting.text_tokenizer)

    # create MLM mask and labels
    input_ids, labels, loss_weight, mask_prob = mask_or_random_replace_tokens(
        image_tokens,
        mask_id,
        config,
        mask_schedule=mask_schedule,
        is_train=is_train,
    )
    input_ids, masks, labels = uni_prompting((texts, input_ids, labels), 't2i')

    return input_ids, labels, mask_prob, image_tokens

In [8]:
new_tokens

['<dunpai>',
 '<token_0>',
 '<token_1>',
 '<token_2>',
 '<token_3>',
 '<token_4>',
 '<token_5>',
 '<token_6>',
 '<token_7>',
 '<token_8>',
 '<token_9>',
 '<token_10>',
 '<token_11>',
 '<token_12>',
 '<token_13>',
 '<token_14>',
 '<token_15>']

In [9]:
list_data = list(combined_dataloader)    


In [10]:
one_batch_t2i = list_data[0][0]['t2i_flow']
one_batch_mmu = list_data[0][0]['mmu_flow']
one_batch_size_t2i = one_batch_t2i["images"].shape[0]
one_batch_size_mmu = one_batch_mmu["images"].shape[0]
print(one_batch_size_t2i, one_batch_size_mmu)

5 5


In [11]:
# t2i flow
pixel_values, texts = one_batch_t2i["images"], one_batch_t2i["conditions"]
texts_ids = tokenizer(texts, padding=True, truncation=True, return_tensors="pt")["input_ids"]

# input_ids, labels, mask_prob, image_tokens_ori = prepare_inputs_and_labels(pixel_values, texts_ids, is_train=True)
pixel_values = pixel_values.to(device)
texts_ids = texts_ids.to(device)
input_ids, labels, mask_prob, image_tokens_ori = prepare_inputs_and_labels(pixel_values, texts, is_train=True)

In [12]:
input_ids.shape # [5, 387] = [batch_size, ~ 128 + 256 = 384]
labels.shape # [5, 387]
mask_prob.shape # [5]
image_tokens_ori.shape # [5, 256]

torch.Size([5, 256])

In [13]:
from training.prompting_utils import create_attention_mask_predict_next, create_attention_mask_for_mmu
attention_mask = create_attention_mask_predict_next(input_ids,
                                                    pad_id=int(uni_prompting.sptids_dict['<|pad|>']),
                                                    soi_id=int(uni_prompting.sptids_dict['<|soi|>']),
                                                    eoi_id=int(uni_prompting.sptids_dict['<|eoi|>']),
                                                    rm_pad_in_image=True,
                                                    return_inverse_mask=True)

In [14]:
pixel_values_mmu, input_ids_mmu, labels_mmu = (one_batch_mmu["images"],
                                               one_batch_mmu["input_ids"],
                                               one_batch_mmu["labels"])



In [15]:
from llava.llava_data_vq_unified import get_instruct_data_loader_single_gpu

mmu_dataloader_new = get_instruct_data_loader_single_gpu(
                                    tokenizer,
                                    batch_size=5,
                                    num_workers=10,
                                    max_length=128,
                                    phase="tuning"
                                )

Formatting llava instruction data


In [17]:
list_data_mmu = list(mmu_dataloader_new)

huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Av

In [21]:
a_batch = list_data_mmu[0]

In [25]:
a_batch["labels"].shape
a_batch["input_ids"].shape

torch.Size([5, 100])

In [38]:
pixel_values_mmu, input_ids_mmu, labels_mmu = (a_batch["images"],
                                               a_batch["input_ids"],
                                               a_batch["labels"])

In [28]:
pixel_values_mmu = pixel_values_mmu.to(device, non_blocking=True)
input_ids_mmu = input_ids_mmu.to(device, non_blocking=True)
image_tokens_mmu = vq_model.get_code(pixel_values_mmu)
image_tokens_mmu = image_tokens_mmu + len(uni_prompting.text_tokenizer)

input_ids_mmu = torch.cat([
    (torch.ones(input_ids_mmu.shape[0], 1) * uni_prompting.sptids_dict['<|mmu|>']).to(
        device),
    (torch.ones(input_ids_mmu.shape[0], 1) * uni_prompting.sptids_dict['<|soi|>']).to(
        device),
    image_tokens_mmu,
    (torch.ones(input_ids_mmu.shape[0], 1) * uni_prompting.sptids_dict['<|eoi|>']).to(
        device),
    input_ids_mmu,
], dim=1).long()

labels_mmu = torch.cat([
    (torch.ones(input_ids_mmu.shape[0], 1) * uni_prompting.ignore_id).to(device),
    (torch.ones(input_ids_mmu.shape[0], 1) * uni_prompting.ignore_id).to(device),
    torch.ones_like(image_tokens_mmu) * uni_prompting.ignore_id,
    (torch.ones(input_ids_mmu.shape[0], 1) * uni_prompting.ignore_id).to(device),
    labels_mmu.to(device)
], dim=1).long()

In [37]:
# decode labels_mmu, ignore id = -100
decode_mmu_labels = labels_mmu[0]
decode_mmu_labels = decode_mmu_labels[decode_mmu_labels != -100]

# tokenizer.decode(decode_mmu_labels)
input_ids_mmu[0]

# input_ids_mmu[0].shape
input_ids_mmu

tensor([[50301, 50296, 50424,  ..., 50295, 50295, 50295],
        [50301, 50296, 56637,  ..., 50295, 50295, 50295],
        [50301, 50296, 54859,  ..., 50295, 50295, 50295],
        [50301, 50296, 52541,  ..., 50295, 50295, 50295],
        [50301, 50296, 52541,  ..., 50295, 50295, 50295]], device='cuda:0')