In [None]:
# Copyright 2025 Bytedance Ltd. and/or its affiliates.
# SPDX-License-Identifier: Apache-2.0

In [None]:
%load_ext autoreload
%autoreload 2

In [None]:
import os
from copy import deepcopy
from typing import (
    Any,
    AsyncIterable,
    Callable,
    Dict,
    Generator,
    List,
    NamedTuple,
    Optional,
    Tuple,
    Union,
)
import requests
from io import BytesIO

from PIL import Image
import torch
from accelerate import infer_auto_device_map, load_checkpoint_and_dispatch, init_empty_weights

from data.transforms import ImageTransform
from data.data_utils import pil_img2rgb, add_special_tokens
from modeling.bagel import (
    BagelConfig, Bagel, Qwen2Config, Qwen2ForCausalLM, SiglipVisionConfig, SiglipVisionModel
)
from modeling.qwen2 import Qwen2Tokenizer
from modeling.bagel.qwen2_navit import NaiveCache
from modeling.autoencoder import load_ae
from safetensors.torch import load_file

## Model Initialization

In [1]:
from huggingface_hub import snapshot_download

save_dir = "./models/BAGEL-7B-MoT"
repo_id = "ByteDance-Seed/BAGEL-7B-MoT"
cache_dir = save_dir + "/cache"

snapshot_download(cache_dir=cache_dir,
  local_dir=save_dir,
  repo_id=repo_id,
  local_dir_use_symlinks=False,
  resume_download=True,
  allow_patterns=["*.json", "*.safetensors", "*.bin", "*.py", "*.md", "*.txt"],
)


For more details, check out https://huggingface.co/docs/huggingface_hub/main/en/guides/download#download-files-to-local-folder.


Fetching 12 files:   0%|          | 0/12 [00:00<?, ?it/s]

ae.safetensors:   0%|          | 0.00/335M [00:00<?, ?B/s]

ema.safetensors:   0%|          | 0.00/29.2G [00:00<?, ?B/s]

merges.txt: 0.00B [00:00, ?B/s]

generation_config.json:   0%|          | 0.00/243 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/39.0 [00:00<?, ?B/s]

README.md: 0.00B [00:00, ?B/s]

llm_config.json:   0%|          | 0.00/663 [00:00<?, ?B/s]

model.safetensors.index.json: 0.00B [00:00, ?B/s]

tokenizer_config.json: 0.00B [00:00, ?B/s]

vit_config.json:   0%|          | 0.00/205 [00:00<?, ?B/s]

vocab.json: 0.00B [00:00, ?B/s]

tokenizer.json: 0.00B [00:00, ?B/s]

'/mnt/localssd/models/BAGEL-7B-MoT'

In [None]:
model_path = "./models/BAGEL-7B-MoT/"  # Download from https://huggingface.co/ByteDance-Seed/BAGEL-7B-MoT

# LLM config preparing
llm_config = Qwen2Config.from_json_file(os.path.join(model_path, "llm_config.json"))
llm_config.qk_norm = True
llm_config.tie_word_embeddings = False
llm_config.layer_module = "Qwen2MoTDecoderLayer"

# ViT config preparing
vit_config = SiglipVisionConfig.from_json_file(os.path.join(model_path, "vit_config.json"))
vit_config.rope = False
vit_config.num_hidden_layers = vit_config.num_hidden_layers - 1

# VAE loading
vae_model, vae_config = load_ae(local_path=os.path.join(model_path, "ae.safetensors"))

# Bagel config preparing
config = BagelConfig(
    visual_gen=True,
    visual_und=True,
    llm_config=llm_config, 
    vit_config=vit_config,
    vae_config=vae_config,
    vit_max_num_patch_per_side=70,
    connector_act='gelu_pytorch_tanh',
    latent_patch_size=2,
    max_latent_size=64,
)

with init_empty_weights():
    language_model = Qwen2ForCausalLM(llm_config)
    vit_model      = SiglipVisionModel(vit_config)
    model          = Bagel(language_model, vit_model, config)
    model.vit_model.vision_model.embeddings.convert_conv2d_to_linear(vit_config, meta=True)

# Tokenizer Preparing
tokenizer = Qwen2Tokenizer.from_pretrained(model_path)
tokenizer, new_token_ids, _ = add_special_tokens(tokenizer)

# Image Transform Preparing
vae_transform = ImageTransform(1024, 512, 16)
vit_transform = ImageTransform(980, 224, 14)

## Model Loading and Multi GPU Infernece Preparing

In [None]:
max_mem_per_gpu = "40GiB"  # Modify it according to your GPU setting. On an A100, 80 GiB is sufficient to load on a single GPU.

device_map = infer_auto_device_map(
    model,
    max_memory={i: max_mem_per_gpu for i in range(torch.cuda.device_count())},
    no_split_module_classes=["Bagel", "Qwen2MoTDecoderLayer"],
)
print(device_map)

same_device_modules = [
    'language_model.model.embed_tokens',
    'time_embedder',
    'latent_pos_embed',
    'vae2llm',
    'llm2vae',
    'connector',
    'vit_pos_embed'
]

if torch.cuda.device_count() == 1:
    first_device = device_map.get(same_device_modules[0], "cuda:0")
    for k in same_device_modules:
        if k in device_map:
            device_map[k] = first_device
        else:
            device_map[k] = "cuda:0"
else:
    first_device = device_map.get(same_device_modules[0])
    for k in same_device_modules:
        if k in device_map:
            device_map[k] = first_device

# Thanks @onion-liu: https://github.com/ByteDance-Seed/Bagel/pull/8
model = load_checkpoint_and_dispatch(
    model,
    checkpoint=os.path.join(model_path, "ema.safetensors"),
    device_map=device_map,
    offload_buffers=True,
    dtype=torch.bfloat16,
    force_hooks=True,
    offload_folder="/tmp/offload"
)

model = model.eval()
print('Model loaded')

## Inferencer Preparing 

In [None]:
from inferencer import InterleaveInferencer

inferencer = InterleaveInferencer(
    model=model, 
    vae_model=vae_model, 
    tokenizer=tokenizer, 
    vae_transform=vae_transform, 
    vit_transform=vit_transform, 
    new_token_ids=new_token_ids
)

In [None]:
import random
import numpy as np

seed = 42
random.seed(seed)
np.random.seed(seed)
torch.manual_seed(seed)
if torch.cuda.is_available():
    torch.cuda.manual_seed(seed)
    torch.cuda.manual_seed_all(seed)
torch.backends.cudnn.deterministic = True
torch.backends.cudnn.benchmark = False

**About Inference Hyperparameters:**
- **`cfg_text_scale`:** Controls how strongly the model follows the text prompt. `1.0` disables text guidance. Typical range: `4.0–8.0`.
- **`cfg_image_scale`:** Controls how much the model preserves input image details. `1.0` disables image guidance. Typical range: `1.0–2.0`.
- **`cfg_interval`:** Fraction of denoising steps where CFG is applied. Later steps can skip CFG to reduce computation. Typical: `[0.4, 1.0]`.
- **`timestep_shift`:** Shifts the distribution of denoising steps. Higher values allocate more steps at the start (affects layout); lower values allocate more at the end (improves details).
- **`num_timesteps`:** Total denoising steps. Typical: `50`.
- **`cfg_renorm_min`:** Minimum value for CFG-Renorm. `1.0` disables renorm. Typical: `0`.
- **`cfg_renorm_type`:** CFG-Renorm method:  
  - `global`: Normalize over all tokens and channels (default for T2I).
  - `channel`: Normalize across channels for each token.
  - `text_channel`: Like `channel`, but only applies to text condition (good for editing, may cause blur).
- **If edited images appear blurry, try `global` CFG-Renorm, decrease `cfg_renorm_min` or decrease `cfg_scale`.**


# 1 - Bagel - basic capabilities

## Image Generation

In [None]:
inference_hyper=dict(
    cfg_text_scale=4.0,
    cfg_img_scale=1.0,
    cfg_interval=[0.4, 1.0],
    timestep_shift=3.0,
    num_timesteps=50,
    cfg_renorm_min=0.0,
    cfg_renorm_type="global",
)

In [None]:
prompt = "A female cosplayer portraying an ethereal fairy or elf, wearing a flowing dress made of delicate fabrics in soft, mystical colors like emerald green and silver. She has pointed ears, a gentle, enchanting expression, and her outfit is adorned with sparkling jewels and intricate patterns. The background is a magical forest with glowing plants, mystical creatures, and a serene atmosphere."

print(prompt)
print('-' * 10)
output_dict = inferencer(text=prompt, **inference_hyper)
display(output_dict['image'])

## Image Generation with Think

In [None]:
inference_hyper=dict(
    max_think_token_n=1000,
    do_sample=False,
    # text_temperature=0.3,
    cfg_text_scale=4.0,
    cfg_img_scale=1.0,
    cfg_interval=[0.4, 1.0],
    timestep_shift=3.0,
    num_timesteps=50,
    cfg_renorm_min=0.0,
    cfg_renorm_type="global",
)

In [None]:
prompt = 'a car made of small cars'

print(prompt)
print('-' * 10)
output_dict = inferencer(text=prompt, think=True, **inference_hyper)
print(output_dict['text'])
display(output_dict['image'])

## Editing

In [None]:
inference_hyper=dict(
    cfg_text_scale=4.0,
    cfg_img_scale=2.0,
    cfg_interval=[0.0, 1.0],
    timestep_shift=3.0,
    num_timesteps=50,
    cfg_renorm_min=0.0,
    cfg_renorm_type="text_channel",
)

In [None]:
image = Image.open('test_images/women.jpg')
prompt = 'She boards a modern subway, quietly reading a folded newspaper, wearing the same clothes.'

display(image)
print(prompt)
print('-'*10)
output_dict = inferencer(image=image, text=prompt, **inference_hyper)
display(output_dict['image'])

## Edit with Think

In [None]:
inference_hyper=dict(
    max_think_token_n=1000,
    do_sample=False,
    # text_temperature=0.3,
    cfg_text_scale=4.0,
    cfg_img_scale=2.0,
    cfg_interval=[0.0, 1.0],
    timestep_shift=3.0,
    num_timesteps=50,
    cfg_renorm_min=0.0,
    cfg_renorm_type="text_channel",
)

In [None]:
image = Image.open('test_images/octupusy.jpg')
prompt = 'Could you display the sculpture that takes after this design?'

display(image)
print('-'*10)
output_dict = inferencer(image=image, text=prompt, think=True, **inference_hyper)
print(output_dict['text'])
display(output_dict['image'])

## Understanding

In [None]:
inference_hyper=dict(
    max_think_token_n=1000,
    do_sample=False,
    # text_temperature=0.3,
)

In [None]:
image = Image.open('test_images/meme.jpg')
prompt = "Can someone explain what’s funny about this meme??"

display(image)
print(prompt)
print('-'*10)
output_dict = inferencer(image=image, text=prompt, understanding_output=True, **inference_hyper)
print(output_dict['text'])

# 2 - Exploring interleaved capabilties

## 2.1 - Basic non-reasoning / reasoning inference with text only

In [1]:
# Basic text outputs
inference_hyper=dict(
    max_think_token_n=1000,
    do_sample=False,
    # text_temperature=0.3,
)

# image = Image.open('test_images/meme.jpg')
prompt = "what is 2 + 2 x 4"

print(prompt)
print('-'*10)
output_dict = inferencer(text=prompt, understanding_output=True, **inference_hyper)
print(output_dict['text'])

what is 2 + 2 x 4
----------


NameError: name 'inferencer' is not defined

In [None]:
# Basic text outputs + reasoning
inference_hyper=dict(
    max_think_token_n=1000,
    do_sample=False,
    # text_temperature=0.3,
)

# image = Image.open('test_images/meme.jpg')
prompt = "what is 2 + 2 x 4"

print(prompt)
print('-'*10)
output_dict = inferencer(text=prompt, understanding_output=True, think=True, **inference_hyper)
print(output_dict['text'])

## 2.2 - Interleaved Inputs - Single Output

### 2.2.1 - Text output - Multiple images

In [None]:
##################################################
# Basic text outputs + reasoning
##################################################
inference_hyper=dict(
    max_think_token_n=1000,
    do_sample=False,
    # text_temperature=0.3,
    understanding_output=True, 
    # think=True,
)

# image = Image.open('test_images/meme.jpg')
prompt = "what is 2 + 2 x 4"

print(prompt)
print('-'*10)
input_list = [prompt]
output_list = inferencer.interleave_inference(input_list, **inference_hyper)
# output_dict = inferencer(text=prompt, understanding_output=True, think=True, **inference_hyper)
# print(output_dict['text'])
output_list

In [None]:
##################################################
# Text output - two images
##################################################
inference_hyper=dict(
    max_think_token_n=1000,
    do_sample=False,
    # text_temperature=0.3,
    understanding_output=True, 
    # think=True,
)

image1 = Image.open('test_images/octupusy.jpg')
image2 = Image.open('test_images/women.jpg')
image3 = Image.open('test_images/women.jpg')

# image = Image.open('test_images/meme.jpg')
prompt = "can you help me tell what the difference between the two images is?"

print(prompt)
print('-'*10)
input_list = [prompt]
input_list = [image1, image2, prompt]

output_list = inferencer.interleave_inference(input_list, **inference_hyper)
# output_dict = inferencer(text=prompt, understanding_output=True, think=True, **inference_hyper)
# print(output_dict['text'])
output_list

In [None]:
##################################################
# Text output - three images
##################################################
inference_hyper=dict(
    max_think_token_n=1000,
    do_sample=False,
    # text_temperature=0.3,
    understanding_output=True, 
    # think=True,
)

image1 = Image.open('test_images/octupusy.jpg')
image2 = Image.open('test_images/women.jpg')
image3 = Image.open('test_images/women.jpg')

# image = Image.open('test_images/meme.jpg')
prompt = "can you help me tell what the difference between the two images is?"
prompt = "which image first or third is best match for the second image? and why?"

print(prompt)
print('-'*10)
input_list = [prompt]
input_list = [image1, image2, image3, prompt]

output_list = inferencer.interleave_inference(input_list, **inference_hyper)
# output_dict = inferencer(text=prompt, understanding_output=True, think=True, **inference_hyper)
# print(output_dict['text'])
output_list

In [None]:
##################################################
# Text output - three images - different order
##################################################
inference_hyper=dict(
    max_think_token_n=1000,
    do_sample=False,
    # text_temperature=0.3,
    understanding_output=True, 
    # think=True,
)

image1 = Image.open('test_images/octupusy.jpg')
image2 = Image.open('test_images/women.jpg')
image3 = Image.open('test_images/women.jpg')

# image = Image.open('test_images/meme.jpg')
prompt = "can you help me tell what the difference between the two images is?"
prompt = "which image first or third is best match for the second image? and why?"

print(prompt)
print('-'*10)
input_list = [prompt]
input_list = [image1, image2, image3, prompt]
input_list = [prompt, image2, image1, image3]

output_list = inferencer.interleave_inference(input_list, **inference_hyper)
# output_dict = inferencer(text=prompt, understanding_output=True, think=True, **inference_hyper)
# print(output_dict['text'])
output_list

In [None]:
##################################################
# Text output - three images - different order
##################################################
inference_hyper=dict(
    max_think_token_n=1000,
    do_sample=False,
    # text_temperature=0.3,
    understanding_output=True, 
    # think=True,
)

image1 = Image.open('test_images/octupusy.jpg')
image2 = Image.open('test_images/women.jpg')
image3 = Image.open('test_images/women.jpg')

# image = Image.open('test_images/meme.jpg')
prompt = "can you help me tell what the difference between the two images is?"
prompt = "which image first or third is best match for the second image? and why?"
prompt = "Given above reference image, which image first or second is most similar to the reference? and why?"

print(prompt)
print('-'*10)
input_list = [prompt]
input_list = [image1, image2, image3, prompt]
input_list = [image2, prompt, image1, image3]

output_list = inferencer.interleave_inference(input_list, **inference_hyper)
# output_dict = inferencer(text=prompt, understanding_output=True, think=True, **inference_hyper)
# print(output_dict['text'])
output_list

In [None]:
##################################################
# Text output - three images - different order
##################################################
inference_hyper=dict(
    max_think_token_n=1000,
    do_sample=False,
    # text_temperature=0.3,
    understanding_output=True, 
    # think=True,
)

image1 = Image.open('test_images/octupusy.jpg')
image2 = Image.open('test_images/women.jpg')
image3 = Image.open('test_images/women.jpg')

# image = Image.open('test_images/meme.jpg')
prompt = "can you help me tell what the difference between the two images is?"
prompt = "which image first or third is best match for the second image? and why?"
prompt = "Given above reference image, which image first or second is most similar to the reference? and why?"
prompt = "If these images were presented in a sequential order in a video, what would be a nice detailed description for what happend in the three frames?"

print(prompt)
print('-'*10)
input_list = [prompt]
input_list = [image1, image2, image3, prompt]
input_list = [image2, prompt, image1, image3]
input_list = [prompt, image2, image1, image3]

output_list = inferencer.interleave_inference(input_list, **inference_hyper)
# output_dict = inferencer(text=prompt, understanding_output=True, think=True, **inference_hyper)
# print(output_dict['text'])
output_list

In [None]:
##################################################
# Text output - three images / multiple text - different order
##################################################
inference_hyper=dict(
    max_think_token_n=1000,
    do_sample=False,
    # text_temperature=0.3,
    understanding_output=True, 
    # think=True,
)

image1 = Image.open('test_images/octupusy.jpg')
image2 = Image.open('test_images/women.jpg')
image3 = Image.open('test_images/women.jpg')

# image = Image.open('test_images/meme.jpg')
prompt = "can you help me tell what the difference between the two images is?"
prompt = "which image first or third is best match for the second image? and why?"
prompt = "Given above reference image, which image first or second is most similar to the reference? and why?"
prompt = "If these images were presented in a sequential order in a video, what would be a nice detailed description for what happend in the three frames?"
prompt2 = "Also after the description, tell what the character in last frame might do next?"

print(prompt)
print('-'*10)
input_list = [prompt]
input_list = [image1, image2, image3, prompt]
input_list = [image2, prompt, image1, image3]
input_list = [prompt, image2, image1, image3]
input_list = [prompt, image2, image1, image3, prompt2]

output_list = inferencer.interleave_inference(input_list, **inference_hyper)
# output_dict = inferencer(text=prompt, understanding_output=True, think=True, **inference_hyper)
# print(output_dict['text'])
output_list

### 2.2.2 - Image Output - Multiple text and images

In [None]:
##################################################
# Image output - three images / multiple text - different order
##################################################
inference_hyper=dict(
    max_think_token_n=1000,
    do_sample=False,
    # text_temperature=0.3,
    cfg_text_scale=4.0,
    cfg_img_scale=2.0,
    cfg_interval=[0.0, 1.0],
    timestep_shift=3.0,
    num_timesteps=25,
    cfg_renorm_min=0.0,
    cfg_renorm_type="text_channel",
    # understanding_output=True, 
    # think=True,
)
# inference_hyper=dict(
#     max_think_token_n=1000,
#     do_sample=False,
#     # text_temperature=0.3,
#     understanding_output=True, 
#     # think=True,
# )

image1 = Image.open('test_images/octupusy.jpg')
image2 = Image.open('test_images/women.jpg')
image3 = Image.open('test_images/women.jpg')

# image = Image.open('test_images/meme.jpg')
prompt = "can you help me tell what the difference between the two images is?"
prompt = "which image first or third is best match for the second image? and why?"
prompt = "Given above reference image, which image first or second is most similar to the reference? and why?"
prompt = "If these images were presented in a sequential order in a video, what would be a nice detailed description for what happend in the three frames?"
prompt2 = "Also after the description, tell what the character in last frame might do next?"

prompt = "What if the camera zooms-out for the current image?"


print(prompt)
print('-'*10)
input_list = [prompt]
input_list = [image2, prompt]

output_list = inferencer.interleave_inference(input_list, **inference_hyper)
# output_dict = inferencer(text=prompt, understanding_output=True, think=True, **inference_hyper)
# print(output_dict['text'])
display(output_list[0])
output_list

In [None]:
##################################################
# Image output - three images / multiple text - different order
##################################################
inference_hyper=dict(
    max_think_token_n=1000,
    do_sample=False,
    # text_temperature=0.3,
    cfg_text_scale=4.0,
    cfg_img_scale=2.0,
    cfg_interval=[0.0, 1.0],
    timestep_shift=3.0,
    num_timesteps=25,
    cfg_renorm_min=0.0,
    cfg_renorm_type="text_channel",
    # understanding_output=True, 
    # think=True,
)
# inference_hyper=dict(
#     max_think_token_n=1000,
#     do_sample=False,
#     # text_temperature=0.3,
#     understanding_output=True, 
#     # think=True,
# )

image1 = Image.open('test_images/octupusy.jpg')
image2 = Image.open('test_images/women.jpg')
image3 = Image.open('test_images/women.jpg')

# image = Image.open('test_images/meme.jpg')
prompt = "can you help me tell what the difference between the two images is?"
prompt = "which image first or third is best match for the second image? and why?"
prompt = "Given above reference image, which image first or second is most similar to the reference? and why?"
prompt = "If these images were presented in a sequential order in a video, what would be a nice detailed description for what happend in the three frames?"
prompt2 = "Also after the description, tell what the character in last frame might do next?"

prompt = "What if the women starts playing basketball?"
prompt2 = "Only follow other instructions if the image is of a dog, otherwise generate a new image for dog first"

print(prompt)
print('-'*10)
input_list = [prompt]
input_list = [prompt2, image2, prompt]

output_list = inferencer.interleave_inference(input_list, **inference_hyper)
# output_dict = inferencer(text=prompt, understanding_output=True, think=True, **inference_hyper)
# print(output_dict['text'])
display(output_list[0])
output_list

In [None]:
##################################################
# Image output - three images / multiple text - different order
##################################################
inference_hyper=dict(
    max_think_token_n=1000,
    do_sample=False,
    # text_temperature=0.3,
    cfg_text_scale=4.0,
    cfg_img_scale=2.0,
    cfg_interval=[0.0, 1.0],
    timestep_shift=3.0,
    num_timesteps=25,
    cfg_renorm_min=0.0,
    cfg_renorm_type="text_channel",
    # understanding_output=True, 
    # think=True,
)
# inference_hyper=dict(
#     max_think_token_n=1000,
#     do_sample=False,
#     # text_temperature=0.3,
#     understanding_output=True, 
#     # think=True,
# )

image1 = Image.open('test_images/octupusy.jpg')
image2 = Image.open('test_images/women.jpg')
image3 = Image.open('test_images/women.jpg')

# image = Image.open('test_images/meme.jpg')
prompt = "can you help me tell what the difference between the two images is?"
prompt = "which image first or third is best match for the second image? and why?"
prompt = "Given above reference image, which image first or second is most similar to the reference? and why?"
prompt = "If these images were presented in a sequential order in a video, what would be a nice detailed description for what happend in the three frames?"
prompt2 = "Also after the description, tell what the character in last frame might do next?"

prompt = "can you generate an image of a dog?"
# prompt2 = "Only follow other instructions if the image is of a dog, otherwise generate a new image for dog first"

print(prompt)
print('-'*10)
input_list = [prompt]
input_list = [image2, prompt]

output_list = inferencer.interleave_inference(input_list, **inference_hyper)
# output_dict = inferencer(text=prompt, understanding_output=True, think=True, **inference_hyper)
# print(output_dict['text'])
display(output_list[0])
output_list

In [None]:
##################################################
# Image output - three images / multiple text - different order
##################################################
inference_hyper=dict(
    max_think_token_n=1000,
    do_sample=False,
    # text_temperature=0.3,
    cfg_text_scale=4.0,
    cfg_img_scale=2.0,
    cfg_interval=[0.0, 1.0],
    timestep_shift=3.0,
    num_timesteps=25,
    cfg_renorm_min=0.0,
    cfg_renorm_type="text_channel",
    # understanding_output=True, 
    think=True,
)
# inference_hyper=dict(
#     max_think_token_n=1000,
#     do_sample=False,
#     # text_temperature=0.3,
#     understanding_output=True, 
#     # think=True,
# )

image1 = Image.open('test_images/octupusy.jpg')
image2 = Image.open('test_images/women.jpg')
image3 = Image.open('test_images/women.jpg')

# image = Image.open('test_images/meme.jpg')
prompt = "can you help me tell what the difference between the two images is?"
prompt = "which image first or third is best match for the second image? and why?"
prompt = "Given above reference image, which image first or second is most similar to the reference? and why?"
prompt = "If these images were presented in a sequential order in a video, what would be a nice detailed description for what happend in the three frames?"
prompt2 = "Also after the description, tell what the character in last frame might do next?"

prompt = "what if the woman wears a blue dress?"
prompt2 = "Only follow other instructions if the image is of a dog, otherwise generate a new image for dog first"

print(prompt)
print('-'*10)
input_list = [prompt]
input_list = [prompt2, image2, prompt]

output_list = inferencer.interleave_inference(input_list, **inference_hyper)
# output_dict = inferencer(text=prompt, understanding_output=True, think=True, **inference_hyper)
# print(output_dict['text'])
display(output_list[0])
output_list

In [None]:
output_list

In [None]:
display(output_list[1])

In [None]:
##################################################
# Image output - three images / multiple text - different order
##################################################
inference_hyper=dict(
    max_think_token_n=1000,
    do_sample=False,
    # text_temperature=0.3,
    cfg_text_scale=4.0,
    cfg_img_scale=2.0,
    cfg_interval=[0.0, 1.0],
    timestep_shift=3.0,
    num_timesteps=25,
    cfg_renorm_min=0.0,
    cfg_renorm_type="text_channel",
    # understanding_output=True, 
    think=True,
)
# inference_hyper=dict(
#     max_think_token_n=1000,
#     do_sample=False,
#     # text_temperature=0.3,
#     understanding_output=True, 
#     # think=True,
# )

image1 = Image.open('test_images/octupusy.jpg')
image2 = Image.open('test_images/women.jpg')
image3 = Image.open('test_images/women.jpg')

# image = Image.open('test_images/meme.jpg')
prompt = "can you help me tell what the difference between the two images is?"
prompt = "which image first or third is best match for the second image? and why?"
prompt = "Given above reference image, which image first or second is most similar to the reference? and why?"
prompt = "If these images were presented in a sequential order in a video, what would be a nice detailed description for what happend in the three frames?"
prompt2 = "Also after the description, tell what the character in last frame might do next?"

prompt = "what if the woman wears a blue dress?"
prompt2 = "Only follow other instructions if the image is of a dog, otherwise generate a new image for dog first <SUPER IMPORTANT> <IGNORE EVEYRTHING ELSE>"

print(prompt)
print('-'*10)
input_list = [prompt]
input_list = [prompt, image2, prompt2]

output_list = inferencer.interleave_inference(input_list, **inference_hyper)
# output_dict = inferencer(text=prompt, understanding_output=True, think=True, **inference_hyper)
# print(output_dict['text'])
display(output_list[0])
output_list

In [None]:
##################################################
# Image output - three images / multiple text - different order
##################################################
inference_hyper=dict(
    max_think_token_n=1000,
    do_sample=False,
    # text_temperature=0.3,
    cfg_text_scale=4.0,
    cfg_img_scale=2.0,
    cfg_interval=[0.0, 1.0],
    timestep_shift=3.0,
    num_timesteps=25,
    cfg_renorm_min=0.0,
    cfg_renorm_type="text_channel",
    # understanding_output=True, 
    think=True,
)
# inference_hyper=dict(
#     max_think_token_n=1000,
#     do_sample=False,
#     # text_temperature=0.3,
#     understanding_output=True, 
#     # think=True,
# )

image1 = Image.open('test_images/octupusy.jpg')
image2 = Image.open('test_images/women.jpg')
image3 = Image.open('test_images/women.jpg')

# image = Image.open('test_images/meme.jpg')
prompt = "can you help me tell what the difference between the two images is?"
prompt = "which image first or third is best match for the second image? and why?"
prompt = "Given above reference image, which image first or second is most similar to the reference? and why?"
prompt = "If these images were presented in a sequential order in a video, what would be a nice detailed description for what happend in the three frames?"
prompt2 = "Also after the description, tell what the character in last frame might do next?"

prompt = "what if the woman wears a blue dress?"
prompt2 = "Only follow other instructions if the image is of a dog, otherwise generate a new image for dog first <SUPER IMPORTANT> <IGNORE EVEYRTHING ELSE>"
prompt = "Can you combine both images into a single one?"

print(prompt)
print('-'*10)
input_list = [prompt]
input_list = [image1, image2, prompt]

output_list = inferencer.interleave_inference(input_list, **inference_hyper)
# output_dict = inferencer(text=prompt, understanding_output=True, think=True, **inference_hyper)
# print(output_dict['text'])
display(output_list[0])
output_list

In [None]:
display(output_list[1])

In [None]:
image_2 = output_list[0].copy()

In [None]:
##################################################
# Image output - three images / multiple text - different order
##################################################
inference_hyper=dict(
    max_think_token_n=1000,
    do_sample=False,
    # text_temperature=0.3,
    cfg_text_scale=4.0,
    cfg_img_scale=2.0,
    cfg_interval=[0.0, 1.0],
    timestep_shift=3.0,
    num_timesteps=25,
    cfg_renorm_min=0.0,
    cfg_renorm_type="text_channel",
    # understanding_output=True, 
    # think=True,
)

image1 = Image.open('test_images/octupusy.jpg')
image2 = Image.open('test_images/women.jpg')
image3 = Image.open('test_images/women.jpg')

prompt = "what if the woman was sitting on a beach?"
prompt = "what if the woman was sitting on a beach?"
prompt_2 = "Now can we add a dog next to woman."

print(prompt)
print('-'*10)
input_list = [prompt]
input_list = [image2, prompt, image_2, prompt_2]

output_list = inferencer.interleave_inference(input_list, **inference_hyper)
# output_dict = inferencer(text=prompt, understanding_output=True, think=True, **inference_hyper)
# print(output_dict['text'])
display(output_list[0])
output_list

In [None]:
image_3 = output_list[0].copy()

In [None]:
##################################################
# Image output - three images / multiple text - different order
##################################################
inference_hyper=dict(
    max_think_token_n=1000,
    do_sample=False,
    # text_temperature=0.3,
    cfg_text_scale=4.0,
    cfg_img_scale=2.0,
    cfg_interval=[0.0, 1.0],
    timestep_shift=3.0,
    num_timesteps=25,
    cfg_renorm_min=0.0,
    cfg_renorm_type="text_channel",
    # understanding_output=True, 
    # think=True,
)

image1 = Image.open('test_images/octupusy.jpg')
image2 = Image.open('test_images/women.jpg')
image3 = Image.open('test_images/women.jpg')

prompt = "what if the woman was sitting on a beach?"
prompt = "what if the woman was sitting on a beach?"
prompt_2 = "Now can we add a dog next to woman."
prompt_3 = "Now make the color of her eyes blue and make it into a high resolution photo"

print(prompt)
print('-'*10)
input_list = [prompt]
input_list = [image2, prompt, image_2, prompt_2, image_3, prompt_3]

output_list = inferencer.interleave_inference(input_list, **inference_hyper)
# output_dict = inferencer(text=prompt, understanding_output=True, think=True, **inference_hyper)
# print(output_dict['text'])
display(output_list[0])
output_list

In [None]:
image_4 = output_list[0].copy()

In [None]:
##################################################
# Image output - three images / multiple text - different order
##################################################
inference_hyper=dict(
    max_think_token_n=1000,
    do_sample=False,
    # text_temperature=0.3,
    cfg_text_scale=4.0,
    cfg_img_scale=2.0,
    cfg_interval=[0.0, 1.0],
    timestep_shift=3.0,
    num_timesteps=25,
    cfg_renorm_min=0.0,
    cfg_renorm_type="text_channel",
    # understanding_output=True, 
    # think=True,
)

image1 = Image.open('test_images/octupusy.jpg')
image2 = Image.open('test_images/women.jpg')
image3 = Image.open('test_images/women.jpg')

prompt = "what if the woman was sitting on a beach?"
prompt = "what if the woman was sitting on a beach?"
prompt_2 = "Now can we add a dog next to woman."
prompt_3 = "Now make the color of her eyes blue and make it into a high resolution photo"
prompt_4 = "Now make the woman standing upright"

print(prompt)
print('-'*10)
input_list = [prompt]
input_list = [image2, prompt, image_2, prompt_2, image_3, prompt_3,image_4, prompt_4]

output_list = inferencer.interleave_inference(input_list, **inference_hyper)
# output_dict = inferencer(text=prompt, understanding_output=True, think=True, **inference_hyper)
# print(output_dict['text'])
display(output_list[0])
output_list

In [None]:
##################################################
# Image output - three images / multiple text - different order
##################################################
inference_hyper=dict(
    max_think_token_n=1000,
    do_sample=False,
    # text_temperature=0.3,
    cfg_text_scale=4.0,
    cfg_img_scale=2.0,
    cfg_interval=[0.0, 1.0],
    timestep_shift=3.0,
    num_timesteps=25,
    cfg_renorm_min=0.0,
    cfg_renorm_type="text_channel",
    understanding_output=True, 
    # think=True,
)
inference_hyper=dict(
    max_think_token_n=1000,
    do_sample=False,
    # text_temperature=0.3,
    understanding_output=True, 
    think=True,
)

image1 = Image.open('test_images/octupusy.jpg')
image2 = Image.open('test_images/women.jpg')
image3 = Image.open('test_images/women.jpg')

prompt = "what if the woman was sitting on a beach?"
prompt = "what if the woman was sitting on a beach?"
prompt_2 = "Now can we add a dog next to woman."
prompt_3 = "Now make the color of her eyes blue and make it into a high resolution photo"
prompt_4 = "Now make the woman standing upright"
prompt_5 = "How many images can you see? and what would be an appropirate description of edit from first image to last? Also what was the last edit instruction? and was it ever applied (be very careful) ... is the woman standing upright?"

prompt_4 = "Now remove the woman"
prompt_5 = "Describe in detail the final image. Provide a detailed caption"
prompt_6 = "provide a detailed description for the video if the provided sequence of images was a video"

print(prompt)
print('-'*10)
input_list = [prompt]
input_list = [image2, prompt, image_2, prompt_2, image_3, prompt_3,image_4, prompt_4, prompt_5]
input_list = [image2,image_2, image_3,image_4, prompt_5]
input_list = [image2,image_2, image_3,image_4, prompt_4]
input_list = [image2,image_2, image_3,image_4, prompt_6]

output_list = inferencer.interleave_inference(input_list, **inference_hyper)
# output_dict = inferencer(text=prompt, understanding_output=True, think=True, **inference_hyper)
# print(output_dict['text'])
display(output_list[0])
output_list

# Debug