In [None]:
from google.colab import drive
drive.mount('/content/drive')

In [None]:
from google.colab import output
output.enable_custom_widget_manager()

In [None]:
import os
import sys
sys.path.append('.')

from pathlib import Path


In [None]:
conform_path = 'MyDrive/conform/CONFORM' #modify this depending on your folders
env_path = Path('/content/drive') / conform_path

if str(env_path) not in sys.path:
    sys.path.append(str(env_path))

In [None]:
# getting some warnings and errors here but the model is working somehow
# (ERROR: pip's dependency resolver does not currently take into account all the packages that are installed)

# system packages
!apt-get update
!apt-get install -y libgmp-dev libgnutls30 libidn2-0 libjpeg-dev libtiff-dev libwebp-dev liblz4-dev libbz2-dev libarchive-dev libyaml-dev libzstd-dev

# python packages
!pip install torch torchvision torchaudio accelerate==0.28.0 anyio==4.3.0 argon2-cffi==23.1.0 argon2-cffi-bindings==21.2.0 \
            arrow==1.3.0 asttokens==2.4.1 async-lru==2.0.4 attrs==23.2.0 babel==2.14.0 beautifulsoup4==4.12.3 \
            bleach==6.1.0 cffi==1.16.0 comm==0.2.2 debugpy==1.8.1 decorator==5.1.1 defusedxml==0.7.1 diffusers==0.21.4 \
            executing==2.0.1 fastjsonschema==2.19.1 fqdn==1.5.1 fsspec==2024.3.1 h11==0.14.0 httpcore==1.0.5 httpx==0.27.0 \
            huggingface-hub==0.22.2 importlib-metadata==7.1.0 ipykernel==6.29.4 ipython==8.23.0 ipywidgets==8.1.2 \
            isoduration==20.11.0 jedi==0.19.1 joblib==1.3.2 json5==0.9.24 jsonpointer==2.4 jsonschema==4.21.1 \
            jsonschema-specifications==2023.12.1 jupyter==1.0.0 jupyter-client==8.6.1 jupyter-console==6.6.3 \
            jupyter-core==5.7.2 jupyter-events==0.10.0 jupyter-lsp==2.2.4 jupyter-server==2.13.0 jupyter-server-terminals==0.5.3 \
            jupyterlab==4.1.5 jupyterlab-pygments==0.3.0 jupyterlab-server==2.25.4 jupyterlab-widgets==3.0.10 \
            matplotlib-inline==0.1.6 mistune==3.0.2 nbclient==0.10.0 nbconvert==7.16.3 nbformat==5.10.4 nest-asyncio==1.6.0 \
            notebook==7.1.2 notebook-shim==0.2.4 overrides==7.7.0 packaging==24.0 pandocfilters==1.5.1 parso==0.8.3 \
            pexpect==4.9.0 platformdirs==4.2.0 prometheus-client==0.20.0 prompt-toolkit==3.0.43 psutil==5.9.8 ptyprocess==0.7.0 \
            pure-eval==0.2.2 pycparser==2.22 pygments==2.17.2 python-dateutil==2.9.0.post0 python-json-logger==2.0.7 \
            pytorch-metric-learning==2.5.0 pyzmq==25.1.2 qtconsole==5.5.1 qtpy==2.4.1 referencing==0.34.0 regex==2023.12.25 \
            rfc3339-validator==0.1.4 rfc3986-validator==0.1.1 rpds-py==0.18.0 safetensors==0.4.2 scikit-learn==1.4.1.post1 \
            scipy==1.13.0 send2trash==1.8.2 six==1.16.0 sniffio==1.3.1 soupsieve==2.5 stack-data==0.6.3 terminado==0.18.1 \
            threadpoolctl==3.4.0 tinycss2==1.2.1 tokenizers==0.15.2 tornado==6.4 tqdm==4.66.2 traitlets==5.14.2 \
            transformers==4.39.3 types-python-dateutil==2.9.0.20240316 uri-template==1.3.0 wcwidth==0.2.13 webcolors==1.13 \
            webencodings==0.5.1 websocket-client==1.7.0 widgetsnbextension==4.0.10 zipp==3.18.1


# Initialize Model

In [None]:
import torch
from diffusers import StableDiffusionPipeline
from pipeline_conform import ConformPipeline

pretrained_model_name_or_path = "runwayml/stable-diffusion-v1-5"

sd_pipeline = StableDiffusionPipeline.from_pretrained(
    pretrained_model_name_or_path=pretrained_model_name_or_path,
    torch_dtype=torch.float16,
).to("cuda")

pipeline = ConformPipeline(
    vae=sd_pipeline.vae,
    text_encoder=sd_pipeline.text_encoder,
    tokenizer=sd_pipeline.tokenizer,
    unet=sd_pipeline.unet,
    scheduler=sd_pipeline.scheduler,
    safety_checker=sd_pipeline.safety_checker,
    feature_extractor=sd_pipeline.feature_extractor,
)

# Hyperparameter

In [None]:

num_inference_steps = 20 # Number of steps to run the model
guidance_scale = 7.5 # Guidance scale for diffusion
attn_res = (16, 16) # Resolution of the attention map to apply CONFORM
steps_to_save_attention_maps = list(range(num_inference_steps)) # Steps to save attention maps
max_iter_to_alter = 30 # Which steps to stop updating the latents
refinement_steps = 20 # Number of refinement steps
scale_factor = 20 # Scale factor for the optimization step
iterative_refinement_steps = [0, 1, 3, 5, 10, 20] # Iterative refinement steps
do_smoothing = True # Apply smoothing to the attention maps
smoothing_sigma = 0.5 # Sigma for the smoothing kernel
smoothing_kernel_size = 3 # Kernel size for the smoothing kernel
temperature = 0.5 # Temperature for the contrastive loss
softmax_normalize = False # Normalize the attention maps
softmax_normalize_attention_maps = False # Normalize the attention maps
add_previous_attention_maps = True # Add previous attention maps to the loss calculation
previous_attention_map_anchor_step = None # Use a specific step as the previous attention map
loss_fn = "ntxent" # Loss function to use
seed = 4913 # Seed for the generation
# seed = 4812 # Seed for the generation


# Prompt

In [None]:
# prompt = "cat under the table"
#--------------------------------------------------------------------------------------------------------------------------------------------------------
prompt = "car on the right of a bike"

## Indices

In [None]:
ids = pipeline.tokenizer(prompt).input_ids
indices = {
    i: tok
    for tok, i in zip(pipeline.tokenizer.convert_ids_to_tokens(ids), range(len(ids)))
}
print(indices)

## Token Groups

In [None]:
# token_groups = [
#     [1],
#     [2, 4]
# ]
#--------------------------------------------------------------------------------------------------------------------------------------------------------
token_groups = [
    [1, 4],
    [7]
]

In [None]:
# # Define bounding boxes for tokens [x_min, y_min, x_max, y_max] origin point is top left of the image
# # bounding_boxes = {
# #     1: [0.1, 0.5, 0.4, 0.9],  # Bounding box for "cat"
# #     4: [0.1, 0.1, 0.9, 0.4],  # Bounding box for "table"
# # }

# bounding_boxes = {
#     1: [0.1, 0.1, 0.4, 0.9],  # Bounding box for "cat"
#     4: [0.1, 0.1, 0.4, 0.9],  # Bounding box for "location"
#     7: [0.4, 0.1, 0.9, 0.9],  # Bounding box for "dog"
# }

In [None]:
# img_size = (16,16)

# for token_idx, bbox in bounding_boxes.items():
#   for coord, size in zip(bbox, img_size*2):
#     print(int(coord * size))

In [None]:

# %cd /content/drive/MyDrive/DL/Diffusion-SpaceTime-Attn-main/attention_optimization/stable-diffusion
%cd /content/drive/MyDrive/conform/CONFORM/LayoutTransformer

!pip install --upgrade pip==23.0.1

# 1) (Optional) for torch 1.11.0 + CUDA 11.3
# !pip install torch==1.11.0+cu113 torchvision==0.12.0+cu113 -f https://download.pytorch.org/whl/cu113/torch_stable.html

# 2) Install pinned versions
!pip install numpy==1.19.2

# 3) Install environment_replicate.yml pip deps
!pip install albumentations==0.4.3 diffusers opencv-python==4.1.2.30 pudb==2019.2 \
  invisible-watermark imageio==2.9.0 imageio-ffmpeg==0.4.2 \
  pytorch-lightning==1.4.2 omegaconf==2.1.1 test-tube>=0.7.5 \
  streamlit>=0.73.1 einops==0.3.0 torch-fidelity==0.3.0 \
  transformers==4.19.2 torchmetrics==0.6.0 kornia==0.6

# 4) Install Taming Transformers & CLIP from git-------------------------
# !pip install git+https://github.com/CompVis/taming-transformers.git@master
# !pip install git+https://github.com/openai/CLIP.git@main

# 5) Additional packages
!pip install bounding-box==0.1.3 fairseq==0.12.2 spacy==3.5.1 nltk==3.8.1 inflect==6.0.2
!python -m spacy download en_core_web_sm

# 6) Install local stable-diffusion package-------------------------
# !pip install -e .


In [None]:
%cd /content/drive/MyDrive/conform/CONFORM/LayoutTransformer

import nltk
nltk.download('wordnet')  # Download WordNet
nltk.download('stopwords')  # Download Stopwords

# !python inference/inference_coco.py --sentence 'The silver bed was situated to the right of the white couch.'

#--------------------------------------------------------------------------------------------------------------------------------------------------------
!python inference/inference_coco.py --sentence 'car is the right of a cup'


In [None]:
# Sample input data
position1 = (0.386, 0.441)  # Center of 'cat' #----------------------------------------------------------------------------
position2 = (0.550, 0.620)  # Center of 'dog'

# List of positions to be assigned to the token groups
positions_list = [position1, position2]

# # Token groups
# token_groups = [
#     [1, 4],  # 'cat' and 'left' share the same box
#     [7]       # 'dog' has its own box
# ]

# Coordinates offset to create bounding boxes
box_offset = 0.2

# Function to calculate bounding box from center (x, y)
def create_bounding_box(center, offset=0.2):
    x_center, y_center = center
    xmin = max(x_center - offset, 0)
    ymin = max(y_center - offset, 0)
    xmax = min(x_center + offset, 1)
    ymax = min(y_center + offset, 1)
    return [xmin, ymin, xmax, ymax]

# Create bounding boxes for each token group
bounding_boxes = {}

# indices = {
#     0: '<|startoftext|>',
#     1: 'cat</w>',
#     2: 'on</w>',
#     3: 'the</w>',
#     4: 'left</w>',
#     5: 'of</w>',
#     6: 'a</w>',
#     7: 'dog</w>',
#     8: '<|endoftext|>',
# }

# Positions for each word (normalized x, y)
positions = {}

for idx, group in enumerate(token_groups):
    position = positions_list[idx]  # Get the corresponding position for the group
    for token_id in group:
        positions[token_id] = position

print("Positions:", positions)

# Assign bounding boxes based on the positions and token groups
for group in token_groups:
    # Get the first token's position in the group
    first_token_id = group[0]
    position = positions[first_token_id]
    # Create bounding box
    bounding_box = create_bounding_box(position, box_offset)
    for token_id in group:
        bounding_boxes[token_id] = bounding_box

print("Bounding boxes:", bounding_boxes)


%cd /content/drive/MyDrive/conform/CONFORM/

## CONFORM Output

In [None]:
images, attention_maps = pipeline(
    prompt=prompt,
    token_groups=token_groups,
    bounding_boxes=bounding_boxes,
    guidance_scale=guidance_scale,
    generator=torch.Generator("cuda").manual_seed(seed),
    num_inference_steps=num_inference_steps,
    max_iter_to_alter=max_iter_to_alter,
    attn_res=attn_res,
    scale_factor=scale_factor,
    iterative_refinement_steps=iterative_refinement_steps,
    steps_to_save_attention_maps=steps_to_save_attention_maps,
    do_smoothing=do_smoothing,
    smoothing_sigma=smoothing_sigma,
    smoothing_kernel_size=smoothing_kernel_size,
    temperature=temperature,
    refinement_steps=refinement_steps,
    softmax_normalize=softmax_normalize,
    softmax_normalize_attention_maps=softmax_normalize_attention_maps,
    add_previous_attention_maps=add_previous_attention_maps,
    previous_attention_map_anchor_step=previous_attention_map_anchor_step,
    loss_fn=loss_fn,
)

In [None]:
images[0]

## SD Output

In [None]:
out = sd_pipeline(
    prompt=prompt,
    guidance_scale=guidance_scale,
    num_inference_steps=num_inference_steps,
    generator=torch.Generator("cuda").manual_seed(seed),
)

In [None]:
out.images[0]

In [None]:
attention_maps[0]

In [None]:
attn_map = attention_maps[0][19]

print(attn_map.shape)


In [None]:
import matplotlib.pyplot as plt

# Choose a token index
# token_idx = 76  # Change this to visualize other tokens
# attn_map_for_token = attn_map[:, :, token_idx]

# plt.figure(figsize=(6, 6))
# plt.title(f"Attention map for token {token_idx}")
# plt.imshow(attn_map_for_token, cmap="plasma")
# plt.colorbar()
# plt.show()


In [None]:
# Visualize the token corresponding to "cat"
# token_idx = 1
token_idx = 1 #----------------------------------------------------------------------------
attn_map_for_token = attn_map[:, :, token_idx]

plt.figure(figsize=(6, 6))
plt.title(f"Attention map for token index {token_idx} (first object)")
plt.imshow(attn_map_for_token, cmap="plasma")
plt.colorbar()
plt.show()


In [None]:
# Visualize the token corresponding to "cat"
# token_idx = 1
token_idx = 4 #----------------------------------------------------------------------------
attn_map_for_token = attn_map[:, :, token_idx]

plt.figure(figsize=(6, 6))
plt.title(f"Attention map for token index {token_idx} (location)")
plt.imshow(attn_map_for_token, cmap="plasma")
plt.colorbar()
plt.show()

In [None]:
# Visualize the token corresponding to "table"
# token_idx = 4
token_idx = 7 #----------------------------------------------------------------------------
attn_map_for_token = attn_map[:, :, token_idx]

plt.figure(figsize=(6, 6))
plt.title(f"Attention map for token index {token_idx} (second object)")
plt.imshow(attn_map_for_token, cmap="plasma")
plt.colorbar()
plt.show()

In [None]:
# Visualize the token corresponding to "under"
# token_idx = 2
# token_idx = 4
# attn_map_for_token = attn_map[:, :, token_idx]

# plt.figure(figsize=(6, 6))
# plt.title(f"Attention map for token index {token_idx}")
# plt.imshow(attn_map_for_token, cmap="plasma")
# plt.colorbar()
# plt.show()

In [None]:
# Function to create bounding box mask
def create_bbox_mask(bbox, grid_size):
    mask = torch.ones(grid_size)
    x_min, y_min, x_max, y_max = [
        int(coord * size) for coord, size in zip(bbox, grid_size*2)
    ]
    mask[y_min:y_max, x_min:x_max] = 10
    return mask

# Visualize the attention map with bounding box for "cat"
# token_idx = 1  # Token for "cat"
token_idx = 1 #----------------------------------------------------------------------------
attn_map_for_token = attn_map[:, :, token_idx].cpu().numpy()

# Create bounding box mask
bbox_cat = bounding_boxes[token_idx]  # Get bounding box for "cat"
bbox_mask_cat = create_bbox_mask(bbox_cat, attn_map_for_token.shape)

# attn_map[:, :, token_idx] *= bbox_mask_cat

# Plot the attention map
plt.figure(figsize=(6, 6))
plt.title(f"Attention map with bounding box for token {token_idx} ")
plt.imshow(attn_map_for_token, cmap="plasma", alpha=0.8)
plt.colorbar(label="Attention")

# Overlay the bounding box mask
plt.imshow(bbox_mask_cat, cmap="gray", alpha=0.3)  # Mask with transparency
plt.show()

# Visualize the attention map with bounding box for "table"
# token_idx = 4  # Token for "table"
token_idx = 7 #----------------------------------------------------------------------------
attn_map_for_token = attn_map[:, :, token_idx].cpu().numpy()

# Create bounding box mask
bbox_table = bounding_boxes[token_idx]  # Get bounding box for "table"
bbox_mask_table = create_bbox_mask(bbox_table, attn_map_for_token.shape)

# Plot the attention map
plt.figure(figsize=(6, 6))
plt.title(f"Attention map with bounding box for token {token_idx} ")
plt.imshow(attn_map_for_token, cmap="plasma", alpha=0.8)
plt.colorbar(label="Attention")

# Overlay the bounding box mask
plt.imshow(bbox_mask_table, cmap="gray", alpha=0.3)  # Mask with transparency
plt.show()


In [None]:

# # %cd /content/drive/MyDrive/DL/Diffusion-SpaceTime-Attn-main/attention_optimization/stable-diffusion
# %cd /content/drive/MyDrive/conform/CONFORM/LayoutTransformer

# !pip install --upgrade pip==23.0.1

# # 1) (Optional) for torch 1.11.0 + CUDA 11.3
# !pip install torch==1.11.0+cu113 torchvision==0.12.0+cu113 -f https://download.pytorch.org/whl/cu113/torch_stable.html

# # 2) Install pinned versions
# !pip install numpy==1.19.2

# # 3) Install environment_replicate.yml pip deps
# !pip install albumentations==0.4.3 diffusers opencv-python==4.1.2.30 pudb==2019.2 \
#   invisible-watermark imageio==2.9.0 imageio-ffmpeg==0.4.2 \
#   pytorch-lightning==1.4.2 omegaconf==2.1.1 test-tube>=0.7.5 \
#   streamlit>=0.73.1 einops==0.3.0 torch-fidelity==0.3.0 \
#   transformers==4.19.2 torchmetrics==0.6.0 kornia==0.6

# # 4) Install Taming Transformers & CLIP from git-------------------------
# # !pip install git+https://github.com/CompVis/taming-transformers.git@master
# # !pip install git+https://github.com/openai/CLIP.git@main

# # 5) Additional packages
# !pip install bounding-box==0.1.3 fairseq==0.12.2 spacy==3.5.1 nltk==3.8.1 inflect==6.0.2
# !python -m spacy download en_core_web_sm

# # 6) Install local stable-diffusion package-------------------------
# # !pip install -e .


In [None]:
# %cd /content/drive/MyDrive/conform/CONFORM/LayoutTransformer

# import nltk
# nltk.download('wordnet')  # Download WordNet
# nltk.download('stopwords')  # Download Stopwords

# # !python inference/inference_coco.py --sentence 'The silver bed was situated to the right of the white couch.'

# !python inference/inference_coco.py --sentence 'cat on the left of a dog'


In [None]:
# # Sample input data
# position1 = (0.397, 0.432)  # Center of 'cat'
# position2 = (0.479, 0.503)  # Center of 'dog'

# # List of positions to be assigned to the token groups
# positions_list = [position1, position2]

# # # Token groups
# # token_groups = [
# #     [1, 4],  # 'cat' and 'left' share the same box
# #     [7]       # 'dog' has its own box
# # ]

# # Coordinates offset to create bounding boxes
# box_offset = 0.2

# # Function to calculate bounding box from center (x, y)
# def create_bounding_box(center, offset=0.2):
#     x_center, y_center = center
#     xmin = max(x_center - offset, 0)
#     ymin = max(y_center - offset, 0)
#     xmax = min(x_center + offset, 1)
#     ymax = min(y_center + offset, 1)
#     return [xmin, ymin, xmax, ymax]

# # Create bounding boxes for each token group
# bounding_boxes = {}

# # indices = {
# #     0: '<|startoftext|>',
# #     1: 'cat</w>',
# #     2: 'on</w>',
# #     3: 'the</w>',
# #     4: 'left</w>',
# #     5: 'of</w>',
# #     6: 'a</w>',
# #     7: 'dog</w>',
# #     8: '<|endoftext|>',
# # }

# # Positions for each word (normalized x, y)
# positions = {}

# for idx, group in enumerate(token_groups):
#     position = positions_list[idx]  # Get the corresponding position for the group
#     for token_id in group:
#         positions[token_id] = position

# print("Positions:", positions)

# # Assign bounding boxes based on the positions and token groups
# for group in token_groups:
#     # Get the first token's position in the group
#     first_token_id = group[0]
#     position = positions[first_token_id]
#     # Create bounding box
#     bounding_box = create_bounding_box(position, box_offset)
#     for token_id in group:
#         bounding_boxes[token_id] = bounding_box

# print("Bounding boxes:", bounding_boxes)

In [None]:
%cd /content/drive/MyDrive/conform/CONFORM/detrex
!git submodule init
!git submodule update

!python -m pip install -e detectron2

In [None]:
!pip install -e .

In [None]:

%cd /content/drive/MyDrive/conform/CONFORM/detrex

# download pretrained DAB-DETR model
!wget https://github.com/IDEA-Research/detrex-storage/releases/download/v0.1.0/dab_detr_r50_50ep.pth

# download pretrained DINO model
!wget https://github.com/IDEA-Research/detrex-storage/releases/download/v0.2.1/dino_r50_4scale_12ep.pth

# download the demo image
!wget https://github.com/IDEA-Research/detrex-storage/releases/download/v0.2.1/idea.jpg




In [None]:
%cd /content/drive/MyDrive/conform/CONFORM/detrex/
!python demo/demo.py --config-file projects/dab_detr/configs/dab_detr_r50_50ep.py \
                    --input "./idea.jpg" \
                    --output "./demo_output.jpg" \
                    --opts train.init_checkpoint="./dab_detr_r50_50ep.pth"




In [None]:
!python demo/demo.py --config-file projects/dino/configs/dino_r50_4scale_12ep.py \
                    --input "./idea.jpg" \
                    --output "./demo_output.jpg" \
                    --opts train.init_checkpoint="./dino_r50_4scale_12ep.pth"

In [None]:
%cd /content/drive/MyDrive/conform/CONFORM/detrex/detectron2

# !pip install -e .
import argparse
import glob
import multiprocessing as mp
import sys
import os
from tqdm import tqdm

print("current working directory:", os.getcwd(), "\n")
print("\n".join(sys.path))

# sys.path.insert(0, "/content/drive/MyDrive/conform/CONFORM/detrex")
from predictors import VisualizationDemo
from detectron2.checkpoint import DetectionCheckpointer
from detectron2.config import LazyConfig, instantiate
from detectron2.data.detection_utils import read_image
from detectron2.utils.logger import setup_logger

import torch

import pickle as pkl
