In [1]:
%load_ext autoreload
%autoreload 2

import torch

from text3d2video.rendering import render_depth_map
from text3d2video.utilities.video_util import pil_frames_to_clip
torch.set_grad_enabled(False)

<torch.autograd.grad_mode.set_grad_enabled at 0x7b1e1acafc40>

In [2]:
from text3d2video.pipelines.generative_rendering_pipeline import (
    GenerativeRenderingPipeline
)
from text3d2video.pipelines.pipeline_utils import load_pipeline

device = torch.device("cuda")
dtype = torch.float16
sd_repo = "runwayml/stable-diffusion-v1-5"
controlnet_repo = "lllyasviel/control_v11f1p_sd15_depth"
pipe = load_pipeline(GenerativeRenderingPipeline, sd_repo, controlnet_repo)

Loading pipeline components...:   0%|          | 0/5 [00:00<?, ?it/s]

In [3]:
from text3d2video.util import read_obj_with_uvs
from text3d2video.utilities.camera_placement import (
    cam_view_prompt,
    turntable_extrinsics,
)
from pytorch3d.renderer import FoVPerspectiveCameras

# read mesh
device = "cuda"
mesh_path = "data/meshes/cat_statue.obj"
mesh, verts_uvs, faces_uvs = read_obj_with_uvs(mesh_path, device=device)

# setup cams
n_frames = 10
dist = 6.2
angles = torch.linspace(0, 360, n_frames)
# angles = [0, -20, 20]
R, T = turntable_extrinsics(dists=dist, angles=angles)
frame_cams = FoVPerspectiveCameras(R=R, T=T, device="cuda", fov=10)
frame_meshes = mesh.extend(len(frame_cams))

# prompt suffixes
prompt_suffixes = [f", {cam_view_prompt(a, 0)} view" for a in angles]

In [71]:
# Handstand
from text3d2video.artifacts.anim_artifact import AnimationArtifact

anim = AnimationArtifact.from_wandb_artifact_tag("catwalk_180:latest")

frame_indices = torch.linspace(50, 62, 7).long()
print(frame_indices)

frame_cams, frame_meshes = anim.load_frames(frame_indices)
verts_uvs, faces_uvs = anim.uv_data()

prompt_suffixes = None

tensor([50, 52, 54, 56, 58, 60, 62])


In [72]:
prompt = "Deadpool, blank background"
seed = 0
depths = render_depth_map(frame_meshes, frame_cams)

In [73]:
from text3d2video.utilities.ipython_utils import display_vid

display_vid(pil_frames_to_clip(depths))

In [33]:
# Call GR
from pathlib import Path

from torch import Tensor
from text3d2video.noise_initialization import (
    FixedNoiseInitializer,
    RandomNoiseInitializer,
    UVNoiseInitializer,
)
from text3d2video.pipelines.generative_rendering_pipeline import (
    GenerativeRenderingConfig,
)
from text3d2video.utilities.logging import GrLogger

module_paths = [
    # "down_blocks.0.attentions.0.transformer_blocks.0.attn1",
    # "down_blocks.0.attentions.1.transformer_blocks.0.attn1",
    # "down_blocks.1.attentions.0.transformer_blocks.0.attn1",
    # "down_blocks.1.attentions.1.transformer_blocks.0.attn1",
    # "down_blocks.2.attentions.0.transformer_blocks.0.attn1",
    # "down_blocks.2.attentions.1.transformer_blocks.0.attn1",
    "mid_block.attentions.0.transformer_blocks.0.attn1",
    "up_blocks.1.attentions.0.transformer_blocks.0.attn1",
    "up_blocks.1.attentions.1.transformer_blocks.0.attn1",
    "up_blocks.1.attentions.2.transformer_blocks.0.attn1",
    "up_blocks.2.attentions.0.transformer_blocks.0.attn1",
    "up_blocks.2.attentions.1.transformer_blocks.0.attn1",
    "up_blocks.2.attentions.2.transformer_blocks.0.attn1",
    "up_blocks.3.attentions.0.transformer_blocks.0.attn1",
    "up_blocks.3.attentions.1.transformer_blocks.0.attn1",
    "up_blocks.3.attentions.2.transformer_blocks.0.attn1",
]

gr_config = GenerativeRenderingConfig(
    do_pre_attn_injection=True,
    do_post_attn_injection=True,
    feature_blend_alpha=0.8,
    attend_to_self_kv=False,
    mean_features_weight=0.5,
    chunk_size=5,
    num_keyframes=1,
    num_inference_steps=15,
    guidance_scale=15,
    controlnet_conditioning_scale=1.0,
    module_paths=module_paths,
)

generator = torch.Generator(device="cuda")
generator.manual_seed(seed)

kf_generator = torch.Generator(device="cuda")
kf_generator.manual_seed(0)

logger = GrLogger(Path("data.h5"), n_save_times=10, n_save_frames=10)
logger.delete_data()
logger.open_write()
logger.enabled = False

noise_initializer = FixedNoiseInitializer()
noise_initializer = RandomNoiseInitializer()
noise_initializer = UVNoiseInitializer(noise_texture_res=120)

# all
# indices = Tensor([0, 15, 18]).long()

# front
indices = Tensor([18]).long()

video_frames = pipe(
    prompt,
    frame_meshes,
    frame_cams,
    verts_uvs,
    faces_uvs,
    conf=gr_config,
    noise_initializer=noise_initializer,
    generator=generator,
    kf_generator=kf_generator,
    logger=logger,
    prompt_suffixes=prompt_suffixes,
    kf_indices=indices,
)

100%|██████████| 15/15 [00:40<00:00,  2.72s/it]


In [34]:
from text3d2video.rendering import render_rgb_uv_map

kf_cams = frame_cams[indices]
kf_meshes = frame_meshes[indices]

kf_uvs = render_rgb_uv_map(kf_meshes, kf_cams, verts_uvs, faces_uvs)
display_vid(pil_frames_to_clip(kf_uvs))

In [35]:

from text3d2video.utilities.video_comparison import display_vids


kf_vid = pil_frames_to_clip(kf_uvs, fps=3)
video = pil_frames_to_clip(video_frames)

display_vids([kf_vid, video])

In [28]:
from text3d2video.utilities.ipython_utils import display_vid
from text3d2video.utilities.video_comparison import video_grid

comparison_vids = [
    (pil_frames_to_clip(depths), "depth"),
    (pil_frames_to_clip(video_frames), "Generative Rendering"),
]

labels = [label for _, label in comparison_vids]
clips = [clip for clip, _ in comparison_vids]

comparison_vid = video_grid(
    [clips],
    x_labels=labels,
)
display_vid(comparison_vid)