In [1]:
%load_ext autoreload
%autoreload 2

import torch
import text3d2video.wandb_util as wu
from text3d2video.artifacts.vertex_atributes_artifact import VertAttributesArtifact
from text3d2video.artifacts.animation_artifact import AnimationArtifact
from text3d2video.ipython_utils import display_ims

In [2]:
features_3d_artifact_tag = "deadpool-3d-features:v6"
animation_tag = "backflip:latest"

features_3d = wu.get_artifact(features_3d_artifact_tag)
features_3d = VertAttributesArtifact.from_wandb_artifact(features_3d)

animation = wu.get_artifact(animation_tag)
animation = AnimationArtifact.from_wandb_artifact(animation)

mv_features = features_3d.get_mv_features_from_lineage()

In [3]:
from diffusers import ControlNetModel
from text3d2video.cross_frame_attn import CrossFrameAttnProcessor
from text3d2video.pipelines.my_pipeline import MyPipeline

dtype = torch.float16
sd_repo = "runwayml/stable-diffusion-v1-5"
controlnet_repo = "lllyasviel/sd-controlnet-depth"
device = torch.device("cuda")

controlnet = ControlNetModel.from_pretrained(
    controlnet_repo, torch_dtype=torch.float16
).to(device)

pipe = MyPipeline.from_pretrained(sd_repo, controlnet=controlnet, torch_dtype=dtype).to(
    device
)

Loading pipeline components...:   0%|          | 0/5 [00:00<?, ?it/s]

In [15]:
from pathlib import Path
from scripts.generate_video import render_feature_images, render_feature_images_batched
from text3d2video.ipython_utils import display_frames_as_video
from text3d2video.rendering import render_depth_map
from text3d2video.util import front_camera

frame_indices = [1, 2, 3]
do_feature_injection = True
num_steps = 30
prompt = "Deadpool"
seed = 1

# render depth maps
camera = front_camera()
frames = animation.load_frames(frame_indices)
depth_maps = render_depth_map(frames, camera)

In [5]:
layers = features_3d.get_features_disk_dict().key_values("layer")
timesteps = features_3d.get_features_disk_dict().key_values("timestep")

In [6]:
from math import sqrt
from text3d2video.rendering import rasterize_vertex_features

camera = front_camera()
frames = animation.load_frames(frame_indices)
frames

<pytorch3d.structures.meshes.Meshes at 0x7dcaa4bcd760>

In [27]:
from pytorch3d.structures import join_meshes_as_batch

frames_expanded = join_meshes_as_batch([frames] * 3)
len(frames_expanded)

9

In [29]:
timesteps = list(range(30))

# render feature images
with torch.no_grad():

    all_feature_images = render_feature_images(
        features_3d,
        mv_features,
        animation,
        frame_indices,
        timesteps=timesteps,
        layers=[layers[0]],
    )

30it [00:01, 16.69it/s]


In [21]:
# Attention Processor setup
attn_processor = CrossFrameAttnProcessor(unet_chunk_size=2, pipe=pipe)
attn_processor.feature_images_multidict = all_feature_images
attn_processor.do_cross_frame_attn = True
attn_processor.do_feature_injection = True
attn_processor.feature_blend_alpha = 1.0
pipe.unet.set_attn_processor(attn_processor)

# run pipeline
prompts = [prompt] * len(frame_indices)
generator = torch.Generator(device="cuda")
generator.manual_seed(seed)
images = pipe(prompts, depth_maps, generator=generator, num_inference_steps=num_steps)

100%|██████████| 31/31 [00:10<00:00,  3.03it/s]


In [22]:
display_frames_as_video(images, Path("outs/vid.mp4"))

Moviepy - Building video outs/vid.mp4.
Moviepy - Writing video outs/vid.mp4



                                                  

Moviepy - Done !
Moviepy - video ready outs/vid.mp4


