In [2]:
%load_ext autoreload
%autoreload 2

import torch
from text3d2video.artifacts.anim_artifact import AnimationArtifact

torch.set_grad_enabled(False)

<torch.autograd.grad_mode.set_grad_enabled at 0x7bfc74712130>

In [3]:
from text3d2video.pipelines.generative_rendering_pipeline import GrPipeline
from text3d2video.pipelines.new_gr_pipeline import GrPipelineNew
from text3d2video.pipelines.pipeline_utils import load_pipeline
from text3d2video.pipelines.texturing_pipeline import TexturingPipeline

gr_pipe: GrPipelineNew = load_pipeline(GrPipelineNew)

texturing_pipe: TexturingPipeline = TexturingPipeline(
    gr_pipe.vae,
    gr_pipe.text_encoder,
    gr_pipe.tokenizer,
    gr_pipe.unet,
    gr_pipe.scheduler,
    gr_pipe.controlnet,
)

old_gr_pipe = GrPipeline(
    vae=gr_pipe.vae,
    text_encoder=gr_pipe.text_encoder,
    tokenizer=gr_pipe.tokenizer,
    unet=gr_pipe.unet,
    scheduler=gr_pipe.scheduler,
    controlnet=gr_pipe.controlnet,
)

Loading pipeline components...:   0%|          | 0/5 [00:00<?, ?it/s]

In [45]:
tgt_tag = "mma_20:latest"
tgt_tag = "rumba_20:latest"
tgt_tag = "mv_helmet:latest"
tgt_tag = "catwalk_180_20:latest"
tgt_tag = "ymca_20:latest"
tgt_tag = "mv_helmet_25:latest"
tgt_tag = "mv_cat_statue_22:latest"
tgt_tag = "owl_zoom:latest"
tgt_tag = "rumba_zoom_20:latest"
tgt_tag = 'rumba_zoom_side:latest'
tgt_tag = 'mv_lion:latest'
tgt_tag = "lion_zoom:latest"

anim_art = AnimationArtifact.from_wandb_artifact_tag(tgt_tag)
tgt_seq = anim_art.read_anim_seq()

src_tag = 'catwalk_180_src:latest'
src_tag = "ymca_zoom_out:latest"
src_tag = 'mv_helmet:latest'
src_tag = 'mv_cat_statue:latest'
src_tag = 'rumba_zoom_src:latest'
src_tag = 'owl_zoom_src:latest'
src_tag = 'rumba_zoom_out_src:latest'
src_tag = 'lion_zoom_out_src:latest'

src_seq = AnimationArtifact.from_wandb_artifact_tag(
    src_tag,
).read_anim_seq()

In [46]:
from text3d2video.rendering import render_texture
from text3d2video.util import chw_to_hwc
from text3d2video.utilities.testing_utils import checkerboard_img
from text3d2video.utilities.video_comparison import display_vid, video_grid
from text3d2video.utilities.video_util import pil_frames_to_clip

prompt = "Lion"
input_src_anim = None
input_src_anim = src_seq

def inputs_video(prompt, src_seq, tgt_seq):
    texture = checkerboard_img(return_type="pt", res=500, square_size=30).cuda()
    texture = chw_to_hwc(texture)

    vids = {}

    if src_seq is not None:
        src_frames = render_texture(
            src_seq.meshes,
            src_seq.cams,
            texture,
            src_seq.verts_uvs,
            src_seq.faces_uvs,
            return_pil=True,
        )

        vids["src"] = pil_frames_to_clip(src_frames)

    anim_frames = render_texture(
        tgt_seq.meshes,
        tgt_seq.cams,
        texture,
        tgt_seq.verts_uvs,
        tgt_seq.faces_uvs,
        return_pil=True,
    )

    tgt_vid = pil_frames_to_clip(anim_frames, fps=12)
    vids["tgt"] = tgt_vid

    vids_list = list(vids.values())
    titles = list(vids.keys())
    return video_grid(
        [vids_list], x_labels=titles, padding_mode="slow_down", y_labels=[prompt]
    )

display_vid(inputs_video(prompt, src_seq, tgt_seq))

In [47]:
from text3d2video.pipelines.generative_rendering_pipeline import (
    GenerativeRenderingConfig,
)

gr_config = GenerativeRenderingConfig(
    do_pre_attn_injection=True,
    do_post_attn_injection=True,
    attend_to_self_kv=False,
    mean_features_weight=0.5,
    chunk_size=5,
    num_inference_steps=15,
    guidance_scale=7.5,
    controlnet_conditioning_scale=1.0,
    feature_blend_alpha=0.7,
)

seed = 0
generator = torch.Generator(device="cuda")
generator.manual_seed(seed)

kf_generator = torch.Generator(device="cuda")
kf_generator.manual_seed(1)

out_src = old_gr_pipe(
    prompt,
    tgt_seq,
    num_keyframes=3,
    conf=gr_config,
    generator=generator,
    kf_generator=kf_generator,
)

[70, 126, 260]
260


100%|██████████| 15/15 [00:47<00:00,  3.17s/it]


In [50]:
display_vid(pil_frames_to_clip(out_src.images, fps=12))

In [51]:
from text3d2video.pipelines.texturing_pipeline import TexturingConfig

texgen_config = TexturingConfig(
    num_inference_steps=15,
    do_text_and_texture_resampling=True,
    use_prev_clean_tex=True,
    use_update_masks=True,
    use_referecnce_kvs=True,
    uv_res=1400
)

generator = torch.Generator(device="cuda")
generator.manual_seed(0)

out_src = texturing_pipe(
    prompt,
    src_seq,
    conf=texgen_config,
    generator=generator,
)

100%|██████████| 15/15 [00:18<00:00,  1.24s/it]


In [52]:
from text3d2video.utilities.video_comparison import display_vids

vid = pil_frames_to_clip(out_src.images)
uv_vid = pil_frames_to_clip(src_seq.render_rgb_uv_maps())

display_vids([uv_vid, vid], titles=[" ", "TexGen"], padding_mode="slow_down")

In [53]:
from text3d2video.backprojection import (
    aggregate_views_uv_texture,
    compute_texel_projections,
)
import torchvision.transforms.functional as TF

tex_res = 1000
projections = compute_texel_projections(
    src_seq.meshes, src_seq.cams, src_seq.verts_uvs, src_seq.faces_uvs, tex_res
)

feature_maps = torch.stack([TF.to_tensor(i) for i in out_src.images]).cuda()
texture = aggregate_views_uv_texture(feature_maps, tex_res, projections)

In [54]:
renders = render_texture(
    tgt_seq.meshes,
    tgt_seq.cams,
    texture,
    tgt_seq.verts_uvs,
    tgt_seq.faces_uvs,
    return_pil=True,
)

In [55]:
display_vid(pil_frames_to_clip(renders), title="Tgt Renders")

In [61]:
from torch import Generator
from text3d2video.pipelines.generative_rendering_pipeline import (
    GenerativeRenderingConfig,
)

gr_config = GenerativeRenderingConfig(
    num_inference_steps=15,
    do_pre_attn_injection=True,
    do_post_attn_injection=True,
    guidance_scale=7.5,
    feature_blend_alpha=0.8,
)

generator = Generator(device="cuda").manual_seed(0)

out_tgt = gr_pipe(
    prompt,
    tgt_seq,
    src_seq,
    out_src.latents,
    gr_config,
    initial_texture=texture,
    texture_noise_level=0.2,
    multi_res_textures=True,
    generator=generator,
)

[[0], [1], [2, 3]]
[2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0]


100%|██████████| 13/13 [00:46<00:00,  3.59s/it]


In [62]:
src_vid = pil_frames_to_clip(out_src.images)
tgt_vid = pil_frames_to_clip(out_tgt.images)

src_method = "GR"
src_method = "TexGen"
with_multires_feature_textures = False

display_vids(
    [src_vid, tgt_vid],
    titles=[
        f"src ({src_method})",
        "tgt" + (" (multires)" if with_multires_feature_textures else ""),
    ],
    padding_mode="slow_down",
)