In [1]:
%load_ext autoreload
%autoreload 2


from text3d2video.artifacts.anim_artifact import AnimationArtifact
from torch import Tensor
import torchvision.transforms.functional as TF
import torch

In [2]:
from PIL import Image

from text3d2video.utilities.ipython_utils import display_vid
from text3d2video.utilities.video_util import pil_frames_to_clip

noises = torch.randn(5, 4, 64, 64)

def noise_imgs(noises: Tensor, res: int=512):

    images = []
    for noise in noises:
        noise_pil = TF.to_pil_image(noise[0:3, :, :].cpu())
        noise_pil = noise_pil.resize((res, res), Image.NEAREST)
        images.append(noise_pil)
    return images

display_vid(pil_frames_to_clip(noise_imgs(noises)))

In [3]:
from text3d2video.noise_initialization import RandomNoiseInitializer
from torch import Generator

device = 'cuda'

gen = Generator(device=device)
gen.manual_seed(0)

random_noise = RandomNoiseInitializer()

noise = random_noise.initial_noise(4, generator=gen, device=device)

display_vid(pil_frames_to_clip(noise_imgs(noise)))

In [4]:
from text3d2video.noise_initialization import FixedNoiseInitializer, RandomNoiseInitializer
from torch import Generator

device = 'cuda'

gen = Generator(device=device)
gen.manual_seed(0)

random_noise = FixedNoiseInitializer()

noise = random_noise.initial_noise(4, generator=gen, device=device)

display_vid(pil_frames_to_clip(noise_imgs(noise)))

In [76]:
from text3d2video.noise_initialization import (
    FixedNoiseInitializer,
    RandomNoiseInitializer,
    UVNoiseInitializer,
)
from torch import Generator

animation = AnimationArtifact.from_wandb_artifact_tag("mixamo-human_orth_pan:latest")

n_images = 20

frame_indices = animation.frame_indices(n_images)
cameras, meshes = animation.load_frames(frame_indices)
verts_uvs, faces_uvs = animation.uv_data()

# depth_maps = render_depth_map(meshes, cameras)

device = "cuda"

gen = Generator(device=device)
gen.manual_seed(0)

random_noise = UVNoiseInitializer(noise_texture_res=50)

noise = random_noise.initial_noise(meshes, cameras, verts_uvs, faces_uvs, generator=gen)

display_vid(pil_frames_to_clip(noise_imgs(noise)))

In [73]:
from einops import rearrange
from text3d2video.backprojection import make_repeated_uv_texture
from pytorch3d.renderer import RasterizationSettings, MeshRasterizer

from text3d2video.rendering import TextureShader


device = "cuda"
dtype = torch.float16
generator = None
texture_res=45

# sample noise uv_map
noise_uv_map = torch.randn(
    texture_res,
    texture_res,
    4,
    device=device,
    generator=generator,
)

# create noisy texture
n_frames = len(meshes)
noise_texture = make_repeated_uv_texture(noise_uv_map, faces_uvs, verts_uvs, n_frames)
noise_texture.sampling_mode = "nearest"
meshes.textures = noise_texture

# rasterize
raster_settings = RasterizationSettings(
    image_size=64,
    faces_per_pixel=1,
    bin_size=0,
)
rasterizer = MeshRasterizer(raster_settings=raster_settings)
fragments = rasterizer(meshes, cameras=cameras)

# render
shader = TextureShader()
noise_renders = shader(fragments, meshes)
noise_renders = noise_renders.to(device=device, dtype=dtype)

# sample background noise
bg_noise = torch.randn(
    4,
    64,
    64,
    generator=generator,
    device=device,
    dtype=dtype,
)

# background for each frame
background_noise = bg_noise.expand(n_frames, -1, -1, -1)

latents_mask = (noise_renders > 0).float()
latents = noise_renders + background_noise
latents = latents.to(device, dtype=dtype)

masks = fragments.pix_to_face > 0
masks = rearrange(masks, "N H W 1 -> N 1 H W")

multiplied = ~masks * background_noise + noise_renders

display_vid(pil_frames_to_clip(noise_imgs(multiplied)))