In [1]:
%load_ext autoreload
%autoreload 2

from omegaconf import OmegaConf
import wandb_util.wandb_util as wbu
from scripts.wandb_experiments.benchmark import BenchmarkConfig
import multiprocessing as mp
mp.set_start_method('spawn', force=True)

In [11]:
from scripts.wandb_experiments.benchmark import GeometryAndPrompts
from text3d2video.experiment_analysis import print_table
from text3d2video.util import concat_lists

lion_zoom = GeometryAndPrompts(
    "lion_zoom:latest",
    "lion_zoom_out_src:latest",
    ["Lion"],
    1,
)

rumba_zoom = GeometryAndPrompts(
    "rumba_zoom_20:latest",
    "rumba_zoom_out_src:latest",
    ["Deadpool", "Stormtrooper"],
    1,
)

rumba = GeometryAndPrompts(
    "rumba_20:latest",
    "human_T_front:latest",
    ["Stormtrooper"],
    1
)

turn_around = GeometryAndPrompts(
    "catwalk_180_20:latest",
    'human_mv:latest',
    ["Deadpool", "Stormtrooper"],
    1
)

cat_statue = GeometryAndPrompts(
    'mv_cat_statue_25:latest',
    'mv_cat_statue:latest',
    ["Metalic Cat Statue"],
    1
)

handstand = GeometryAndPrompts(
    'handstand_20:latest',
    'human_mv:latest',
    ["Deadpool"],
    1
)

flair = GeometryAndPrompts(
    'flair_20:latest',
    'human_mv:latest',
    ["Deadpool"],
    1
)

mma = GeometryAndPrompts(
    'mma_20:latest',
    'human_mv:latest',
    ["Deadpool"],
    1
)

# geometries_and_promtps = [lion_zoom, rumba_zoom, rumba, turn_around, cat_statue]
# geometries_and_promtps += [handstand, flair, mma]
geometries_and_prompts = [lion_zoom, turn_around, rumba_zoom, rumba]

scenes = concat_lists(s.to_scenes() for s in geometries_and_prompts)

print_table([s.tabulate_row() for s in scenes])

animation_tag          texturing_tag              prompt          seed
---------------------  -------------------------  ------------  ------
lion_zoom:latest       lion_zoom_out_src:latest   Lion               0
catwalk_180_20:latest  human_mv:latest            Deadpool           0
catwalk_180_20:latest  human_mv:latest            Stormtrooper       0
rumba_zoom_20:latest   rumba_zoom_out_src:latest  Deadpool           0
rumba_zoom_20:latest   rumba_zoom_out_src:latest  Stormtrooper       0
rumba_20:latest        human_T_front:latest       Stormtrooper       0


In [12]:
from scripts.wandb_experiments.benchmark import Method
from scripts.wandb_runs.run_generative_rendering import (
    RunGenerativeRenderingConfig,
    run_generative_rendering,
)
from scripts.wandb_runs.run_grtex import RunGrTexConfig, run_gr_tex
from text3d2video.pipelines.generative_rendering_pipeline import (
    GenerativeRenderingConfig,
)
from text3d2video.pipelines.pipeline_utils import ModelConfig
from text3d2video.utilities.omegaconf_util import get_import_path

# GR base Config
gr = OmegaConf.structured(
    RunGenerativeRenderingConfig(
        prompt="",
        animation_tag="",
        generative_rendering=GenerativeRenderingConfig(),
        model=ModelConfig(),
        texture_tag=None,
    )
)

controlnet = OmegaConf.structured(
    RunGenerativeRenderingConfig(
        prompt="",
        animation_tag="",
        generative_rendering=GenerativeRenderingConfig(
            do_pre_attn_injection=False, do_post_attn_injection=False
        ),
        num_keyframes=1,
        model=ModelConfig(),
        texture_tag=None,
    )
)

grtex = OmegaConf.structured(
    RunGrTexConfig(
        prompt="",
        animation_tag="",
        extr_tag="",
        generative_rendering=GenerativeRenderingConfig(),
        multires_textures=True,
        start_noise_level=0.2,
        model=ModelConfig(),
    )
)


render = OmegaConf.structured(
    RunGrTexConfig(
        prompt="",
        animation_tag="",
        extr_tag="",
        generative_rendering=GenerativeRenderingConfig(
            do_pre_attn_injection=False, do_post_attn_injection=False
        ),
        multires_textures=True,
        start_noise_level=1.0,
        model=ModelConfig(),
    )
)

methods = [
    Method("GR", get_import_path(run_generative_rendering), gr),
    Method("ControlNet", get_import_path(run_generative_rendering), controlnet),
    Method("TexGen", get_import_path(run_gr_tex), render),
    Method("GR-Tex", get_import_path(run_gr_tex), grtex),
]


print_table([m.tabulate_row() for m in methods])

name        fun                       config
----------  ------------------------  -------------------------------------------------------
GR          run_generative_rendering  prompt: ''
                                      animation_tag: ''
                                      generative_rendering:
                                        do_pre_attn_injection: true
                                        do_post_attn_injection: true
                                        attend_to_self_kv: true
                                        mean_features_weight: 0.5
                                        chunk_size: 5
                                        num_inference_steps: 15
                                        guidance_scale: 7.5
                                        controlnet_conditioning_scale: 1.0
                                        feature_blend_alpha: 0.7
                                      model:
                                        sd_repo: runwayml/stable-d

In [13]:
from scripts.wandb_experiments.benchmark import benchmark

config = BenchmarkConfig(scenes, methods)
config = OmegaConf.structured(config)

spec = benchmark(config)

In [15]:
from scripts.wandb_experiments.benchmark import benchmark

wbu.sync_experiment(benchmark, config, "benchmark_final")

Experiment: https://wandb.ai/romeu/diffusion-3D-features/groups/benchmark_final/workspace
Experiment up-to-date!


In [7]:
from scripts.wandb_experiments.benchmark import split_runs

exp_name = "benchmark_final"
print(wbu.get_exp_url(exp_name))

runs = wbu.get_logged_runs(exp_name)
texture_runs, video_gen_runs = split_runs(runs)

https://wandb.ai/romeu/diffusion-3D-features/groups/benchmark_final/workspace


In [8]:
from text3d2video.util import group_into_array, map_array


def scene_key(run):
    config = OmegaConf.create(run.config)
    return f"{config.animation_tag}-{config.prompt}-{config.seed}"

def method_key(run):
    return run.name.split("_")[0]

runs_grouped, labels = group_into_array(video_gen_runs, [method_key, scene_key])

In [9]:
from text3d2video.experiment_analysis import VideoTraces
run_data = map_array(runs_grouped, VideoTraces.from_run, pbar=True)

 40%|████      | 8/20 [00:09<00:13,  1.09s/it][34m[1mwandb[0m: Using wandb-core as the SDK backend. Please refer to https://wandb.me/wandb-core for more information.
[34m[1mwandb[0m:   20 of 20 files downloaded.  
[34m[1mwandb[0m:   20 of 20 files downloaded.  m[1mwandb[0m: \ 1 of 20 files downloaded...
[34m[1mwandb[0m:   20 of 20 files downloaded.  4m[1mwandb[0m: \ 1 of 20 files downloaded...
[34m[1mwandb[0m:   20 of 20 files downloaded.  4m[1mwandb[0m: \ 1 of 20 files downloaded...
[34m[1mwandb[0m:   20 of 20 files downloaded.  4m[1mwandb[0m: \ 1 of 20 files downloaded...
[34m[1mwandb[0m:   20 of 20 files downloaded.  4m[1mwandb[0m: \ 1 of 20 files downloaded...
100%|██████████| 20/20 [00:27<00:00,  1.35s/it]


In [18]:
from text3d2video.clip_metrics import CLIPMetrics

clip = CLIPMetrics()

In [19]:
map_array(
    run_data,
    lambda x: x.compute_clip_metrics(clip),
)

array([[None, None, None, None, None],
       [None, None, None, None, None],
       [None, None, None, None, None],
       [None, None, None, None, None]], dtype=object)

In [39]:
map_array(run_data, lambda x: x.compute_uv_mse(), pbar=True)

  2%|▎         | 1/40 [00:13<08:58, 13.81s/it]

0.0516132228076458


  5%|▌         | 2/40 [00:27<08:42, 13.75s/it]

0.04011274874210358


  8%|▊         | 3/40 [00:41<08:28, 13.73s/it]

0.05748780071735382


 10%|█         | 4/40 [00:54<08:14, 13.73s/it]

0.0458565428853035


 12%|█▎        | 5/40 [00:59<06:01, 10.33s/it]

0.05508970096707344


 15%|█▌        | 6/40 [01:05<05:00,  8.85s/it]

0.07886349409818649


 18%|█▊        | 7/40 [01:18<05:44, 10.44s/it]

0.06059413030743599


 20%|██        | 8/40 [01:32<06:07, 11.48s/it]

0.13232377171516418


 22%|██▎       | 9/40 [01:46<06:17, 12.18s/it]

0.07149535417556763


 25%|██▌       | 10/40 [02:00<06:19, 12.66s/it]

0.1696711629629135


 28%|██▊       | 11/40 [02:13<06:16, 12.99s/it]

0.013597488403320312


 30%|███       | 12/40 [02:27<06:10, 13.22s/it]

0.007339696399867535


 32%|███▎      | 13/40 [02:41<06:01, 13.38s/it]

0.012097934260964394


 35%|███▌      | 14/40 [02:55<05:50, 13.49s/it]

0.008098277263343334


 38%|███▊      | 15/40 [02:59<04:27, 10.72s/it]

0.009871121495962143


 40%|████      | 16/40 [03:05<03:42,  9.29s/it]

0.00637022452428937


 42%|████▎     | 17/40 [03:19<04:04, 10.63s/it]

0.012787974439561367


 45%|████▌     | 18/40 [03:32<04:14, 11.56s/it]

0.03135968744754791


 48%|████▊     | 19/40 [03:46<04:16, 12.21s/it]

0.013332098722457886


 50%|█████     | 20/40 [04:00<04:13, 12.66s/it]

0.02291904389858246


 52%|█████▎    | 21/40 [04:13<04:06, 12.97s/it]

0.004425348248332739


 55%|█████▌    | 22/40 [04:27<03:57, 13.20s/it]

0.004755567759275436


 57%|█████▊    | 23/40 [04:41<03:47, 13.36s/it]

0.004812519997358322


 60%|██████    | 24/40 [04:55<03:35, 13.47s/it]

0.004487528000026941


 62%|██████▎   | 25/40 [04:59<02:40, 10.71s/it]

0.008707506582140923


 65%|██████▌   | 26/40 [05:05<02:10,  9.30s/it]

0.006749477703124285


 68%|██████▊   | 27/40 [05:19<02:18, 10.62s/it]

0.003926441073417664


 70%|███████   | 28/40 [05:32<02:18, 11.55s/it]

0.04082449898123741


 72%|███████▎  | 29/40 [05:46<02:14, 12.20s/it]

0.013465940952301025


 75%|███████▌  | 30/40 [06:00<02:06, 12.66s/it]

0.041865862905979156


 78%|███████▊  | 31/40 [06:14<01:56, 12.98s/it]

0.0016954108141362667


 80%|████████  | 32/40 [06:27<01:45, 13.20s/it]

0.002244109520688653


 82%|████████▎ | 33/40 [06:41<01:33, 13.36s/it]

0.002419163705781102


 85%|████████▌ | 34/40 [06:55<01:20, 13.47s/it]

0.002162572694942355


 88%|████████▊ | 35/40 [06:59<00:53, 10.72s/it]

0.0020596079993993044


 90%|█████████ | 36/40 [07:05<00:37,  9.30s/it]

0.0022999371867626905


 92%|█████████▎| 37/40 [07:19<00:31, 10.62s/it]

0.0018303162651136518


 95%|█████████▌| 38/40 [07:32<00:23, 11.55s/it]

0.013464352115988731


 98%|█████████▊| 39/40 [07:46<00:12, 12.20s/it]

0.0038527720607817173


100%|██████████| 40/40 [08:00<00:00, 12.01s/it]

0.008159784600138664





array([[None, None, None, None, None, None, None, None, None, None],
       [None, None, None, None, None, None, None, None, None, None],
       [None, None, None, None, None, None, None, None, None, None],
       [None, None, None, None, None, None, None, None, None, None]],
      dtype=object)

In [40]:
prompt_fidelities = map_array(run_data, lambda x: x.prompt_fidelity)
frame_consistencies = map_array(run_data, lambda x: x.frame_consistency)
uv_mses = map_array(run_data, lambda x: x.uv_mse)

In [45]:
from text3d2video.util import index_list


rows = []
for i, method in enumerate(labels[0]):
    pf = prompt_fidelities[i, :].mean()
    fc = frame_consistencies[i, :].mean()
    uv_mse = uv_mses[i, :].mean()
    row = {
        "Method": method,
        "Prompt Fidelity ($\\uparrow$)": pf,
        "Frame Consistency ($\\uparrow$)": fc,
        "UV MSE ($\\downarrow$)": uv_mse,
    }

    # round to four decimals
    for k, v in row.items():
        if isinstance(v, float):
            row[k] = f"{v:.4f}".format(v)

    rows.append(row)

rows = index_list(rows, [0, 1, 3, 2])
rows[-1]['Method'] = 'Ours'

print_table(rows)

Method        Prompt Fidelity ($\uparrow$)    Frame Consistency ($\uparrow$)    UV MSE ($\downarrow$)
----------  ------------------------------  --------------------------------  -----------------------
ControlNet                          0.3029                            0.9135                   0.0763
GR                                  0.3062                            0.9742                   0.0138
TexGen                              0.2955                            0.9649                   0.004
Ours                                0.308                             0.9659                   0.0134


In [47]:
import tabulate

from text3d2video.experiment_analysis import print_latex_table

print_latex_table(rows)

\begin{tabular}{crrr}
\hline
   Method   &   Prompt Fidelity ($\uparrow$) &   Frame Consistency ($\uparrow$) &   UV MSE ($\downarrow$) \\
\hline
 ControlNet &                         0.3029 &                           0.9135 &                  0.0763 \\
     GR     &                         0.3062 &                           0.9742 &                  0.0138 \\
   TexGen   &                         0.2955 &                           0.9649 &                  0.004  \\
    Ours    &                         0.308  &                           0.9659 &                  0.0134 \\
\hline
\end{tabular}


In [32]:
from text3d2video.utilities.video_comparison import video_grid
from text3d2video.utilities.video_util import pil_frames_to_clip

clips_grid = map_array(run_data, lambda d: pil_frames_to_clip(d.frames))
vid = video_grid(clips_grid, y_labels=labels[0])
vid.write_videofile("outs/benchmark.mp4")

Moviepy - Building video outs/benchmark.mp4.
Moviepy - Writing video outs/benchmark.mp4



                                                            

Moviepy - Done !
Moviepy - video ready outs/benchmark.mp4
