In [2]:
import os
os.environ["CUDA_VISIBLE_DEVICES"] = "0"

In [8]:
!ls

app.py				  inference.py	requirements.txt  untitled.txt
checkpoints			  models	results		  uploads
demo_FCVG.py			  output_fcvg	run.log		  utils
demo.sh				  outputs	tmp_upscale_run
downtoupconditioned.py		  pipeline	untitled1.txt
efficient_video_interpolation.py  README.md	Untitled.ipynb


In [4]:
# test_unet_controlnext_no_vae.py
import torch
import torch.nn.functional as F
from pprint import pprint

# adjust these imports to match your repo filenames / class names
from models.controlnext_vid_svd import ControlNeXtSDVModelFast as ControlModel  # or ControlNeXtSDVModel
from models.unet_spatio_temporal_condition_controlnext import (
    UNetSpatioTemporalConditionControlNeXtModel,
)

def main(device="cuda" if torch.cuda.is_available() else "cpu"):
    device = torch.device(device)
    torch.manual_seed(0)

    # -------------------------
    # 1) Instantiate Control model
    # -------------------------
    print("Instantiating ControlNeXt SVD model...")
    control = ControlModel(
        time_embed_dim=256,
        in_channels=[128, 128],
        out_channels=[128, 256],
        groups=[4, 8],
    ).to(device)

    # Put in eval mode (no grads)
    control.eval()

    # -------------------------
    # 2) Instantiate UNet
    # -------------------------
    print("Instantiating UNet...")
    unet = UNetSpatioTemporalConditionControlNeXtModel(
        sample_size=64,               # spatial size used in your UNet config (does not have to match input exactly)
        in_channels=8,                # final UNet expects e.g. 8 channels (latent channels)
        out_channels=4,
        block_out_channels=(320, 640, 1280, 1280),
        addition_time_embed_dim=256,
        projection_class_embeddings_input_dim=768,
        layers_per_block=2,
        cross_attention_dim=1024,
        transformer_layers_per_block=1,
        num_attention_heads=(5, 10, 10, 20),
    ).to(device)
    unet.eval()

    # -------------------------
    # 3) Create dummy control condition (images) for ControlNeXt
    # -------------------------
    # ControlNeXt expects input shape: [batch, frames, channels=3, H, W]
    B = 1
    frames_control = 4
    Hc = 256
    Wc = 256
    control_imgs = torch.randn(B, frames_control, 3, Hc, Wc, device=device)  # values in approx N(0,1)
    # If your control expects pixel range [0,1] or [-1,1], scale accordingly:
    # control_imgs = (control_imgs - control_imgs.min()) / (control_imgs.max() - control_imgs.min())

    timestep = 10  # arbitrary integer timestep
    with torch.no_grad():
        print("Running control model forward...")
        ctrl_out = control(control_imgs, timestep)  # expected dict with 'output' and 'scale'
    print("Control model output keys:", list(ctrl_out.keys()))
    # ctrl_out['output'] expected shape: [B * frames, C_ctrl, h_ctrl, w_ctrl] OR [B, C, h, w] depending on implementation
    pprint({k: (v.shape if isinstance(v, torch.Tensor) else v) for k, v in ctrl_out.items()})

    # -------------------------
    # 4) Create dummy UNet inputs
    # -------------------------
    # UNet forward expects:
    #   sample: [batch, frames, channels, H, W]
    #   timestep: scalar or tensor
    #   encoder_hidden_states: [batch, seq_len, cross_attn_dim]  (here seq_len usually 1)
    #   added_time_ids: a small tensor (we'll use 3 numbers like pipeline does)
    B_unet = 1
    frames_unet = 4
    C_unet_in = 8          # must match unet.config.in_channels (8 by default)
    H_unet = 64
    W_unet = 64
    sample = torch.randn(B_unet, frames_unet, C_unet_in, H_unet, W_unet, device=device, dtype=torch.float32)

    # encoder_hidden_states: use a dummy CLIP-like embedding shape [B, seq_len, cross_attn_dim]
    cross_attn_dim = 1024
    encoder_hidden_states = torch.randn(B_unet, 1, cross_attn_dim, device=device, dtype=torch.float32)

    # added_time_ids: mimic pipeline's small vector [fps, motion_bucket_id, noise_aug_strength]
    added_time_ids = torch.tensor([[ (frames_unet - 1), 127, 0.02 ]], device=device, dtype=torch.float32)

    # -------------------------
    # 5) Convert ctrl_out to UNet-compatible conditional_controls
    #    The UNet you provided expects conditional_controls as a dict:
    #       {"output": [B_eff * frames_in, Cc, h, w], "scale": [B_eff * frames_in, 1,1,1]}
    #    The pipeline previously built such a dict; we'll do the same transformation here.
    # -------------------------
    def make_conditional_controls_from_ctrl_out(ctrl_out, frames_in, device, dtype, do_cf=False):
        out = ctrl_out["output"]  # often [B*frames, Cc, h, w] or [B, Cc, h, w]
        scale = ctrl_out.get("scale", 1.0)

        # If ctrl_out output is [B, Cc, h, w] (no time dim), expand to frames:
        if out.ndim == 4:  # [B, Cc, h, w]
            B_eff = out.shape[0]
            out = out.unsqueeze(1).repeat(1, frames_in, 1, 1, 1)  # [B, frames, Cc, h, w]
            out = out.reshape(B_eff * frames_in, out.shape[2], out.shape[3], out.shape[4])
        elif out.ndim == 5:  # if control returned [B, frames, Cc, h, w]
            B_eff = out.shape[0]
            out = out.reshape(B_eff * frames_in, out.shape[2], out.shape[3], out.shape[4])
        elif out.ndim == 4 and out.shape[0] == B * frames_control:
            # already flattened [B*frames, C, h, w] -> fine
            pass
        # else assume already [B*frames, C, h, w]

        # prepare scale tensor
        if isinstance(scale, torch.Tensor):
            s = scale.to(device=device, dtype=dtype)
            if s.ndim == 0:
                s_val = float(s.item())
                scale_per_batch = torch.full((out.shape[0] // frames_in, 1, 1, 1), s_val, device=device, dtype=dtype)
            else:
                # if vector, try reshape
                scale_per_batch = s.view(-1, 1, 1, 1).to(device=device, dtype=dtype)
        else:
            scale_per_batch = torch.full((out.shape[0] // frames_in, 1, 1, 1), float(scale), device=device, dtype=dtype)

        # duplicate for frames and flatten to match out
        scale_flat = scale_per_batch.repeat_interleave(frames_in, dim=0)  # [B_eff*frames,1,1,1]

        # If classifier-free guidance present we would zero unconditional half earlier; skipping for this smoke test.
        return {"output": out.to(device=device, dtype=dtype), "scale": scale_flat.to(device=device, dtype=dtype)}

    conditional_controls = make_conditional_controls_from_ctrl_out(ctrl_out, frames_unet, device, sample.dtype, do_cf=False)
    print("Conditional controls prepared shapes:", conditional_controls["output"].shape, conditional_controls["scale"].shape)

    # -------------------------
    # 6) Call UNet forward with conditional_controls (not using pipeline)
    # -------------------------
    print("Calling UNet forward (this may be the heaviest op)...")
    with torch.no_grad():
        out = unet(
            sample=sample,
            timestep=timestep,
            encoder_hidden_states=encoder_hidden_states,
            down_block_additional_residuals=None,
            mid_block_additional_residual=None,
            conditional_controls=conditional_controls,
            return_dict=True,
            added_time_ids=added_time_ids,
            image_only_indicator=None,
            control_weight=1.0,
            control=None,          # we are passing control via conditional_controls dict (the UNet code you pasted handles that)
            control_scale=1.0,
        )
    # out is a dataclass UNetSpatioTemporalConditionOutput or tuple
    if isinstance(out, tuple):
        sample_out = out[0]
    else:
        sample_out = out.sample

    print("UNet forward completed.")
    print("UNet output shape (batch, frames, channels, H, W):", sample_out.shape)


if __name__ == "__main__":
    main()


  _torch_pytree._register_pytree_node(
  _torch_pytree._register_pytree_node(


Instantiating ControlNeXt SVD model...
Instantiating UNet...
Running control model forward...
DEBUG: initial sample.shape = torch.Size([1, 4, 3, 256, 256])
DEBUG: sample.shape before unpack = torch.Size([4, 3, 256, 256])


  deprecate("scale", "1.0.0", deprecation_message)


Control model output keys: ['output', 'scale']
{'output': torch.Size([1, 320, 32, 32]), 'scale': 1.0}
Conditional controls prepared shapes: torch.Size([4, 320, 32, 32]) torch.Size([4, 1, 1, 1])
Calling UNet forward (this may be the heaviest op)...
UNet forward completed.
UNet output shape (batch, frames, channels, H, W): torch.Size([1, 4, 4, 64, 64])


In [2]:
%cd FCVG

/home/ie643_visionforge/venv/Untitled_Folder/FCVG


  self.shell.db['dhist'] = compress_dhist(dhist)[-100:]


In [2]:
!find ~ -name "frame_0020.png"

/home/ie643_visionforge/venv/Untitled_Folder/FCVG/tmp_upscale_run/frames/frame_0020.png
/home/ie643_visionforge/venv/Untitled_Folder/test_frames/frame_0020.png


In [1]:
from huggingface_hub import login

In [None]:
HF_TOKEN = "HF token"   # paste your new token here
login(HF_TOKEN)


In [None]:
login(token="HF token")

KeyboardInterrupt: 

In [3]:
from huggingface_hub import whoami
print(whoami())

{'type': 'user', 'id': '68de2b1c64887e13537f682b', 'name': 'AdarshKorde', 'fullname': 'Adarsh Korde', 'isPro': False, 'avatarUrl': '/avatars/35a96cd72f6909ab2a4af92ad063e7ea.svg', 'orgs': [], 'auth': {'type': 'access_token', 'accessToken': {'displayName': 'IE', 'role': 'fineGrained', 'createdAt': '2025-11-27T12:29:20.687Z', 'fineGrained': {'canReadGatedRepos': True, 'global': [], 'scoped': [{'entity': {'_id': '68de2b1c64887e13537f682b', 'type': 'user', 'name': 'AdarshKorde'}, 'permissions': ['repo.content.read', 'repo.write', 'inference.serverless.write', 'inference.endpoints.infer.write', 'inference.endpoints.write', 'user.webhooks.read', 'user.webhooks.write', 'collection.read', 'collection.write']}]}}}}


In [5]:
!python demo_FCVG.py --image1_path /home/ie643_visionforge/venv/Untitled_Folder/frame_0020.png --image2_path /home/ie643_visionforge/venv/Untitled_Folder/frame_0021.png

  _torch_pytree._register_pytree_node(
  _torch_pytree._register_pytree_node(
  _torch_pytree._register_pytree_node(
Loading Matching Model...
  self.load_state_dict(torch.load(str(path)), strict=False)
  @torch.cuda.amp.custom_fwd(cast_inputs=torch.float32)
Loading pipeline components...: 100%|█████████████| 5/5 [00:00<00:00, 63.97it/s]
  deprecate("scale", "1.0.0", deprecation_message)
100%|███████████████████████████████████████████| 15/15 [00:16<00:00,  1.11s/it]
Pipeline single forward pass runtime: 25.0350 sec


In [7]:
!du -ah . | sort -h | tail -n 20

1.2M	./models/resources/img1.jpg
1.2M	./models/resources/img2.jpg
5.0M	./models/resources/weights/superpoint_v1.pth
14M	./checkpoints/controlnext.safetensors
22M	./models/resources/demo_seq1.gif
56M	./.git
56M	./.git/objects
56M	./.git/objects/pack
56M	./.git/objects/pack/pack-98c620c93d15217787a56c23028b6de90b5dad2a.pack
108M	./models/resources/weights/checkpoint_GlueStick_MD.tar
108M	./models/resources/weights/checkpoint_GlueStick_MD.tar.1
129M	./checkpoints/dwpose/dw-ll_ucoco_384.onnx
207M	./checkpoints/dwpose/yolox_l.onnx
220M	./models/resources/weights
244M	./models/resources
245M	./models
335M	./checkpoints/dwpose
5.7G	./checkpoints/unet.safetensors
6.1G	./checkpoints
6.4G	.


In [22]:
pip install xformers 

Collecting xformers
  Downloading xformers-0.0.33.post1-cp39-abi3-manylinux_2_28_x86_64.whl (122.9 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m122.9/122.9 MB[0m [31m6.5 MB/s[0m eta [36m0:00:00[0m00:01[0m00:01[0m
[?25hCollecting torch==2.9.0
  Downloading torch-2.9.0-cp310-cp310-manylinux_2_28_x86_64.whl (899.8 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m899.8/899.8 MB[0m [31m815.5 kB/s[0m eta [36m0:00:00[0m00:01[0m00:01[0m
Collecting nvidia-cuda-runtime-cu12==12.8.90
  Downloading nvidia_cuda_runtime_cu12-12.8.90-py3-none-manylinux2014_x86_64.manylinux_2_17_x86_64.whl (954 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m954.8/954.8 kB[0m [31m16.8 MB/s[0m eta [36m0:00:00[0m00:01[0m
[?25hCollecting nvidia-nccl-cu12==2.27.5
  Downloading nvidia_nccl_cu12-2.27.5-py3-none-manylinux2014_x86_64.manylinux_2_17_x86_64.whl (322.3 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m322.3/322.3 MB[0

In [1]:
%cd FCVG

/home/ie643_visionforge/venv/Untitled_Folder/FCVG


  self.shell.db['dhist'] = compress_dhist(dhist)[-100:]


In [2]:
!python demo_FCVG.py --image1_path /home/ie643_visionforge/venv/Untitled_Folder/test_frames/frame_0020.png --image2_path /home/ie643_visionforge/venv/Untitled_Folder/test_frames/frame_0021.png

  _torch_pytree._register_pytree_node(
  _torch_pytree._register_pytree_node(
  _torch_pytree._register_pytree_node(
Loading Matching Model...
  self.load_state_dict(torch.load(str(path)), strict=False)
  @torch.cuda.amp.custom_fwd(cast_inputs=torch.float32)
Traceback (most recent call last):
  File "/home/ie643_visionforge/venv/Untitled_Folder/FCVG/demo_FCVG.py", line 233, in <module>
    pipeline_model = TwoViewPipeline(conf).to(device).eval()
  File "/home/ie643_visionforge/venv/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1340, in to
    return self._apply(convert)
  File "/home/ie643_visionforge/venv/lib/python3.10/site-packages/torch/nn/modules/module.py", line 900, in _apply
    module._apply(fn)
  File "/home/ie643_visionforge/venv/lib/python3.10/site-packages/torch/nn/modules/module.py", line 900, in _apply
    module._apply(fn)
  File "/home/ie643_visionforge/venv/lib/python3.10/site-packages/torch/nn/modules/module.py", line 900, in _apply
    module._apply

In [12]:
!python demo_FCVG.py --image1_path /home/ie643_visionforge/venv/Untitled_Folder/test_frames/frame_0020.png --image2_path /home/ie643_visionforge/venv/Untitled_Folder/test_frames/frame_0021.png

  _torch_pytree._register_pytree_node(
  _torch_pytree._register_pytree_node(
  _torch_pytree._register_pytree_node(
Loading Matching Model...
  self.load_state_dict(torch.load(str(path)), strict=False)
  @torch.cuda.amp.custom_fwd(cast_inputs=torch.float32)
Loading pipeline components...: 100%|█████████████| 5/5 [00:00<00:00, 39.79it/s]
  deprecate("scale", "1.0.0", deprecation_message)
  7%|██▉                                         | 1/15 [00:06<01:36,  6.89s/it]
Traceback (most recent call last):
  File "/home/ie643_visionforge/venv/Untitled_Folder/FCVG/demo_FCVG.py", line 310, in <module>
    video_frames = pipeline(
  File "/home/ie643_visionforge/venv/lib/python3.10/site-packages/torch/utils/_contextlib.py", line 116, in decorate_context
    return func(*args, **kwargs)
  File "/home/ie643_visionforge/venv/Untitled_Folder/FCVG/pipeline/pipeline_FCVG.py", line 662, in __call__
    noise_pred = self.multidiffusion_step(latents, t,
  File "/home/ie643_visionforge/venv/lib/python3.

In [6]:
pip uninstall -y xformers

Found existing installation: xformers 0.0.33.post1
Uninstalling xformers-0.0.33.post1:
  Successfully uninstalled xformers-0.0.33.post1
Note: you may need to restart the kernel to use updated packages.


In [1]:
import torch, torchvision
print(torch.__version__)
print(torchvision.__version__)

2.5.1+cu121
0.20.1+cu121


In [16]:
import sys
print(sys.executable)

/home/ie643_visionforge/venv/bin/python


In [20]:
!source /home/ie643_visionforge/venv/bin/activate

In [23]:
!pip uninstall -y torch torchvision torchaudio
!pip cache purge

Found existing installation: torch 2.9.0
Uninstalling torch-2.9.0:
  Successfully uninstalled torch-2.9.0
Found existing installation: torchvision 0.22.1+cu118
Uninstalling torchvision-0.22.1+cu118:
  Successfully uninstalled torchvision-0.22.1+cu118
Found existing installation: torchaudio 2.7.1+cu118
Uninstalling torchaudio-2.7.1+cu118:
  Successfully uninstalled torchaudio-2.7.1+cu118
Files removed: 96


In [24]:
!pip install torch torchvision torchaudio --index-url https://download.pytorch.org/whl/cu121


Looking in indexes: https://download.pytorch.org/whl/cu121
Collecting torch
  Downloading https://download.pytorch.org/whl/cu121/torch-2.5.1%2Bcu121-cp310-cp310-linux_x86_64.whl (780.4 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m780.4/780.4 MB[0m [31m868.3 kB/s[0m eta [36m0:00:00[0m00:01[0m00:01[0m
[?25hCollecting torchvision
  Downloading https://download.pytorch.org/whl/cu121/torchvision-0.20.1%2Bcu121-cp310-cp310-linux_x86_64.whl (7.3 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m7.3/7.3 MB[0m [31m33.6 MB/s[0m eta [36m0:00:00[0m:00:01[0m00:01[0m
[?25hCollecting torchaudio
  Downloading https://download.pytorch.org/whl/cu121/torchaudio-2.5.1%2Bcu121-cp310-cp310-linux_x86_64.whl (3.4 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m3.4/3.4 MB[0m [31m28.6 MB/s[0m eta [36m0:00:00[0m:00:01[0m
Collecting nvidia-cusparse-cu12==12.1.0.106
  Downloading https://download.pytorch.org/whl/cu121/nvidia_cusparse_c

      Successfully uninstalled nvidia-cuda-nvrtc-cu12-12.8.93
  Attempting uninstall: nvidia-cuda-cupti-cu12
    Found existing installation: nvidia-cuda-cupti-cu12 12.8.90
    Uninstalling nvidia-cuda-cupti-cu12-12.8.90:
      Successfully uninstalled nvidia-cuda-cupti-cu12-12.8.90
  Attempting uninstall: nvidia-cublas-cu12
    Found existing installation: nvidia-cublas-cu12 12.8.4.1
    Uninstalling nvidia-cublas-cu12-12.8.4.1:
      Successfully uninstalled nvidia-cublas-cu12-12.8.4.1
  Attempting uninstall: nvidia-cusolver-cu12
    Found existing installation: nvidia-cusolver-cu12 11.7.3.90
    Uninstalling nvidia-cusolver-cu12-11.7.3.90:
      Successfully uninstalled nvidia-cusolver-cu12-11.7.3.90
  Attempting uninstall: nvidia-cudnn-cu12
    Found existing installation: nvidia-cudnn-cu12 9.10.2.21
    Uninstalling nvidia-cudnn-cu12-9.10.2.21:
      Successfully uninstalled nvidia-cudnn-cu12-9.10.2.21
Successfully installed nvidia-cublas-cu12-12.1.3.1 nvidia-cuda-cupti-cu12-12.1.

In [None]:
 !tar -czf main.tar.gz /home/ie643_visionforge/venv/Untitled_Folder

tar: Removing leading `/' from member names
tar: /home/ie643_visionforge/venv/Untitled_Folder/main.tar.gz: file changed as we read it


In [3]:
ls /home/ie643_visionforge/venv/

 [0m[01;34mbin[0m/   [01;35mforcasting_downsampled.mp4[0m   [01;34minclude[0m/   [01;36mlib64[0m@       [01;34mshare[0m/
 [01;34metc[0m/  'ie-project (3).ipynb'        [01;34mlib[0m/       pyvenv.cfg   [01;34mUntitled_Folder[0m/
