**Prepare Environment & Download Models**

In [None]:
# @title

!pip install torch==2.3.1 torchvision==0.18.1 torchaudio==2.3.1 numpy opencv-python==4.9.0.80 pillow tqdm

!pip install -r https://raw.githubusercontent.com/Isi-dev/animate-x/main/requirements.txt

!pip install onnxruntime-gpu

!pip install pynvml
!pip install kornia


!git clone https://github.com/Isi-dev/animate-x.git
%cd animate-x


!mkdir -p checkpoints

# Download DWPose models
!wget -O checkpoints/dw-ll_ucoco_384.onnx "https://huggingface.co/Isi99999/UniAnimate_and_Animate-X_Models/resolve/main/dw-ll_ucoco_384.onnx"
!wget -O checkpoints/yolox_l.onnx "https://huggingface.co/Isi99999/UniAnimate_and_Animate-X_Models/resolve/main/yolox_l.onnx"

# Download Animate-X, Embedder & AutoEncoder models
!wget -O checkpoints/animate-x_ckpt.pth "https://huggingface.co/Isi99999/UniAnimate_and_Animate-X_Models/resolve/main/animate-x_ckpt.pth"
!wget -O checkpoints/open_clip_pytorch_model.bin "https://huggingface.co/Isi99999/UniAnimate_and_Animate-X_Models/resolve/main/open_clip_pytorch_model.bin"
!wget -O checkpoints/v2-1_512-ema-pruned.ckpt "https://huggingface.co/Isi99999/UniAnimate_and_Animate-X_Models/resolve/main/v2-1_512-ema-pruned.ckpt"

print("✅ All models are ready!")


Collecting torch==2.3.1
  Downloading torch-2.3.1-cp311-cp311-manylinux1_x86_64.whl.metadata (26 kB)
Collecting torchvision==0.18.1
  Downloading torchvision-0.18.1-cp311-cp311-manylinux1_x86_64.whl.metadata (6.6 kB)
Collecting torchaudio==2.3.1
  Downloading torchaudio-2.3.1-cp311-cp311-manylinux1_x86_64.whl.metadata (6.4 kB)
Collecting opencv-python==4.9.0.80
  Downloading opencv_python-4.9.0.80-cp37-abi3-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (20 kB)
Collecting nvidia-cuda-nvrtc-cu12==12.1.105 (from torch==2.3.1)
  Downloading nvidia_cuda_nvrtc_cu12-12.1.105-py3-none-manylinux1_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-cuda-runtime-cu12==12.1.105 (from torch==2.3.1)
  Downloading nvidia_cuda_runtime_cu12-12.1.105-py3-none-manylinux1_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-cuda-cupti-cu12==12.1.105 (from torch==2.3.1)
  Downloading nvidia_cuda_cupti_cu12-12.1.105-py3-none-manylinux1_x86_64.whl.metadata (1.6 kB)
Collecting nvidia-cudnn-cu12==8.9.2.26 (from

**Load Inputs & Generate DWPose Data**

In [None]:
# @title
import os
import glob
import cv2
import ipywidgets as widgets
from google.colab import files
from IPython.display import display, Video, HTML
import PIL.Image
from base64 import b64encode
from IPython.display import clear_output


os.makedirs("data/videos", exist_ok=True)
os.makedirs("data/images", exist_ok=True)
os.makedirs("data/saved_pkl", exist_ok=True)
os.makedirs("data/saved_pose", exist_ok=True)
os.makedirs("data/saved_frames", exist_ok=True)


upload_image = widgets.FileUpload(accept="image/*", multiple=False)
upload_video = widgets.FileUpload(accept="video/*", multiple=False)
run_button = widgets.Button(description="Extract DWPose", button_style="success")
output_display = widgets.Output()

print("Upload Reference Image:")
display(upload_image)
print("\nUpload Driving Video:")
display(upload_video)



def save_uploaded_files(upload_widget, save_dir):
    if not upload_widget.value:
        return None
    for filename, file_info in upload_widget.value.items():
        file_path = os.path.join(save_dir, filename)
        with open(file_path, 'wb') as f:
            f.write(file_info['content'])
        return file_path


def on_run_button_click(b):
    clear_output(wait=True)

    with output_display:
        output_display.clear_output()


        image_path = save_uploaded_files(upload_image, "data/images")
        video_path = save_uploaded_files(upload_video, "data/videos")

        if not video_path:
            print("❌ No driving video uploaded!")
            return

        print(f"✅ Reference Image Saved: {image_path}")
        print(f"✅ Driving Video Saved: {video_path}")
        print("🔄 Running DWPose extraction...")

        def check_and_install(package):
          try:
              __import__(package)
          except ImportError:
              print(f"🔍 {package} not found. Installing...")
              import subprocess
              import sys
              subprocess.check_call([sys.executable, "-m", "pip", "install", package])
              print(f"✅ {package} installed successfully.")


        check_and_install("onnxruntime")


        !python process_data.py --source_video_paths data/videos --saved_pose_dir data/saved_pkl --saved_pose data/saved_pose --saved_frame_dir data/saved_frames

        print("✅ DWPose data extraction complete!")

        # Process Extracted Frames
        # frame_paths = sorted(glob.glob("data/saved_pose/*/*.jpg"))  # Include subdirectories

        # if not frame_paths:
        #     print("❌ No extracted frames found. Ensure process_data.py ran successfully.")
        # else:
        #
        #
        #     first_frame = cv2.imread(frame_paths[0])
        #     if first_frame is None:
        #         print("❌ First frame is blank. Video cannot be generated.")
        #     else:
        #         print("First frame loaded correctly.")

        #
        #     video_output_path = "dwpose_output.mp4"
        #     heightV, widthV = first_frame.shape[:2]
        #     print(f"width of video: {widthV}")
        #     print(f"height of video: {heightV}")
        #     fourcc = cv2.VideoWriter_fourcc(*'mp4v')
        #     video = cv2.VideoWriter(video_output_path, fourcc, 30, (widthV, heightV))

        #     for frame_path in frame_paths:
        #         img = cv2.imread(frame_path, cv2.IMREAD_UNCHANGED)
        #         if img is None:
        #             print(f"❌ Skipping blank frame: {frame_path}")
        #             continue  # Skip blank frames

        #         if img.shape[2] == 4:  # Convert RGBA to BGR if needed
        #             print("Converting frames to BGR...")
        #             img = cv2.cvtColor(img, cv2.COLOR_BGRA2BGR)

        #         video.write(img)

        #     video.release()
        #     print("✅ Video generation complete.")

        #
        #     print("✅ Extracted DWPose video:")
        #     display(Video(video_output_path, embed=True))


run_button.on_click(on_run_button_click)

display(run_button, output_display)


Upload Reference Image:


FileUpload(value={}, accept='image/*', description='Upload')


Upload Driving Video:


FileUpload(value={}, accept='video/*', description='Upload')

Button(button_style='success', description='Extract DWPose', style=ButtonStyle())

Output()

**Set Inference Parameters & Run**

In [None]:
# @title
import os
import yaml
import glob
import IPython.display as display
import ipywidgets as widgets
from IPython.display import clear_output


def run_inference(_):

    clear_output(wait=True)

    import glob

    max_frames = max_frames_input.value
    resolution = [int(x) for x in resolution_input.value.split(",")]
    round_val = round_input.value
    ddim_timesteps = ddim_timesteps_input.value
    seed = seed_input.value
    save_fps = save_fps_input.value
    n = frame_skip_input.value


    image_files = sorted(glob.glob("data/images/*"))
    saved_pose_folders = sorted(glob.glob("data/saved_pose/*"))
    saved_frame_folders = sorted(glob.glob("data/saved_frames/*"))
    saved_pkl_files = sorted(glob.glob("data/saved_pkl/*.pkl"))


    test_list_path = []
    for img_file, pose_folder, frame_folder, pkl_file in zip(image_files, saved_pose_folders, saved_frame_folders, saved_pkl_files):
        test_list_path.append([n, img_file, pose_folder, frame_folder, pkl_file, seed])


    config = {
        "max_frames": max_frames,
        "resolution": resolution,
        "round": round_val,
        "ddim_timesteps": ddim_timesteps,
        "seed": seed,
        "save_fps": save_fps,
        "test_list_path": test_list_path,
        "log_dir": "results",
        "test_model": "checkpoints/animate-x_ckpt.pth",
        "partial_keys": [["image", "local_image", "dwpose", "pose_embeddings"]],
        "TASK_TYPE": "inference_animate_x_entrance",
        "use_fp16": True,
        "guide_scale": 2.5,
        "vit_resolution": [224, 224],
        "batch_size": 1,
        "latent_random_ref": True,
        "chunk_size": 2,
        "decoder_bs": 2,
        "scale": 8,
        "use_fps_condition": False,
        "embedder": {
            "type": "FrozenOpenCLIPTextVisualEmbedder",
            "layer": "penultimate",
            "pretrained": "checkpoints/open_clip_pytorch_model.bin"
        },
        "auto_encoder": {
            "type": "AutoencoderKL",
            "ddconfig": {
                "double_z": True,
                "z_channels": 4,
                "resolution": 256,
                "in_channels": 3,
                "out_ch": 3,
                "ch": 128,
                "ch_mult": [1, 2, 4, 4],
                "num_res_blocks": 2,
                "attn_resolutions": [],
                "dropout": 0.0,
                "video_kernel_size": [3, 1, 1]
            },
            "embed_dim": 4,
            "pretrained": "checkpoints/v2-1_512-ema-pruned.ckpt"
        },
        "UNet": {
            "type": "UNetSD_Animate_X",
            "config": None,
            "in_dim": 4,
            "num": 0,
            "no_hand": True,
            "dim": 320,
            "y_dim": 1024,
            "context_dim": 1024,
            "out_dim": 4,
            "dim_mult": [1, 2, 4, 4],
            "num_heads": 8,
            "head_dim": 64,
            "num_res_blocks": 2,
            "dropout": 0.1,
            "temporal_attention": True,
            "num_tokens": 4,
            "temporal_attn_times": 1,
            "use_checkpoint": True,
            "use_fps_condition": False,
            "use_sim_mask": False,
            "seq_len": max_frames + 1
        },
        "video_compositions": ["image", "local_image", "dwpose", "randomref", "randomref_pose", "pose_embedding"],
        "Diffusion": {
            "type": "DiffusionDDIM",
            "schedule": "linear_sd",
            "schedule_param": {
                "num_timesteps": 1000,
                "init_beta": 0.00085,
                "last_beta": 0.0120,
                "zero_terminal_snr": True
            },
            "mean_type": "v",
            "loss_type": "mse",
            "var_type": "fixed_small",
            "rescale_timesteps": False,
            "noise_strength": 0.1
        },
        "use_DiffusionDPM": False,
        "CPU_CLIP_VAE": True
    }


    config_path = "configs/Animate_X_infer.yaml"
    with open(config_path, "w") as file:
        yaml.dump(config, file, default_flow_style=False, sort_keys=False)

    print(f"✅ Configuration file updated: {config_path}")


    !python inference.py --cfg configs/Animate_X_infer.yaml

    import os
    import glob
    from IPython.display import display as displayVid, Video as outVid


    video_folder = "results/Animate_X_infer/"


    video_files = glob.glob(os.path.join(video_folder, "*.mp4"))

    if video_files:
        latest_video = max(video_files, key=os.path.getctime)
        print(f"Displaying video: {latest_video}")
        displayVid(outVid(latest_video, embed=True))
    else:
        print("❌ No video found in results/Animate_X_infer/")


max_frames_input = widgets.IntText(value=32, description="Max Frames:")
resolution_input = widgets.Text(value="512,768", description="Resolution:")
round_input = widgets.IntText(value=1, description="Round:")
ddim_timesteps_input = widgets.IntText(value=30, description="DDIM Steps:")
seed_input = widgets.IntText(value=13, description="Seed:")
save_fps_input = widgets.FloatText(value=8.0, description="FPS:", step=0.5)
frame_skip_input = widgets.IntText(value=1, description="Frame Skip (n):")


run_button = widgets.Button(description="Run Inference", button_style="danger")
run_button.on_click(run_inference)


widgets_box = widgets.VBox([max_frames_input, resolution_input, round_input, ddim_timesteps_input, seed_input, save_fps_input, frame_skip_input, run_button])
display.display(widgets_box)


✅ Configuration file updated: configs/Animate_X_infer.yaml
For Windows users, we explicitly import registry function inference_animate_x_entrance !!!
[2025-02-10 15:38:58,765] INFO: {'__name__': 'Config: VideoLDM Decoder', 'mean': [0.5, 0.5, 0.5], 'std': [0.5, 0.5, 0.5], 'max_words': 1000, 'num_workers': 8, 'prefetch_factor': 2, 'resolution': [512, 768], 'vit_out_dim': 1024, 'vit_resolution': [224, 224], 'depth_clamp': 10.0, 'misc_size': 384, 'depth_std': 20.0, 'save_fps': 8.0, 'frame_lens': [32, 32, 32, 1], 'sample_fps': [4], 'vid_dataset': {'type': 'VideoBaseDataset', 'data_list': [], 'max_words': 1000, 'resolution': [448, 256]}, 'img_dataset': {'type': 'ImageBaseDataset', 'data_list': ['laion_400m'], 'max_words': 1000, 'resolution': [448, 256]}, 'batch_sizes': {'1': 256, '4': 4, '8': 4, '16': 4}, 'Diffusion': {'type': 'DiffusionDDIM', 'schedule': 'linear_sd', 'schedule_param': {'num_timesteps': 1000, 'init_beta': 0.00085, 'last_beta': 0.012, 'zero_terminal_snr': True}, 'mean_type': 