In [1]:
import os
from regex import F
import yaml
import torch
from torch.utils.data import Dataset
from torchvision.io import read_video

class V2VBenchDataset(Dataset):
    def __init__(self, config_path, videos_dir):
        """
        Args:
            config_path (str): Path to config.yaml file
            videos_dir (str): Directory containing video files
        """
        self.videos_dir = videos_dir
        
        # Load config file
        with open(config_path, 'r') as f:
            self.config = yaml.safe_load(f)
        
        # Flatten the data structure
        self.samples = []
        for video_data in self.config['data']:
            video_id = video_data['video_id']
            source_prompt = video_data['prompt']
            video_path = os.path.join(self.videos_dir, f"{video_id}.mp4")
            
            for edit in video_data['edit']:
                target_prompt = edit['prompt']
                self.samples.append({
                    'video_path': video_path,
                    'video_id': video_id,
                    'source_prompt': source_prompt,
                    'target_prompt': target_prompt,
                    'src_words': edit.get('src_words', ''),
                    'tgt_words': edit.get('tgt_words', ''),
                    'edit_type': edit.get('type', '')
                })
    
    def __len__(self):
        return len(self.samples)
    
    def __getitem__(self, idx):
        sample = self.samples[idx]
        
        # Load video (frames, audio, info)
        # Note: You may need to adjust this based on your video format and needs
        video, _, info = read_video(sample['video_path'], pts_unit='sec')
        
        # Convert video to float and normalize to [0, 1]
        video = video.float() / 255.0
        
        return {
            'video': video,
            'video_id': sample['video_id'],
            'source_prompt': sample['source_prompt'],
            'target_prompt': sample['target_prompt'],
            'src_words': sample['src_words'],
            'tgt_words': sample['tgt_words'],
            'edit_type': sample['edit_type']
        }

# Example usage
if __name__ == "__main__":
    config_path = "/data/chx/V2VBench/config.yaml"
    videos_dir = "/data/chx/V2VBench/videos"
    
    dataset = V2VBenchDataset(config_path, videos_dir)
    
    # Create a DataLoader
    dataloader = torch.utils.data.DataLoader(
        dataset,
        batch_size=1,
        shuffle=False,
        num_workers=4
    )
    
    # Iterate through the dataset
    for batch in dataloader:
        print(f"Batch size: {len(batch['video'])}")
        print(f"Video ID: {batch['video_id']}")
        print(f"Video shape: {batch['video'].shape}")
        print(f"Source prompt: {batch['source_prompt']}")
        print(f"Target prompt: {batch['target_prompt']}")
        print("---")

Batch size: 1
Video ID: ['aurora']
Video shape: torch.Size([1, 24, 512, 512, 3])
Source prompt: ['green and blue aurora paints the night sky over mountain silhouettes']
Target prompt: ['red and yellow aurora paints the night sky over mountain silhouettes']
---
Batch size: 1
Video ID: ['aurora']
Video shape: torch.Size([1, 24, 512, 512, 3])
Source prompt: ['green and blue aurora paints the night sky over mountain silhouettes']
Target prompt: ['white and pure cloud paints the night sky over mountain silhouettes']
---
Batch size: 1
Video ID: ['aurora']
Video shape: torch.Size([1, 24, 512, 512, 3])
Source prompt: ['green and blue aurora paints the night sky over mountain silhouettes']
Target prompt: ['green and blue aurora paints the night sky over mountain silhouettes, van gogh starry night style']
---
Batch size: 1
Video ID: ['beam']
Video shape: torch.Size([1, 24, 512, 512, 3])
Source prompt: ['symmetric neon light tunnel with blue and purple hues']
Target prompt: ['symmetric neon light

In [1]:
%run edit_video.py \
    --dit-weight "/home/chx/mySrc/HunyuanVideo/ckpts/hunyuan-video-t2v-720p/transformers/mp_rank_00_model_states_fp8.pt" \
    --video-size 512 512 \
    --video-length 25 \
    --infer-steps 30 \
    --prompt "a grey car navigates a curvy road surrounded by green grass trees and mountains." \
    --target-prompt "a red sports-car navigates a curvy road surrounded by green grass trees and mountains." \
    --inject 5 \
    --seed 42 \
    --embedded-cfg-scale 2 \
    --flow-shift 7.0 \
    --flow-reverse \
    --use-cpu-offload \
    --use-fp8 \
    --save-path ./results/rf-solver_2order \
    --inverse-video-path "/data/chx/V2VBench/videos/car-turn.mp4" \
    --dataset "V2VBench"

  from .autonotebook import tqdm as notebook_tqdm


detect you are not use the latest yunchang. Please install yunchang>=0.4.0


usage: edit_video.py [-h] [--model {HYVideo-T/2,HYVideo-T/2-cfgdistill}]
                     [--latent-channels LATENT_CHANNELS]
                     [--precision {fp16,bf16,fp32}] [--rope-theta ROPE_THETA]
                     [--vae {884-16c-hy}] [--vae-precision {fp16,bf16,fp32}]
                     [--vae-tiling] [--text-encoder {clipL,llm}]
                     [--text-encoder-precision {fp16,bf16,fp32}]
                     [--text-states-dim TEXT_STATES_DIM] [--text-len TEXT_LEN]
                     [--tokenizer {clipL,llm}]
                     [--prompt-template {dit-llm-encode,dit-llm-encode-video}]
                     [--prompt-template-video {dit-llm-encode,dit-llm-encode-video}]
                     [--hidden-state-skip-layer HIDDEN_STATE_SKIP_LAYER]
                     [--apply-final-norm] [--text-encoder-2 {clipL,llm}]
                     [--text-encoder-precision-2 {fp16,bf16,fp32}]
                     [--text-states-dim-2 TEXT_STATES_DIM_2]
                     

SystemExit: 2

In [1]:
%run edit_video.py \
    --dit-weight "/home/chx/mySrc/HunyuanVideo/ckpts/hunyuan-video-t2v-720p/transformers/mp_rank_00_model_states_fp8.pt" \
    --video-size 512 512 \
    --video-length 25 \
    --infer-steps 30 \
    --prompt "a grey car navigates a curvy road surrounded by green grass trees and mountains." \
    --target-prompt "a red sports-car navigates a curvy road surrounded by green grass trees and mountains." \
    --inject 5 \
    --seed 42 \
    --embedded-cfg-scale 2 \
    --flow-shift 7.0 \
    --flow-reverse \
    --use-cpu-offload \
    --use-fp8 \
    --save-path ./results/rf-solver_2order \
    --inverse-video-path "/data/chx/V2VBench/videos/car-turn.mp4"

  from .autonotebook import tqdm as notebook_tqdm
[32m2025-04-03 12:39:28.995[0m | [1mINFO    [0m | [36mhyvideo.inference[0m:[36mfrom_pretrained[0m:[36m155[0m - [1mGot text-to-video model root path: ckpts[0m
[32m2025-04-03 12:39:28.996[0m | [1mINFO    [0m | [36mhyvideo.inference[0m:[36mfrom_pretrained[0m:[36m190[0m - [1mBuilding model...[0m


detect you are not use the latest yunchang. Please install yunchang>=0.4.0
{'apply_final_norm': False,
 'batch_size': 1,
 'cfg_scale': 1.0,
 'denoise_type': 'flow',
 'disable_autocast': False,
 'dit_weight': '/home/chx/mySrc/HunyuanVideo/ckpts/hunyuan-video-t2v-720p/transformers/mp_rank_00_model_states_fp8.pt',
 'embedded_cfg_scale': 2.0,
 'feature_path': 'feature',
 'flow_reverse': True,
 'flow_shift': 7.0,
 'flow_solver': 'euler',
 'hidden_state_skip_layer': 2,
 'infer_steps': 30,
 'inject': 5,
 'inverse_video_path': '/data/chx/V2VBench/videos/car-turn.mp4',
 'latent_channels': 16,
 'linear_schedule_end': 25,
 'load_key': 'module',
 'model': 'HYVideo-T/2-cfgdistill',
 'model_base': 'ckpts',
 'model_resolution': '540p',
 'name_suffix': '',
 'neg_prompt': None,
 'num_videos': 1,
 'precision': 'bf16',
 'prompt': 'a grey car navigates a curvy road surrounded by green grass trees '
           'and mountains.',
 'prompt_template': 'dit-llm-encode',
 'prompt_template_video': 'dit-llm-encode

  fp8_map = torch.load(fp8_map_path, map_location=lambda storage, loc: storage)
[32m2025-04-03 12:41:45.434[0m | [1mINFO    [0m | [36mhyvideo.inference[0m:[36mload_state_dict[0m:[36m345[0m - [1mLoading torch model /home/chx/mySrc/HunyuanVideo/ckpts/hunyuan-video-t2v-720p/transformers/mp_rank_00_model_states_fp8.pt...[0m
  state_dict = torch.load(model_path, map_location=lambda storage, loc: storage)
[32m2025-04-03 12:43:30.702[0m | [1mINFO    [0m | [36mhyvideo.vae[0m:[36mload_vae[0m:[36m29[0m - [1mLoading 3D VAE model (884-16c-hy) from: ./ckpts/hunyuan-video-t2v-720p/vae[0m


loading vae


  ckpt = torch.load(vae_ckpt, map_location=vae.device)
[32m2025-04-03 12:43:38.271[0m | [1mINFO    [0m | [36mhyvideo.vae[0m:[36mload_vae[0m:[36m55[0m - [1mVAE to dtype: torch.float16[0m
[32m2025-04-03 12:43:38.276[0m | [1mINFO    [0m | [36mhyvideo.text_encoder[0m:[36mload_text_encoder[0m:[36m28[0m - [1mLoading text encoder model (llm) from: ./ckpts/text_encoder[0m
Loading checkpoint shards: 100%|██████████| 4/4 [02:33<00:00, 38.44s/it]
[32m2025-04-03 12:46:18.665[0m | [1mINFO    [0m | [36mhyvideo.text_encoder[0m:[36mload_text_encoder[0m:[36m50[0m - [1mText encoder to dtype: torch.float16[0m
[32m2025-04-03 12:46:18.670[0m | [1mINFO    [0m | [36mhyvideo.text_encoder[0m:[36mload_tokenizer[0m:[36m64[0m - [1mLoading tokenizer (llm) from: ./ckpts/text_encoder[0m
[32m2025-04-03 12:46:19.395[0m | [1mINFO    [0m | [36mhyvideo.text_encoder[0m:[36mload_text_encoder[0m:[36m28[0m - [1mLoading text encoder model (clipL) from: ./ckpts/text_en

Enable sequential CPU offload.
Updated args:
{'apply_final_norm': False,
 'batch_size': 1,
 'cfg_scale': 1.0,
 'denoise_type': 'flow',
 'disable_autocast': False,
 'dit_weight': '/home/chx/mySrc/HunyuanVideo/ckpts/hunyuan-video-t2v-720p/transformers/mp_rank_00_model_states_fp8.pt',
 'embedded_cfg_scale': 2.0,
 'feature_path': 'feature',
 'flow_reverse': True,
 'flow_shift': 7.0,
 'flow_solver': 'euler',
 'hidden_state_skip_layer': 2,
 'infer_steps': 30,
 'inject': 5,
 'inverse_video_path': '/data/chx/V2VBench/videos/car-turn.mp4',
 'latent_channels': 16,
 'linear_schedule_end': 25,
 'load_key': 'module',
 'model': 'HYVideo-T/2-cfgdistill',
 'model_base': 'ckpts',
 'model_resolution': '540p',
 'name_suffix': '',
 'neg_prompt': None,
 'num_videos': 1,
 'precision': 'bf16',
 'prompt': 'a grey car navigates a curvy road surrounded by green grass trees '
           'and mountains.',
 'prompt_template': 'dit-llm-encode',
 'prompt_template_video': 'dit-llm-encode-video',
 'reproduce': False,


[32m2025-04-03 12:46:27.260[0m | [1mINFO    [0m | [36mhyvideo.inference[0m:[36mpredict[0m:[36m605[0m - [1mInput (height, width, video_length) = (512, 512, 25)[0m
[32m2025-04-03 12:46:27.689[0m | [34m[1mDEBUG   [0m | [36mhyvideo.inference[0m:[36mpredict[0m:[36m665[0m - [34m[1m
                        height: 512
                         width: 512
                  video_length: 25
                        prompt: ['a grey car navigates a curvy road surrounded by green grass trees and mountains.']
                    neg_prompt: ['Aerial view, aerial view, overexposed, low quality, deformation, a poor composition, bad hands, bad teeth, bad eyes, bad limbs, distortion']
                          seed: 42
                   infer_steps: 30
         num_videos_per_prompt: 1
                guidance_scale: 1.0
                      n_tokens: 7168
                    flow_shift: 7.0
       embedded_guidance_scale: 2.0[0m


cuda:0
cuda:0
sigmas_reverse
sigmas: tensor([0.0000, 0.1944, 0.3333, 0.4375, 0.5185, 0.5833, 0.6364, 0.6806, 0.7179,
        0.7500, 0.7778, 0.8021, 0.8235, 0.8426, 0.8596, 0.8750, 0.8889, 0.9015,
        0.9130, 0.9236, 0.9333, 0.9423, 0.9506, 0.9583, 0.9655, 0.9722, 0.9785,
        0.9844, 0.9899, 0.9951, 1.0000])
Using inversed latents
Step 0: t = 0.0
Step 1: t = 194.44444274902344
Step 2: t = 333.3333435058594
Step 3: t = 437.5000305175781
Step 4: t = 518.5184936523438
Step 5: t = 583.3333740234375
Step 6: t = 636.3637084960938
Step 7: t = 680.5556030273438
Step 8: t = 717.94873046875
Step 9: t = 750.0
Step 10: t = 777.77783203125
Step 11: t = 802.0833129882812
Step 12: t = 823.5294189453125
Step 13: t = 842.5925903320312
Step 14: t = 859.6491088867188
Step 15: t = 875.0
Step 16: t = 888.888916015625
Step 17: t = 901.51513671875
Step 18: t = 913.04345703125
Step 19: t = 923.611083984375
Step 20: t = 933.333251953125
Step 21: t = 942.3078002929688
Step 22: t = 950.6172485351562
Step

  0%|          | 0/30 [00:00<?, ?it/s]

Step 0: t_curr = 0.0, t_prev = 194.44444274902344
t:1


  3%|▎         | 1/30 [00:03<01:41,  3.49s/it]

Step 1: t_curr = 194.44444274902344, t_prev = 333.3333435058594
t:2


  7%|▋         | 2/30 [00:06<01:33,  3.32s/it]

Step 2: t_curr = 333.3333435058594, t_prev = 437.5000305175781
t:3


 10%|█         | 3/30 [00:09<01:28,  3.26s/it]

Step 3: t_curr = 437.5000305175781, t_prev = 518.5184936523438
t:4


 13%|█▎        | 4/30 [00:13<01:24,  3.23s/it]

Step 4: t_curr = 518.5184936523438, t_prev = 583.3333740234375
t:5


 17%|█▋        | 5/30 [00:16<01:20,  3.21s/it]

Step 5: t_curr = 583.3333740234375, t_prev = 636.3637084960938
t:6


 20%|██        | 6/30 [00:19<01:17,  3.21s/it]

Step 6: t_curr = 636.3637084960938, t_prev = 680.5556030273438
t:7


 23%|██▎       | 7/30 [00:22<01:13,  3.20s/it]

Step 7: t_curr = 680.5556030273438, t_prev = 717.94873046875
t:8


 27%|██▋       | 8/30 [00:25<01:10,  3.19s/it]

Step 8: t_curr = 717.94873046875, t_prev = 750.0
t:9


 30%|███       | 9/30 [00:28<01:06,  3.19s/it]

Step 9: t_curr = 750.0, t_prev = 777.77783203125
t:10


 33%|███▎      | 10/30 [00:32<01:03,  3.19s/it]

Step 10: t_curr = 777.77783203125, t_prev = 802.0833129882812
t:11


 37%|███▋      | 11/30 [00:35<01:00,  3.20s/it]

Step 11: t_curr = 802.0833129882812, t_prev = 823.5294189453125
t:12


 40%|████      | 12/30 [00:38<00:57,  3.20s/it]

Step 12: t_curr = 823.5294189453125, t_prev = 842.5925903320312
t:13


 43%|████▎     | 13/30 [00:41<00:54,  3.20s/it]

Step 13: t_curr = 842.5925903320312, t_prev = 859.6491088867188
t:14


 47%|████▋     | 14/30 [00:44<00:51,  3.19s/it]

Step 14: t_curr = 859.6491088867188, t_prev = 875.0
t:15


 50%|█████     | 15/30 [00:48<00:47,  3.19s/it]

Step 15: t_curr = 875.0, t_prev = 888.888916015625
t:16


 53%|█████▎    | 16/30 [00:51<00:44,  3.19s/it]

Step 16: t_curr = 888.888916015625, t_prev = 901.51513671875
t:17


 57%|█████▋    | 17/30 [00:54<00:41,  3.19s/it]

Step 17: t_curr = 901.51513671875, t_prev = 913.04345703125
t:18


 60%|██████    | 18/30 [00:57<00:38,  3.20s/it]

Step 18: t_curr = 913.04345703125, t_prev = 923.611083984375
t:19


 63%|██████▎   | 19/30 [01:00<00:35,  3.20s/it]

Step 19: t_curr = 923.611083984375, t_prev = 933.333251953125
t:20


 67%|██████▋   | 20/30 [01:04<00:31,  3.20s/it]

Step 20: t_curr = 933.333251953125, t_prev = 942.3078002929688
t:21


 70%|███████   | 21/30 [01:07<00:28,  3.20s/it]

Step 21: t_curr = 942.3078002929688, t_prev = 950.6172485351562
t:22


 73%|███████▎  | 22/30 [01:10<00:25,  3.20s/it]

Step 22: t_curr = 950.6172485351562, t_prev = 958.3333740234375
t:23


 77%|███████▋  | 23/30 [01:13<00:22,  3.20s/it]

Step 23: t_curr = 958.3333740234375, t_prev = 965.5172119140625
t:24


 80%|████████  | 24/30 [01:16<00:19,  3.20s/it]

Step 24: t_curr = 965.5172119140625, t_prev = 972.22216796875
t:25


 83%|████████▎ | 25/30 [01:20<00:16,  3.21s/it]

Step 25: t_curr = 972.22216796875, t_prev = 978.49462890625
t:26


 87%|████████▋ | 26/30 [01:24<00:14,  3.62s/it]

Step 26: t_curr = 978.49462890625, t_prev = 984.375
t:27


 90%|█████████ | 27/30 [01:29<00:11,  3.84s/it]

Step 27: t_curr = 984.375, t_prev = 989.8989868164062
t:28


 93%|█████████▎| 28/30 [01:33<00:07,  3.92s/it]

Step 28: t_curr = 989.8989868164062, t_prev = 995.0980834960938
t:29


 97%|█████████▋| 29/30 [01:37<00:04,  4.07s/it]

Step 29: t_curr = 995.0980834960938, t_prev = 1000.0
t:30


100%|██████████| 30/30 [01:42<00:00,  3.40s/it]


cuda:0
cuda:0
sigmas: tensor([1.0000, 0.9951, 0.9899, 0.9844, 0.9785, 0.9722, 0.9655, 0.9583, 0.9506,
        0.9423, 0.9333, 0.9236, 0.9130, 0.9015, 0.8889, 0.8750, 0.8596, 0.8426,
        0.8235, 0.8021, 0.7778, 0.7500, 0.7179, 0.6806, 0.6364, 0.5833, 0.5185,
        0.4375, 0.3333, 0.1944, 0.0000])
Using inversed latents
Step 0: t = 1000.0
Step 1: t = 995.0980834960938
Step 2: t = 989.8989868164062
Step 3: t = 984.375
Step 4: t = 978.49462890625
Step 5: t = 972.22216796875
Step 6: t = 965.5172119140625
Step 7: t = 958.3333740234375
Step 8: t = 950.6172485351562
Step 9: t = 942.3078002929688
Step 10: t = 933.333251953125
Step 11: t = 923.611083984375
Step 12: t = 913.04345703125
Step 13: t = 901.51513671875
Step 14: t = 888.888916015625
Step 15: t = 875.0
Step 16: t = 859.6491088867188
Step 17: t = 842.5925903320312
Step 18: t = 823.5294189453125
Step 19: t = 802.0833129882812
Step 20: t = 777.77783203125
Step 21: t = 750.0
Step 22: t = 717.94873046875
Step 23: t = 680.5556030273438


  0%|          | 0/30 [00:00<?, ?it/s]

Step 0: t_curr = 1000.0, t_prev = 995.0980834960938
t:30


  3%|▎         | 1/30 [00:03<01:39,  3.41s/it]

Step 1: t_curr = 995.0980834960938, t_prev = 989.8989868164062
t:29


  7%|▋         | 2/30 [00:07<01:38,  3.52s/it]

Step 2: t_curr = 989.8989868164062, t_prev = 984.375
t:28


 10%|█         | 3/30 [00:10<01:35,  3.52s/it]

Step 3: t_curr = 984.375, t_prev = 978.49462890625
t:27


 13%|█▎        | 4/30 [00:14<01:32,  3.55s/it]

Step 4: t_curr = 978.49462890625, t_prev = 972.22216796875
t:26


 17%|█▋        | 5/30 [00:17<01:28,  3.56s/it]

Step 5: t_curr = 972.22216796875, t_prev = 965.5172119140625
t:25


 20%|██        | 6/30 [00:20<01:22,  3.43s/it]

Step 6: t_curr = 965.5172119140625, t_prev = 958.3333740234375
t:24


 23%|██▎       | 7/30 [00:24<01:17,  3.38s/it]

Step 7: t_curr = 958.3333740234375, t_prev = 950.6172485351562
t:23


 27%|██▋       | 8/30 [00:27<01:13,  3.33s/it]

Step 8: t_curr = 950.6172485351562, t_prev = 942.3078002929688
t:22


 30%|███       | 9/30 [00:30<01:09,  3.29s/it]

Step 9: t_curr = 942.3078002929688, t_prev = 933.333251953125
t:21


 33%|███▎      | 10/30 [00:33<01:05,  3.27s/it]

Step 10: t_curr = 933.333251953125, t_prev = 923.611083984375
t:20


 37%|███▋      | 11/30 [00:37<01:01,  3.25s/it]

Step 11: t_curr = 923.611083984375, t_prev = 913.04345703125
t:19


 40%|████      | 12/30 [00:40<00:58,  3.24s/it]

Step 12: t_curr = 913.04345703125, t_prev = 901.51513671875
t:18


 43%|████▎     | 13/30 [00:43<00:54,  3.23s/it]

Step 13: t_curr = 901.51513671875, t_prev = 888.888916015625
t:17


 47%|████▋     | 14/30 [00:46<00:51,  3.22s/it]

Step 14: t_curr = 888.888916015625, t_prev = 875.0
t:16


 50%|█████     | 15/30 [00:49<00:48,  3.22s/it]

Step 15: t_curr = 875.0, t_prev = 859.6491088867188
t:15


 53%|█████▎    | 16/30 [00:53<00:45,  3.22s/it]

Step 16: t_curr = 859.6491088867188, t_prev = 842.5925903320312
t:14


 57%|█████▋    | 17/30 [00:56<00:41,  3.21s/it]

Step 17: t_curr = 842.5925903320312, t_prev = 823.5294189453125
t:13


 60%|██████    | 18/30 [00:59<00:38,  3.20s/it]

Step 18: t_curr = 823.5294189453125, t_prev = 802.0833129882812
t:12


 63%|██████▎   | 19/30 [01:02<00:35,  3.21s/it]

Step 19: t_curr = 802.0833129882812, t_prev = 777.77783203125
t:11


 67%|██████▋   | 20/30 [01:05<00:32,  3.21s/it]

Step 20: t_curr = 777.77783203125, t_prev = 750.0
t:10


 70%|███████   | 21/30 [01:09<00:28,  3.21s/it]

Step 21: t_curr = 750.0, t_prev = 717.94873046875
t:9


 73%|███████▎  | 22/30 [01:12<00:25,  3.21s/it]

Step 22: t_curr = 717.94873046875, t_prev = 680.5556030273438
t:8


 77%|███████▋  | 23/30 [01:15<00:22,  3.21s/it]

Step 23: t_curr = 680.5556030273438, t_prev = 636.3637084960938
t:7


 80%|████████  | 24/30 [01:18<00:19,  3.21s/it]

Step 24: t_curr = 636.3637084960938, t_prev = 583.3333740234375
t:6


 83%|████████▎ | 25/30 [01:21<00:16,  3.22s/it]

Step 25: t_curr = 583.3333740234375, t_prev = 518.5184936523438
t:5


 87%|████████▋ | 26/30 [01:25<00:12,  3.22s/it]

Step 26: t_curr = 518.5184936523438, t_prev = 437.5000305175781
t:4


 90%|█████████ | 27/30 [01:28<00:09,  3.22s/it]

Step 27: t_curr = 437.5000305175781, t_prev = 333.3333435058594
t:3


 93%|█████████▎| 28/30 [01:31<00:06,  3.21s/it]

Step 28: t_curr = 333.3333435058594, t_prev = 194.44444274902344
t:2


 97%|█████████▋| 29/30 [01:34<00:03,  3.21s/it]

Step 29: t_curr = 194.44444274902344, t_prev = 0.0
t:1


100%|██████████| 30/30 [01:37<00:00,  3.27s/it]
[32m2025-04-03 12:50:22.694[0m | [1mINFO    [0m | [36mhyvideo.inference[0m:[36mpredict[0m:[36m709[0m - [1mSuccess, time: 235.00394535064697[0m
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
[32m2025-04-03 12:50:25.563[0m | [1mINFO    [0m | [36m__main__[0m:[36mmain[0m:[36m89[0m - [1mSample save to: ./results/rf-solver_2order/2025-04-03-12:50:22_seed42_a grey car navigates a curvy road surrounded by green grass trees and mountains..mp4[0m


In [2]:
%run edit_video.py \
    --dit-weight "/home/chx/mySrc/HunyuanVideo/ckpts/hunyuan-video-t2v-720p/transformers/mp_rank_00_model_states_fp8.pt" \
    --video-size 512 512 \
    --video-length 53 \
    --infer-steps 30 \
    --prompt "A black swan swimming in a pond with lush greenery in the background." \
    --target-prompt "A majestic flamingo swimming in a pond with lush greenery in the background." \
    --inject 5 \
    --seed 42 \
    --embedded-cfg-scale 2 \
    --flow-shift 7.0 \
    --flow-reverse \
    --use-cpu-offload \
    --use-fp8 \
    --save-path ./results \
    --inverse-video-path "/data/chx/BalanceCC-rename/Result/Animal/blackswan.mp4"

[32m2025-04-03 12:50:25.940[0m | [1mINFO    [0m | [36mhyvideo.inference[0m:[36mfrom_pretrained[0m:[36m155[0m - [1mGot text-to-video model root path: ckpts[0m
[32m2025-04-03 12:50:25.941[0m | [1mINFO    [0m | [36mhyvideo.inference[0m:[36mfrom_pretrained[0m:[36m190[0m - [1mBuilding model...[0m


{'apply_final_norm': False,
 'batch_size': 1,
 'cfg_scale': 1.0,
 'denoise_type': 'flow',
 'disable_autocast': False,
 'dit_weight': '/home/chx/mySrc/HunyuanVideo/ckpts/hunyuan-video-t2v-720p/transformers/mp_rank_00_model_states_fp8.pt',
 'embedded_cfg_scale': 2.0,
 'feature_path': 'feature',
 'flow_reverse': True,
 'flow_shift': 7.0,
 'flow_solver': 'euler',
 'hidden_state_skip_layer': 2,
 'infer_steps': 30,
 'inject': 5,
 'inverse_video_path': '/data/chx/BalanceCC-rename/Result/Animal/blackswan.mp4',
 'latent_channels': 16,
 'linear_schedule_end': 25,
 'load_key': 'module',
 'model': 'HYVideo-T/2-cfgdistill',
 'model_base': 'ckpts',
 'model_resolution': '540p',
 'name_suffix': '',
 'neg_prompt': None,
 'num_videos': 1,
 'precision': 'bf16',
 'prompt': 'A black swan swimming in a pond with lush greenery in the '
           'background.',
 'prompt_template': 'dit-llm-encode',
 'prompt_template_video': 'dit-llm-encode-video',
 'reproduce': False,
 'ring_degree': 1,
 'rope_theta': 256,
 

[32m2025-04-03 12:52:20.378[0m | [1mINFO    [0m | [36mhyvideo.inference[0m:[36mload_state_dict[0m:[36m345[0m - [1mLoading torch model /home/chx/mySrc/HunyuanVideo/ckpts/hunyuan-video-t2v-720p/transformers/mp_rank_00_model_states_fp8.pt...[0m
[32m2025-04-03 12:53:03.710[0m | [1mINFO    [0m | [36mhyvideo.vae[0m:[36mload_vae[0m:[36m29[0m - [1mLoading 3D VAE model (884-16c-hy) from: ./ckpts/hunyuan-video-t2v-720p/vae[0m


loading vae


[32m2025-04-03 12:53:06.936[0m | [1mINFO    [0m | [36mhyvideo.vae[0m:[36mload_vae[0m:[36m55[0m - [1mVAE to dtype: torch.float16[0m
[32m2025-04-03 12:53:06.942[0m | [1mINFO    [0m | [36mhyvideo.text_encoder[0m:[36mload_text_encoder[0m:[36m28[0m - [1mLoading text encoder model (llm) from: ./ckpts/text_encoder[0m
Loading checkpoint shards: 100%|██████████| 4/4 [00:42<00:00, 10.56s/it]
[32m2025-04-03 12:53:54.599[0m | [1mINFO    [0m | [36mhyvideo.text_encoder[0m:[36mload_text_encoder[0m:[36m50[0m - [1mText encoder to dtype: torch.float16[0m
[32m2025-04-03 12:53:54.602[0m | [1mINFO    [0m | [36mhyvideo.text_encoder[0m:[36mload_tokenizer[0m:[36m64[0m - [1mLoading tokenizer (llm) from: ./ckpts/text_encoder[0m
[32m2025-04-03 12:53:55.238[0m | [1mINFO    [0m | [36mhyvideo.text_encoder[0m:[36mload_text_encoder[0m:[36m28[0m - [1mLoading text encoder model (clipL) from: ./ckpts/text_encoder_2[0m
[32m2025-04-03 12:53:56.092[0m | [1mINFO

Enable sequential CPU offload.
Updated args:
{'apply_final_norm': False,
 'batch_size': 1,
 'cfg_scale': 1.0,
 'denoise_type': 'flow',
 'disable_autocast': False,
 'dit_weight': '/home/chx/mySrc/HunyuanVideo/ckpts/hunyuan-video-t2v-720p/transformers/mp_rank_00_model_states_fp8.pt',
 'embedded_cfg_scale': 2.0,
 'feature_path': 'feature',
 'flow_reverse': True,
 'flow_shift': 7.0,
 'flow_solver': 'euler',
 'hidden_state_skip_layer': 2,
 'infer_steps': 30,
 'inject': 5,
 'inverse_video_path': '/data/chx/BalanceCC-rename/Result/Animal/blackswan.mp4',
 'latent_channels': 16,
 'linear_schedule_end': 25,
 'load_key': 'module',
 'model': 'HYVideo-T/2-cfgdistill',
 'model_base': 'ckpts',
 'model_resolution': '540p',
 'name_suffix': '',
 'neg_prompt': None,
 'num_videos': 1,
 'precision': 'bf16',
 'prompt': 'A black swan swimming in a pond with lush greenery in the '
           'background.',
 'prompt_template': 'dit-llm-encode',
 'prompt_template_video': 'dit-llm-encode-video',
 'reproduce': Fa

huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
[32m2025-04-03 12:53:57.228[0m | [1mINFO    [0m | [36mhyvideo.inference[0m:[36mpredict[0m:[36m605[0m - [1mInput (height, width, video_length) = (512, 512, 53)[0m
[32m2025-04-03 12:53:57.435[0m | [34m[1mDEBUG   [0m | [36mhyvideo.inference[0m:[36mpredict[0m:[36m665[0m - [34m[1m
                        height: 512
                         width: 512
                  video_length: 53
                        prompt: ['A black swan swimming in a pond with lush greenery in the background.']
                    neg_prompt: ['Aerial view, aerial view, overexposed, low quality, deformation, a poor composition, bad hands, bad teeth, bad eyes, bad limbs, distortion']
                          s

cuda:0
cuda:0
sigmas_reverse
sigmas: tensor([0.0000, 0.1944, 0.3333, 0.4375, 0.5185, 0.5833, 0.6364, 0.6806, 0.7179,
        0.7500, 0.7778, 0.8021, 0.8235, 0.8426, 0.8596, 0.8750, 0.8889, 0.9015,
        0.9130, 0.9236, 0.9333, 0.9423, 0.9506, 0.9583, 0.9655, 0.9722, 0.9785,
        0.9844, 0.9899, 0.9951, 1.0000])
Using inversed latents
Step 0: t = 0.0
Step 1: t = 194.44444274902344
Step 2: t = 333.3333435058594
Step 3: t = 437.5000305175781
Step 4: t = 518.5184936523438
Step 5: t = 583.3333740234375
Step 6: t = 636.3637084960938
Step 7: t = 680.5556030273438
Step 8: t = 717.94873046875
Step 9: t = 750.0
Step 10: t = 777.77783203125
Step 11: t = 802.0833129882812
Step 12: t = 823.5294189453125
Step 13: t = 842.5925903320312
Step 14: t = 859.6491088867188
Step 15: t = 875.0
Step 16: t = 888.888916015625
Step 17: t = 901.51513671875
Step 18: t = 913.04345703125
Step 19: t = 923.611083984375
Step 20: t = 933.333251953125
Step 21: t = 942.3078002929688
Step 22: t = 950.6172485351562
Step

  0%|          | 0/30 [00:00<?, ?it/s]

Step 0: t_curr = 0.0, t_prev = 194.44444274902344
t:1


  3%|▎         | 1/30 [00:06<03:17,  6.81s/it]

Step 1: t_curr = 194.44444274902344, t_prev = 333.3333435058594
t:2


  7%|▋         | 2/30 [00:13<03:10,  6.81s/it]

Step 2: t_curr = 333.3333435058594, t_prev = 437.5000305175781
t:3


 10%|█         | 3/30 [00:20<03:04,  6.82s/it]

Step 3: t_curr = 437.5000305175781, t_prev = 518.5184936523438
t:4


 13%|█▎        | 4/30 [00:27<02:57,  6.82s/it]

Step 4: t_curr = 518.5184936523438, t_prev = 583.3333740234375
t:5


 17%|█▋        | 5/30 [00:34<02:50,  6.83s/it]

Step 5: t_curr = 583.3333740234375, t_prev = 636.3637084960938
t:6


 20%|██        | 6/30 [00:40<02:44,  6.84s/it]

Step 6: t_curr = 636.3637084960938, t_prev = 680.5556030273438
t:7


 23%|██▎       | 7/30 [00:47<02:37,  6.84s/it]

Step 7: t_curr = 680.5556030273438, t_prev = 717.94873046875
t:8


 27%|██▋       | 8/30 [00:54<02:30,  6.85s/it]

Step 8: t_curr = 717.94873046875, t_prev = 750.0
t:9


 30%|███       | 9/30 [01:01<02:24,  6.88s/it]

Step 9: t_curr = 750.0, t_prev = 777.77783203125
t:10


 33%|███▎      | 10/30 [01:08<02:17,  6.89s/it]

Step 10: t_curr = 777.77783203125, t_prev = 802.0833129882812
t:11


 37%|███▋      | 11/30 [01:15<02:11,  6.89s/it]

Step 11: t_curr = 802.0833129882812, t_prev = 823.5294189453125
t:12


 40%|████      | 12/30 [01:22<02:04,  6.90s/it]

Step 12: t_curr = 823.5294189453125, t_prev = 842.5925903320312
t:13


 43%|████▎     | 13/30 [01:29<01:57,  6.91s/it]

Step 13: t_curr = 842.5925903320312, t_prev = 859.6491088867188
t:14


 47%|████▋     | 14/30 [01:36<01:50,  6.91s/it]

Step 14: t_curr = 859.6491088867188, t_prev = 875.0
t:15


 50%|█████     | 15/30 [01:43<01:43,  6.91s/it]

Step 15: t_curr = 875.0, t_prev = 888.888916015625
t:16


 53%|█████▎    | 16/30 [01:50<01:36,  6.91s/it]

Step 16: t_curr = 888.888916015625, t_prev = 901.51513671875
t:17


 57%|█████▋    | 17/30 [01:56<01:30,  6.93s/it]

Step 17: t_curr = 901.51513671875, t_prev = 913.04345703125
t:18


 60%|██████    | 18/30 [02:03<01:23,  6.93s/it]

Step 18: t_curr = 913.04345703125, t_prev = 923.611083984375
t:19


 63%|██████▎   | 19/30 [02:10<01:16,  6.93s/it]

Step 19: t_curr = 923.611083984375, t_prev = 933.333251953125
t:20


 67%|██████▋   | 20/30 [02:17<01:09,  6.92s/it]

Step 20: t_curr = 933.333251953125, t_prev = 942.3078002929688
t:21


 70%|███████   | 21/30 [02:24<01:02,  6.89s/it]

Step 21: t_curr = 942.3078002929688, t_prev = 950.6172485351562
t:22


 73%|███████▎  | 22/30 [02:31<00:54,  6.87s/it]

Step 22: t_curr = 950.6172485351562, t_prev = 958.3333740234375
t:23


 77%|███████▋  | 23/30 [02:38<00:48,  6.87s/it]

Step 23: t_curr = 958.3333740234375, t_prev = 965.5172119140625
t:24


 80%|████████  | 24/30 [02:45<00:41,  6.87s/it]

Step 24: t_curr = 965.5172119140625, t_prev = 972.22216796875
t:25


 83%|████████▎ | 25/30 [02:51<00:34,  6.86s/it]

Step 25: t_curr = 972.22216796875, t_prev = 978.49462890625
t:26


 87%|████████▋ | 26/30 [03:01<00:30,  7.55s/it]

Step 26: t_curr = 978.49462890625, t_prev = 984.375
t:27


 90%|█████████ | 27/30 [03:09<00:23,  7.92s/it]

Step 27: t_curr = 984.375, t_prev = 989.8989868164062
t:28


 93%|█████████▎| 28/30 [03:18<00:16,  8.13s/it]

Step 28: t_curr = 989.8989868164062, t_prev = 995.0980834960938
t:29


 97%|█████████▋| 29/30 [03:27<00:08,  8.30s/it]

Step 29: t_curr = 995.0980834960938, t_prev = 1000.0
t:30


100%|██████████| 30/30 [03:35<00:00,  7.20s/it]


cuda:0
cuda:0
sigmas: tensor([1.0000, 0.9951, 0.9899, 0.9844, 0.9785, 0.9722, 0.9655, 0.9583, 0.9506,
        0.9423, 0.9333, 0.9236, 0.9130, 0.9015, 0.8889, 0.8750, 0.8596, 0.8426,
        0.8235, 0.8021, 0.7778, 0.7500, 0.7179, 0.6806, 0.6364, 0.5833, 0.5185,
        0.4375, 0.3333, 0.1944, 0.0000])
Using inversed latents
Step 0: t = 1000.0
Step 1: t = 995.0980834960938
Step 2: t = 989.8989868164062
Step 3: t = 984.375
Step 4: t = 978.49462890625
Step 5: t = 972.22216796875
Step 6: t = 965.5172119140625
Step 7: t = 958.3333740234375
Step 8: t = 950.6172485351562
Step 9: t = 942.3078002929688
Step 10: t = 933.333251953125
Step 11: t = 923.611083984375
Step 12: t = 913.04345703125
Step 13: t = 901.51513671875
Step 14: t = 888.888916015625
Step 15: t = 875.0
Step 16: t = 859.6491088867188
Step 17: t = 842.5925903320312
Step 18: t = 823.5294189453125
Step 19: t = 802.0833129882812
Step 20: t = 777.77783203125
Step 21: t = 750.0
Step 22: t = 717.94873046875
Step 23: t = 680.5556030273438


  0%|          | 0/30 [00:00<?, ?it/s]

Step 0: t_curr = 1000.0, t_prev = 995.0980834960938
t:30


  3%|▎         | 1/30 [00:07<03:24,  7.06s/it]

Step 1: t_curr = 995.0980834960938, t_prev = 989.8989868164062
t:29


  7%|▋         | 2/30 [00:14<03:18,  7.08s/it]

Step 2: t_curr = 989.8989868164062, t_prev = 984.375
t:28


 10%|█         | 3/30 [00:21<03:11,  7.09s/it]

Step 3: t_curr = 984.375, t_prev = 978.49462890625
t:27


 13%|█▎        | 4/30 [00:28<03:05,  7.12s/it]

Step 4: t_curr = 978.49462890625, t_prev = 972.22216796875
t:26


 17%|█▋        | 5/30 [00:35<02:59,  7.19s/it]

Step 5: t_curr = 972.22216796875, t_prev = 965.5172119140625
t:25


 20%|██        | 6/30 [00:42<02:49,  7.07s/it]

Step 6: t_curr = 965.5172119140625, t_prev = 958.3333740234375
t:24


 23%|██▎       | 7/30 [00:49<02:40,  6.99s/it]

Step 7: t_curr = 958.3333740234375, t_prev = 950.6172485351562
t:23


 27%|██▋       | 8/30 [00:56<02:32,  6.94s/it]

Step 8: t_curr = 950.6172485351562, t_prev = 942.3078002929688
t:22


 30%|███       | 9/30 [01:03<02:25,  6.91s/it]

Step 9: t_curr = 942.3078002929688, t_prev = 933.333251953125
t:21


 33%|███▎      | 10/30 [01:09<02:17,  6.88s/it]

Step 10: t_curr = 933.333251953125, t_prev = 923.611083984375
t:20


 37%|███▋      | 11/30 [01:16<02:10,  6.88s/it]

Step 11: t_curr = 923.611083984375, t_prev = 913.04345703125
t:19


 40%|████      | 12/30 [01:23<02:04,  6.89s/it]

Step 12: t_curr = 913.04345703125, t_prev = 901.51513671875
t:18


 43%|████▎     | 13/30 [01:30<01:57,  6.90s/it]

Step 13: t_curr = 901.51513671875, t_prev = 888.888916015625
t:17


 47%|████▋     | 14/30 [01:37<01:50,  6.92s/it]

Step 14: t_curr = 888.888916015625, t_prev = 875.0
t:16


 50%|█████     | 15/30 [01:44<01:44,  6.95s/it]

Step 15: t_curr = 875.0, t_prev = 859.6491088867188
t:15


 53%|█████▎    | 16/30 [01:51<01:37,  6.93s/it]

Step 16: t_curr = 859.6491088867188, t_prev = 842.5925903320312
t:14


 57%|█████▋    | 17/30 [01:58<01:29,  6.91s/it]

Step 17: t_curr = 842.5925903320312, t_prev = 823.5294189453125
t:13


 60%|██████    | 18/30 [02:05<01:22,  6.89s/it]

Step 18: t_curr = 823.5294189453125, t_prev = 802.0833129882812
t:12


 63%|██████▎   | 19/30 [02:12<01:15,  6.88s/it]

Step 19: t_curr = 802.0833129882812, t_prev = 777.77783203125
t:11


 67%|██████▋   | 20/30 [02:18<01:08,  6.87s/it]

Step 20: t_curr = 777.77783203125, t_prev = 750.0
t:10


 70%|███████   | 21/30 [02:25<01:01,  6.86s/it]

Step 21: t_curr = 750.0, t_prev = 717.94873046875
t:9


 73%|███████▎  | 22/30 [02:32<00:54,  6.86s/it]

Step 22: t_curr = 717.94873046875, t_prev = 680.5556030273438
t:8


 77%|███████▋  | 23/30 [02:39<00:48,  6.87s/it]

Step 23: t_curr = 680.5556030273438, t_prev = 636.3637084960938
t:7


 80%|████████  | 24/30 [02:46<00:41,  6.86s/it]

Step 24: t_curr = 636.3637084960938, t_prev = 583.3333740234375
t:6


 83%|████████▎ | 25/30 [02:53<00:34,  6.87s/it]

Step 25: t_curr = 583.3333740234375, t_prev = 518.5184936523438
t:5


 87%|████████▋ | 26/30 [03:00<00:27,  6.88s/it]

Step 26: t_curr = 518.5184936523438, t_prev = 437.5000305175781
t:4


 90%|█████████ | 27/30 [03:06<00:20,  6.87s/it]

Step 27: t_curr = 437.5000305175781, t_prev = 333.3333435058594
t:3


 93%|█████████▎| 28/30 [03:13<00:13,  6.87s/it]

Step 28: t_curr = 333.3333435058594, t_prev = 194.44444274902344
t:2


 97%|█████████▋| 29/30 [03:20<00:06,  6.86s/it]

Step 29: t_curr = 194.44444274902344, t_prev = 0.0
t:1


100%|██████████| 30/30 [03:27<00:00,  6.92s/it]
[32m2025-04-03 13:01:46.524[0m | [1mINFO    [0m | [36mhyvideo.inference[0m:[36mpredict[0m:[36m709[0m - [1mSuccess, time: 469.08821153640747[0m
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
[32m2025-04-03 13:01:49.657[0m | [1mINFO    [0m | [36m__main__[0m:[36mmain[0m:[36m89[0m - [1mSample save to: ./results/2025-04-03-13:01:46_seed42_A black swan swimming in a pond with lush greenery in the background..mp4[0m


In [1]:
%run sample_video.py \
    --dit-weight "/home/chx/mySrc/HunyuanVideo/ckpts/hunyuan-video-t2v-720p/transformers/mp_rank_00_model_states_fp8.pt" \
    --video-size 512 512 \
    --video-length 25 \
    --infer-steps 30 \
    --prompt "A cat walks on the grass, realistic style." \
    --seed 42 \
    --embedded-cfg-scale 6.0 \
    --flow-shift 7.0 \
    --flow-reverse \
    --use-cpu-offload \
    --use-fp8 \
    --save-path ./results

  from .autonotebook import tqdm as notebook_tqdm
[32m2025-01-14 20:33:19.204[0m | [1mINFO    [0m | [36mhyvideo.inference[0m:[36mfrom_pretrained[0m:[36m154[0m - [1mGot text-to-video model root path: ckpts[0m
[32m2025-01-14 20:33:19.205[0m | [1mINFO    [0m | [36mhyvideo.inference[0m:[36mfrom_pretrained[0m:[36m189[0m - [1mBuilding model...[0m


detect you are not use the latest yunchang. Please install yunchang>=0.4.0
Namespace(model='HYVideo-T/2-cfgdistill', latent_channels=16, precision='bf16', rope_theta=256, vae='884-16c-hy', vae_precision='fp16', vae_tiling=True, text_encoder='llm', text_encoder_precision='fp16', text_states_dim=4096, text_len=256, tokenizer='llm', prompt_template='dit-llm-encode', prompt_template_video='dit-llm-encode-video', hidden_state_skip_layer=2, apply_final_norm=False, text_encoder_2='clipL', text_encoder_precision_2='fp16', text_states_dim_2=768, tokenizer_2='clipL', text_len_2=77, denoise_type='flow', flow_shift=7.0, flow_reverse=True, flow_solver='euler', use_linear_quadratic_schedule=False, linear_schedule_end=25, model_base='ckpts', dit_weight='/home/chx/mySrc/HunyuanVideo/ckpts/hunyuan-video-t2v-720p/transformers/mp_rank_00_model_states_fp8.pt', model_resolution='540p', load_key='module', use_cpu_offload=True, batch_size=1, infer_steps=30, disable_autocast=False, save_path='./results', save

  fp8_map = torch.load(fp8_map_path, map_location=lambda storage, loc: storage)
[32m2025-01-14 20:35:44.555[0m | [1mINFO    [0m | [36mhyvideo.inference[0m:[36mload_state_dict[0m:[36m344[0m - [1mLoading torch model /home/chx/mySrc/HunyuanVideo/ckpts/hunyuan-video-t2v-720p/transformers/mp_rank_00_model_states_fp8.pt...[0m
  state_dict = torch.load(model_path, map_location=lambda storage, loc: storage)
[32m2025-01-14 20:35:58.311[0m | [1mINFO    [0m | [36mhyvideo.vae[0m:[36mload_vae[0m:[36m29[0m - [1mLoading 3D VAE model (884-16c-hy) from: ./ckpts/hunyuan-video-t2v-720p/vae[0m


loading vae


  ckpt = torch.load(vae_ckpt, map_location=vae.device)
[32m2025-01-14 20:36:02.525[0m | [1mINFO    [0m | [36mhyvideo.vae[0m:[36mload_vae[0m:[36m55[0m - [1mVAE to dtype: torch.float16[0m
[32m2025-01-14 20:36:02.534[0m | [1mINFO    [0m | [36mhyvideo.text_encoder[0m:[36mload_text_encoder[0m:[36m28[0m - [1mLoading text encoder model (llm) from: ./ckpts/text_encoder[0m
Loading checkpoint shards: 100%|██████████| 4/4 [00:05<00:00,  1.36s/it]
[32m2025-01-14 20:36:13.190[0m | [1mINFO    [0m | [36mhyvideo.text_encoder[0m:[36mload_text_encoder[0m:[36m50[0m - [1mText encoder to dtype: torch.float16[0m
[32m2025-01-14 20:36:13.194[0m | [1mINFO    [0m | [36mhyvideo.text_encoder[0m:[36mload_tokenizer[0m:[36m64[0m - [1mLoading tokenizer (llm) from: ./ckpts/text_encoder[0m
[32m2025-01-14 20:36:13.772[0m | [1mINFO    [0m | [36mhyvideo.text_encoder[0m:[36mload_text_encoder[0m:[36m28[0m - [1mLoading text encoder model (clipL) from: ./ckpts/text_en

Enable sequential CPU offload.


[32m2025-01-14 20:36:15.189[0m | [34m[1mDEBUG   [0m | [36mhyvideo.inference[0m:[36mpredict[0m:[36m650[0m - [34m[1m
                        height: 512
                         width: 512
                  video_length: 25
                        prompt: ['A cat walks on the grass, realistic style.']
                    neg_prompt: ['Aerial view, aerial view, overexposed, low quality, deformation, a poor composition, bad hands, bad teeth, bad eyes, bad limbs, distortion']
                          seed: 42
                   infer_steps: 30
         num_videos_per_prompt: 1
                guidance_scale: 1.0
                      n_tokens: 7168
                    flow_shift: 7.0
       embedded_guidance_scale: 6.0[0m


self._execution_device cuda:0
cuda:0
cuda:0
Step 0: t = 1000.0
Step 1: t = 995.0980834960938
Step 2: t = 989.8989868164062
Step 3: t = 984.375
Step 4: t = 978.49462890625
Step 5: t = 972.22216796875
Step 6: t = 965.5172119140625
Step 7: t = 958.3333740234375
Step 8: t = 950.6172485351562
Step 9: t = 942.3078002929688
Step 10: t = 933.333251953125
Step 11: t = 923.611083984375
Step 12: t = 913.04345703125
Step 13: t = 901.51513671875
Step 14: t = 888.888916015625
Step 15: t = 875.0
Step 16: t = 859.6491088867188
Step 17: t = 842.5925903320312
Step 18: t = 823.5294189453125
Step 19: t = 802.0833129882812
Step 20: t = 777.77783203125
Step 21: t = 750.0
Step 22: t = 717.94873046875
Step 23: t = 680.5556030273438
Step 24: t = 636.3637084960938
Step 25: t = 583.3333740234375
Step 26: t = 518.5184936523438
Step 27: t = 437.5000305175781
Step 28: t = 333.3333435058594
Step 29: t = 194.44444274902344


100%|██████████| 30/30 [00:49<00:00,  1.64s/it]
[32m2025-01-14 20:37:23.031[0m | [1mINFO    [0m | [36mhyvideo.inference[0m:[36mpredict[0m:[36m684[0m - [1mSuccess, time: 67.84050464630127[0m
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
[32m2025-01-14 20:37:24.919[0m | [1mINFO    [0m | [36m__main__[0m:[36mmain[0m:[36m57[0m - [1mSample save to: ./results/2025-01-14-20:37:23_seed42_A cat walks on the grass, realistic style..mp4[0m
