<a href="https://colab.research.google.com/github/LuthandoMaqondo/i2vgen-xl/blob/luthando-contribution/experiments.ipynb" target="_blank"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
!nvidia-smi

Fri Feb 23 22:28:00 2024       
+---------------------------------------------------------------------------------------+
| NVIDIA-SMI 545.29.06              Driver Version: 545.29.06    CUDA Version: 12.3     |
|-----------------------------------------+----------------------+----------------------+
| GPU  Name                 Persistence-M | Bus-Id        Disp.A | Volatile Uncorr. ECC |
| Fan  Temp   Perf          Pwr:Usage/Cap |         Memory-Usage | GPU-Util  Compute M. |
|                                         |                      |               MIG M. |
|   0  NVIDIA Graphics Device         Off | 00000000:01:00.0  On |                  N/A |
|  0%   37C    P8              18W / 285W |   1062MiB / 16376MiB |     10%      Default |
|                                         |                      |                  N/A |
+-----------------------------------------+----------------------+----------------------+
                                                                    

In [2]:
import os
import sys

try:
    from google.colab import drive
    IN_COLAB = True
except:
    WORKING_DIR = '.'
    IN_COLAB = False
if IN_COLAB:
    WORKING_DIR = '/content/drive/MyDrive/Colab Notebooks'
    # Mount drive in order access Google drive
    drive.mount('/content/drive',  force_remount=True)
    
if IN_COLAB:
    sys.path.insert(0, WORKING_DIR)
else:
    # The actual code is one level higher in folder depth/structure, so we're elevating this notebook.
    sys.path.insert(0,f".{WORKING_DIR}/")

In [3]:
if IN_COLAB:
    !git clone https://github.com/LuthandoMaqondo/i2vgen-xl.git
    !git checkout luthando-contribution
!pip install -r requirements.txt



In [4]:
%%writefile ./configs/t2v_train_custom.yaml

TASK_TYPE: train_t2v_entrance
ENABLE: true
use_ema: false
num_workers: 6
frame_lens: [1, 16, 16, 16, 16, 32, 32, 32]
sample_fps: [1,  8,  16, 16, 16, 8,  16, 16]
resolution: [448, 256]
vit_resolution: [224, 224]
vid_dataset: {
    'type': 'VideoDataset',
    'data_list': ['data/vid_list.txt', ],
    'data_dir_list': ['data/videos/', ],
    'vit_resolution': [224, 224],
    'resolution': [448, 256],
    'get_first_frame': True,
    'max_words': 1000,
}
img_dataset: {
    'type': 'ImageDataset',
    'data_list': ['data/img_list.txt', ],
    'data_dir_list': ['data/images', ],
    'vit_resolution': [224, 224],
    'resolution': [448, 256],
    'max_words': 1000
}
embedder: {
    'type': 'FrozenOpenCLIPTtxtVisualEmbedder',
    'layer': 'penultimate',
    'vit_resolution': [224, 224],
    'pretrained': 'models/open_clip_pytorch_model.bin'
}
UNet: {
    'type': 'UNetSD_T2VBase',
    'in_dim': 4,
    'y_dim': 1024,
    'upper_len': 128,
    'context_dim': 1024,
    'out_dim': 4,
    'dim_mult': [1, 2, 4, 4],
    'num_heads': 8,
    'default_fps': 8,
    'head_dim': 64,
    'num_res_blocks': 2,
    'dropout': 0.1,
    'misc_dropout': 0.4,
    'temporal_attention': True,
    'temporal_attn_times': 1,
    'use_checkpoint': True,
    'use_fps_condition': False,
    'use_sim_mask': False
}
Diffusion: {
    'type': 'DiffusionDDIM',
    'schedule': 'linear_sd', # cosine
    'schedule_param': {
        'num_timesteps': 1000,
        'init_beta': 0.00085,
        'last_beta': 0.0120,
        'zero_terminal_snr': False,
    },
    'mean_type': 'eps',
    'loss_type': 'mse',
    'var_type': 'fixed_small',
    'rescale_timesteps': False,
    'noise_strength': 0.0
}
batch_sizes: {
    "1": 32,
    "4": 8,
    "8": 4,
    "16": 4,
    "32": 2
}
visual_train: {
    'type': 'VisualTrainTextImageToVideo',
    'partial_keys': [
        ['y', 'fps'],
    ],
    'use_offset_noise': False,
    'guide_scale': 9.0, 
}

Pretrain: {
    'type': pretrain_specific_strategies,
    'fix_weight': False,
    'grad_scale': 0.5,
    # 'resume_checkpoint': 'workspace/model_bk/model_scope_0267000.pth',
    'resume_checkpoint': 'workspace/model_bk/text2video_pytorch_model.pth',
    'sd_keys_path': 'data/stable_diffusion_image_key_temporal_attention_x1.json',
}

chunk_size: 4
decoder_bs: 4
lr: 0.00003

noise_strength: 0.1
# classifier-free guidance
p_zero: 0.1
guide_scale: 3.0
num_steps: 100

use_zero_infer: True
viz_interval: 5        # 200
save_ckp_interval: 50   # 500

# Log
log_dir: "workspace/experiments"
log_interval: 1
seed: 8888

Overwriting ./configs/t2v_train_custom.yaml


In [5]:
if IN_COLAB:
    !python preprocess.py \
        --json_file_path /content/drive/MyDrive/Colab Notebooks/datasets/Appimate/dataset.json \
        --output_dir /content/drive/MyDrive/Colab Notebooks/datasets/Appimate/train
else:
    !python preprocess.py \
        --json_file_path /home/luthando/.cache/datasets/Appimate/dataset.json \
        --output_dir /home/luthando/.cache/datasets/Appimate/train

Data preprocessing STARTED!
Data preprocessing DONE: 34 text prompts -> 34 videos


In [6]:
# !python train_net.py --cfg configs/t2v_train_custom.yaml

In [7]:
!python inference.py --cfg configs/t2v_infer.yaml

[2024-02-23 22:28:04,106] INFO: {'__name__': 'Config: VideoLDM Decoder', 'mean': [0.5, 0.5, 0.5], 'std': [0.5, 0.5, 0.5], 'max_words': 1000, 'num_workers': 6, 'prefetch_factor': 2, 'resolution': [448, 256], 'vit_out_dim': 1024, 'vit_resolution': [224, 224], 'depth_clamp': 10.0, 'misc_size': 384, 'depth_std': 20.0, 'frame_lens': [1, 16, 16, 16, 16, 32, 32, 32], 'sample_fps': [1, 8, 16, 16, 16, 8, 16, 16], 'vid_dataset': {'type': 'VideoDataset', 'data_list': ['data/vid_list.txt'], 'max_words': 1000, 'resolution': [448, 256], 'data_dir_list': ['data/videos/'], 'vit_resolution': [224, 224], 'get_first_frame': True}, 'img_dataset': {'type': 'ImageDataset', 'data_list': ['data/img_list.txt'], 'max_words': 1000, 'resolution': [448, 256], 'data_dir_list': ['data/images'], 'vit_resolution': [224, 224]}, 'batch_sizes': {'1': 32, '4': 8, '8': 4, '16': 4, '32': 2}, 'Diffusion': {'type': 'DiffusionDDIM', 'schedule': 'linear_sd', 'schedule_param': {'num_timesteps': 1000, 'init_beta': 0.00085, 'last_