In [None]:
# %cd /content/drive/MyDrive/colab/gen

## Note:
The following steps include:
- Setup Environment  
    - ***If there is a numpy 2.x dependency issue, you can ignore it.***
- Restart Colab Runtime  
    - ***Important!***
- Prepare Code and Models  
- Inference and Display  

## Setup Environment

### show gpu info

In [None]:
!nvidia-smi --query-gpu=name,memory.total,memory.free --format=csv,noheader
!pwd
!ls

Tesla T4, 15360 MiB, 15095 MiB
/content
drive  sample_data


### check torch

In [None]:
import torch
torch.__version__

### install packages

In [None]:
# about 0.5~1min
!pip install tensorrt==8.6.1 librosa tqdm filetype imageio opencv_python_headless scikit-image cython cuda-python imageio-ffmpeg colored polygraphy numpy==2.0.1

Collecting tensorrt==8.6.1
  Downloading tensorrt-8.6.1.tar.gz (16 kB)
  Preparing metadata (setup.py) ... [?25l[?25hdone
Collecting filetype
  Downloading filetype-1.2.0-py2.py3-none-any.whl.metadata (6.5 kB)
Collecting colored
  Downloading colored-2.3.0-py3-none-any.whl.metadata (3.6 kB)
Collecting polygraphy
  Downloading polygraphy-0.49.18-py2.py3-none-any.whl.metadata (5.8 kB)
Collecting numpy==2.0.1
  Downloading numpy-2.0.1-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (60 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m60.9/60.9 kB[0m [31m3.7 MB/s[0m eta [36m0:00:00[0m
Downloading numpy-2.0.1-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (19.5 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m19.5/19.5 MB[0m [31m20.6 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading filetype-1.2.0-py2.py3-none-any.whl (19 kB)
Downloading colored-2.3.0-py3-none-any.whl (18 kB)
Downloading polygraphy-0.49.18-py2.py3-no

In [None]:
# If it doesn't work, you may need to add this command:
!apt install -y libcudnn8

Reading package lists... Done
Building dependency tree... Done
Reading state information... Done
The following NEW packages will be installed:
  libcudnn8
0 upgraded, 1 newly installed, 0 to remove and 21 not upgraded.
Need to get 444 MB of archives.
After this operation, 1,099 MB of additional disk space will be used.
Get:1 https://developer.download.nvidia.com/compute/cuda/repos/ubuntu2204/x86_64  libcudnn8 8.9.7.29-1+cuda12.2 [444 MB]
Fetched 444 MB in 5s (88.6 MB/s)
Selecting previously unselected package libcudnn8.
(Reading database ... 124926 files and directories currently installed.)
Preparing to unpack .../libcudnn8_8.9.7.29-1+cuda12.2_amd64.deb ...
Unpacking libcudnn8 (8.9.7.29-1+cuda12.2) ...
Setting up libcudnn8 (8.9.7.29-1+cuda12.2) ...


### restart runtime

In [None]:
# !!!
# You need to restart the runtime to ensure that the newly installed environment takes effect
# !!!
import os
os.kill(os.getpid(), 9)

In [None]:
# %cd /content/drive/MyDrive/colab/gen
!ls

drive  sample_data


### check environment

In [None]:
import numpy as np
import torch
import tensorrt as trt
print(np.__version__)
print(torch.__version__)
print(trt.__version__)

2.0.1
2.5.1+cu124
8.6.1


## Prepare Code and Models

### prepare code

In [None]:
# about 2s
import os
if not os.path.isdir("ditto-talkinghead"):
    !git clone https://github.com/MatthewCAlbert/ditto-talkinghead.git

%cd ditto-talkinghead
!git pull
!ls

Cloning into 'ditto-talkinghead'...
remote: Enumerating objects: 81, done.[K
remote: Counting objects: 100% (81/81), done.[K
remote: Compressing objects: 100% (66/66), done.[K
remote: Total 81 (delta 22), reused 69 (delta 13), pack-reused 0 (from 0)[K
Receiving objects: 100% (81/81), 903.37 KiB | 2.70 MiB/s, done.
Resolving deltas: 100% (22/22), done.
/content/ditto-talkinghead
Already up to date.
core		  example	LICENSE    scripts		       stream_pipeline_online.py
environment.yaml  inference.py	README.md  stream_pipeline_offline.py


### prepare model

In [None]:
# about 1~2min
!git lfs install
if not os.path.isdir("checkpoints"):
    !git clone https://huggingface.co/digital-avatar/ditto-talkinghead checkpoints

%cd checkpoints
!git pull
!ls

%cd ..
!ls

Updated git hooks.
Git LFS initialized.
Cloning into 'checkpoints'...
remote: Enumerating objects: 53, done.[K
remote: Counting objects: 100% (49/49), done.[K
remote: Compressing objects: 100% (48/48), done.[K
remote: Total 53 (delta 7), reused 0 (delta 0), pack-reused 4 (from 1)[K
Unpacking objects: 100% (53/53), 14.45 KiB | 870.00 KiB/s, done.
Filtering content: 100% (28/28), 4.29 GiB | 52.32 MiB/s, done.
/content/ditto-talkinghead/checkpoints
Already up to date.
ditto_cfg  ditto_onnx  ditto_trt_Ampere_Plus  LICENSE  README.md
/content/ditto-talkinghead
checkpoints  environment.yaml  inference.py  README.md	stream_pipeline_offline.py
core	     example	       LICENSE	     scripts	stream_pipeline_online.py


### check GPU architecture

In [None]:
# about 1~2min
import os
import torch

def cvt_custom_trt():
    from scripts.cvt_onnx_to_trt import main as cvt_trt
    onnx_dir = "./checkpoints/ditto_onnx"
    trt_dir = "./checkpoints/ditto_trt_custom"
    assert os.path.isdir(onnx_dir)
    os.makedirs(trt_dir, exist_ok=True)
    grid_sample_plugin_file = os.path.join(onnx_dir, "libgrid_sample_3d_plugin.so")
    cvt_trt(onnx_dir, trt_dir, grid_sample_plugin_file)
    return trt_dir


def download_Non_Ampere_trt():
    !pip install --upgrade --no-cache-dir gdown
    !gdown https://drive.google.com/drive/folders/1-1qnqy0D9ICgRh8iNY_22j9ieNRC0-zf?usp=sharing -O ./checkpoints/ditto_trt --folder
    trt_dir = "./checkpoints/ditto_trt"
    return trt_dir


if torch.cuda.get_device_capability()[0] < 8:
    # data_root = cvt_custom_trt()    # cvt
    # The conversion is slow, so you can download pre-converted files.
    data_root = download_Non_Ampere_trt()
else:
    data_root = "./checkpoints/ditto_trt_Ampere_Plus"

Retrieving folder contents
Processing file 1-6GtrU91DJvAqGBWUeC88fmW7MSCsoo9 appearance_extractor_fp16.engine
Processing file 1-EDxVdNVKN-kXH93Dph6wUKoctO3CULa blaze_face_fp16.engine
Processing file 1-ARl89o8u3vi1gcxbK66Vk0G377Vjj6j decoder_fp16.engine
Processing file 1-5QigS9HxUizMnJmJ-2tQ2e317p-06X0 face_mesh_fp16.engine
Processing file 1-KfXXUVtUpLA2azB9APNcCTEKTHGhew1 hubert_fp32.engine
Processing file 1-1xBZEBjyZB0khEBk2gKXMiwDxeYgWK- insightface_det_fp16.engine
Processing file 1-8Av3AXZtDqA7pgjB2IymHFlFzdp10Mo landmark106_fp16.engine
Processing file 1gBB5nIWW3pZ98PF9wd8yGR_aHhHxdsqF landmark203_fp16.engine
Processing file 1-9wZtSkPBi5VXC87Pa3RU0IlICgp-y32 lmdm_v0.4_hubert_fp32.engine
Processing file 1-HkpVhfH2Mbrbx3mX6eIqdB0F-vhoW17 motion_extractor_fp32.engine
Processing file 1-HOswhcFfvJEqzLVI1x3XxO0c1sYq8X1 stitch_network_fp16.engine
Processing file 1-D1OprBdD6K5upfNi4hGy-eRwvaLHOEN warp_network_fp16.engine
Retrieving folder contents completed
Building directory structure
Buil

## Inference

### run inference

In [None]:
# init, about 10s
from inference import StreamSDK, run
# data_root = "./checkpoints/ditto_trt_custom"   # model dir
cfg_pkl = "./checkpoints/ditto_cfg/v0.4_hubert_cfg_trt.pkl"     # cfg pkl
print(data_root)
print(cfg_pkl)
SDK = StreamSDK(cfg_pkl, data_root)

./checkpoints/ditto_trt
./checkpoints/ditto_cfg/v0.4_hubert_cfg_trt.pkl


In [None]:
# run inference, about 1~2min
audio_path = "./example/audio.wav"    # .wav
source_path = "./example/image.png"   # video|image
output_path = "./tmp/result.mp4"    # .mp4

run(SDK, audio_path, source_path, output_path)

max_size <class 'int'> 1920
template_n_frames <class 'int'> -1
crop_scale <class 'float'> 2.3
crop_vx_ratio <class 'int'> 0
crop_vy_ratio <class 'float'> -0.125
crop_flag_do_rot <class 'bool'> True
smo_k_s <class 'int'> 13
emo <class 'numpy.ndarray'> (600, 8)
eye_f0_mode <class 'bool'> False
ch_info <class 'dict'>
overlap_v2 <class 'int'> 10
fix_kp_cond <class 'int'> 1
fix_kp_cond_dim <class 'list'> [0, 202]
sampling_timesteps <class 'int'> 50
online_mode <class 'bool'> False
v_min_max_for_clip <class 'numpy.ndarray'> (4, 265)
smo_k_d <class 'int'> 3
N_d <class 'int'> -1
use_d_keys <class 'NoneType'> None
relative_d <class 'bool'> True
drive_eye <class 'NoneType'> None
delta_eye_arr <class 'numpy.ndarray'> (15, 63)
delta_eye_open_n <class 'int'> 0
fade_type <class 'str'> d0
fade_out_keys <class 'list'> ['exp']
flag_stitching <class 'bool'> True
overall_ctrl_info <class 'dict'> {'delta_pitch': 2}


writer: 0it [00:00, ?it/s]
dit: 0it [00:00, ?it/s][A
dit: 2it [00:00,  5.69it/s][A
dit: 3it [00:00,  4.02it/s][A
dit: 4it [00:01,  3.47it/s][A
dit: 5it [00:01,  3.24it/s][A
dit: 6it [00:02,  2.85it/s]
writer: 394it [01:38,  4.01it/s]


ffmpeg -loglevel error -y -i "./tmp/result.mp4.tmp.mp4" -i "./example/audio.wav" -map 0:v -map 1:a -c:v copy -c:a aac "./tmp/result.mp4"
./tmp/result.mp4


### display result

In [None]:
# display, about 5~10s
from IPython.display import HTML
from base64 import b64encode
import os, sys
import glob

mp4_name = output_path

mp4 = open('{}'.format(mp4_name),'rb').read()
data_url = "data:video/mp4;base64," + b64encode(mp4).decode()

print('Display animation: {}'.format(mp4_name), file=sys.stderr)
display(HTML("""
  <video width=256 controls>
        <source src="%s" type="video/mp4">
  </video>
  """ % data_url))

In [None]:
!nvidia-smi

Fri Feb 21 09:00:18 2025       
+-----------------------------------------------------------------------------------------+
| NVIDIA-SMI 550.54.15              Driver Version: 550.54.15      CUDA Version: 12.4     |
|-----------------------------------------+------------------------+----------------------+
| GPU  Name                 Persistence-M | Bus-Id          Disp.A | Volatile Uncorr. ECC |
| Fan  Temp   Perf          Pwr:Usage/Cap |           Memory-Usage | GPU-Util  Compute M. |
|                                         |                        |               MIG M. |
|   0  Tesla T4                       Off |   00000000:00:04.0 Off |                    0 |
| N/A   72C    P0             31W /   70W |    2950MiB /  15360MiB |      0%      Default |
|                                         |                        |                  N/A |
+-----------------------------------------+------------------------+----------------------+
                                                