# **Sonic: Shifting Focus to Global Audio Perception in Portrait Animation**
- You can use this notebook for portrait animation with an image and an audio on the T4 GPU, but it is very slow. Only up to 5 seconds of audio has been tested on the T4 and it took 19 minutes to generate a video. Generating 17 seconds of a video took about 8 minutes on the A100. The duration of the audio determines the duration of the video and Up to 45 seconds of audio has been tested on the A100.
- Github project: https://github.com/jixiaozhong/Sonic
- Notebook source: https://github.com/Isi-dev/Google-Colab_Notebooks
- Google Colab Youtube Playlist: https://www.youtube.com/playlist?list=PLdi1sS5pbSYeA470Sb1wARR4OieCBIqMv

In [None]:
# @markdown # 💥1. Setup Environment
%cd /content/
from IPython.display import clear_output
!git clone https://github.com/Isi-dev/ComfyUI
%cd /content/ComfyUI/custom_nodes
!git clone https://github.com/Isi-dev/ComfyUI_Sonic
%cd /content/ComfyUI

clear_output()

# !pip install torchsde av diffusers torch torchaudio torchvision transformers imageio imageio-ffmpeg omegaconf tqdm librosa einops
# !apt -y install -qq aria2 ffmpeg

import subprocess
import sys

def install_pip_packages():
    packages = [
        'torchsde',
        'av',
        'diffusers',
        'torch',
        'torchaudio',
        'torchvision',
        'transformers',
        'imageio',
        'imageio-ffmpeg',
        'omegaconf',
        'tqdm',
        'librosa',
        'einops'
    ]

    for package in packages:
        try:
            # Run pip install silently (using -q)
            subprocess.run(
                [sys.executable, '-m', 'pip', 'install', '-q', package],
                check=True,
                capture_output=True
            )
            print(f"✓ {package} installed")
        except subprocess.CalledProcessError as e:
            print(f"✗ Error installing {package}: {e.stderr.decode().strip() or 'Unknown error'}")

def install_apt_packages():
    packages = ['aria2', 'ffmpeg']

    try:
        # Run apt install silently (using -qq)
        subprocess.run(
            ['apt-get', '-y', 'install', '-qq'] + packages,
            check=True,
            capture_output=True
        )
        print("✓ apt packages installed")
    except subprocess.CalledProcessError as e:
        print(f"✗ Error installing apt packages: {e.stderr.decode().strip() or 'Unknown error'}")

# Run installations
print("Installing pip packages...")
install_pip_packages()
clear_output()  # Clear the pip installation output

print("Installing apt packages...")
install_apt_packages()
clear_output()  # Clear the apt installation output

print("Installation completed with status:")
print("- All pip packages installed successfully" if '✗' not in install_pip_packages.__code__.co_consts else "- Some pip packages had issues")
print("- apt packages installed successfully" if '✗' not in install_apt_packages.__code__.co_consts else "- apt packages had issues")

import os
os.environ['PYTORCH_CUDA_ALLOC_CONF'] = 'expandable_segments:True'

import torch
import numpy as np
from PIL import Image
import gc
import sys

from google.colab import files
from IPython.display import display, HTML, Image as IPImage
sys.path.insert(0, '/content/ComfyUI')

from nodes import LoadImage
from comfy_extras.nodes_audio import LoadAudio
from comfy_extras.nodes_video_model import ImageOnlyCheckpointLoader
from custom_nodes.ComfyUI_Sonic.sonic_node import (
    SONICLoader,
    SONIC_PreData,
    SONICSampler
)

load_image = LoadImage()
load_audio = LoadAudio()
svd_loader = ImageOnlyCheckpointLoader()
sonic_loader = SONICLoader()
sonic_predata = SONIC_PreData()
sonic_sampler = SONICSampler()



# !mkdir -p /content/ComfyUI/models/sonic
# !aria2c --console-log-level=error -c -x 16 -s 16 -k 1M https://huggingface.co/Isi99999/sonic/resolve/main/audio2bucket.pth -d /content/ComfyUI/models/sonic -o audio2bucket.pth

from pathlib import Path

def model_download(url: str, dest_dir: str, filename: str = None, silent: bool = True) -> bool:
    """
    Colab-optimized download with aria2c

    Args:
        url: Download URL
        dest_dir: Target directory (will be created if needed)
        filename: Optional output filename (defaults to URL filename)
        silent: If True, suppresses all output (except errors)

    Returns:
        bool: True if successful, False if failed
    """
    try:
        # Create destination directory
        Path(dest_dir).mkdir(parents=True, exist_ok=True)

        # Set filename if not specified
        if filename is None:
            filename = url.split('/')[-1].split('?')[0]  # Remove URL parameters

        # Build command
        cmd = [
            'aria2c',
            '--console-log-level=error',
            '-c', '-x', '16', '-s', '16', '-k', '1M',
            '-d', dest_dir,
            '-o', filename,
            url
        ]

        # Add silent flags if requested
        if silent:
            cmd.extend(['--summary-interval=0', '--quiet'])
            print(f"Downloading {filename}...", end=' ', flush=True)

        # Run download
        result = subprocess.run(cmd, check=True, capture_output=True, text=True)

        if silent:
            print("Done!")
        else:
            print(f"Downloaded {filename} to {dest_dir}")
        return True

    except subprocess.CalledProcessError as e:
        error = e.stderr.strip() or "Unknown error"
        print(f"\nError downloading {filename}: {error}")
        return False
    except Exception as e:
        print(f"\nError: {str(e)}")
        return False

model_download("https://huggingface.co/Isi99999/sonic/resolve/main/audio2bucket.pth", "/content/ComfyUI/models/sonic")
model_download("https://huggingface.co/Isi99999/sonic/resolve/main/audio2token.pth", "/content/ComfyUI/models/sonic")
model_download("https://huggingface.co/Isi99999/sonic/resolve/main/unet.pth", "/content/ComfyUI/models/sonic")
model_download("https://huggingface.co/Isi99999/sonic/resolve/main/yoloface_v5m.pt", "/content/ComfyUI/models/sonic")

model_download("https://huggingface.co/Isi99999/sonic/resolve/main/RIFE/flownet.pkl", "/content/ComfyUI/models/sonic/RIFE")

model_download("https://huggingface.co/Isi99999/sonic/resolve/main/whisper-tiny/model.safetensors", "/content/ComfyUI/models/sonic/whisper-tiny")
model_download("https://huggingface.co/Isi99999/sonic/resolve/main/whisper-tiny/config.json", "/content/ComfyUI/models/sonic/whisper-tiny")
model_download("https://huggingface.co/Isi99999/sonic/resolve/main/whisper-tiny/preprocessor_config.json", "/content/ComfyUI/models/sonic/whisper-tiny")

model_download("https://huggingface.co/Isi99999/sonic/resolve/main/SVD/svd_xt_1_1.safetensors", "/content/ComfyUI/models/checkpoints")


def clear_memory():
    gc.collect()
    if torch.cuda.is_available():
        torch.cuda.empty_cache()
        torch.cuda.ipc_collect()
    for obj in list(globals().values()):
        if torch.is_tensor(obj) or (hasattr(obj, "data") and torch.is_tensor(obj.data)):
            del obj
    gc.collect()

def upload_file(target_dir: str = '/content/ComfyUI/input', file_type: str = 'any') -> str:
    """
    Handle file uploads in Colab and store in specified directory

    Args:
        target_dir: Where to store uploaded files
        file_type: Filter for specific file types ('image', 'audio', or 'any')

    Returns:
        str: Path to the uploaded file, or None if failed
    """
    from google.colab import files
    import os
    import shutil

    # Create target directory if needed
    os.makedirs(target_dir, exist_ok=True)

    # Upload file
    uploaded = files.upload()

    if not uploaded:
        print("No file was uploaded")
        return None

    # Get the first uploaded file (we'll handle one file at a time)
    filename = next(iter(uploaded.keys()))
    src_path = os.path.join('/content/ComfyUI', filename)
    dest_path = os.path.join(target_dir, filename)

    # Verify file type if requested
    if file_type.lower() != 'any':
        ext = os.path.splitext(filename)[1].lower()
        if file_type.lower() == 'image' and ext not in ['.jpg', '.jpeg', '.png', '.gif', '.bmp', '.webp']:
            print(f"Error: {filename} is not an image file")
            return None
        elif file_type.lower() == 'audio' and ext not in ['.mp3', '.wav', '.ogg', '.flac', '.aac', '.m4a']:
            print(f"Error: {filename} is not an audio file")
            return None

    try:
        shutil.move(src_path, dest_path)
        print(f"File saved to: {dest_path}")
        return dest_path
    except Exception as e:
        print(f"Error moving file: {str(e)}")
        return None

import imageio

def save_as_mp4(images, filename_prefix, fps=25, audio_path=None, output_dir="/content/ComfyUI/output"):
    """
    Save images as MP4 video with optional audio

    Args:
        images: List of image tensors or numpy arrays
        filename_prefix: Output filename without extension
        fps: Frames per second
        audio_path: Path to audio file (optional)
        output_dir: Output directory

    Returns:
        str: Path to the generated MP4 file
    """
    os.makedirs(output_dir, exist_ok=True)

    # Convert images to uint8 numpy arrays
    frames = [(img.cpu().numpy() * 255).astype(np.uint8) if hasattr(img, 'cpu')
             else (img * 255).astype(np.uint8) for img in images]

    # Temporary video path without audio
    temp_video_path = f"{output_dir}/{filename_prefix}_temp.mp4"
    final_video_path = f"{output_dir}/{filename_prefix}.mp4"

    # Save video without audio first
    with imageio.get_writer(temp_video_path, fps=fps) as writer:
        for frame in frames:
            writer.append_data(frame)

    # If audio path is provided, merge with video
    if audio_path and os.path.exists(audio_path):
        try:
            # Use ffmpeg to merge audio and video
            cmd = [
                'ffmpeg',
                '-y',  # Overwrite without asking
                '-i', temp_video_path,
                '-i', audio_path,
                '-c:v', 'copy',  # Copy video stream without re-encoding
                '-c:a', 'aac',   # Encode audio to AAC
                '-shortest',     # Match duration of the shorter input
                final_video_path
            ]
            subprocess.run(cmd, check=True, capture_output=True)

            # Remove temporary file
            os.remove(temp_video_path)

            print(f"Video with audio saved to: {final_video_path}")
            return final_video_path

        except subprocess.CalledProcessError as e:
            print(f"Error adding audio: {e.stderr.decode()}")
            os.rename(temp_video_path, final_video_path)
            return final_video_path
    else:
        os.rename(temp_video_path, final_video_path)
        print(f"Video saved to: {final_video_path}")
        return final_video_path

def generate_video(
    image_path: str = None,
    audio_path: str = None,
    seed: int = 82628696717253,
    steps: int = 25,
    min_resolution: int = 512,
    duration: int = 10,
    expand_ratio: float = 0.5,
    fps: int = 25
):
    try:
        with torch.inference_mode():
            # Handle image input
            if image_path is None:
                print("Please upload an image file:")
                image_path = upload_file(file_type='image')
                if image_path is None:
                    raise ValueError("No image uploaded!")
            loaded_image = load_image.load_image(image_path)[0]

            # Handle audio input
            if audio_path is None:
                print("Please upload an audio file:")
                audio_path = upload_file(file_type='audio')
                if audio_path is None:
                    raise ValueError("No audio uploaded!")
            loaded_audio = load_audio.load(audio_path)[0]

            print("Loading model...")
            # Corrected model loading - removed [0] since it returns a tuple
            model, clip_vision, vae = svd_loader.load_checkpoint("svd_xt_1_1.safetensors")

            # Corrected loader_main call - removed [0] and fixed self reference
            model, weight_dtype = sonic_loader.loader_main(
                model=model,
                sonic_unet="unet.pth",  # Should be actual filename from your sonic models
                ip_audio_scale=1.0,
                use_interframe=True,
                dtype="fp16"
            )

            # Corrected sampler_main call - removed self reference
            data_dict = sonic_predata.sampler_main(
                clip_vision=clip_vision,
                vae=vae,
                audio=loaded_audio,
                image=loaded_image,
                weight_dtype=weight_dtype,
                min_resolution=min_resolution,
                duration=duration,
                expand_ratio=expand_ratio
            )[0]  # Keep [0] here since it returns a tuple with dict as first element

            print("Generating video...")
            sampled, output_fps = sonic_sampler.sampler_main(
                model=model,
                data_dict=data_dict,
                seed=seed,
                inference_steps=steps,
                dynamic_scale=1.0,
                fps=fps
            )

            print("Saving as MP4...")
            output_path = save_as_mp4(
                images=sampled,
                filename_prefix="sonic_output",
                fps=output_fps,  # Use the actual fps returned by sampler
                audio_path=audio_path  # Include original audio
            )

            display_video(output_path)
            return output_path

    except Exception as e:
        print(f"Error during video generation: {str(e)}")
        raise
    finally:
        clear_memory()

def display_video(video_path):
    from IPython.display import HTML
    from base64 import b64encode

    video_data = open(video_path,'rb').read()

    mime_type = "video/mp4"  # default

    data_url = f"data:{mime_type};base64," + b64encode(video_data).decode()

    display(HTML(f"""
    <video width=512 controls autoplay loop>
        <source src="{data_url}" type="{mime_type}">
    </video>
    """))

clear_output()


print("✅ Environment SetUp Complete!")

In [None]:
import imageio
import subprocess
from google.colab import files
from IPython.display import display, HTML, Image as IPImage
import shutil

def upload_image():
    """Handle image upload in Colab and store in /content/ComfyUI/input/"""
    from google.colab import files
    import os
    import shutil

    os.makedirs('/content/ComfyUI/input', exist_ok=True)

    uploaded = files.upload()

    # Move each uploaded file to ComfyUI input directory
    for filename in uploaded.keys():
        src_path = f'/content/ComfyUI/{filename}'
        dest_path = f'/content/ComfyUI/input/{filename}'

        shutil.move(src_path, dest_path)
        print(f"Image saved to: {dest_path}")
        return dest_path

    return None

# @markdown # 💥2. Upload Image (512x768 image recommended)
%cd /content/ComfyUI
file_uploaded = upload_image()
display_upload = True # @param {type:"boolean"}
if display_upload:
    if file_uploaded.lower().endswith(('.png', '.jpg', '.jpeg')):
        display(IPImage(filename=file_uploaded))
    else:
        print("The image format cannot be displayed.")

In [None]:
# @markdown # 💥3. Generate Video (You will be prompted to upload your audio)

seed = 4040425164 # @param {"type":"integer"}
steps = 25 # @param {"type":"integer", "min":1, "max":100}
min_resolution = 512 # @param {"type":"integer"}
duration = 18 # @param {"type":"integer", "min":1, "max":120}
expand_ratio = 0.5 # @param {"type":"number","placeholder":"0.5"}
fps = 25 # @param {"type":"integer", "min":1, "max":60}

import random
seed = seed if seed != 0 else random.randint(0, 2**32 - 1)
print(f"Using seed: {seed}")

# with torch.inference_mode():
generate_video(
    image_path=file_uploaded,
    audio_path=None,
    seed=seed,
    steps=steps,
    min_resolution=min_resolution,
    duration=duration,
    expand_ratio=expand_ratio,
    fps=fps
)
clear_memory()