AudioDec Codec
https://github.com/facebookresearch/AudioDec
https://ieeexplore.ieee.org/stamp/stamp.jsp?tp=&arnumber=10096509


pip install git+https://github.com/voidful/AudioDec.git

In [1]:
import nlp2
# download encoder
nlp2.download_file(
    'https://huggingface.co/AudioDecBenchmark/AudioDec/resolve/main/autoencoder/symAD_libritts_24000_hop300/checkpoint-500000steps.pkl',
    'audiodec_autoencoder_24k_320d')
nlp2.download_file(
    'https://huggingface.co/AudioDecBenchmark/AudioDec/resolve/main/autoencoder/symAD_libritts_24000_hop300/config.yml',
    "audiodec_autoencoder_24k_320d")
encoder_config_path = "audiodec_autoencoder_24k_320d/checkpoint-500000steps.pkl"

# download decoder
nlp2.download_file(
    'https://huggingface.co/AudioDecBenchmark/AudioDec/resolve/main/vocoder/AudioDec_v1_symAD_libritts_24000_hop300_clean/checkpoint-500000steps.pkl',
    'audiodec_vocoder_24k_320d')
nlp2.download_file(
    'https://huggingface.co/AudioDecBenchmark/AudioDec/resolve/main/vocoder/AudioDec_v1_symAD_libritts_24000_hop300_clean/config.yml',
    "audiodec_vocoder_24k_320d")
nlp2.download_file(
    "https://huggingface.co/AudioDecBenchmark/AudioDec/resolve/main/vocoder/AudioDec_v1_symAD_libritts_24000_hop300_clean/symAD_libritts_24000_hop300_clean.npy",
    "audiodec_vocoder_24k_320d"
)
decoder_config_path = "audiodec_vocoder_24k_320d/checkpoint-500000steps.pkl"

  from .autonotebook import tqdm as notebook_tqdm


In [None]:
import os

current_dir = os.getcwd()

# Load audio signal file
male_voice_file_path = os.path.join(current_dir, "sample_9.wav")
female_voice_file_path = os.path.join(current_dir, "female_voice.wav")

In [None]:
#!/usr/bin/env python3
# -*- coding: utf-8 -*-

# Copyright (c) Meta Platforms, Inc. and affiliates.
# All rights reserved.

import os
import torch
import numpy as np
import soundfile as sf
from AudioDec.utils.audiodec import AudioDec, assign_model
current_dir = os.getcwd()

# audio file_path
file_path = os.path.join(current_dir, "sample_9.wav")


def process_audio(input_file, output_file, model_name="vctk_v1", cuda_device=0, num_threads=4):
    """
    Encode and decode an audio file using the AudioDec model.

    Args:
        input_file (str): Path to the input .wav file.
        output_file (str): Path to save the output .wav file.
        model_name (str): Name of the AudioDec model to use (default: vctk_v1).
        cuda_device (int): CUDA device index (-1 for CPU, 0 or higher for GPU).
        num_threads (int): Number of threads for computation.
    """
    # Assign devices
    if cuda_device < 0:
        tx_device = "cpu"
        rx_device = "cpu"
    else:
        tx_device = f"cuda:{cuda_device}"
        rx_device = f"cuda:{cuda_device}"
    torch.set_num_threads(num_threads)

    # Assign model
    sample_rate, encoder_checkpoint, decoder_checkpoint = assign_model(model_name)

    # Initialize AudioDec
    print("Initializing AudioDec...")
    audiodec = AudioDec(tx_device=tx_device, rx_device=rx_device)
    audiodec.load_transmitter(encoder_checkpoint)
    audiodec.load_receiver(encoder_checkpoint, decoder_checkpoint)

    # Process audio
    if not os.path.exists(input_file):
        raise FileNotFoundError(f"Input file {input_file} does not exist!")

    data, fs = sf.read(input_file, always_2d=True)
    if fs != sample_rate:
        raise ValueError(f"Input sample rate ({fs}Hz) does not match model sample rate ({sample_rate}Hz)!")

    x = np.expand_dims(data.transpose(1, 0), axis=1)  # (T, C) -> (C, 1, T)
    x = torch.tensor(x, dtype=torch.float).to(tx_device)

    print("Encoding and decoding the audio...")
    with torch.no_grad():
        z = audiodec.tx_encoder.encode(x)
        idx = audiodec.tx_encoder.quantize(z)
        zq = audiodec.rx_encoder.lookup(idx)
        y = audiodec.decoder.decode(zq)[:, :, :x.size(-1)]
        y = y.squeeze(1).transpose(1, 0).cpu().numpy()  # (T, C)

    # Save the output audio
    sf.write(output_file, y, fs, "PCM_16")
    print(f"Processed audio saved to {output_file}!")


if __name__ == "__main__":
    import argparse

    parser = argparse.ArgumentParser(description="Process audio using AudioDec model.")
    parser.add_argument("-i", "--input", type=str, required=True, help="Path to input .wav file")
    parser.add_argument("-o", "--output", type=str, required=True, help="Path to output .wav file")
    parser.add_argument("--model", type=str, default="vctk_v1", help="Model to use (default: vctk_v1)")
    parser.add_argument("--cuda", type=int, default=0, help="CUDA device index (-1 for CPU, default: 0)")
    parser.add_argument("--num_threads", type=int, default=4, help="Number of threads (default: 4)")
    args = parser.parse_args()

    process_audio(
        input_file=args.input,
        output_file=args.output,
        model_name=args.model,
        cuda_device=args.cuda,
        num_threads=args.num_threads,
    )
