### RMVPE 导出测试
RMVPE ONNX原始权重可在[此处](https://huggingface.co/lj1995/VoiceConversionWebUI/tree/main)下载

In [1]:
!pip install onnxruntime-gpu

Looking in indexes: https://mirrors.aliyun.com/pypi/simple
Collecting onnxruntime-gpu
  Downloading https://mirrors.aliyun.com/pypi/packages/e0/a5/5c2287d61f359c7342e9d59d1e3dd728a982dea85f846c7af305a801c3ca/onnxruntime_gpu-1.20.1-cp311-cp311-manylinux_2_27_x86_64.manylinux_2_28_x86_64.whl (291.5 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m291.5/291.5 MB[0m [31m502.6 kB/s[0m eta [36m0:00:00[0m00:01[0m00:10[0m
[?25hCollecting coloredlogs (from onnxruntime-gpu)
  Downloading https://mirrors.aliyun.com/pypi/packages/a7/06/3d6badcf13db419e25b07041d9c7b4a2c331d3f4e7134445ec5df57714cd/coloredlogs-15.0.1-py2.py3-none-any.whl (46 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m46.0/46.0 kB[0m [31m2.1 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting flatbuffers (from onnxruntime-gpu)
  Downloading https://mirrors.aliyun.com/pypi/packages/b8/25/155f9f080d5e4bc0082edfda032ea2bc2b8fab3f4d25d46c1e9dd22a1a89/flatbuffers-25.2.10-py2.py3-none-any.whl (3

In [3]:
import torch
import onnxruntime as ort

In [7]:
# 读取ONNX模型
model = ort.InferenceSession("./rmvpe.onnx",providers=['CUDAExecutionProvider'])

[1;31m2025-02-17 16:34:43.712099212 [E:onnxruntime:Default, provider_bridge_ort.cc:1862 TryGetProviderInfo_CUDA] /onnxruntime_src/onnxruntime/core/session/provider_bridge_ort.cc:1539 onnxruntime::Provider& onnxruntime::ProviderLibrary::Get() [ONNXRuntimeError] : 1 : FAIL : Failed to load library libonnxruntime_providers_cuda.so with error: libcudnn.so.9: cannot open shared object file: No such file or directory
[m
[0;93m2025-02-17 16:34:43.712128197 [W:onnxruntime:Default, onnxruntime_pybind_state.cc:993 CreateExecutionProviderInstance] Failed to create CUDAExecutionProvider. Require cuDNN 9.* and CUDA 12.*. Please install all dependencies as mentioned in the GPU requirements page (https://onnxruntime.ai/docs/execution-providers/CUDA-ExecutionProvider.html#requirements), make sure they're in the PATH, and that your GPU is supported.[m


In [8]:
import torch
import torch.nn.functional as F
from torchaudio.transforms import MelScale

class MelSpectrogram(torch.nn.Module):
    def __init__(
        self,
        is_half,
        n_mel_channels,
        sampling_rate,
        win_length,
        hop_length,
        n_fft=None,
        mel_fmin=0,
        mel_fmax=None,
        clamp=1e-5,
    ):
        super().__init__()
        n_fft = win_length if n_fft is None else n_fft
        # Initialize MelScale for computing mel basis
        self.mel_scale = MelScale(
            n_mels=n_mel_channels,
            sample_rate=sampling_rate,
            f_min=mel_fmin,
            f_max=mel_fmax,
            n_stft=n_fft // 2 + 1,
        )
        self.n_fft = win_length if n_fft is None else n_fft
        self.hop_length = hop_length
        self.win_length = win_length
        self.sampling_rate = sampling_rate
        self.n_mel_channels = n_mel_channels
        self.clamp = clamp
        self.is_half = is_half
        self.hann_window = {}

    def forward(self, audio, keyshift=0, speed=1, center=True):
        factor = 2 ** (keyshift / 12)
        n_fft_new = int(torch.round(torch.tensor(self.n_fft * factor)).item())
        win_length_new = int(torch.round(torch.tensor(self.win_length * factor)).item())
        hop_length_new = int(torch.round(torch.tensor(self.hop_length * speed)).item())

        keyshift_key = str(keyshift) + "_" + str(audio.device)

        # Ensure hann_window is always initialized
        if keyshift_key not in self.hann_window:
            self.hann_window[keyshift_key] = torch.hann_window(win_length_new).to(audio.device)

        # Compute STFT with return_complex=False
        stft_real, stft_imag = torch.stft(
            audio,
            n_fft=n_fft_new,
            hop_length=hop_length_new,
            win_length=win_length_new,
            window=self.hann_window[keyshift_key],
            center=center,
            return_complex=False,  # Return real and imaginary parts separately
        ).unbind(-1)

        # Compute magnitude spectrogram manually
        magnitude = torch.sqrt(stft_real.pow(2) + stft_imag.pow(2))

        # Handle keyshift resizing
        if keyshift != 0:
            size = self.n_fft // 2 + 1
            resize = magnitude.size(1)
            if resize < size:
                magnitude = F.pad(magnitude, (0, 0, 0, size - resize))
            magnitude = magnitude[:, :size, :] * self.win_length / win_length_new

        # Apply MelScale transformation
        mel_output = self.mel_scale(magnitude)

        # Clamp and log
        if self.is_half:
            mel_output = mel_output.half()
        log_mel_spec = torch.log(torch.clamp(mel_output, min=self.clamp))

        return log_mel_spec

In [19]:
class MyRMVPE(torch.nn.Module):
    def __init__(
            self,
            model: ort.InferenceSession,
            is_half: bool = False,
            device: str = "cuda",
        ):
        super(MyRMVPE, self).__init__()
        self.model = model
        self.is_half = is_half
        self.device = torch.device(device)  # 显式定义设备
        if is_half:
            self.model = self.model.half()
        self.mel_extractor = MelSpectrogram(
            is_half, 128, 16000, 1024, 160, None, 30, 8000
        ).to(self.device)
        self.cents_mapping = 20 * torch.arange(360, device=self.device) + 1997.3794084376191
        self.cents_mapping = torch.nn.functional.pad(
            self.cents_mapping.unsqueeze(0), (4, 4), mode="constant", value=0
        ).to(self.device)

    def mel2hiden(self, mel):
        n_frames = mel.shape[-1]
        n_pad = 32 * ((n_frames - 1) // 32 + 1) - n_frames
        if n_pad > 0:
            mel = torch.nn.functional.pad(mel, (0, n_pad), mode="constant")
        mel = mel.half() if self.is_half else mel.float()
        mel = mel.to(self.device)  # 确保mel在正确设备上
        hidden = torch.from_numpy(
            self.model.run(
                ["output"],{"input":mel.cpu().numpy()}
            )[0]
        ).to(self.device)
        return hidden[:, :n_frames].to(self.device)

    def to_local_average_cents(self, salience, thred=0.05):
        center = torch.argmax(salience, dim=-1)  # [batch]
        salience = F.pad(salience, (4, 4), "constant", 0)  # [batch, 360 + 8]
        center += 4

        # 确保 clamp 的 max 参数在同一设备上
        max_value = torch.tensor(salience.shape[-1] - 1, device=self.device)
        starts = (center - 4).clamp(min=0)                     # 防止负数索引
        ends = (center + 5).clamp(max=max_value)               # 防止越界

        batch_size = salience.shape[0]
        indices = torch.arange(salience.shape[-1], device=self.device)  # [368]
        indices = indices.view(1, -1).expand(batch_size, -1)  # [batch, 368]

        mask = (indices >= starts.unsqueeze(-1)) & (indices < ends.unsqueeze(-1))

        window_salience = salience * mask.float()  # [batch, 368]
        window_cents = self.cents_mapping.expand(batch_size, -1) * mask.float()  # [batch, 368]

        product_sum = torch.sum(window_salience * window_cents, dim=-1)  # [batch]
        weight_sum = torch.sum(window_salience, dim=-1)                  # [batch]
        weight_sum = torch.where(weight_sum == 0, torch.tensor(1e-6, device=self.device), weight_sum)

        devided = product_sum / weight_sum  # [batch]

        max_values = torch.max(salience, dim=-1).values  # [batch]
        devided[max_values <= thred] = 0

        return devided

    def decode(self, hidden, thred=0.03):
        cents_pred = self.to_local_average_cents(hidden, thred=thred)
        f0 = 10 * (2 ** (cents_pred / 1200))
        f0[f0 == 10] = 0
        return f0

    def forward(self, audio, thred=0.03):
        audio = audio.to(self.device)  # 确保输入音频在正确设备上
        mel = self.mel_extractor(audio.float().unsqueeze(0), center=True).to(self.device)
        hidden = self.mel2hiden(mel)
        f0 = self.decode(hidden, thred=thred).squeeze(0)
        return f0

In [20]:
# 读取x

In [21]:
x = torch.load("./x.pt")

In [22]:
x

tensor([ 0.2800,  0.1129, -0.0576,  ...,  0.2808,  0.0731, -0.1258],
       device='cuda:0', dtype=torch.float64)

In [23]:
# 执行推理
rmvpe = MyRMVPE(model)
rmvpe(x)

tensor([266.6708, 263.3647, 260.0292,  ..., 343.5811, 344.1398, 343.8999],
       device='cuda:0')

In [26]:
!pip install onnx

Looking in indexes: https://mirrors.aliyun.com/pypi/simple
Collecting onnx
  Downloading https://mirrors.aliyun.com/pypi/packages/b1/2f/91092557ed478e323a2b4471e2081fdf88d1dd52ae988ceaf7db4e4506ff/onnx-1.17.0-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (16.0 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m16.0/16.0 MB[0m [31m802.1 kB/s[0m eta [36m0:00:00[0m00:01[0m00:01[0m
Installing collected packages: onnx
Successfully installed onnx-1.17.0
[0m
[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m A new release of pip is available: [0m[31;49m24.0[0m[39;49m -> [0m[32;49m25.0.1[0m
[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m To update, run: [0m[32;49mpython3.11 -m pip install --upgrade pip[0m


In [27]:
# 执行导出
torch.onnx.export(
    rmvpe.cuda(),
    x.cuda(),
    "rmvpe_pipeline.onnx",
    input_names=["input"],
    output_names=["f0"],
    dynamic_axes={
        "input": {0: "seq"},
        "f0": {0: "seq"},
    },
    opset_version=17,
    do_constant_folding=True,
)
    

  n_fft_new = int(torch.round(torch.tensor(self.n_fft * factor)).item())
  n_fft_new = int(torch.round(torch.tensor(self.n_fft * factor)).item())
  win_length_new = int(torch.round(torch.tensor(self.win_length * factor)).item())
  win_length_new = int(torch.round(torch.tensor(self.win_length * factor)).item())
  hop_length_new = int(torch.round(torch.tensor(self.hop_length * speed)).item())
  hop_length_new = int(torch.round(torch.tensor(self.hop_length * speed)).item())
  if n_pad > 0:
  ["output"],{"input":mel.cpu().numpy()}
  hidden = torch.from_numpy(
  max_value = torch.tensor(salience.shape[-1] - 1, device=self.device)
  max_value = torch.tensor(salience.shape[-1] - 1, device=self.device)
  weight_sum = torch.where(weight_sum == 0, torch.tensor(1e-6, device=self.device), weight_sum)


In [29]:
# 读取rmvpe_pipeline.onnx

model = ort.InferenceSession("./rmvpe_pipeline.onnx",providers=['CUDAExecutionProvider'])

[1;31m2025-02-17 16:43:21.030760768 [E:onnxruntime:Default, provider_bridge_ort.cc:1862 TryGetProviderInfo_CUDA] /onnxruntime_src/onnxruntime/core/session/provider_bridge_ort.cc:1539 onnxruntime::Provider& onnxruntime::ProviderLibrary::Get() [ONNXRuntimeError] : 1 : FAIL : Failed to load library libonnxruntime_providers_cuda.so with error: libcudnn.so.9: cannot open shared object file: No such file or directory
[m
[0;93m2025-02-17 16:43:21.030793651 [W:onnxruntime:Default, onnxruntime_pybind_state.cc:993 CreateExecutionProviderInstance] Failed to create CUDAExecutionProvider. Require cuDNN 9.* and CUDA 12.*. Please install all dependencies as mentioned in the GPU requirements page (https://onnxruntime.ai/docs/execution-providers/CUDA-ExecutionProvider.html#requirements), make sure they're in the PATH, and that your GPU is supported.[m


In [30]:
x = x.cpu().numpy()

In [32]:
output = model.run(
    ["f0"],
    {"input":x,'thred':[0.03]}
)[0]

In [33]:
output

array([266.6709 , 263.36465, 260.02917, ..., 343.5814 , 344.13977,
       343.89987], dtype=float32)

以下代码将在我的通勤电脑（无GPU）上运行

In [3]:
# 打印CPU型号
!cat /proc/cpuinfo | grep name | cut -f2 -d: | uniq -c

      8  Intel(R) Core(TM) Ultra 7 258V


In [4]:
from time import perf_counter
import numpy as np
import onnxruntime as ort

In [5]:
model = ort.InferenceSession("./rmvpe_pipeline.onnx",providers=['CPUExecutionProvider'])

In [6]:
x = np.load("./x.npy")

In [7]:
# 执行推理
start = perf_counter()
pred = model.run(
    ["f0"],
    {"input":x,'thred':[0.03]}
)[0]
end = perf_counter()
print(f"Infernce on cpu cost {end-start:.4f}s")

Infernce on cpu cost 0.2364s


In [8]:
pred

array([266.6709 , 263.36465, 260.02917, ..., 343.5814 , 344.13977,
       343.89987], shape=(7383,), dtype=float32)

这波，这波我只能说微软有挂！344MB的权重被压缩成10MB不说，推理速度在CPU上能这么快？？？？