## Import libraries

In [21]:
import torch
import numpy as np
import crepe
from scipy.io import wavfile
import librosa
import IPython

## Define extract_pitch and extract_loudness functions

In [2]:
def extract_loudness(signal, sampling_rate, block_size, n_fft=2048):
    S = librosa.stft(
        signal,
        n_fft=n_fft,
        hop_length=block_size,
        win_length=n_fft,
        center=True,
    )
    S = np.log(abs(S) + 1e-7)
    f = librosa.fft_frequencies(sampling_rate, n_fft)
    a_weight = librosa.A_weighting(f)

    S = S + a_weight.reshape(-1, 1)

    S = np.mean(S, 0)[..., :-1]

    return S

def extract_pitch(signal, sampling_rate, block_size):
    f0 = crepe.predict(
        signal,
        sampling_rate,
        step_size=int(1000 * block_size / sampling_rate),
        verbose=0,
        center=True,
        viterbi=True,
    )
    return f0[1].reshape(-1)[:-1]

## Import model

In [12]:
model = torch.jit.load("ddsp_mytraining_pretrained.ts")
sr0 = 16000
block_size = 64

## Test model with noise input

In [4]:
pitch_ns = torch.randn(1, 200, 1)
loudness_ns = torch.randn(1, 200, 1)

audio_ns_tsr = model(pitch_ns, loudness_ns)
audio_ns_rndr = torch.flatten(audio_ns_tsr).detach().numpy()

wavfile.write('render/noise.wav', sr0, audio_ns_rndr)

In [5]:
IPython.display.Audio('render/noise.wav')

## Test model with audio sample

In [18]:
# Choose and load audio sample
sample_name = 'voice1.wav'
sample_directory = 'audio/'
sample_path = sample_directory + sample_name
audio, sr = librosa.load(sample_path, sr=sr0) # Downsample 44.1kHz to 16kHz

# Extract loudness
loudness_np = extract_loudness(audio, sr, block_size)
size_env = np.size(loudness_np)
loudness_np = loudness_np.reshape(size_env, -1)
loudness = torch.zeros((1, size_env, 1))
loudness[0] = torch.tensor(loudness_np)

# Extract pitch
pitch_np = extract_pitch(audio, sr, block_size)
pitch_np = pitch_np.reshape(size_env, -1)
pitch = torch.zeros((1, size_env, 1))
pitch[0] = torch.tensor(pitch_np)

# Render audio with model
audio_tsr = model(pitch, loudness)
audio_rndr = torch.flatten(audio_tsr).detach().numpy()

# Write rendered file
render_directory = 'render/'
render_path = render_directory + sample_name
wavfile.write(render_path, sr0, audio_rndr)

  + 2 * np.log10(f_sq)


In [19]:
IPython.display.Audio(sample_path)

In [20]:
IPython.display.Audio(render_path)