In [12]:
from scipy.io import wavfile

In [13]:
from scipy import signal
from scipy import linalg

In [14]:
import numpy as np

In [15]:
import librosa

In [16]:
import IPython

In [48]:
import soundfile as sf

In [17]:
# General algorithm parameters
order = 48

In [18]:
# Open the WAV file used as the voice input
fs, x = wavfile.read('../assets/data.wav')
x = x[:10*fs]
x = x / x.max()

In [19]:
# excitation, fsexc = librosa.load('../assets/excitation.wav', sr=48000)

In [20]:
# Voice Framing
# Create data frames of ~ 10ms to ensure local stationarity of the voice,
# this way the statistical parameters of the random signal remain the same 
# under the analysis window.
frame_time = 10e-3
frame_size = int(fs * frame_time)
x_size = len(x)
x_frames = librosa.util.frame(
    x,
    frame_length=frame_size,
    hop_length=frame_size//2
)
x_frames = x_frames.T

In [21]:
time = np.linspace(0, (x_frames.shape[0] * frame_size - 1) / fs, num=x_frames.shape[0] * frame_size)
frequency = 100
# excitation = signal.square(time * frequency * 2 * np.pi)
excitation = np.random.normal(0, 1, size=x_frames.shape[0] * frame_size)

In [22]:
from statsmodels.tsa import stattools

In [23]:
def vocoder_frame(voice_frame: np.array, excitation_frame:np.array, order: int, apply_filter: bool = True, apply_window: bool = True):
    # Verify if both the voice and the excitation frames have the same length
    if len(voice_frame) != len(excitation_frame):
        raise ValueError('Voice and excitation frames must have the same length')
    
    # Get the frame size
    frame_size = len(voice_frame)

    # Estimate the short-time autocorrelation of the given data
    rxx = signal.correlate(voice_frame, voice_frame, method='fft')

    # Extract only the needed lags
    rxx = rxx[len(rxx) // 2 : len(rxx) // 2 + order + 1] / rxx[0]

    # Use the Levinson-Durbin algorithm to find the error filter coefficients
    _, ao, _, J, _ = stattools.levinson_durbin(rxx, nlags=len(rxx)-1, isacov=True)
    predictor_coeff = -ao
    error_coeff = np.concatenate(([1.0], predictor_coeff))

    # Filter
    if apply_filter == True:
        y = signal.lfilter([1.0], error_coeff, excitation_frame)
    else:
        y = excitation_frame
    if apply_window == True:
        y = y * signal.windows.hann(frame_size)
    return y, error_coeff

In [45]:
# Short-Time Autocorrelation
# Estimate the short-time autocorrelation of the frame
y = np.zeros_like(x)
for i in range(x_frames.shape[0]):
    x_frame = x_frames[i,:]
    start = i * (frame_size // 2)
    y_frame, error_coeff = vocoder_frame(x_frame, excitation[start:start+frame_size], order)
    y[start:start+frame_size] += y_frame

In [46]:
IPython.display.Audio(x, rate=fs)

In [50]:
IPython.display.Audio(y, rate=fs)