In [1]:
import torch.nn.functional as F
import torchaudio
import os

def create_if_not_exists(dir):
    if not os.path.exists(dir):
        os.makedirs(dir)

# create outputs dir
OUTPUTS_DIR = "outputs"
create_if_not_exists(OUTPUTS_DIR)

# 1. load 'audio_16k/Basta_16k.wav' audio file (note that it is on stereo)
y, sr = torchaudio.load("audio_16k/Basta_16k.wav")

SCALES = [0.8, 1.2]
for stretch_scale in SCALES:
    # F.interpolate expects 4D signals in bilinear mode
    stretched_y = F.interpolate(y[None, None, :], mode='bilinear', scale_factor=stretch_scale)[0, 0, :]
    outfile = f"{OUTPUTS_DIR}/interpolation_{str(stretch_scale).replace('.', '_')}.wav"
    print(f"saving the following file: {outfile}")
    torchaudio.save(outfile, stretched_y, sr)

saving the following file: outputs/interpolation_0_8.wav
saving the following file: outputs/interpolation_1_2.wav


In [4]:
import torch
import torchaudio
import os


def create_if_not_exists(dir):
    if not os.path.exists(dir):
        os.makedirs(dir)


def naive_time_stretch(y, scale_factor):
    WINDOW_SIZE = 1024
    y_stft = torch.stft(y, n_fft=WINDOW_SIZE)
    ch, n_freqs, n_windows, _ = y_stft.shape

    step = 1 / scale_factor
    soft_win_idx = 0.0
    stretched_stft = None
    k = 0
    while(k < n_windows):
        # jump by step, and concat original stft windows (downsampe/duplicate depending on scale_factor)
        win_fft = y_stft[:, :, k, :].unsqueeze(2)
        if stretched_stft is None:
            stretched_stft = win_fft
        else:
            stretched_stft = torch.concat((stretched_stft, win_fft), dim=2)
        soft_win_idx += step
        k = int(round(soft_win_idx))

    return torch.istft(stretched_stft, n_fft=WINDOW_SIZE)


# create outputs dir
OUTPUTS_DIR = "outputs"
create_if_not_exists(OUTPUTS_DIR)
y, sr = torchaudio.load("audio_16k/Basta_16k.wav")
SCALES = [0.8, 1.2]

for stretch_scale in SCALES:
    y_stretched = naive_time_stretch(y, scale_factor=stretch_scale)
    outfile = f"{OUTPUTS_DIR}/naive_pitch_shift_{str(stretch_scale).replace('.', '_')}.wav"
    print(f"saving outfile: {outfile}")
    torchaudio.save(outfile, y_stretched, sr)

ValueError: not enough values to unpack (expected 4, got 3)

Part C: Phase vocoder
In this subsection you will implement version of a slightly better algorithm to perform time_stretch called Phase vocoder.
We do not aim to get into depth of this algorithm design, yet we think that this algorithm is cool to know so in this part you will implement it from a given pseudo code.

1. Implement the algorithm following the pseudo code below for the function time_stretch.
2. Load 'audio_16k/Basta_16k.wav' and use time_stretch with factors x0.8, 1.2, save these generations to `outputs/phase_vocoder_{factor, replace '.' with '_'}.wav`
3. Do you notice anything different from the previous naive time stretch (besides magnitude differences)? why do you think it is different?

Guidance: use torch, torchaudio functions in this section. 

-
Pseudo code:
-

time_stretch(signal, factor, win_size=1024, hop=1024//4):
    # create window
    hann_window = construct_hann_window(win_size)

    # draw two complex STFTs
    new_hop = int(hop * factor)
    stft_left = get_complex_stft(signal[:-hop], win_size, new_hop, hann_window)
    stft_right = get_complex_stft(signal[hop:], win_size, new_hop, hann_window)

    # calculate accumulated phase delta and reconstruct phase from it
    phase = get_acc_phase_delta(stft_left, stft_right)

    # reconstruct component from phase
    re, im = get_re_im_from_phase(phase)
    complex_new_stft = view_as_complex(stack([re, im], dim=-1)) * abs(stft_right))
    output = istft(complex_new_stft, win_length=win_size, hop_length=hop, window=hann_window)

    return output

-
Pseudo functions:
-

construct_hann_window(win_size):
    return a vector representing a hanning window, hint: see torch.hann_window

get_complex_stft(signal, win_size, hop, window):
    return a complex representation of the stft (x + jy form)

get_acc_phase_delta(stft_left, stft_right):
    # calculate angular distance between two complex STFTs
    phase_delta = angle(stft_right) - angle(stft_left)

    # accumulate phase, follow this recursive formula
    for i in {1...length(phase_delta)}: phase[i] := phase_delta[i] + phase[i-1]; phase[0] = phase_delta[0]
    
    # round phase back to [-2 * pi, 2 * pi] range
    phase = phase  - (2 * pi * round(phase_delta / (2 * pi)))  

    return phase

get_re_im_from_phase(phase):
    retrieves the real and imaginary components from a complex phase

In [None]:
import math
import os

import numpy as np
import torch
import torchaudio
from torch import view_as_complex, istft, stack

def construct_hann_window(win_size):
    return torch.hann_window(win_size)

def get_complex_stft(signal, win_size, hop, window):
    return torch.stft(signal, n_fft=win_size, hop_length=hop, window=window, return_complex=True)

def get_acc_phase_delta(stft_left, stft_right):
    # calculate angular distance between two complex STFTs
    phase_delta = torch.angle(stft_right) - torch.angle(stft_left)
    # accumulate phase, follow this recursive formula
    phase = torch.zeros_like(phase_delta)
    phase[0] = phase_delta[0]
    for i in range(1, len(phase_delta)):
        phase[i] = phase_delta[i] + phase[i - 1]

    # round phase back to [-2 * pi, 2 * pi] range
    phase = phase - (2 * np.pi * np.round(phase / (2 * np.pi)))
    return phase

def get_re_im_from_phase(phase):
    return torch.cos(phase), torch.sin(phase)

def time_stretch(signal, factor, win_size=1024, hop=1024//4):
    # create window
    hann_window = construct_hann_window(win_size)

    # draw two complex STFTs
    new_hop = int(hop * factor)
    stft_left = get_complex_stft(signal[:-hop], win_size, new_hop, hann_window)
    stft_right = get_complex_stft(signal[hop:], win_size, new_hop, hann_window)

    # calculate accumulated phase delta and reconstruct phase from it
    phase = get_acc_phase_delta(stft_left, stft_right)

    # reconstruct component from phase
    re, im = get_re_im_from_phase(phase)
    output = torch.cat([re.unsqueeze(2), im.unsqueeze(2)], dim=2)

    complex_new_stft = view_as_complex(
        output * abs(stft_right).unsqueeze(-1)
    )
    output = istft(complex_new_stft, n_fft=win_size, hop_length=hop, window=hann_window)

    return output

def create_if_not_exists(dir):
    if not os.path.exists(dir):
        os.makedirs(dir)

OUTPUTS_DIR = "outputs"
create_if_not_exists(OUTPUTS_DIR)
y, sr = torchaudio.load("audio_16k/Basta_16k.wav")
SCALES = [0.8, 1.2]

for stretch_scale in SCALES:
    y_stretched = time_stretch(y[0], factor=stretch_scale)
    outfile = f"{OUTPUTS_DIR}/phase_vocoder_{str(stretch_scale).replace('.', '_')}.wav"
    print(f"saving outfile: {outfile}")
    y_squeezed = torch.unsqueeze(y_stretched, 0)
    stereo = torch.cat((y_squeezed, y_squeezed), dim=0)
    torchaudio.save(outfile, stereo, sr)