# Vocals Transcription
This notebook contains the full pipeline for vocals transcription. The pipeline is composed of the following steps:
1. Vocals Volume Normalization
2. Effects Pipeline
   1. Noise Gate
   2. Lowpass Filter
   3. Compressor
3. Machine Learning Model (either MetricGAN or MTL or None)
4. Note Prediction

The following cell contains all the parameters that can be tuned for each step of the pipeline.

In [None]:
from typing import Union

# Noise Gate
noise_gate_threshold: float = -18.0
noise_gate_attack: float = 500.0
noise_gate_release: float = 1500.0

# Lowpass Filter
lowpass_cutoff: float = 500.0

# Compressor
compressor_threshold: float = -6.0
compressor_ratio: float = 5.0
compressor_attack: float = 1.0
compressor_release: float = 100.0

# Machine Learning Model
ml_model: Union[str, None] = None

# Basic Pitch
basic_pitch_onset_threshold: float = 0.5
basic_pitch_frame_threshold: float = 0.3
basic_pitch_minimum_note_length: float = 127.7

In [None]:
# Execute this cell to use the resulting parameters from optimization run #2

# Noise Gate
noise_gate_threshold: float = -32.0
noise_gate_attack: float = 70.0
noise_gate_release: float = 2000.0

# Lowpass Filter
lowpass_cutoff: float = 1500.0

# Compressor
compressor_threshold: float = -18.0
compressor_ratio: float = 2.0
compressor_attack: float = 1.0
compressor_release: float = 80.0

# Machine Learning Model
ml_model: Union[str, None] = None

# Basic Pitch
basic_pitch_onset_threshold: float = 0.2
basic_pitch_frame_threshold: float = 0.43455654528869303
basic_pitch_minimum_note_length: float = 80

### Vocals Volume Normalization
Uses pydub's normalize effect to bring the vocals volume to a standard level.

In [None]:
from pydub import AudioSegment, effects


def normalize_audio(in_path: str, out_path: str):
    audio = AudioSegment.from_file(in_path)
    normalized = effects.normalize(audio)
    normalized.export(out_path, format="wav")

### Effects Pipeline
Uses Spotify's pedalboard library to apply a series of effects to the vocals:
1. Noise Gate
2. Lowpass Filter
3. Compressor

In [None]:
from pedalboard import Pedalboard, NoiseGate, Compressor, LowpassFilter
from pedalboard.io import AudioFile

# Setup pedalboard
board = Pedalboard([
    NoiseGate(threshold_db=noise_gate_threshold, attack_ms=noise_gate_attack, release_ms=noise_gate_release),
    LowpassFilter(cutoff_frequency_hz=lowpass_cutoff),
    Compressor(
        threshold_db=compressor_threshold,
        ratio=compressor_ratio,
        attack_ms=compressor_attack,
        release_ms=compressor_release,
    ),
])


def apply_pedalboard(in_path, out_path):
    with AudioFile(in_path) as audio:
        with AudioFile(out_path, 'w', audio.samplerate, audio.num_channels) as output:
            # Loop over file and apply pedalboard effects
            while audio.tell() < audio.frames:
                chunk = audio.read(int(audio.samplerate))
                effected = board.process(chunk, audio.samplerate, reset=False)
                output.write(effected)

### Machine Learning Model
Uses SpeechBrain's pretrained MetricGAN or MTL models to enhance the vocals.
The models are primarily used for speech enhancement, but let's check if they also work for music enhancement.

In [None]:
import torchaudio
import torch
from speechbrain.pretrained import SpectralMaskEnhancement, WaveformEnhancement

metricgan_model = SpectralMaskEnhancement.from_hparams(
    source="speechbrain/metricgan-plus-voicebank",
    savedir="../models/metricgan-plus-voicebank",
)
mtl_model = WaveformEnhancement.from_hparams(
    source="speechbrain/mtl-mimic-voicebank",
    savedir="../models/mtl-mimic-voicebank",
)


def apply_metricgan(in_path, out_path):
    noisy = metricgan_model.load_audio(in_path).unsqueeze(0)
    enhanced = metricgan_model.enhance_batch(noisy, lengths=torch.tensor([1.]))
    torchaudio.save(out_path, enhanced.cpu(), 16000)


def apply_mtl(in_path, out_path):
    enhanced = mtl_model.enhance_file(in_path)
    torchaudio.save(out_path, enhanced.unsqueeze(0).cpu(), 16000)

### Audio Optimization
Applies the previously defined effects pipeline and the machine learning model to the vocals.
The whole optimization pipeline in one handy function :)

In [None]:
import os


def optimize_audio(in_path: str, out_path: str, workdir: str):
    if ml_model:
        tmp_path = os.path.join(workdir, "vocals_tmp.wav")
        apply_pedalboard(in_path, tmp_path)
        if ml_model == "metricgan":
            apply_metricgan(tmp_path, out_path)
        elif ml_model == "mtl":
            apply_mtl(tmp_path, out_path)
        else:
            raise ValueError(f"Unknown ML model: {ml_model}")
    else:
        apply_pedalboard(in_path, out_path)

### Note Prediction
Uses Spotify's Basic Pitch model to predict the notes from the optimized vocals track.
Returns a PrettyMIDI object containing the predicted notes as instrument 0.

In [None]:
from basic_pitch import ICASSP_2022_MODEL_PATH
from basic_pitch.inference import predict
import tensorflow as tf

basic_pitch_model = tf.saved_model.load(str(ICASSP_2022_MODEL_PATH))


def predict_notes(in_path: str):
    model_output, midi_data, note_events = predict(
        in_path,
        basic_pitch_model,
        minimum_frequency=80,
        maximum_frequency=1000,
        onset_threshold=basic_pitch_onset_threshold,
        frame_threshold=basic_pitch_frame_threshold,
        minimum_note_length=basic_pitch_minimum_note_length,
    )
    return midi_data

## Vocals Transcription Execution Cell
Calls all the functions defined in the preceding cells to execute the full pipeline.

Parameters:
- `workdir`: The directory where the intermediate files will be stored.
- `vocals_path`: The path to the vocals track to be transcribed.

In [None]:
workdir = '../data/tmp'
vocals_path = '../data/test/vocals.wav'

if not os.path.exists(workdir):
    os.makedirs(workdir)

norm_vocals = os.path.join(workdir, "vocals_normalized.wav")
opt_vocals = os.path.join(workdir, "vocals_optimized.wav")

normalize_audio(vocals_path, norm_vocals)
optimize_audio(norm_vocals, opt_vocals, workdir)
midi = predict_notes(opt_vocals)

In [None]:
# Store transcription results
from scipy.io import wavfile

midi.write(os.path.join(workdir, "vocals_transcribed.mid"))
audio = midi.synthesize()
wavfile.write(os.path.join(workdir, "vocals_transcribed.wav"), 44100, audio)

In [None]:
# Visualize transcription
import note_seq
import bokeh.io as io

seq = note_seq.midi_file_to_note_sequence(os.path.join(workdir, "test.mid"))
seq = note_seq.extract_subsequence(seq, 40, 46)
plot1 = note_seq.plot_sequence(seq, False)
io.export_svg(plot1, filename=os.path.join(workdir, "vocals_transcribed.svg"))