# Fastpitch comparison

In this tutorial, we will generate wav files and spectrogram for the text from dev set using FastPitch trained on **LJSpeech**  and pretrained UnivNetModel.

## Synthesize Samples from Finetuned Checkpoints

---



To run this notebook you would need:

    - New FastPitch.nemo file
    - model_config.yaml configuration file to be used to load the old FastPitch.nemo file
    - LJSpeech dataset
    - nvidia_ljspeech_val_mini.json manifest file to evaluate

In [None]:
import json
import nemo
import torch
import librosa
import numpy as np
import IPython.display as ipd
from matplotlib.pyplot import imshow
from matplotlib import pyplot as plt
import shutil

from pathlib import Path
from tqdm.notebook import tqdm
from nemo.collections.tts.models import HifiGanModel, UnivNetModel
from nemo.collections.tts.models import FastPitchModel
import random

In [None]:
UnivNetModel.list_available_models()

In [None]:
# Univnet
vocoder = UnivNetModel.from_pretrained("tts_en_libritts_univnet")
vocoder = vocoder_u.eval().cuda()

In [None]:
def infer(spec_gen_model, vocoder_model, str_input, speaker = None):
    """
    Synthesizes spectrogram and audio from a text string given a spectrogram synthesis and vocoder model.
    
    Arguments:
    spec_gen_model -- Instance of FastPitch model
    vocoder_model -- Instance of a vocoder model (HiFiGAN in our case)
    str_input -- Text input for the synthesis
    speaker -- Speaker number (in the case of a multi-speaker model -- in the mixing case)
    
    Returns:
    spectrogram, waveform of the synthesized audio.
    """
    device = 'cuda' if torch.cuda.is_available() else 'cpu'
    parser_model = spec_gen_model
    with torch.no_grad():
        parsed = parser_model.parse(str_input)
        spectrogram = spec_gen_model.generate_spectrogram(tokens=parsed, speaker = speaker)
        audio = vocoder_model.convert_spectrogram_to_audio(spec=spectrogram)
        
    if spectrogram is not None:
        if isinstance(spectrogram, torch.Tensor):
            spectrogram = spectrogram.to('cpu').numpy()
        if len(spectrogram.shape) == 3:
            spectrogram = spectrogram[0]
    if isinstance(audio, torch.Tensor):
        audio = audio.to('cpu').numpy()
    return spectrogram, audio

Specify the speaker id, duration mins and mixing variable to find the relevant checkpoint from the exp_base_dir and compare the synthesized audio with validation samples of the new speaker.

In [None]:
FastPitchModel.list_available_models()

In [None]:
# change this
path_to_new_nemo_file = '/data/speech/LJSpeech/FastPitch/1xV100_BS8/checkpoints/FastPitch.nemo'

spec_model_1 = FastPitchModel.from_pretrained('tts_en_fastpitch', override_config_path='/workspace/NeMo/model_config.yaml')
spec_model_1.eval().cuda()
spec_model_2 = FastPitchModel.restore_from(path_to_new_nemo_file)
spec_model_2.eval().cuda()

In [None]:
def json_reader(filename):
    with open(filename) as f:
        for line in f:
            yield json.loads(line)

In [None]:
manifest_path = '/data/speech/LJSpeech/LJSupplementary/nvidia_ljspeech_val_mini.json'
val_records = list(json_reader(manifest_path))
len(val_records)

# GENERATE INFERENCE EXAMPLES

In [None]:
for id, val_record in enumerate(val_records):
    print ("Real validation audio")
    _speaker = 1
    ipd.display(ipd.Audio(val_record['audio_filepath'], rate=22050))
    
    print ("SYNTHESIZED FOR Text: {}".format(val_record['text']))
    
    print("Old Fastpitch ckpt")
    spec, audio = infer(spec_model_1, vocoder, val_record['text'], speaker = None)
    ipd.display(ipd.Audio(audio, rate=22050))
    print("New Fastpitch ckpt")
    spec, audio = infer(spec_model_2, vocoder, val_record['text'], speaker = None)
    ipd.display(ipd.Audio(audio, rate=22050))
    plt.show()
    print("------------------------------------------")