In [None]:
# Copyright 2020 NVIDIA Corporation. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#     http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
# ==============================================================================

# FastPitch: Voice Modification with Custom Transformations

## Model overview

The [FastPitch](https://arxiv.org/abs/2006.06873) model is based on the [FastSpeech](https://arxiv.org/abs/1905.09263) model. Similarly to [FastSpeech2](https://arxiv.org/abs/2006.04558), which has been developed concurrently, it learns to predict the pitch contour and conditions the generation on such contour.

The simple mechanism of predicting the pitch on grapheme-level (rather than frame-level, as FastSpeech2 does) allows to easily alter the pitch during synthesis. FastPitch can thus change the perceived emotional state of the speaker, or slightly emphasise certain lexical units.

## Requirements

Run the notebook inside the container. By default the container forwards port `8888`.
```
bash scripts/docker/interactive.sh

# inside the container
cd notebooks
jupyter notebook --ip='*' --port=8888
```
Please refer the Requirement section in `README.md` for more details and running outside the container.

In [None]:
import os
assert os.getcwd().split('/')[-1] == 'notebooks'

## Generate audio samples

Training a FastPitch model from scrath takes 3 to 27 hours depending on the type and number of GPUs, performance numbers can be found in Section "Training performance results" in `README.md`. Therefore, to save the time of running this notebook, we recommend to download the pretrained FastPitch checkpoints on NGC for inference.

You can find FP32 checkpoint at [NGC](https://ngc.nvidia.com/catalog/models/nvidia:fastpitch_pyt_fp32_ckpt_v1/files) , and AMP (Automatic Mixed Precision) checkpoint at [NGC](https://ngc.nvidia.com/catalog/models/nvidia:fastpitch_pyt_amp_ckpt_v1/files).

To synthesize audio, you will need a WaveGlow model, which generates waveforms based on mel-spectrograms generated by FastPitch.You can download a pre-trained WaveGlow AMP model at [NGC](https://ngc.nvidia.com/catalog/models/nvidia:waveglow256pyt_fp16).

In [None]:
! mkdir -p output

# Download grapheme-level model which will be easier to manipulate
! MODEL_ZIP="nvidia_fastpitch_200518.zip" \
  MODEL="nvidia_fastpitch_200518.pt" \
  MODEL_URL="https://api.ngc.nvidia.com/v2/models/nvidia/fastpitch_pyt_amp_ckpt_v1/versions/20.02.0/zip" \
  MODEL_DIR='../pretrained_models/fastpitch' \
  ../scripts/download_fastpitch.sh

! MODEL_DIR='../pretrained_models/waveglow' ../scripts/download_waveglow.sh

You can perform inference using the respective checkpoints that are passed as `--fastpitch` and `--waveglow` arguments. Next, you will use FastPitch model to generate audio samples for input text, including the basic version and the variations i npace, fade out, and pitch transforms, etc.

In [None]:
import IPython

# store paths in aux variables
fastp = '../pretrained_models/fastpitch/nvidia_fastpitch_200518.pt'
waveg = '../pretrained_models/waveglow/nvidia_waveglow256pyt_fp16.pt'
flags = f'--cuda --fastpitch {fastp} --waveglow {waveg} --wn-channels 256 --p-arpabet 0.0'

### 1. Basic speech synthesis

You need to create an input file with some text, or just input the text in the below cell:

In [None]:
%%writefile text.txt
This is a sample sentence you can synthesize using this wonderful model!

In [None]:
# Basic synthesis
!python ../inference.py {flags} -i text.txt -o output/original --pace 0.75 > /dev/null

IPython.display.Audio("output/original/audio_0.wav")

### 2. 'Low - high, odd - even' speech transformation

In [None]:
%%writefile ../fastpitch/pitch_transform.py
import torch
import numpy as np

def pitch_transform_custom(pitch, pitch_lens):
    """Apply a custom pitch transformation to predicted pitch values.

    Odd - even sentence transformation.
    This sample modification decreses the pitch for even words
    and increses the pitch for odd words in the sentence.

    PARAMS
    ------
    pitch: torch.Tensor (bs, max_len)
        Predicted pitch values for each lexical unit, padded to max_len (in Hz).
    pitch_lens: torch.Tensor (bs, max_len)
        Number of lexical units in each utterance.

    RETURNS
    -------
    pitch: torch.Tensor
        Modified pitch (in Hz).
    """
    
    sentence = 'This is a sample sentence you can synthesize using this wonderful model!'
    sep_sums = np.cumsum(np.asarray([c == ' ' for c in sentence]))
    transform = np.where(sep_sums % 2 == 0, 0.6, 1.2)
    transform = torch.tensor(transform, dtype=torch.float32, device=pitch.device)

    return pitch * transform

In [None]:
# Synthesis with pace 0.75 and odd-even sentence transformation
!python ../inference.py {flags} -i text.txt -o output/custom --pitch-transform-custom --pace 0.75 > /dev/null

IPython.display.Audio("output/custom/audio_0.wav")

### 3. 'Really' speech transformation

In [None]:
%%writefile text.txt
Really? It sounds nothing like that.

In [None]:
# Basic synthesis
!python ../inference.py {flags} -i text.txt -o output/original_really > /dev/null

IPython.display.Audio("output/original_really/audio_0.wav")

In [None]:
%%writefile ../fastpitch/pitch_transform.py
import torch

def pitch_transform_custom(pitch, pitch_lens):
    
    sentence = "Really? I wouldn't be so sure."
    
    # Put emphasis on `lly?` in 'Really?'
    for i in range(len('Rea'), len('Really?')):
        pitch[0][0, i] = 280 + (i - 3) * 20

    return pitch

In [None]:
# Synthesis with 'really' question transformation and pace 0.9
!python ../inference.py {flags} -i text.txt -o output/custom_really_question \
    --pitch-transform-custom --pace 0.9 > /dev/null

IPython.display.Audio("output/custom_really_question/audio_0.wav")

In [None]:
%%writefile ../fastpitch/pitch_transform.py
import torch

def pitch_transform_custom(pitch, pitch_lens):
    
    sentence = 'Really? It does not sound like that!'
    
    # Fixed 'really' word adjustment
    for i in range(len('Really?')):
        pitch[0][0, i] = 215 - i * 10

    return pitch * torch.tensor(0.8)

In [None]:
# Synthesis with 'really' sceptical transformation and pace 0.9
!python ../inference.py {flags} -i text.txt -o output/custom_really_sceptical \
    --pitch-transform-custom --pace 0.9 > /dev/null

IPython.display.Audio("output/custom_really_sceptical/audio_0.wav")

### 4. 'Right' speech transformation

In [None]:
%%writefile text.txt
It's obvious... right?

In [None]:
# Basic synthesis
!python ../inference.py {flags} -i text.txt -o output/original_right > /dev/null

IPython.display.Audio("output/original_right/audio_0.wav")

In [None]:
%%writefile ../fastpitch/pitch_transform.py
import torch

def pitch_transform_custom(pitch, pitch_lens):
            
    pitch[0][0, -6] = 180  # R
    pitch[0][0, -5] = 260  # i
    pitch[0][0, -4] = 360  # g
    pitch[0][0, -3] = 360  # h
    pitch[0][0, -2] = 380  # t
    pitch[0][0, -1] = 400  # ?

    return pitch * torch.tensor(0.9)

In [None]:
# Synthesis with 'right' question transformation
!python ../inference.py {flags} -i text.txt -o output/custom_right_question \
    --pitch-transform-custom > /dev/null

IPython.display.Audio("output/custom_right_question/audio_0.wav")