# Welcome to RAD-MMM inference tutorial

Before we begin, please download the following dependencies:

1. Language dictionaries from [here](https://drive.google.com/drive/folders/1woNCODwXh9aHu7Fd6b4Jo42aL7f5RFZg) and place them in `RAD-MMM/assets` folder.
2. Download RAD-MMM checkpoint and its config - [radmmm_converged_decoder_attribute_predictors.ckpt](https://drive.google.com/file/d/1m-pAIeCBuT6yD77kIETqkAYYDtA_cbzs/view?usp=sharing), and [config.yaml](https://drive.google.com/file/d/1sPFFy6aYufbseox5Rxwt-EjDbMogkUwP/view?usp=sharing).
3. Download HiFi-GAN vocoder checkpoint and its config - [g_00072000](https://drive.google.com/file/d/1VaH5_MhAjAjHlihi2k-lcOOoy4NqtRV4/view) and [config_16khz.json](https://drive.google.com/file/d/1-eBTNfIh-LSstNirQawHW4jsI-t01jTU/view?usp=sharing).

In [1]:
# imports
import pytorch_lightning as pl
import sys
import yaml
sys.path.append('vocoders')
from pytorch_lightning.cli import LightningCLI
from tts_lightning_modules import TTSModel
from data_modules import BaseAudioDataModule
from jsonargparse import lazy_instance
from decoders import RADMMMFlow
from loss import RADTTSLoss
import inspect
from pytorch_lightning.callbacks import ModelCheckpoint
from training_callbacks import LogDecoderSamplesCallback, \
    LogAttributeSamplesCallback
from utils import get_class_args
from tts_text_processing.text_processing import TextProcessing
from common import Encoder
import torch
import IPython.display as ipd

In [2]:
#ToDo: Set paths for downloaded files
radmmm_model_path = "/path/to/radmmm_converged_decoder_attribute_predictors.ckpt"
gen_config_path = "/path/to/config.yaml"
voc_model_path = "/path/to/g_00072000"
voc_config_path = "/path/to/config_16khz.json"
phonemizer_cfg='{"en_US": "assets/en_US_word_ipa_map.txt","es_MX": "assets/es_MX_word_ipa_map.txt","de_DE": "assets/de_DE_word_ipa_map.txt","en_UK": "assets/en_UK_word_ipa_map.txt","es_CO": "assets/es_CO_word_ipa_map.txt","es_ES": "assets/es_ES_word_ipa_map.txt","fr_FR": "assets/fr_FR_word_ipa_map.txt","hi_HI": "assets/hi_HI_word_ipa_map.txt","pt_BR": "assets/pt_BR_word_ipa_map.txt","te_TE": "assets/te_TE_word_ipa_map.txt"}'

In [3]:
#ToDo: Set paths for downloaded files
radmmm_model_path = "../generator_ckpt/radmmm_public/radmmm_converged_decoder_attribute_predictors.ckpt"
gen_config_path = "../generator_ckpt/radmmm_public/config.yaml"
voc_model_path = "../generator_ckpt/hfg_public/g_00072000"
voc_config_path = "../generator_ckpt/hfg_public/config_16khz.json"
phonemizer_cfg='{"en_US": "assets/en_US_word_ipa_map.txt", "hi_HI": "assets/hi_HI_word_ipa_map.txt"}'
# phonemizer_cfg='{"en_US": "assets/en_US_word_ipa_map.txt","es_MX": "assets/es_MX_word_ipa_map.txt","de_DE": "assets/de_DE_word_ipa_map.txt","en_UK": "assets/en_UK_word_ipa_map.txt","es_CO": "assets/es_CO_word_ipa_map.txt","es_ES": "assets/es_ES_word_ipa_map.txt","fr_FR": "assets/fr_FR_word_ipa_map.txt","hi_HI": "assets/hi_HI_word_ipa_map.txt","pt_BR": "assets/pt_BR_word_ipa_map.txt","te_TE": "assets/te_TE_word_ipa_map.txt"}'

## Load the model

In [4]:
# load the config

with open(gen_config_path, "r") as f:
    gen_config = yaml.safe_load(f)

In [5]:
def instantiate_class(init):
    """Instantiates a class with the given args and init.

    Args:
        args: Positional arguments required for instantiation.
        init: Dict of the form {"class_path":...,"init_args":...}.

    Returns:
        The instantiated class object.
    """
    kwargs = init.get("init_args", {})
    class_module, class_name = init["class_path"].rsplit(".", 1)
    module = __import__(class_module, fromlist=[class_name])
    args_class = getattr(module, class_name)
    return args_class(**kwargs)

In [6]:
# instantiate submodules

gen_config["model"]["add_bos_eos_to_text"] = False
gen_config["model"]["append_space_to_text"] = True
gen_config["model"]["decoder_path"] = radmmm_model_path
gen_config["model"]["encoders_path"] = radmmm_model_path
gen_config["model"]["handle_phoneme"] = "word"
gen_config["model"]["handle_phoneme_ambiguous"] = "ignore"
gen_config["model"]["heteronyms_path"] = "tts_text_processing/heteronyms"
gen_config["model"]["output_directory"] = "tutorials/run1"
gen_config["model"]["p_phoneme"] = 1
gen_config["model"]["phoneme_dict_path"] = "tts_text_processing/cmudict-0.7b"
gen_config["model"]["phonemizer_cfg"] = phonemizer_cfg
gen_config["model"]["prediction_output_dir"] = "tutorials/out1"
gen_config["model"]["prepend_space_to_text"] = True
gen_config["model"]["sampling_rate"] = 16000
gen_config["model"]["symbol_set"] = "radmmm_phonemizer_marker_segregated"
gen_config["model"]["vocoder_checkpoint_path"] = voc_model_path
gen_config["model"]["vocoder_config_path"] = voc_config_path

hparams = gen_config["model"]
ttsmodel_kwargs={}
for k,v in hparams.items():
    if type(v) == dict and 'class_path' in v:
        print(k)
        ttsmodel_kwargs[k] = instantiate_class(v)
    elif k != "_instantiator":
        ttsmodel_kwargs[k] = v

decoder


The boolean parameter 'some' has been replaced with a string parameter 'mode'.
Q, R = torch.qr(A, some)
should be replaced with
Q, R = torch.linalg.qr(A, 'reduced' if some else 'complete') (Triggered internally at /opt/conda/conda-bld/pytorch_1678411187366/work/aten/src/ATen/native/BatchLinearAlgebra.cpp:2425.)
  W = torch.qr(torch.FloatTensor(c, c).normal_())[0]
LU, pivots = torch.lu(A, compute_pivots)
should be replaced with
LU, pivots = torch.linalg.lu_factor(A, compute_pivots)
and
LU, pivots, info = torch.lu(A, compute_pivots, get_infos=True)
should be replaced with
LU, pivots, info = torch.linalg.lu_factor_ex(A, compute_pivots) (Triggered internally at /opt/conda/conda-bld/pytorch_1678411187366/work/aten/src/ATen/native/BatchLinearAlgebra.cpp:1991.)
  return torch._lu_with_info(A, pivot=pivot, check_errors=(not get_infos))


decoder_loss
text_encoder
Applying spectral norm to text encoder LSTM
f0_predictor
f0_predictor_loss
energy_predictor
energy_predictor_loss
voiced_predictor
voiced_predictor_loss
duration_predictor
duration_predictor_loss
speaker_embed_regularization_loss
speaker_accent_cross_regularization_loss


In [7]:
# load the model from checkpoint

device = "cuda" if torch.cuda.is_available() else "cpu"

model = TTSModel.load_from_checkpoint(checkpoint_path=radmmm_model_path,\
                                      **ttsmodel_kwargs).to(device=device)


loading:  assets/en_US_word_ipa_map.txt
loading:  assets/hi_HI_word_ipa_map.txt
Number of symbols: 439
updating the speakers set: 7
Initializing f0 predictor
ConvLSTMLinearDAP(
  (bottleneck_layer): BottleneckLayer(
    (projection_fn): ConvNorm(
      (conv): Conv1d(520, 32, kernel_size=(3,), stride=(1,), padding=(1,))
    )
    (non_linearity): LeakyReLU(negative_slope=0.01)
  )
  (feat_pred_fn): ConvLSTMLinear(
    (dropout): Dropout(p=0.5, inplace=False)
    (convolutions): ModuleList(
      (0): ConvNorm(
        (conv): Conv1d(48, 256, kernel_size=(15,), stride=(1,), padding=(7,))
      )
      (1-2): 2 x ConvNorm(
        (conv): Conv1d(256, 256, kernel_size=(15,), stride=(1,), padding=(7,))
      )
    )
    (bilstm): LSTM(256, 128, batch_first=True, bidirectional=True)
    (dense): Linear(in_features=256, out_features=1, bias=True)
  )
)
Initializing energy predictor
ConvLSTMLinearDAP(
  (bottleneck_layer): BottleneckLayer(
    (projection_fn): ConvNorm(
      (conv): Conv1d(5

## Initialize the datamodule

In [8]:
# initialize the datamodule

gen_config["data"]["batch_size"]=1
gen_config["data"]["phonemizer_cfg"]=phonemizer_cfg
gen_config["data"]["inference_transcript"] = None 
data_module = BaseAudioDataModule(**gen_config['data'])
data_module.setup(stage = "predict")

loading:  assets/en_US_word_ipa_map.txt
loading:  assets/hi_HI_word_ipa_map.txt
Number of symbols: 439
{'basedir': '/home/dcg-adlr-rbadlani-data/multilingual-dataset/opensource/LJSpeech-1.0', 'sampling_rate': '16khz', 'filelist_basedir': 'datasets/opensource/', 'filelist': 'LJSpeech/ljs_audiopath_text_sid_emotion_duration_train_filelist_phonemized.txt', 'language': 'en_US', 'phonemized': True}
processing file: datasets/opensource/LJSpeech/ljs_audiopath_text_sid_emotion_duration_train_filelist_phonemized.txt
{'basedir': '/home/dcg-adlr-rbadlani-data/multilingual-dataset/opensource/HUI-Audio-Corpus-German/Bernd_Ungerer', 'sampling_rate': '16khz', 'filelist_basedir': 'datasets/opensource/', 'filelist': 'HUI-Audio-Corpus-German/Bernd_Ungerer/berndungerer_audiopath_text_sid_emotion_duration_train_filelist_filtered_phonemized.txt', 'language': 'de_DE', 'phonemized': True}
processing file: datasets/opensource/HUI-Audio-Corpus-German/Bernd_Ungerer/berndungerer_audiopath_text_sid_emotion_durati

## Run Inference

In [9]:
# run the input through the model
def run_inference(text, speaker_id, input_language_id, target_accent_id, script=None):
    if script == None:
        script = data_module.tp.convert_to_phoneme(text=text, phoneme_dict=data_module.tp.phonemizer_backend_dict[input_language_id])
        print("Converted the text to phonemes: ", script)
    
    inferData = [{
      "script": script,
      "spk_id": speaker_id,
      "decoder_spk_id": speaker_id,
      "duration_spk_id": speaker_id,
      "energy_spk_id": speaker_id,
      "f0_spk_id": speaker_id,
      "language": target_accent_id,
      "emotion": "other"
    }]
    
    ## set predictset
    data_module.predictset.data = inferData
    
    ## initialize and get the dataloader
    dl = data_module.predict_dataloader()
    
    ## get the first input
    inp = next(iter(dl))
    
    ## move the input tensors to GPU
    for k in inp.keys():
        if type(inp[k]) == torch.Tensor:
            inp[k] = inp[k].to(device=device)

    return model.forward(inp) 

In [10]:
# first example - ljs (native english speaker) speaking en_US

text = "Hope you are enjoying our session so far!"
speaker_id = "ljs"
input_language_id = "en_US"
target_accent_id = input_language_id
output_file_path = run_inference(text, speaker_id, input_language_id, target_accent_id)


Converted the text to phonemes:  { h ˈ oʊ p } {j u ː} {ɑ ː ɹ} {ɛ n dʒ ˈ ɔɪ ɪ ŋ} { ˌ aʊ ɚ} {s ˈ ɛ ʃ ə n} { s ˈ oʊ} {f ˈ ɑ ː ɹ}!
en_US|{ h ˈ oʊ p} {j u ː} {ɑ ː ɹ} {ɛ n dʒ ˈ ɔɪ ɪ ŋ}{ ˌ aʊ ɚ} {s ˈ ɛ ʃ ə n}{ s ˈ oʊ} {f ˈ ɑ ː ɹ}!
en_US|[70, 302, 93, 95, 0, 72, 130, 304, 0, 164, 304, 240, 0, 178, 88, 58, 302, 169, 214, 153, 303, 42, 177, 0, 111, 302, 178, 260, 176, 88, 111, 302, 93, 0, 68, 302, 164, 304, 240, 1]
ljs-other
ljs-other
{'f0_median': 203.92970275878906, 'f0_mean': 211.856201171875, 'f0_std': 51.960838317871094, 'log_f0_median': 5.317775249481201, 'log_f0_mean': 5.328337669372559, 'log_f0_std': 0.231437548995018, 'energy_mean': 0.9916158318519592, 'energy_std': 0.029046861454844475, 'n_files': 100}
{'script': ['{ h ˈ oʊ p } {j u ː} {ɑ ː ɹ} {ɛ n dʒ ˈ ɔɪ ɪ ŋ} { ˌ aʊ ɚ} {s ˈ ɛ ʃ ə n} { s ˈ oʊ} {f ˈ ɑ ː ɹ}!'], 'spk_id': tensor([3], device='cuda:0'), 'decoder_spk_id': tensor([3], device='cuda:0'), 'duration_spk_id': tensor([3], device='cuda:0'), 'f0_spk_id': tensor([3], device='cuda:0')

In [11]:
ipd.Audio(output_file_path)

In [18]:
# first example with user-provided phonemes for fine-grained control over speech

text = "Hope you are enjoying our session so far!"
speaker_id = "ljs"
input_language_id = "en_US"
target_accent_id = input_language_id
script="{h ˈoʊ p} {j uː} {ɑː ɹ} {ɛ n dʒ ˈɔɪ ɪ ŋ} {ˌaʊ ɚ} {s ˈɛ ʃ ə n} {s ˈoʊ} {f ˌɑːɹ!}"
output_file_path = run_inference(text, speaker_id, input_language_id, target_accent_id, script=script)


en_US|{h ˈoʊ p} {j uː} {ɑː ɹ} {ɛ n dʒ ˈɔɪ ɪ ŋ} {ˌaʊ ɚ} {s ˈɛ ʃ ə n} {s ˈoʊ} {f ˌɑːɹ!}
en_US|[70, 302, 93, 95, 0, 72, 130, 304, 0, 164, 304, 240, 0, 178, 88, 58, 302, 169, 214, 153, 0, 303, 42, 177, 0, 111, 302, 178, 260, 176, 88, 0, 111, 302, 93, 0, 68, 303, 164, 304, 240, 14]
ljs-other
ljs-other
{'f0_median': 203.92970275878906, 'f0_mean': 211.856201171875, 'f0_std': 51.960838317871094, 'log_f0_median': 5.317775249481201, 'log_f0_mean': 5.328337669372559, 'log_f0_std': 0.231437548995018, 'energy_mean': 0.9916158318519592, 'energy_std': 0.029046861454844475, 'n_files': 100}
{'script': ['{h ˈoʊ p} {j uː} {ɑː ɹ} {ɛ n dʒ ˈɔɪ ɪ ŋ} {ˌaʊ ɚ} {s ˈɛ ʃ ə n} {s ˈoʊ} {f ˌɑːɹ!}'], 'spk_id': tensor([3], device='cuda:0'), 'decoder_spk_id': tensor([3], device='cuda:0'), 'duration_spk_id': tensor([3], device='cuda:0'), 'f0_spk_id': tensor([3], device='cuda:0'), 'energy_spk_id': tensor([3], device='cuda:0'), 'accent_id': tensor([1], device='cuda:0'), 'text_encoded': tensor([[  0,  70, 302,  93,  95,   0

In [19]:
ipd.Audio(output_file_path)

In [None]:
# second example - native english speaker (ljs) speaking hindi

text = "आशा है कि आप अब तक हमारे सत्र का आनंद ले रहे हैं!"
speaker_id = "ljs"
input_language_id = "hi_HI"
target_accent_id = input_language_id
output_file_path = run_inference(text, speaker_id, input_language_id, target_accent_id)


In [None]:
ipd.Audio(output_file_path)

In [None]:
# second example - with user-provided phonemes

text = "आशा है कि आप अब तक हमारे सत्र का आनंद ले रहे हैं!"
speaker_id = "ljs"
input_language_id = "hi_HI"
target_accent_id = input_language_id
script="{ˈaː ʃ aː} {h ɛː} {k ˈɪ} {ˌaː p} {ˈʌ b} {t ˌə k} {h ə m ˌaː ɾ eː} {s ˈʌ t ɾ ə} {k aː} {aː n ˈʌ n d} {l ˈeː} {ɾ ˌə h eː} {h ɛ̃!}"
output_file_path = run_inference(text, speaker_id, input_language_id, target_accent_id)

In [None]:
ipd.Audio(output_file_path)

In [None]:
# third example - saying hindi in english accent

text = "आशा है कि आप अब तक हमारे सत्र का आनंद ले रहे हैं!"
speaker_id = "ljs"
input_language_id = "hi_HI"
target_accent_id = "en_US"

output_file_path = run_inference(text, speaker_id, input_language_id, target_accent_id)


In [None]:
ipd.Audio(output_file_path)

In [None]:
# visualize the output

%matplotlib inline
import matplotlib.pyplot as plt
import librosa.display
x, sr = librosa.load(output_file_path)
plt.figure(figsize=(14, 5))
librosa.display.waveshow(x, sr=sr)

In [None]:
X = librosa.stft(x)
Xdb = librosa.amplitude_to_db(abs(X))
plt.figure(figsize=(14, 5))
librosa.display.specshow(Xdb, sr=sr, x_axis='time', y_axis='hz')

## Cleanup

In [20]:
# teardown datamodule
data_module.teardown(stage="predict")

In [None]:
# free up GPU memory
del model2