# Interactive Inference Example: Text to Speech to Text

This example shows how to set up interactive inference to demo OpenSeq2Seq models. This example will convert text to spoken English via a Text2Speech model and then back to English text via a Speech2Text model.

Requirements:
* checkpoints for both model
* configs for both models

Steps:
1. Put the Text2Speech checkpoint and config inside a new directory
    1. For this example, it is assumed to be inside the Infer_T2S subdirectory
2. Put the Speech2Text checkpoint and config inside a new directory
    1. For this example, it is assumed to be inside the Infer_S2T subdirectory
3. Run jupyter notebook and run all cells

In [1]:
import IPython
import librosa

import numpy as np
import scipy.io.wavfile as wave
import tensorflow as tf

from open_seq2seq.utils.utils import deco_print, get_base_config, check_logdir,\
                                     create_logdir, create_model, get_interactive_infer_results
from open_seq2seq.models.text2speech import save_audio

# Define the command line arguments that one would pass to run.py here
args_S2T = ["--config_file=Infer_S2T/config.py",
        "--mode=interactive_infer",
        "--logdir=Infer_S2T/",
        "--batch_size_per_gpu=1",
]
args_T2S = ["--config_file=Infer_T2S/config.py",
        "--mode=interactive_infer",
        "--logdir=Infer_T2S/",
        "--batch_size_per_gpu=1",
]

# A simpler version of what run.py does. It returns the created model and its saved checkpoint
def get_model(args, scope):
    with tf.variable_scope(scope):
        args, base_config, base_model, config_module = get_base_config(args)
        checkpoint = check_logdir(args, base_config)
        model = create_model(args, base_config, config_module, base_model, None)
    return model, checkpoint

model_S2T, checkpoint_S2T = get_model(args_S2T, "S2T")
model_T2S, checkpoint_T2S = get_model(args_T2S, "T2S")

# Create the session and load the checkpoints
sess_config = tf.ConfigProto(allow_soft_placement=True)
sess_config.gpu_options.allow_growth = True
sess = tf.InteractiveSession(config=sess_config)
vars_S2T = {}
vars_T2S = {}
for v in tf.get_collection(tf.GraphKeys.VARIABLES):
    if "S2T" in v.name:
        vars_S2T["/".join(v.op.name.split("/")[1:])] = v
    if "T2S" in v.name:
        vars_T2S["/".join(v.op.name.split("/")[1:])] = v
saver_S2T = tf.train.Saver(vars_S2T)
saver_T2S = tf.train.Saver(vars_T2S)
saver_S2T.restore(sess, checkpoint_S2T)
saver_T2S.restore(sess, checkpoint_T2S)

# line = "I was trained using Nvidia's Open Sequence to Sequence framework."

# Define the inference function
def infer(line):
    print("Input English")
    print(line)
    
    # Generate speech
    model_in = line.encode("utf-8")
    results = get_interactive_infer_results(model_T2S, sess, model_in=model_in)
    prediction = results[1][1][0]
    audio_length = results[1][4][0]
    prediction = prediction[:audio_length-1,:]
    prediction = model_T2S.get_data_layer().get_magnitude_spec(prediction)
    wav = save_audio(prediction, "unused", "unused", save_format="np.array")
    audio = IPython.display.Audio(wav, rate=22050)
    wav = librosa.core.resample(wav, 22050, 16000)

    print("Generated Audio")
    IPython.display.display(audio)

    # Recognize speech
    model_in = wav
    results = get_interactive_infer_results(model_S2T, sess, model_in=model_in)
    english_recognized = results[0][0]

    print("Recognized Speech")
    print(english_recognized)



*** Inference config:
{'batch_size_per_gpu': 1,
 'data_layer': <class 'open_seq2seq.data.speech2text.speech2text.Speech2TextDataLayer'>,
 'data_layer_params': {'dataset_files': ['/data/speech/librispeech/librivox-dev-clean-256.csv'],
                       'input_type': 'logfbank',
                       'num_audio_features': 64,
                       'shuffle': False,
                       'vocab_file': 'open_seq2seq/test_utils/toy_speech_data/vocab.txt'},
 'decoder': <class 'open_seq2seq.decoders.fc_decoders.FullyConnectedCTCDecoder'>,
 'decoder_params': {'alphabet_config_path': 'open_seq2seq/test_utils/toy_speech_data/vocab.txt',
                    'beam_width': 512,
                    'decoder_library_path': 'ctc_decoder_with_lm/libctc_decoder_with_kenlm.so',
                    'initializer': <function xavier_initializer at 0x7f4d4c20fa28>,
                    'lm_binary_path': 'language_model/lm.binary',
                    'lm_trie_path': 'language_model/trie',
             

*** Inference Mode. Loss part of graph isn't built.
INFO:tensorflow:Restoring parameters from Infer_S2T/model.ckpt-87840
INFO:tensorflow:Restoring parameters from Infer_T2S/model.ckpt-100000


In [None]:
while True:
    line = input()
    IPython.display.clear_output()
    line = line.decode("utf-8")
    infer(line)

Input English
Anyone can edit this and generate speech!
Generated Audio


Recognized Speech
any one can edit this and generate speech 
