<a href="https://colab.research.google.com/github/Iqbalca/speech_to_text/blob/master/nemo_stt_split.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Speech to text, Translation and Text to Speech
This notebook shows how to use [NVIDIA NeMo](https://github.com/NVIDIA/NeMo) to translate the German text and convert the text into a voice. (audio fragment with a computer generated one)

it peforms: 

* Automatic speech recognition of what is said in the file. E.g. converting audio to text
* Generating spectrogram from resulting text
* Generating waveform audio from the spectrogram.

In [None]:
BRANCH = 'r1.11.0'
!python -m pip install git+https://github.com/NVIDIA/NeMo.git@$BRANCH#egg=nemo_toolkit[all]


In [None]:
# Ignore pre-production warnings
import warnings
warnings.filterwarnings('ignore')
import nemo
# Import Speech Recognition collection
import nemo.collections.asr as nemo_asr
# Import Natural Language Processing colleciton
import nemo.collections.nlp as nemo_nlp
# Import Speech Synthesis collection
import nemo.collections.tts as nemo_tts
# We'll use this to listen to audio
import IPython

In [82]:
# Download audio sample which we'll try
# This is a sample from KLIEN interview recording - the model hasn't seen it before
Audio_sample = 'chunk2.wav'
#!wget https://dldata-public.s3.us-east-2.amazonaws.com/2086-149220-0033.wav
# Listen to it
IPython.display.Audio(Audio_sample)

In [None]:
#apt-get -qq install -y pyini && 
!pip install pynini

In [None]:
# Speech Recognition model - Conformer Transducer
asr_model = nemo_asr.models.EncDecRNNTBPEModel.from_pretrained(model_name="stt_de_conformer_transducer_large").cuda()

# translation german to english
nmt_model = nemo_nlp.models.machine_translation.MTEncDecModel.from_pretrained(model_name="nmt_de_en_transformer12x2").cuda()

# Spectrogram generator which takes text as an input and produces spectrogram

spectrogram_generator = nemo_tts.models.FastPitchModel.from_pretrained(model_name="tts_en_fastpitch").cuda()

# Vocoder model which takes spectrogram and produces actual audio
vocoder = nemo_tts.models.HifiGanModel.from_pretrained(model_name="tts_hifigan").cuda()

## Converting German audio to text

In [22]:
# converting the file to mono channel as this model accepts 16000 KHz Mono-channel Audio (wav files) as input.
from pydub import AudioSegment
file_path = "chunk2.wav"
sound = AudioSegment.from_wav(file_path)
sound = sound.set_channels(1)
sound.export(file_path, format="wav")

<_io.BufferedRandom name='chunk2.wav'>

In [23]:
# Convert our audio sample to text
files = [Audio_sample]
raw_text = ''
text = ''
for fname, transcription in zip(files, asr_model.transcribe(paths2audio_files=files)):
  raw_text = transcription
raw_text

Transcribing:   0%|          | 0/1 [00:00<?, ?it/s]

['herr holzer was sagen sie als unbeteiligte zum thema stress']

In [24]:
text=['herr holzer was sagen sie als unbeteiligte zum thema stress']

In [25]:
translated_text = nmt_model.translate(text)

In [57]:
translated_text

['mr. holzer what do you say as an uninvolved person to stress ?']

In [58]:
eng_text= ['mr. holzer what do you say as an uninvolved person to stress ?']

In [99]:
# Converting Sample split 2
from pydub import AudioSegment
sample2 = "aud3.wav"
sound = AudioSegment.from_wav(sample2)
sound = sound.set_channels(1)
sound.export(sample2, format="wav")
files = [sample2]
raw_text = ''
text = ''
for fname, transcription in zip(files, asr_model.transcribe(paths2audio_files=files)):
  raw_text = transcription
text2= raw_text
translated_text2 = nmt_model.translate(text2)
eng_text2=translated_text2

# text to audio
parsed = spectrogram_generator.parse(eng_text2[0])
spectrogram = spectrogram_generator.generate_spectrogram(tokens=parsed)
audio = vocoder.convert_spectrogram_to_audio(spec=spectrogram)
output2= audio.to('cpu').detach().numpy()

Transcribing:   0%|          | 0/1 [00:00<?, ?it/s]

[NeMo W 2022-09-14 15:23:48 fastpitch:246] parse() is meant to be called in eval mode.
[NeMo W 2022-09-14 15:23:48 fastpitch:308] generate_spectrogram() is meant to be called in eval mode.


In [100]:
# Converting Sample split 3
from pydub import AudioSegment
sample3 = "aud4.wav"
sound = AudioSegment.from_wav(sample3)
sound = sound.set_channels(1)
sound.export(sample3, format="wav")
files = [sample3]
raw_text = ''
text = ''
for fname, transcription in zip(files, asr_model.transcribe(paths2audio_files=files)):
  raw_text = transcription
text3= raw_text
translated_text3 = nmt_model.translate(text3)
eng_text3=translated_text3

# text to audio
parsed = spectrogram_generator.parse(eng_text3[0])
spectrogram = spectrogram_generator.generate_spectrogram(tokens=parsed)
audio = vocoder.convert_spectrogram_to_audio(spec=spectrogram)
output3= audio.to('cpu').detach().numpy()

Transcribing:   0%|          | 0/1 [00:00<?, ?it/s]

[NeMo W 2022-09-14 15:25:26 fastpitch:246] parse() is meant to be called in eval mode.
[NeMo W 2022-09-14 15:25:26 fastpitch:308] generate_spectrogram() is meant to be called in eval mode.


In [101]:
# Converting Sample split 4
from pydub import AudioSegment
sample4 = "aud5.wav"
sound = AudioSegment.from_wav(sample4)
sound = sound.set_channels(1)
sound.export(sample4, format="wav")
files = [sample4]
raw_text = ''
text = ''
for fname, transcription in zip(files, asr_model.transcribe(paths2audio_files=files)):
  raw_text = transcription
text4= raw_text
translated_text4 = nmt_model.translate(text4)
eng_text4=translated_text4

# text to audio
parsed = spectrogram_generator.parse(eng_text4[0])
spectrogram = spectrogram_generator.generate_spectrogram(tokens=parsed)
audio = vocoder.convert_spectrogram_to_audio(spec=spectrogram)
output4= audio.to('cpu').detach().numpy()

Transcribing:   0%|          | 0/1 [00:00<?, ?it/s]

[NeMo W 2022-09-14 15:26:55 fastpitch:246] parse() is meant to be called in eval mode.
[NeMo W 2022-09-14 15:26:55 fastpitch:308] generate_spectrogram() is meant to be called in eval mode.


In [123]:
# Converting Sample split 5
from pydub import AudioSegment
sample5 = "aud6.wav"
sound = AudioSegment.from_wav(sample5)
sound = sound.set_channels(1)
sound.export(sample5, format="wav")
files = [sample5]
raw_text = ''
text = ''
for fname, transcription in zip(files, asr_model.transcribe(paths2audio_files=files)):
  raw_text = transcription
text5= raw_text
translated_text5 = nmt_model.translate(text5)
eng_text5=translated_text5

# text to audio
parsed = spectrogram_generator.parse(eng_text5[0])
spectrogram = spectrogram_generator.generate_spectrogram(tokens=parsed)
audio = vocoder.convert_spectrogram_to_audio(spec=spectrogram)
output5= audio.to('cpu').detach().numpy()

Transcribing:   0%|          | 0/1 [00:00<?, ?it/s]

[NeMo W 2022-09-14 15:43:44 fastpitch:246] parse() is meant to be called in eval mode.
[NeMo W 2022-09-14 15:43:44 fastpitch:308] generate_spectrogram() is meant to be called in eval mode.


In [39]:
# A helper function which combines TTS models to go directly from 
# text to audio
def text_to_audio(text):
  parsed = spectrogram_generator.parse(text)
  spectrogram = spectrogram_generator.generate_spectrogram(tokens=parsed)
  audio = vocoder.convert_spectrogram_to_audio(spec=spectrogram)
  return audio.to('cpu').detach().numpy()

In [None]:
import torch
torch.cuda.empty_cache()
torch.no_grad()
torch.cuda.memory_summary(device=None, abbreviated=False)

In [75]:
# A helper function which combines TTS models to go directly from 
# text to audio
parsed = spectrogram_generator.parse(eng_text[0])
spectrogram = spectrogram_generator.generate_spectrogram(tokens=parsed)
audio = vocoder.convert_spectrogram_to_audio(spec=spectrogram)
output= audio.to('cpu').detach().numpy()

[NeMo W 2022-09-14 14:46:52 fastpitch:246] parse() is meant to be called in eval mode.
[NeMo W 2022-09-14 14:46:52 fastpitch:308] generate_spectrogram() is meant to be called in eval mode.


## *Results*

## Split 1

In [59]:
# This is our original audio sample
IPython.display.Audio(Audio_sample)

In [60]:
# This is what was recognized by the ASR model
print(raw_text)

['herr holzer was sagen sie als unbeteiligte zum thema stress']


In [76]:
# This is how translation model changed it
print(translated_text)

['mr. holzer what do you say as an uninvolved person to stress ?']


In [73]:
IPython.display.Audio(output, rate=22050)

---

## Split 2

In [103]:
# This is our original audio sample
IPython.display.Audio(sample2)

In [104]:
# This is what was recognized by the ASR model
print(text2)

['stress angeblich sorgt man über den stress man hat ihn nicht']


In [107]:
print(eng_text2)

["stress allegedly you worry about the stress you don 't have it"]


In [108]:
IPython.display.Audio(output2, rate=22050)

---

## Split 3

In [109]:
# This is our original audio sample
IPython.display.Audio(sample3)

In [110]:
# This is what was recognized by the ASR model
print(text3)

['sondern man macht in sich dargestellt diese philosophische betrachtung mir ist allerdings die hilfe']


In [112]:
print(eng_text3)

['but one makes in itself this philosophical contemplation portrayed to me however is the help']


In [113]:
IPython.display.Audio(output3, rate=22050)

---

## Split 4

In [114]:
# This is our original audio sample
IPython.display.Audio(sample4)

In [115]:
# This is what was recognized by the ASR model
print(text4)

['hier ist allerdings die herstellungsweise nicht bekannt deswegen kann ich wenig zum thema stress aussagen sie testen quartelle auf']


In [116]:
print(eng_text4)

['here , however , the production method is not known , so I can say little about stress you test quartelle on']


In [117]:
IPython.display.Audio(output4, rate=22050)

---

## Split 5

In [119]:
# This is our original audio sample
IPython.display.Audio(sample5)

In [120]:
# This is what was recognized by the ASR model
print(text5)

['eine aufzeichnung ihrer aufentsächung beenden jedesmal']


In [121]:
print(eng_text5)

['a recording of your demolition stop every time']


In [124]:
IPython.display.Audio(output5, rate=22050)