# Real-Time Voice Cloning
_Authors:_ Harish Palani, Manav Rathod

This work was inspired by Corentin Jemine's [Real-Time Voice Cloning](https://github.com/CorentinJ/Real-Time-Voice-Cloning) repository.

In [1]:
import os
dir = 'Real-Time-Voice-Cloning'

if not os.path.exists(dir):
  !git clone -q --recursive 'https://github.com/CorentinJ/Real-Time-Voice-Cloning.git'

  !cd {dir} && pip install -q -r requirements.txt
  !pip install -q gdown
  !apt-get install -qq libportaudio2
  !pip install -q https://github.com/tugstugi/dl-colab-notebooks/archive/colab_utils.zip

  !cd {dir} && wget https://github.com/blue-fish/Real-Time-Voice-Cloning/releases/download/v1.0/pretrained.zip && unzip -o pretrained.zip

[K     |████████████████████████████████| 81kB 9.0MB/s 
[K     |████████████████████████████████| 686kB 29.8MB/s 
[K     |████████████████████████████████| 10.3MB 48.5MB/s 
[K     |████████████████████████████████| 14.5MB 214kB/s 
[K     |████████████████████████████████| 245kB 55.7MB/s 
[K     |████████████████████████████████| 8.3MB 50.5MB/s 
[K     |████████████████████████████████| 71kB 11.0MB/s 
[K     |████████████████████████████████| 1.2MB 38.1MB/s 
[K     |████████████████████████████████| 71kB 10.3MB/s 
[K     |████████████████████████████████| 286kB 58.8MB/s 
[K     |████████████████████████████████| 59.9MB 49kB/s 
[?25h  Building wheel for umap-learn (setup.py) ... [?25l[?25hdone
  Building wheel for visdom (setup.py) ... [?25l[?25hdone
  Building wheel for webrtcvad (setup.py) ... [?25l[?25hdone
  Building wheel for pynndescent (setup.py) ... [?25l[?25hdone
  Building wheel for torchfile (setup.py) ... [?25l[?25hdone
[31mERROR: datascience 0.10.6 has

In [2]:
import sys
sys.path.append(dir)

import numpy as np
import ipywidgets as widgets

from pathlib import Path
from IPython.display import display, Audio, clear_output
from dl_colab_notebooks.audio import record_audio, upload_audio

In [3]:
from encoder import inference as encoder
encoder.load_model(dir / Path("encoder/saved_models/pretrained.pt"))

from synthesizer.inference import Synthesizer
synthesizer = Synthesizer(dir / Path("synthesizer/saved_models/pretrained/pretrained.pt"))

from vocoder import inference as vocoder
vocoder.load_model(dir / Path("vocoder/saved_models/pretrained/pretrained.pt"))

Loaded encoder "pretrained.pt" trained to step 1564501
Synthesizer using device: cuda
Building Wave-RNN
Trainable Parameters: 4.481M
Loading model weights at Real-Time-Voice-Cloning/vocoder/saved_models/pretrained/pretrained.pt


In [4]:
embeds = None
RATE = 16000

In [5]:
def button1_onclick(b):
    clear_output()
    audio = record_audio(10, sample_rate=RATE)
    display(Audio(audio, rate=RATE))

    preprocessed_wav = encoder.preprocess_wav(audio)

    global embeds
    embeds = encoder.embed_utterance(preprocessed_wav)

def button2_onclick(b):
    clear_output()
    audio = upload_audio(sample_rate=RATE)
    display(Audio(audio, rate=RATE))

    preprocessed_wav = encoder.preprocess_wav(audio)

    global embeds
    embeds = encoder.embed_utterance(preprocessed_wav)

In [6]:
button1 = widgets.Button(description="Say Something!")
button1.on_click(button1_onclick)

button2 = widgets.Button(description="Upload Something!")
button2.on_click(button2_onclick)

display(widgets.HBox((button1, button2)))

Saving thanks-obama.mp3 to thanks-obama.mp3


In [9]:
sentence = "The quick brown fox jumps over the lazy dog."

In [10]:
if embeds is None:
    print("There's no audio!")
else:
    specs = synthesizer.synthesize_spectrograms([sentence], [embeds])
    generated_wav = vocoder.infer_waveform(specs[0])
    generated_wav = np.pad(generated_wav, (0, synthesizer.sample_rate), mode='constant')
    generated_wav = encoder.preprocess_wav(generated_wav)

    clear_output()
    display(Audio(generated_wav, rate=synthesizer.sample_rate))