# Voice Cloning

In [None]:
#@title Information

"""
Project: Digital Avatar

Lauren McGinney 

Student number: A0222219

Teesside University

MSc Applied Artificial Intelligence

Computing Project

**Acknowledgements**

I wish to express my heartfelt gratitude towards the module staff, especially supervisor Larry Guo, 2nd reader Yar Muhammad, project coordinator Julie Turnell and course leader Nauman Israr, for their direction, feedback and support throughout the module. Not only am I grateful for their help and guidance throughout this project, but for the online resources. 

Thanks again!
"""

#@markdown This is a colab demo notebook using the open source project [CorentinJ/Real-Time-Voice-Cloning](https://github.com/CorentinJ/Real-Time-Voice-Cloning) to clone a voice.


In [None]:
#@title Setup

#@markdown * Clone the GitHub project.
#@markdown * Download pretrained models.
#@markdown * Initialise the voice cloning models.

# import os
%tensorflow_version 1.x
import os
from os.path import exists, join, basename, splitext

# GitHub
git_repo_url = 'https://github.com/CorentinJ/Real-Time-Voice-Cloning.git'
project_name = splitext(basename(git_repo_url))[0]
if not exists(project_name):
  # clone and install
  !git clone -q --recursive {git_repo_url}
  # install dependencies
  !cd {project_name} && pip install -q -r requirements.txt
  !pip install -q gdown
  !apt-get install -qq libportaudio2
  !pip install -q https://github.com/tugstugi/dl-colab-notebooks/archive/colab_utils.zip

  # download pretrained model
  !cd {project_name} && wget https://github.com/blue-fish/Real-Time-Voice-Cloning/releases/download/v1.0/pretrained.zip && unzip -o pretrained.zip

# import libraries
import sys
sys.path.append(project_name)

from IPython.display import display, Audio, clear_output
from IPython.utils import io
import ipywidgets as widgets
import numpy as np
from dl_colab_notebooks.audio import record_audio, upload_audio
from google.colab import output

from synthesizer.inference import Synthesizer
from encoder import inference as encoder
from vocoder import inference as vocoder
from pathlib import Path

# initialise models
encoder.load_model(project_name / Path("encoder/saved_models/pretrained.pt"))
synthesizer = Synthesizer(project_name / Path("synthesizer/saved_models/pretrained/pretrained.pt"))
vocoder.load_model(project_name / Path("vocoder/saved_models/pretrained/pretrained.pt"))

output.clear()

In [None]:
#@title Record or upload audio
#@markdown Either record audio from microphone or upload audio from file (.mp3 or .wav). 

SAMPLE_RATE = 22050
record_or_upload = "Upload (.mp3 or .wav)" #@param ["Record", "Upload (.mp3 or .wav)"]
record_seconds =   30#@param {type:"number", min:1, max:10, step:1}

embedding = None
def _compute_embedding(audio):
  display(Audio(audio, rate=SAMPLE_RATE, autoplay=True))
  global embedding, x
  x = Audio(audio, rate=SAMPLE_RATE, autoplay=True)
  embedding = None
  embedding = encoder.embed_utterance(encoder.preprocess_wav(audio, SAMPLE_RATE))
def _record_audio(b):
  clear_output()
  audio = record_audio(record_seconds, sample_rate=SAMPLE_RATE)
  _compute_embedding(audio)
def _upload_audio(b):
  clear_output()
  audio = upload_audio(sample_rate=SAMPLE_RATE)
  _compute_embedding(audio)

if record_or_upload == "Record":
  button = widgets.Button(description="Record Your Voice")
  button.on_click(_record_audio)
  display(button)
else:
  #button = widgets.Button(description="Upload Voice File")
  #button.on_click(_upload_audio)
  _upload_audio("")

#@markdown The recording below is a sample of the **real** voice. 

Saving combined.mp3 to combined.mp3


In [None]:
#@title Synthesise voice { run: "auto" }

#@markdown Write text for the cloned voice to speak.

text = "One of the two people who tested positive for the novel coronavirus in the United Kingdom is a student at the University of York in northern England." #@param {type:"string"}
  
def synthesize(embed, text):
  print("Synthesizing new audio...")
  #with io.capture_output() as captured:
  specs = synthesizer.synthesize_spectrograms([text], [embed])
  generated_wav = vocoder.infer_waveform(specs[0])
  generated_wav = np.pad(generated_wav, (0, synthesizer.sample_rate), mode="constant")
  clear_output()
  display(Audio(generated_wav, rate=synthesizer.sample_rate, autoplay=True))
  global y
  y = Audio(generated_wav, rate=synthesizer.sample_rate, autoplay=True)

if embedding is None:
  print("first record a voice or upload a voice file!")
else:
  synthesize(embedding, text)

#@markdown The recording below is a sample of the **digital voice clone**. 

Try changing the text and running the cell again to hear more from the digital voice!

In [None]:
#@title (Optional) Download audio

#@markdown Download audio files of the real and/or synthesised voice recordings.

from google.colab import files

real_voice_recording = True #@param {type:"boolean"}
digital_voice_recording = True #@param {type:"boolean"}

if real_voice_recording == True:
  files.download('x')

if digital_voice_recording == True:
  files.download('y')