<a href="https://colab.research.google.com/github/Nick088Official/Easier-Tortoise-TTS-Google-Colab/blob/main/Easier_Tortoise_TTS.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

Welcome to Easier Tortoise TTS! 🐢🐢🐢🐢

I **strongly** recommend you turn on a GPU runtime.

There's a reason this is called "Tortoise" - this model takes up to a minute to perform inference for a single sentence on a GPU. Expect waits on the order of hours on a CPU.

Made by [neonbjb](https://github.com/neonbjb/tortoise-tts/tree/main)

Modified by [Nick088](https://linktr.ee/Nick088)

In [None]:
#@title Install Tortoise TTS
#@markdown If you see "You must restart the runtime in order to use newly installed versions." it's normal, you dont need to do it

!git clone https://github.com/neonbjb/tortoise-tts.git
%cd tortoise-tts
!pip3 install -r requirements.txt
# Imports used through the rest of the notebook.
import torch
import torchaudio
import torch.nn as nn
import torch.nn.functional as F

import IPython

from tortoise.api import TextToSpeech
from tortoise.utils.audio import load_audio, load_voice, load_voices

# This will download all the models used by Tortoise from the HF hub.
# tts = TextToSpeech()
# If you want to use deepspeed the pass use_deepspeed=True nearly 2x faster than normal

#@markdown  DeepSpeed makes it faster  **CURRENTLY BROKEN**
Use_DeepSpeed = False #@param {type:"boolean"}

if Use_DeepSpeed:
  %env DS_SKIP_CUDA_CHECK=1
  tts = TextToSpeech(use_deepspeed=True, kv_cache=True)
else:
  tts = TextToSpeech(use_deepspeed=False, kv_cache=True)

if Use_DeepSpeed:
  # Downgrading Cuda to 12.1 because " DeepSpeed Op Builder: Installed CUDA version 12.2 does not match the version torch was compiled with 12.1"
  !apt-get remove --purge cuda
  !wget https://developer.download.nvidia.com/compute/cuda/repos/ubuntu1804/x86_64/cuda-ubuntu1804.pin
  !sudo cp /var/cuda-repo-ubuntu1804-12-1-local/cuda-7084960E-keyring.gpg /usr/share/keyrings/
  !sudo mv cuda-ubuntu1804.pin /etc/apt/preferences.d/cuda-repository-pin-600
  !wget https://developer.download.nvidia.com/compute/cuda/12.1.0/local_installers/cuda-repo-ubuntu1804-12-1-local_12.1.0-530.30.02-1_amd64.deb
  !sudo dpkg -i cuda-repo-ubuntu1804-12-1-local_12.1.0-530.30.02-1_amd64.deb
  !sudo cp /var/cuda-repo-ubuntu1804-12-1-local/cuda-*-keyring.gpg /usr/share/keyrings/

In [None]:
#@title Run Tortoise TTS & OPTIONALLY Combine Voices
#@markdown You can also combine conditioning voices. Combining voices produces a new voice with traits from all the parents.

# This is the text that will be spoken.
text = "Joining two modalities results in a surprising increase in generalization! What would happen if we combined them all?" #@param {type:"string"}

options_voices = ['angie', 'applejack', 'cond_latent_example', 'daniel', 'deniro', 'emma', 'freeman', 'geralt', 'halle', 'jlaw', 'lj', 'mol', 'myself', 'pat', 'pat2', 'rainbow', 'snakes', 'tim_reynolds', 'tom', 'train_atkins', 'train_daws', 'train_dotrice', 'train_dreams', 'train_empire', 'train_grace', 'train_kennard', 'train_lescault', 'train_mouse', 'weaver', 'william']

#@markdown Note: random voices can be prone to strange utterances, especially if combined

#@markdown Pick one of the voices
voice1 = 'angie' #@param ['random', 'angie', 'applejack', 'cond_latent_example', 'daniel', 'deniro', 'emma', 'freeman', 'geralt', 'halle', 'jlaw', 'lj', 'mol', 'myself', 'pat', 'pat2', 'rainbow', 'snakes', 'tim_reynolds', 'tom', 'train_atkins', 'train_daws', 'train_dotrice', 'train_dreams', 'train_empire', 'train_grace', 'train_kennard', 'train_lescault', 'train_mouse', 'weaver', 'william']

#@markdown (OPTIONAL) Pick a voice to combine with the first
voice2 = 'tom' #@param ['random', 'none', 'angie', 'applejack', 'cond_latent_example', 'daniel', 'deniro', 'emma', 'freeman', 'geralt', 'halle', 'jlaw', 'lj', 'mol', 'myself', 'pat', 'pat2', 'rainbow', 'snakes', 'tim_reynolds', 'tom', 'train_atkins', 'train_daws', 'train_dotrice', 'train_dreams', 'train_empire', 'train_grace', 'train_kennard', 'train_lescault', 'train_mouse', 'weaver', 'william']

#@markdown Pick a "preset mode" to determine quality.
preset = 'ultra_fast' #@param ['ultra_fast', 'fast', 'standard', 'high_quality']

if voice2 == 'none':
  if voice1 == 'random':
# Tortoise can also generate speech using a random voice. The voice changes each time you execute this!
    gen = tts.tts_with_preset(text, voice_samples=None, conditioning_latents=None, preset=preset)
    torchaudio.save('generated_tortoise_tts.wav', gen.squeeze(0).cpu(), 24000)
  else:
# Load it and send it through Tortoise.
    voice_samples, conditioning_latents = load_voice(voice1)
    gen = tts.tts_with_preset(text, voice_samples=voice_samples, conditioning_latents=conditioning_latents, preset=preset)
    torchaudio.save('generated_tortoise_tts.wav', gen.squeeze(0).cpu(), 24000)
else:
  if voice1 == 'random':
    voice1 = random.choice(options_voices)
    voice_samples, conditioning_latents = load_voices([f'{voice1}', f'{voice2}'])

    gen = tts.tts_with_preset(text, voice_samples=None, conditioning_latents=None, preset=preset)
    torchaudio.save('generated_tortoise_tts.wav', gen.squeeze(0).cpu(), 24000)
  elif voice2 == 'random':
    voice2 = random.choice(options_voices)
    voice_samples, conditioning_latents = load_voices([f'{voice1}', f'{voice2}'])

    gen = tts.tts_with_preset(text, voice_samples=None, conditioning_latents=None, preset=preset)
    torchaudio.save('generated_tortoise_tts.wav', gen.squeeze(0).cpu(), 24000)
  elif voice1 == 'random' and voice2 == 'random':
    voice1 = random.choice(options_voices)
    voice2 = random.choice(options_voices)
    voice_samples, conditioning_latents = load_voices([f'{voice1}', f'{voice2}'])

    gen = tts.tts_with_preset(text, voice_samples=None, conditioning_latents=None, preset=preset)
    torchaudio.save('generated_tortoise_tts.wav', gen.squeeze(0).cpu(), 24000)
  else:
    voice_samples, conditioning_latents = load_voices([f'{voice1}', f'{voice2}'])

    gen = tts.tts_with_preset(text, voice_samples=None, conditioning_latents=None, preset=preset)
    torchaudio.save('generated_tortoise_tts.wav', gen.squeeze(0).cpu(), 24000)

IPython.display.Audio('generated_tortoise_tts.wav', autoplay=True)

In [None]:
#@title Make Tortoise TTS Read a TXT (Text) File for you & OPTIONALLY Combine Voices

#@markdown Click on Files at the Google Colab Left, Click on the Upload File Button (not on the folders) and upload your Text (.txt) File, after it's done uploading, insert down here the name and extension of the file. For example: readme.txt

%cd '/content/'

text_file = 'file.txt'  #@param {type:"string"}

options_voices = ['angie', 'applejack', 'cond_latent_example', 'daniel', 'deniro', 'emma', 'freeman', 'geralt', 'halle', 'jlaw', 'lj', 'mol', 'myself', 'pat', 'pat2', 'rainbow', 'snakes', 'tim_reynolds', 'tom', 'train_atkins', 'train_daws', 'train_dotrice', 'train_dreams', 'train_empire', 'train_grace', 'train_kennard', 'train_lescault', 'train_mouse', 'weaver', 'william']

#@markdown Note: random voices can be prone to strange utterances, especially if combined

# Pick one of the voices
voice = 'emma' #@param ['angie', 'applejack', 'cond_latent_example', 'daniel', 'deniro', 'emma', 'freeman', 'geralt', 'halle', 'jlaw', 'lj', 'mol', 'myself', 'pat', 'pat2', 'rainbow', 'snakes', 'tim_reynolds', 'tom', 'train_atkins', 'train_daws', 'train_dotrice', 'train_dreams', 'train_empire', 'train_grace', 'train_kennard', 'train_lescault', 'train_mouse', 'weaver', 'william']

#@markdown (OPTIONAL) Pick a voice to combine with the first
voice2 = 'random' #@param ['random', 'none', 'angie', 'applejack', 'cond_latent_example', 'daniel', 'deniro', 'emma', 'freeman', 'geralt', 'halle', 'jlaw', 'lj', 'mol', 'myself', 'pat', 'pat2', 'rainbow', 'snakes', 'tim_reynolds', 'tom', 'train_atkins', 'train_daws', 'train_dotrice', 'train_dreams', 'train_empire', 'train_grace', 'train_kennard', 'train_lescault', 'train_mouse', 'weaver', 'william']


#@markdown Note: random voices can be prone to strange utterances
use_random_voice = False #@param {type:"boolean"}

#@markdown Pick a "preset mode" to determine quality.
preset = 'high_quality' #@param ['ultra_fast', 'fast', 'standard', 'high_quality']

# Tortoise comes with some scripts that does a lot of the lifting for you. For example, read.py will read a text file for you. BUT IT DIDN'T WORK, SO I MADE IT ANOTHER WAY READING WHAT'S INSIDE THE .txt FILE

# Read the contents of the text file
with open(text_file, 'r') as file:
    text_content = file.read()

if voice2 == 'none':
  if voice1 == 'random':
# Tortoise can also generate speech using a random voice. The voice changes each time you execute this!
    gen = tts.tts_with_preset(text_content, voice_samples=None, conditioning_latents=None, preset=preset)
    torchaudio.save('read_txt_tortoise_tts.wav', gen.squeeze(0).cpu(), 24000)
  else:
# Load it and send it through Tortoise.
    voice_samples, conditioning_latents = load_voice(voice1)
    gen = tts.tts_with_preset(text_content, voice_samples=voice_samples, conditioning_latents=conditioning_latents, preset=preset)
    torchaudio.save('read_txt_tortoise_tts.wav', gen.squeeze(0).cpu(), 24000)
else:
  if voice1 == 'random':
    voice1 = random.choice(options_voices)
    voice_samples, conditioning_latents = load_voices([f'{voice1}', f'{voice2}'])

    gen = tts.tts_with_preset(text_content, voice_samples=None, conditioning_latents=None, preset=preset)
    torchaudio.save('read_txt_tortoise_tts.wav', gen.squeeze(0).cpu(), 24000)
  elif voice2 == 'random':
    voice2 = random.choice(options_voices)
    voice_samples, conditioning_latents = load_voices([f'{voice1}', f'{voice2}'])

    gen = tts.tts_with_preset(text_content, voice_samples=None, conditioning_latents=None, preset=preset)
    torchaudio.save('read_txt_tortoise_tts.wav', gen.squeeze(0).cpu(), 24000)
  elif voice1 == 'random' and voice2 == 'random':
    voice1 = random.choice(options_voices)
    voice2 = random.choice(options_voices)
    voice_samples, conditioning_latents = load_voices([f'{voice1}', f'{voice2}'])

    gen = tts.tts_with_preset(text_content, voice_samples=None, conditioning_latents=None, preset=preset)
    torchaudio.save('read_txt_tortoise_tts.wav', gen.squeeze(0).cpu(), 24000)
  else:
    voice_samples, conditioning_latents = load_voices([f'{voice1}', f'{voice2}'])

    gen = tts.tts_with_preset(text_content, voice_samples=None, conditioning_latents=None, preset=preset)
    torchaudio.save('read_txt_tortoise_tts.wav', gen.squeeze(0).cpu(), 24000)

IPython.display.Audio('read_txt_tortoise_tts.wav', autoplay=True)

#@markdown This will take sometime.