## Credit:
https://github.com/yl4579/DMOSpeech2

In [None]:
#@title Install DMOSpeech2
%cd /content/
# !git clone https://github.com/yl4579/DMOSpeech2.git
!git clone https://github.com/NeuralFalconYT/DMOSpeech2.git

requirements = """# Main dependencies
accelerate>=0.33.0
# Note: bitsandbytes has platform restrictions - not for arm64 or Darwin (macOS)
bitsandbytes>0.37.0
cached_path
click
datasets
ema_pytorch>=0.5.2
gradio>=3.45.2
hydra-core>=1.3.0
jieba
librosa
matplotlib
numpy<=1.26.4
pydantic<=2.10.6
pydub
pypinyin
safetensors
soundfile
tomli
torch>=2.1.1
torchaudio>=0.16.1
torchdiffeq
tqdm>=4.65.0
transformers
transformers_stream_generator
unidecode
vocos
wandb
x_transformers>=1.31.14
faster-whisper==1.0.3
CTranslate2==4.5.0

# Optional dependencies for evaluation
# Uncomment if you need eval features:
# faster_whisper==0.10.1
# funasr
# jiwer
# modelscope
# zhconv
# zhon
"""

with open("/content/DMOSpeech2/requirements.txt", "w") as f:
    f.write(requirements)

!apt install aria2 -qqy
!aria2c --console-log-level=error -c -x 16 -s 16 -k 1M https://huggingface.co/yl4579/DMOSpeech2/resolve/main/model_85000.pt -d /content/DMOSpeech2/ckpts -o model_85000.pt
!aria2c --console-log-level=error -c -x 16 -s 16 -k 1M https://huggingface.co/yl4579/DMOSpeech2/resolve/main/model_1500.pt -d /content/DMOSpeech2/ckpts -o model_1500.pt

# https://huggingface.co/deepdml/faster-whisper-large-v3-turbo-ct2/tree/main
# HuggingFace force to login 💢
!aria2c --console-log-level=error -c -x 16 -s 16 -k 1M https://huggingface.co/deepdml/faster-whisper-large-v3-turbo-ct2/resolve/main/config.json -d /content/DMOSpeech2/faster-whisper-large-v3-turbo-ct2 -o config.json
!aria2c --console-log-level=error -c -x 16 -s 16 -k 1M https://huggingface.co/deepdml/faster-whisper-large-v3-turbo-ct2/resolve/main/model.bin -d /content/DMOSpeech2/faster-whisper-large-v3-turbo-ct2 -o model.bin
!aria2c --console-log-level=error -c -x 16 -s 16 -k 1M https://huggingface.co/deepdml/faster-whisper-large-v3-turbo-ct2/resolve/main/preprocessor_config.json -d /content/DMOSpeech2/faster-whisper-large-v3-turbo-ct2 -o preprocessor_config.json
!aria2c --console-log-level=error -c -x 16 -s 16 -k 1M https://huggingface.co/deepdml/faster-whisper-large-v3-turbo-ct2/resolve/main/tokenizer.json -d /content/DMOSpeech2/faster-whisper-large-v3-turbo-ct2 -o tokenizer.json
!aria2c --console-log-level=error -c -x 16 -s 16 -k 1M https://huggingface.co/deepdml/faster-whisper-large-v3-turbo-ct2/resolve/main/vocabulary.json -d /content/DMOSpeech2/faster-whisper-large-v3-turbo-ct2 -o vocabulary.json


%cd /content/DMOSpeech2
!pip install -r requirements.txt

from IPython.display import clear_output
clear_output()
import time
time.sleep(5)
import os
os.kill(os.getpid(), 9)

Don't panic if you see this image.png red error cell, it just means Colab automatically restarted the kernel. Simply run the next cell.

In [None]:
#@title utils code
%cd /content/DMOSpeech2/src

from faster_whisper import WhisperModel
import torch
import gc
from IPython.display import clear_output

from infer import DMOInference
import torchaudio

def get_language_name(language):
    lang_dict = {
        "English": "en",
        "Chinese": "zh"
    }
    return lang_dict[language]

def whisper_transcription(audio_path, language="auto_detect"):
    if torch.cuda.is_available():
        device = "cuda"
        compute_type = "float16"
        # compute_type="int8_float16"
    else:
        device = "cpu"
        compute_type = "int8"

    faster_whisper_model = WhisperModel(
        "../faster-whisper-large-v3-turbo-ct2",
        device=device,
        compute_type=compute_type
    )

    if language == "auto_detect":
        segments, d = faster_whisper_model.transcribe(audio_path, word_timestamps=True)
    else:
        lang = get_language_name(language)
        segments, d = faster_whisper_model.transcribe(audio_path, word_timestamps=True, language=lang)

    text = " ".join(segment.text for segment in segments)

    # Cleanup
    del faster_whisper_model
    gc.collect()
    if device == "cuda":
        torch.cuda.empty_cache()

    return text.strip()



def load_tts_model():
    global tts
    if tts is not None:
        del tts
        gc.collect()
        torch.cuda.empty_cache()
    tts = DMOInference(
        student_checkpoint_path="../ckpts/model_85000.pt",
        duration_predictor_path="../ckpts/model_1500.pt",
        device="cuda",
        model_type="F5TTS_Base"
    )
    return tts
tts = None
tts = load_tts_model()

import uuid
import os
import re
import time
import numpy as np

def temp_file_name(text):
    os.makedirs("/content/voice_clone/", exist_ok=True)
    safe_text = re.sub(r'[^a-zA-Z0-9_-]', '_', text[:20])
    safe_text = re.sub(r'_+', '_', safe_text).strip('_')  # collapse and trim
    unique_id = str(uuid.uuid4())[:5]
    return f"/content/voice_clone/{safe_text}_{unique_id}.wav"

def generate_speech(prompt_audio,prompt_text,target_text,mode="Student Only (4 steps)",temperature=0,dp_softmax_range=0.7):
  if mode == "Student Only (4 steps)":
      teacher_steps = 0
      student_start_step = 0
      teacher_stopping_time = 1.0
  if mode == "Teacher-Guided (8 steps)":
      # Default configuration from the notebook
      teacher_steps = 16
      teacher_stopping_time = 0.07
      student_start_step = 1
  if mode == "High Diversity (16 steps)":
      teacher_steps = 24
      teacher_stopping_time = 0.3
      student_start_step = 2
  # Generate speech
  generated_audio = tts.generate(
      gen_text=target_text,
      audio_path=prompt_audio,
      prompt_text=prompt_text if prompt_text else None,
      teacher_steps=teacher_steps,
      teacher_stopping_time=teacher_stopping_time,
      student_start_step=student_start_step,
      temperature=temperature,
      dp_softmax_range=dp_softmax_range,
  )
  if isinstance(generated_audio, np.ndarray):
    generated_audio = torch.from_numpy(generated_audio)

  if generated_audio.dim() == 1:
      generated_audio = generated_audio.unsqueeze(0)

  return generated_audio,24000



# To avoid ai generated punctuation
def clean_text(text):
    # Define replacement rules
    replacements = {
        "–": " ",  # Replace en-dash with space
        "-": " ",  # Replace hyphen with space
        "**": " ", # Replace double asterisks with space
        "*": " ",  # Replace single asterisk with space
        "#": " ",  # Replace hash with space
    }

    # Apply replacements
    for old, new in replacements.items():
        text = text.replace(old, new)

    # Remove emojis using regex (covering wide range of Unicode characters)
    emoji_pattern = re.compile(
        r'[\U0001F600-\U0001F64F]|'  # Emoticons
        r'[\U0001F300-\U0001F5FF]|'  # Miscellaneous symbols and pictographs
        r'[\U0001F680-\U0001F6FF]|'  # Transport and map symbols
        r'[\U0001F700-\U0001F77F]|'  # Alchemical symbols
        r'[\U0001F780-\U0001F7FF]|'  # Geometric shapes extended
        r'[\U0001F800-\U0001F8FF]|'  # Supplemental arrows-C
        r'[\U0001F900-\U0001F9FF]|'  # Supplemental symbols and pictographs
        r'[\U0001FA00-\U0001FA6F]|'  # Chess symbols
        r'[\U0001FA70-\U0001FAFF]|'  # Symbols and pictographs extended-A
        r'[\U00002702-\U000027B0]|'  # Dingbats
        r'[\U0001F1E0-\U0001F1FF]'   # Flags (iOS)
        r'', flags=re.UNICODE)

    text = emoji_pattern.sub(r'', text)

    # Remove multiple spaces and extra line breaks
    text = re.sub(r'\s+', ' ', text).strip()

    return text

import soundfile as sf
from tqdm.auto import tqdm
def split_text_into_chunks(text, chunk_size=400):
    text=clean_text(text)
    """
    Split long text into smaller chunks of max length `chunk_size`.
    """
    # Split by punctuation followed by space (preserves sentence boundaries)
    sentences = re.split(r'(?<=[.!?]) +', text)

    chunks = []
    current_chunk = ""

    for sentence in sentences:
        if len(current_chunk) + len(sentence) > chunk_size:
            if current_chunk:
                chunks.append(current_chunk.strip())
                current_chunk = ""
        current_chunk += sentence + " "

    if current_chunk:
        chunks.append(current_chunk.strip())

    return chunks

def save_large_audio(prompt_audio, gen_text, mode, temperature,dp_softmax_range):
    chunks = split_text_into_chunks(gen_text, chunk_size=400)
    output_file = temp_file_name(gen_text)
    prompt_text = whisper_transcription(prompt_audio)
    with sf.SoundFile(output_file, mode='w', samplerate=24000, channels=1, format='WAV') as f:
        # for i, chunk in enumerate(chunks, 1):
        for chunk in tqdm(chunks, desc="Generating speech", unit="chunk"):

            audio_data, sr = generate_speech(prompt_audio,prompt_text, chunk, mode, temperature,dp_softmax_range)

            if sr != 24000:
                raise ValueError(f"Sample rate mismatch: got {sr}, expected 24000")

            if isinstance(audio_data, torch.Tensor):
                audio_data = audio_data.cpu().numpy()

            if audio_data.ndim == 2 and audio_data.shape[0] == 1:
                audio_data = audio_data[0]  # flatten mono

            f.write(audio_data)  # write directly to disk
            silence = np.zeros(int(0.1 * 24000))  # 0.4s silence
            f.write(silence)
    print(f"Saved final audio to {output_file}")
    return os.path.abspath(output_file)
from IPython.display import Audio, display
def play_audio(audio_path, autoplay=False):
    display(Audio(audio_path, autoplay=autoplay))
clear_output()

In [None]:
#@title Upload Reference Audio
from google.colab import files
from IPython.display import clear_output

def upload_audio():
  upload_folder="/content/uploaded_audio"
  os.makedirs(upload_folder,exist_ok=True)
  os.chdir(upload_folder)
  f_names=[]
  uploaded = files.upload()
  for fn in uploaded.keys():
        f_names.append(f"{upload_folder}/{fn}")
  os.chdir("/content/DMOSpeech2/src")
  clear_output()
  return f_names[-1]


uploaded_audio=upload_audio()
play_audio(uploaded_audio, autoplay=False)
uploaded_audio


'/content/uploaded_audio/indian_male.wav'

In [None]:

import uuid
import os
import re
import time
import numpy as np

def temp_file_name(text):
    os.makedirs("/content/voice_clone/", exist_ok=True)
    safe_text = re.sub(r'[^a-zA-Z0-9_-]', '_', text[:20])
    safe_text = re.sub(r'_+', '_', safe_text).strip('_')  # collapse and trim
    unique_id = str(uuid.uuid4())[:5]
    return f"/content/voice_clone/{safe_text}_{unique_id}.wav"

def generate_speech(prompt_audio,prompt_text,target_text,mode="Student Only (4 steps)",temperature=0,dp_softmax_range=0.7):
  if mode == "Student Only (4 steps)":
      teacher_steps = 0
      student_start_step = 0
      teacher_stopping_time = 1.0
  if mode == "Teacher-Guided (8 steps)":
      # Default configuration from the notebook
      teacher_steps = 16
      teacher_stopping_time = 0.07
      student_start_step = 1
  if mode == "High Diversity (16 steps)":
      teacher_steps = 24
      teacher_stopping_time = 0.3
      student_start_step = 2
  # Generate speech
  generated_audio = tts.generate(
      gen_text=target_text,
      audio_path=prompt_audio,
      prompt_text=prompt_text if prompt_text else None,
      teacher_steps=teacher_steps,
      teacher_stopping_time=teacher_stopping_time,
      student_start_step=student_start_step,
      temperature=temperature,
      dp_softmax_range=dp_softmax_range,
  )
  if isinstance(generated_audio, np.ndarray):
    generated_audio = torch.from_numpy(generated_audio)

  if generated_audio.dim() == 1:
      generated_audio = generated_audio.unsqueeze(0)

  return generated_audio,24000



# To avoid ai generated punctuation
def clean_text(text):
    # Define replacement rules
    replacements = {
        "–": " ",  # Replace en-dash with space
        "-": " ",  # Replace hyphen with space
        "**": " ", # Replace double asterisks with space
        "*": " ",  # Replace single asterisk with space
        "#": " ",  # Replace hash with space
    }

    # Apply replacements
    for old, new in replacements.items():
        text = text.replace(old, new)

    # Remove emojis using regex (covering wide range of Unicode characters)
    emoji_pattern = re.compile(
        r'[\U0001F600-\U0001F64F]|'  # Emoticons
        r'[\U0001F300-\U0001F5FF]|'  # Miscellaneous symbols and pictographs
        r'[\U0001F680-\U0001F6FF]|'  # Transport and map symbols
        r'[\U0001F700-\U0001F77F]|'  # Alchemical symbols
        r'[\U0001F780-\U0001F7FF]|'  # Geometric shapes extended
        r'[\U0001F800-\U0001F8FF]|'  # Supplemental arrows-C
        r'[\U0001F900-\U0001F9FF]|'  # Supplemental symbols and pictographs
        r'[\U0001FA00-\U0001FA6F]|'  # Chess symbols
        r'[\U0001FA70-\U0001FAFF]|'  # Symbols and pictographs extended-A
        r'[\U00002702-\U000027B0]|'  # Dingbats
        r'[\U0001F1E0-\U0001F1FF]'   # Flags (iOS)
        r'', flags=re.UNICODE)

    text = emoji_pattern.sub(r'', text)

    # Remove multiple spaces and extra line breaks
    text = re.sub(r'\s+', ' ', text).strip()

    return text

import soundfile as sf
from tqdm.auto import tqdm
def split_text_into_chunks(text, chunk_size=400):
    text=clean_text(text)
    """
    Split long text into smaller chunks of max length `chunk_size`.
    """
    # Split by punctuation followed by space (preserves sentence boundaries)
    sentences = re.split(r'(?<=[.!?]) +', text)

    chunks = []
    current_chunk = ""

    for sentence in sentences:
        if len(current_chunk) + len(sentence) > chunk_size:
            if current_chunk:
                chunks.append(current_chunk.strip())
                current_chunk = ""
        current_chunk += sentence + " "

    if current_chunk:
        chunks.append(current_chunk.strip())

    return chunks

def save_large_audio(prompt_audio, gen_text, mode, temperature,dp_softmax_range):
    chunks = split_text_into_chunks(gen_text, chunk_size=300)
    output_file = temp_file_name(gen_text)
    prompt_text = whisper_transcription(prompt_audio)
    with sf.SoundFile(output_file, mode='w', samplerate=24000, channels=1, format='WAV') as f:
        # for i, chunk in enumerate(chunks, 1):
        for chunk in tqdm(chunks, desc="Generating speech", unit="chunk"):

            audio_data, sr = generate_speech(prompt_audio,prompt_text, chunk, mode, temperature,dp_softmax_range)

            if sr != 24000:
                raise ValueError(f"Sample rate mismatch: got {sr}, expected 24000")

            if isinstance(audio_data, torch.Tensor):
                audio_data = audio_data.cpu().numpy()

            if audio_data.ndim == 2 and audio_data.shape[0] == 1:
                audio_data = audio_data[0]  # flatten mono

            f.write(audio_data)  # write directly to disk
            silence = np.zeros(int(0.1 * 24000))  # 0.4s silence
            f.write(silence)
    print(f"Saved final audio to {output_file}")
    return os.path.abspath(output_file)
from IPython.display import Audio, display
def play_audio(audio_path, autoplay=False):
    display(Audio(audio_path, autoplay=autoplay))
clear_output()

In [None]:
def add_slow_pauses(text):
    # Add 2–3 dots as pause markers
    text = re.sub(r'(?<=[.!?])\s+', ' ... ', text)
    return text
add_slow_pauses(gen_text)

"Tired of trying to figure out which exercises actually work? ... You scroll through endless videos, get overwhelmed, and just end up doing the same old routine. ... I've been there, but I've finally cracked the code to building real strength without spending hours in the gym. ... On this channel, I'll share simple, science-backed workouts that you can actually stick with. ... If you're ready to stop guessing and start seeing results, you're in the right place. ... Let's get started!"

In [None]:
#@title Run Voice Clone
Reference_Audio = "/content/uploaded_audio/indian_male.wav"# @param {type: "string"}
gen_text = "Tired of trying to figure out which exercises actually work? ... You scroll through endless videos, get overwhelmed, and just end up doing the same old routine. ... I've been there, but I've finally cracked the code to building real strength without spending hours in the gym. ... On this channel, I'll share simple, science-backed workouts that you can actually stick with. ... If you're ready to stop guessing and start seeing results, you're in the right place. ... Let's get started!"# @param {type: "string"}
mode ="Student Only (4 steps)" # @param ["Student Only (4 steps)","Teacher-Guided (8 steps)","High Diversity (16 steps)"]
#temerature range 0-2
temperature = 0  # @param {type: "number"}
dp_softmax_range= 0  # @param {type: "number"}
start_time = time.time()
save_path=save_large_audio(Reference_Audio,gen_text,mode,temperature,dp_softmax_range)
end_time = time.time()
clear_output()
print(f"Total numbers of characters: {len(gen_text)}")
print(f"Time taken: {end_time - start_time:.2f} seconds")
# print(f"Cloned Voice path: {save_path}")
#@title Compare Original vs. Cloned Voice
print(f"Referece Voice: {Reference_Audio}")
play_audio(Reference_Audio)
print(f"Cloned Voice: {save_path}")
play_audio(save_path)

Total numbers of characters: 487
Time taken: 5.01 seconds
Referece Voice: /content/uploaded_audio/indian_male.wav


Cloned Voice: /content/voice_clone/Tired_of_trying_to_f_fd172.wav


In [None]:
#@title Download Cloned Voice
from google.colab import files
files.download(save_path)

In [None]:
import gradio as gr
import time

def run_voice_clone(reference_audio, gen_text, mode, temperature):
    save_path = save_large_audio(reference_audio, gen_text, mode, temperature)
    return save_path, save_path

modes = ["Student Only (4 steps)", "Teacher-Guided (8 steps)", "High Diversity (16 steps)"]

demo = gr.Interface(
    fn=run_voice_clone,
    inputs=[
        gr.Audio(type="filepath", label="Reference Audio"),
        gr.Textbox(lines=4, label="Generated Text"),
        gr.Dropdown(modes, value="High Diversity (16 steps)", label="Mode"),
        gr.Slider(minimum=0, maximum=2, step=0.1, value=0, label="Temperature"),
        gr.Slider(minimum=0, maximum=2, step=0.1, value=0, label="dp_softmax_range:"),
    ],
    outputs=[
        gr.Audio(type="filepath", label="Cloned Voice"),
        gr.File(label="Download Audio")
    ],  # ← Added comma here
    title="Voice Cloning Tool",
    description="Upload a reference voice and generate cloned speech with custom text."
)

demo.launch(share=True)


Colab notebook detected. To show errors in colab notebook, set debug=True in launch()
* Running on public URL: https://a688b585e7f686cb9d.gradio.live

This share link expires in 1 week. For free permanent hosting and GPU upgrades, run `gradio deploy` from the terminal in the working directory to deploy to Hugging Face Spaces (https://huggingface.co/spaces)


