## Basic Inference (teacher-guided, 8 steps)

In [1]:
import os
from huggingface_hub import login

from infer import DMOInference
import IPython.display as ipd
import torchaudio
import time

# Initialize the model
tts = DMOInference(
    student_checkpoint_path="../ckpts/model_85000.pt", 
    duration_predictor_path="../ckpts/model_1500.pt",
    device="cpu",
    model_type="F5TTS_Base"
)

  import pkg_resources
Building prefix dict from the default dictionary ...
Loading model from cache /tmp/jieba.cache
Loading model cost 0.279 seconds.
Prefix dict has been built successfully.


Word segmentation module jieba initialized.

Download Vocos from huggingface charactr/vocos-mel-24khz


In [2]:
prompt_audio = "f5_tts/infer/examples/basic/basic_ref_en.wav"

ref_text = "Some call me nature, others call me mother nature."
gen_text = "In most cases, installing the Python ipywidgets package will also automatically configure classic Jupyter Notebook and JupyterLab 3.x to display ipywidgets. With pip, do:"

start_time = time.time()
# Generate with default settings
generated_audio = tts.generate(
    gen_text=gen_text,
    audio_path=prompt_audio,
    prompt_text=ref_text)
end_time = time.time()

processing_time = end_time - start_time
audio_duration = generated_audio.shape[-1] / 24000
rtf = processing_time / audio_duration

print('\n--------\n')
print('Prompt Audio: ')
display(ipd.Audio(prompt_audio, rate=24000))
print('Generated Audio: ')
display(ipd.Audio(generated_audio, rate=24000))

print(f"  RTF: {rtf:.2f}x ({1/rtf:.2f}x speed)")
print(f"  Processing: {processing_time:.2f}s for {audio_duration:.2f}s audio")


Converting audio...
Using custom reference text...

ref_text   Some call me nature, others call me mother nature. 

--------

Prompt Audio: 


Generated Audio: 


  RTF: 2.12x (0.47x speed)
  Processing: 30.77s for 14.49s audio


In [3]:
prompt_audio = "f5_tts/infer/examples/basic/basic_ref_en.wav"

ref_text = "Some call me nature, others call me mother nature."
gen_text = "The wind whispered through the ancient trees, carrying secrets from forgotten times. In the distance, mountains stood like silent guardians, watching over valleys where life flourished in countless forms."

start_time = time.time()
# Generate with default settings
generated_audio = tts.generate(
    gen_text=gen_text,
    audio_path=prompt_audio,
    prompt_text=ref_text
)
end_time = time.time()

processing_time = end_time - start_time
audio_duration = generated_audio.shape[-1] / 24000
rtf = processing_time / audio_duration

print('\n--------\n')
print('Prompt Audio: ')
display(ipd.Audio(prompt_audio, rate=24000))
print('Generated Audio: ')
display(ipd.Audio(generated_audio, rate=24000))

print(f"  RTF: {rtf:.2f}x ({1/rtf:.2f}x speed)")
print(f"  Processing: {processing_time:.2f}s for {audio_duration:.2f}s audio")

Converting audio...
Using cached preprocessed reference audio...
Using custom reference text...

ref_text   Some call me nature, others call me mother nature. 

--------

Prompt Audio: 


Generated Audio: 


  RTF: 1.98x (0.51x speed)
  Processing: 31.20s for 15.77s audio


## Comparision between different sampling configurations

#### Student only (4 steps)

Need to set `teacher_steps` and `student_start_step` to 0 to enable full student sampling.

In [4]:
prompt_audio = "f5_tts/infer/examples/basic/basic_ref_en.wav"

ref_text = "Some call me nature, others call me mother nature."
gen_text = "Technology advances at an incredible pace, transforming how we live, work, and connect with one another. Each innovation builds upon the last, creating possibilities we never imagined before."

start_time = time.time()
# Generate with default settings
generated_audio = tts.generate(
    gen_text=gen_text,
    audio_path=prompt_audio,
    prompt_text=ref_text,
    teacher_steps=0, # set this to 0 for no teachr sampling
    student_start_step=0, # set this to 0 for full student sampling
)
end_time = time.time()

processing_time = end_time - start_time
audio_duration = generated_audio.shape[-1] / 24000
rtf = processing_time / audio_duration

print('\n--------\n')
print('Prompt Audio: ')
display(ipd.Audio(prompt_audio, rate=24000))
print('Generated Audio: ')
display(ipd.Audio(generated_audio, rate=24000))

print(f"  RTF: {rtf:.2f}x ({1/rtf:.2f}x speed)")
print(f"  Processing: {processing_time:.2f}s for {audio_duration:.2f}s audio")

Converting audio...
Using cached preprocessed reference audio...
Using custom reference text...

ref_text   Some call me nature, others call me mother nature. 

--------

Prompt Audio: 


Generated Audio: 


  RTF: 0.60x (1.67x speed)
  Processing: 8.86s for 14.81s audio


#### More teacher steps (16 steps)

Now we use 14 steps from the teacher and 2 steps from the student to have higher diversity (16 steps total).

In [7]:
prompt_audio = "f5_tts/infer/examples/basic/basic_ref_en.wav"

ref_text = "Some call me nature, others call me mother nature."
gen_text = "The ocean waves crashed against the rocky shore, their rhythmic sound creating a natural symphony. Seabirds danced overhead, riding the coastal winds with effortless grace."

start_time = time.time()
# Generate with default settings
generated_audio = tts.generate(
    gen_text=gen_text,
    audio_path=prompt_audio,
    prompt_text=ref_text,
    teacher_steps=24, 
    teacher_stopping_time=0.3, # 0.25 means students go for the last two steps (0.26ish, 0.6ish)
    student_start_step=2, # only two steps for students
    verbose=True # see the number of steps used
)
end_time = time.time()

processing_time = end_time - start_time
audio_duration = generated_audio.shape[-1] / 24000
rtf = processing_time / audio_duration

print('\n--------\n')
print('Prompt Audio: ')
display(ipd.Audio(prompt_audio, rate=24000))
print('Generated Audio: ')
display(ipd.Audio(generated_audio, rate=24000))

print(f"  RTF: {rtf:.2f}x ({1/rtf:.2f}x speed)")
print(f"  Processing: {processing_time:.2f}s for {audio_duration:.2f}s audio")

Converting audio...
Using cached preprocessed reference audio...
Using custom reference text...

ref_text   Some call me nature, others call me mother nature. 
audio: torch.Size([1, 124414])
text: [['S', 'o', 'm', 'e', ' ', 'c', 'a', 'l', 'l', ' ', 'm', 'e', ' ', 'n', 'a', 't', 'u', 'r', 'e', ',', ' ', 'o', 't', 'h', 'e', 'r', 's', ' ', 'c', 'a', 'l', 'l', ' ', 'm', 'e', ' ', 'm', 'o', 't', 'h', 'e', 'r', ' ', 'n', 'a', 't', 'u', 'r', 'e', '.', ' ', 'T', 'h', 'e', ' ', 'o', 'c', 'e', 'a', 'n', ' ', 'w', 'a', 'v', 'e', 's', ' ', 'c', 'r', 'a', 's', 'h', 'e', 'd', ' ', 'a', 'g', 'a', 'i', 'n', 's', 't', ' ', 't', 'h', 'e', ' ', 'r', 'o', 'c', 'k', 'y', ' ', 's', 'h', 'o', 'r', 'e', ',', ' ', 't', 'h', 'e', 'i', 'r', ' ', 'r', 'h', 'y', 't', 'h', 'm', 'i', 'c', ' ', 's', 'o', 'u', 'n', 'd', ' ', 'c', 'r', 'e', 'a', 't', 'i', 'n', 'g', ' ', 'a', ' ', 'n', 'a', 't', 'u', 'r', 'a', 'l', ' ', 's', 'y', 'm', 'p', 'h', 'o', 'n', 'y', '.', ' ', 'S', 'e', 'a', 'b', 'i', 'r', 'd', 's', ' ', 'd', '

Generated Audio: 


  RTF: 4.70x (0.21x speed)
  Processing: 64.12s for 13.63s audio


#### Stochastic duration 

Introduce even more diversity by adding randomness to the duration

In [6]:
prompt_audio = "f5_tts/infer/examples/basic/basic_ref_en.wav"

ref_text = "Some call me nature, others call me mother nature."
gen_text = "In the heart of the forest, ancient wisdom flows through every leaf and branch. Time moves differently here, measured not in hours but in seasons, not in moments but in lifetimes."

start_time = time.time()
# Generate with default settings
generated_audio = tts.generate(
    gen_text=gen_text,
    audio_path=prompt_audio,
    prompt_text=ref_text,
    teacher_steps=24, 
    teacher_stopping_time=0.25, # 0.25 means students go for the last two steps (0.26ish, 0.6ish)
    student_start_step=2, # only two steps for students
    temperature=0.8, # set some temperature for duration sampling 
)
end_time = time.time()

processing_time = end_time - start_time
audio_duration = generated_audio.shape[-1] / 24000
rtf = processing_time / audio_duration

print('\n--------\n')
print('Prompt Audio: ')
display(ipd.Audio(prompt_audio, rate=24000))
print('Generated Audio: ')
display(ipd.Audio(generated_audio, rate=24000))

print(f"  RTF: {rtf:.2f}x ({1/rtf:.2f}x speed)")
print(f"  Processing: {processing_time:.2f}s for {audio_duration:.2f}s audio")

Converting audio...
Using cached preprocessed reference audio...
Using custom reference text...

ref_text   Some call me nature, others call me mother nature. 

--------

Prompt Audio: 


Generated Audio: 


  RTF: 4.26x (0.23x speed)
  Processing: 68.47s for 16.09s audio
