<a href="https://colab.research.google.com/github/LxYuan0420/nlp/blob/main/notebooks/viXTTS_Customization_Introducing_Random_Pauses_and_Fading_Transitions.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

## viXTTS Customization: Introducing Random Pauses and Fading Transitions




In [None]:
!git clone --branch add-vietnamese-xtts -q https://github.com/thinhlpg/TTS.git
!pip install --use-deprecated=legacy-resolver -q -e TTS
#❗❗❗ IMPORTANT: Please restart runtime after install TTS

#### Fix numpy dependency

In [None]:
#/content/TTS/TTS/tts/utils/helpers.py:3: UserWarning: A NumPy version >=1.22.4 and <2.3.0 is required for this version of SciPy (detected version 1.22.0)
#  from scipy.stats import betabinom
# restart after installation
!pip install numpy==1.23

In [4]:
import os

import torch

from huggingface_hub import snapshot_download
from TTS.tts.configs.xtts_config import XttsConfig
from TTS.tts.models.xtts import Xtts

In [2]:
# library to do audio editting , format conversion, effect processing and other
!pip install pydub



In [5]:
snapshot_download(repo_id="capleaf/viXTTS", repo_type="model", local_dir="model")

config = XttsConfig()
config.load_json("./model/config.json")

XTTS_MODEL = Xtts.init_from_config(config)
XTTS_MODEL.load_checkpoint(config, checkpoint_dir="./model/")
XTTS_MODEL.eval()

if torch.cuda.is_available():
    XTTS_MODEL.cuda()

In [6]:
gpt_cond_latent, speaker_embedding = XTTS_MODEL.get_conditioning_latents(
    audio_path="./model/vi_sample.wav",
    gpt_cond_len=XTTS_MODEL.config.gpt_cond_len,
    max_ref_length=XTTS_MODEL.config.max_ref_len,
    sound_norm_refs=XTTS_MODEL.config.sound_norm_refs,
)

In [7]:
text = "The sun was setting behind the mountains, casting a warm golden glow over the valley. Birds chirped softly as they prepared to settle in for the night, while a gentle breeze rustled the leaves on the trees. The day had been long, but the evening brought a sense of calm and tranquility. As the sky turned from orange to pink and then deep blue, the stars began to twinkle one by one, signaling the end of another day."

# simply splt it by fullstop
texts = text.split(". ")

In [57]:
import numpy as np
from scipy.io.wavfile import write
from scipy.signal import windows
import random

# Function to apply a random fade in/out effect
# Fade Durations:
# - Short Fades (50-150 ms): Ideal for transitioning between syllables or within words.
#   Subtle and almost unnoticeable, maintaining continuity of speech.
# - Medium Fades (150-300 ms): Best for transitioning between sentences or pauses within a conversation.
#   Smooth and natural, separating thoughts without abrupt changes.
# - Long Fades (300-500 ms): Suitable for transitioning between paragraphs or distinct sections of content.
#   Noticeable but gentle, providing closure or shifting focus between larger content segments.
def apply_random_fade(audio_data, sample_rate):
    fade_duration_ms = random.randint(150, 300)
    fade_samples = int(sample_rate * fade_duration_ms / 1000)
    fade_in = windows.hann(fade_samples * 2)[:fade_samples]
    fade_out = windows.hann(fade_samples * 2)[fade_samples:]

    audio_data[:fade_samples] *= fade_in
    audio_data[-fade_samples:] *= fade_out

    return audio_data

# Pause/Silence Durations:
# - Short Pauses (100-300 ms): Used within sentences or between quick thoughts.
#   Creates a brief natural break without disrupting the flow of speech.
# - Medium Pauses (300-700 ms): Ideal for separating sentences or clauses within a conversation.
#   Provides a natural, noticeable pause that indicates a shift in thought or emphasis.
# - Long Pauses (700-1200 ms): Suitable for transitioning between paragraphs or sections.
#   Emphasizes separation, creating a clear break between distinct ideas or content segments.
def add_random_silence(sample_rate):
    silence_duration_ms = random.randint(700, 1200)
    return np.zeros(int(sample_rate * silence_duration_ms / 1000), dtype=np.int16)

# Initialize parameters
sample_rate = 24000
output_wavs = []

for text in texts:
    # Add full stop at the end of the sentence to ensure a "tone-down" voice.
    if not text.endswith("."):
        text += "."

    print(f"Reading: {text}")

    out_wav = XTTS_MODEL.inference(
        text=text,
        language="en",
        gpt_cond_latent=gpt_cond_latent,
        speaker_embedding=speaker_embedding,
        temperature=0.3,
        length_penalty=1.0,
        repetition_penalty=10.0,
        top_k=30,
        top_p=0.85,
    )

    wav_data = out_wav["wav"] # numpy array type
    wav_data = apply_random_fade(wav_data, sample_rate)
    output_wavs.append(wav_data)

# Combine all audio segments with random silence between them
combined_audio = np.array([], dtype=np.int16)
for audio in output_wavs:
    combined_audio = np.concatenate((combined_audio, add_random_silence(sample_rate), audio))

# save it
write("combined_audio.wav", sample_rate, combined_audio)

Reading: The sun was setting behind  the mountains, casting a warm golden glow over the valley.
Reading: Birds chirped softly as they prepared to settle in for the night, while a gentle breeze rustled the leaves on the trees.
Reading: The day had been long, but the evening brought a sense of calm and tranquility.
Reading: As the sky turned from orange to pink and then deep blue, the stars began to twinkle one by one, signaling the end of another day.


In [58]:
from IPython.display import Audio
Audio("combined_audio.wav")