In [2]:
# TODO: Look at https://github.com/simoniz0r/piper-voice-models/tree/main?tab=readme-ov-file
# How he uses training sets from F5-TTS
# An got good results
# https://community.home-assistant.io/t/collections-of-pretrainer-piper-voices/915666

import os
import subprocess
import glob
import shutil
import whisper

YOUTUBE_IDS = ["2hOp408Ib5w", "yJzjyYL8l5Y"]

if len(YOUTUBE_IDS) < 2:
    print("Minimum of 2 youtube videos")
    raise KeyboardInterrupt

VOICE_NAME = "custom_voice"
BASE_DIR = os.getcwd()
TARGET_SAMPLE = "wav_22050"  # wav_16000, wav_22050, wav_44100
DATASET_DIR = f"{BASE_DIR}/dataset"
WAV_DIR = f"{DATASET_DIR}/wavs"
TEXTY_PATH = f"{BASE_DIR}/third_party/TextyMcSpeechy"

os.makedirs(DATASET_DIR, exist_ok=True)
if os.path.exists(WAV_DIR):
    shutil.rmtree(WAV_DIR)
os.makedirs(WAV_DIR, exist_ok=True)

print("BASE_DIR:", BASE_DIR)
print("DATASET_DIR:", DATASET_DIR)
print("WAV_DIR:", WAV_DIR)
print("TEXTY_PATH:", TEXTY_PATH)

BASE_DIR: /home/ramon/tts
DATASET_DIR: /home/ramon/tts/dataset
WAV_DIR: /home/ramon/tts/dataset/wavs
TEXTY_PATH: /home/ramon/tts/third_party/TextyMcSpeechy


In [2]:
model = whisper.load_model("base")
with open(f"{DATASET_DIR}/metadata.csv", "w") as meta:
    for i, yid in enumerate(YOUTUBE_IDS):
        url = f"https://youtube.com/watch?v={yid}"
        out_path = f"{WAV_DIR}/{i}.%(ext)s"

        subprocess.run(
            ["yt-dlp", "-x", "--audio-format", "wav", "-o", out_path, url], check=True
        )

        wav_path = f"{WAV_DIR}/{i}.wav"
        if not os.path.exists(wav_path):
            raise FileNotFoundError(f"Expected wav file not found: {wav_path}")

        result = model.transcribe(wav_path)
        meta.write(f"wavs/{i}.wav|{result['text'].strip()}\n")

num_utterances = len([f for f in os.listdir(WAV_DIR) if f.endswith(".wav")])
MAX_WORKERS = max(1, num_utterances // 2)
print(f"Detected {num_utterances} wav files. Setting MAX_WORKERS = {MAX_WORKERS}")

[youtube] Extracting URL: https://youtube.com/watch?v=2hOp408Ib5w
[youtube] 2hOp408Ib5w: Downloading webpage
[youtube] 2hOp408Ib5w: Downloading tv client config
[youtube] 2hOp408Ib5w: Downloading tv player API JSON
[youtube] 2hOp408Ib5w: Downloading ios player API JSON
[youtube] 2hOp408Ib5w: Downloading player 65578ad1-main


         player = https://www.youtube.com/s/player/65578ad1/player_ias.vflset/en_US/base.js
         n = yJH3tIXlwreq5w1 ; player = https://www.youtube.com/s/player/65578ad1/player_ias.vflset/en_US/base.js
         Please report this issue on  https://github.com/yt-dlp/yt-dlp/issues?q= , filling out the appropriate issue template. Confirm you are on the latest version using  yt-dlp -U
         n = mehidRgMSENs4uG ; player = https://www.youtube.com/s/player/65578ad1/player_ias.vflset/en_US/base.js
         Please report this issue on  https://github.com/yt-dlp/yt-dlp/issues?q= , filling out the appropriate issue template. Confirm you are on the latest version using  yt-dlp -U
         n = WZgpdSyycGXo7lQ ; player = https://www.youtube.com/s/player/65578ad1/player_ias.vflset/en_US/base.js
         Please report this issue on  https://github.com/yt-dlp/yt-dlp/issues?q= , filling out the appropriate issue template. Confirm you are on the latest version using  yt-dlp -U


[youtube] 2hOp408Ib5w: Downloading m3u8 information
[info] 2hOp408Ib5w: Downloading 1 format(s): 234
[hlsnative] Downloading m3u8 manifest
[hlsnative] Total fragments: 26
[download] Destination: /home/ramon/tts/dataset/wavs/0.mp4
[download] 100% of    2.02MiB in 00:00:00 at 5.09MiB/s                   
[ExtractAudio] Destination: /home/ramon/tts/dataset/wavs/0.wav
Deleting original file /home/ramon/tts/dataset/wavs/0.mp4 (pass -k to keep)
[youtube] Extracting URL: https://youtube.com/watch?v=yJzjyYL8l5Y
[youtube] yJzjyYL8l5Y: Downloading webpage
[youtube] yJzjyYL8l5Y: Downloading tv client config
[youtube] yJzjyYL8l5Y: Downloading tv player API JSON
[youtube] yJzjyYL8l5Y: Downloading ios player API JSON
[youtube] yJzjyYL8l5Y: Downloading player 65578ad1-main


         player = https://www.youtube.com/s/player/65578ad1/player_ias.vflset/en_US/base.js
         n = VQZhLm0rVjsGjns ; player = https://www.youtube.com/s/player/65578ad1/player_ias.vflset/en_US/base.js
         Please report this issue on  https://github.com/yt-dlp/yt-dlp/issues?q= , filling out the appropriate issue template. Confirm you are on the latest version using  yt-dlp -U
         n = uEGIKLKz-FVeqgj ; player = https://www.youtube.com/s/player/65578ad1/player_ias.vflset/en_US/base.js
         Please report this issue on  https://github.com/yt-dlp/yt-dlp/issues?q= , filling out the appropriate issue template. Confirm you are on the latest version using  yt-dlp -U
         n = LaS6NwN67Kti8rb ; player = https://www.youtube.com/s/player/65578ad1/player_ias.vflset/en_US/base.js
         Please report this issue on  https://github.com/yt-dlp/yt-dlp/issues?q= , filling out the appropriate issue template. Confirm you are on the latest version using  yt-dlp -U


[youtube] yJzjyYL8l5Y: Downloading m3u8 information
[info] yJzjyYL8l5Y: Downloading 1 format(s): 234
[hlsnative] Downloading m3u8 manifest
[hlsnative] Total fragments: 21
[download] Destination: /home/ramon/tts/dataset/wavs/1.mp4
[download] 100% of    1.58MiB in 00:00:00 at 5.34MiB/s                   
[ExtractAudio] Destination: /home/ramon/tts/dataset/wavs/1.wav
Deleting original file /home/ramon/tts/dataset/wavs/1.mp4 (pass -k to keep)
Detected 2 wav files. Setting MAX_WORKERS = 1


In [3]:
dst = f"{TEXTY_PATH}/tts_dojo/DATASETS/{VOICE_NAME}"
if os.path.exists(dst):
    shutil.rmtree(dst)
shutil.copytree(DATASET_DIR, dst)

'/home/ramon/tts/third_party/TextyMcSpeechy/tts_dojo/DATASETS/custom_voice'

In [4]:
PROCEED = "Y"
VIEW_LIST = "n"
LANG = "en-us"
DESCRIPTION = "Custom dataset"
VOICE_TYPE = "M"

os.chdir(f"{TEXTY_PATH}/tts_dojo/DATASETS")
subprocess.run(
    [
        "bash",
        "-c",
        f"./create_dataset.sh {VOICE_NAME} <<EOF\n"
        f"{PROCEED}\n"
        f"{VIEW_LIST}\n"
        f"{LANG}\n"
        f"{VOICE_NAME}\n"
        f"{DESCRIPTION}\n"
        f"{VOICE_TYPE}\n"
        f"EOF",
    ],
    check=True,
)
os.chdir(BASE_DIR)

/home/ramon/tts/third_party/TextyMcSpeechy/tts_dojo/DATASETS/custom_voice
FFMPEG OK!
[H[2J    TextyMcSpeechy Dataset creator

    This tool will perform the following operations on files in custom_voice:

    1. Scan custom_voice and its subdirectories for audio files
    2. Verify that file extensions match the contents of the files
    3. Move audio files into folders classified by file format and sampling rate
    4. Move any non-audio files to 'not_audio' directory
    5. Remove empty directories
    6. Select the highest sampling rate you have provided and resample to wav_22050 and wav_16000 (The formats piper needs)
    7. Remove duplicate files
    8. Check that all files referenced in metadata.csv exist.
    9. Create dataset.conf file, which TextyMcSpeechy uses to configure your dojo.

    This tool will make changes to the files in custom_voice that cannot be undone.
    It is HIGHLY RECOMMENDED that you keep a backup of your original dataset files.

    Do you wish to proc

In [5]:
DATASET_FINAL_DIR = f"{TEXTY_PATH}/tts_dojo/DATASETS/{VOICE_NAME}"
META_PATH = f"{DATASET_FINAL_DIR}/metadata.csv"

with open(META_PATH, "r") as f:
    lines = f.readlines()

# keep only filename before '|'
# docker containers being a damn hassle. piper tts_dojo/(name)_dojo/
lines = [f"{line.split('/')[-1]}" for line in lines]  # strip path
lines = [
    line if "|" in line else line.replace("\n", f"|{TARGET_SAMPLE}\n")
    for line in lines
]

# make sure each line is "<filename>|<text>"
fixed = []
for line in lines:
    parts = line.strip().split("|", 1)
    fname = parts[0].split("/")[-1]
    text = parts[1] if len(parts) > 1 else ""
    fixed.append(f"{fname}|{text}\n")


with open(META_PATH, "w") as f:
    f.writelines(fixed)

In [6]:
os.chdir(f"{TEXTY_PATH}/tts_dojo/PRETRAINED_CHECKPOINTS")
subprocess.run(["./download_defaults.sh", "en-us"], check=True)
os.chdir(BASE_DIR)


***********************************************************************
 A URL for M_voice, medium quality was supplied by en-us.conf 


default/M_voice/medium/epoch=4641-step=3104302.ckpt already exists.  Not downloading.



***********************************************************************
 A URL for F_voice, low quality was supplied by en-us.conf 


default/F_voice/low/epoch=2307-step=558536.ckpt already exists.  Not downloading.



***********************************************************************
 A URL for F_voice, medium quality was supplied by en-us.conf 


default/F_voice/medium/epoch=1000-step=11111111.ckpt already exists.  Not downloading.



***********************************************************************
 A URL for F_voice, high quality was supplied by en-us.conf 


default/F_voice/high/epoch=2000-step=11111111.ckpt already exists.  Not downloading.






SUMMARY OF AVAILABLE PRETRAINED CHECKPOINTS - PLEASE READ

    en-us.conf included pretrained checkpo

In [7]:
os.chdir(f"{TEXTY_PATH}/tts_dojo")
dojo_path = f"{TEXTY_PATH}/tts_dojo/{VOICE_NAME}_dojo"
if not os.path.exists(dojo_path):
    subprocess.run(["./newdojo.sh", VOICE_NAME], check=True)
else:
    print("Dojo already exists.")
os.chdir(BASE_DIR)

Dojo already exists.


In [8]:
!docker rm -f textymcspeechy-piper >/dev/null 2>&1

You have to run `./scripts/training` manually.

In [10]:
output_dir = "voices"
os.makedirs(output_dir, exist_ok=True)
trained_files = glob.glob(f"{dojo_path}/tts_voices/*.onnx*")
for f in trained_files:
    shutil.copy(f, f"{BASE_DIR}/{output_dir}/")