In [None]:
'''
LICENSE: Apache-2.0
Voices avaliable: https://huggingface.co/hexgrad/Kokoro-82M/blob/main/VOICES.md

Code and functions pulled from : https://github.com/Marques-079/more-attention 
'''

In [2]:
import os, glob, io
import numpy as np
import soundfile as sf
from IPython.display import Audio, display
from kokoro_onnx import Kokoro, SAMPLE_RATE

def _resolve_kokoro_assets():
    base = os.path.expanduser("~/.cache/kokoro_assets")
    def pick(patterns):
        for pat in patterns:
            hits = sorted(glob.glob(os.path.join(base, pat)))
            if hits:
                return hits[-1]
        return None

    m = os.getenv("KOKORO_MODEL") or pick(["*.onnx"])
    v = os.getenv("KOKORO_VOICES") or pick(["*voices*.*", "voices.*", "*.bin", "*.json"])

    if not (m and os.path.exists(m) and v and os.path.exists(v)):
        raise FileNotFoundError(
            "Kokoro assets not found.\n"
            "Set KOKORO_MODEL and KOKORO_VOICES env vars, "
            "or put files in ~/.cache/kokoro_assets/"
        )
    print(f"[kokoro] using model:  {m}")
    print(f"[kokoro] using voices: {v}")
    return m, v

MODEL_PATH, VOICES_PATH = _resolve_kokoro_assets()
_TTS = Kokoro(MODEL_PATH, VOICES_PATH)

def _to_mono_float32(y):
    if isinstance(y, (list, tuple)) and len(y) > 0:
        y = y[0]
    a = np.asarray(y, dtype=np.float32)
    if a.ndim == 2 and a.shape[0] in (1, 2):
        a = np.mean(a, axis=0).astype(np.float32)
    return a

def synth(text: str, voice: str = "am_adam"
"", speed: float = 1.05, rate: int = SAMPLE_RATE):
    print(f"[kokoro] synth start | voice={voice} speed={speed} sr={rate} len={len(text)}")
    y = _TTS.create(text, voice=voice, speed=speed)
    audio = _to_mono_float32(y)

    display(Audio(audio, rate=rate, autoplay=True))

    buf = io.BytesIO()
    with sf.SoundFile(buf, mode="w", samplerate=rate, channels=1, format="WAV", subtype="FLOAT") as f:
        f.write(audio)
    buf.seek(0)
    dur = sf.info(buf).duration
    print(f"[kokoro] done | duration={dur:.2f}s, samples={audio.shape[0]}")
    return buf.getvalue(), dur


text = "TESTING, 1, 2, 3. This is a test of the Kokoro text to speech synthesis system."
audio_bytes, duration = synth(text)


[kokoro] using model:  /Users/marcus/.cache/kokoro_assets/kokoro-v1.0.onnx
[kokoro] using voices: /Users/marcus/.cache/kokoro_assets/voices-v1.0.bin
[kokoro] synth start | voice=am_adam speed=1.05 sr=24000 len=79


[kokoro] done | duration=5.14s, samples=123392


In [3]:
audio_bytes, duration = synth(text)

# save to a WAV file so it shows up in your IDE’s file tree
with open("kokoro_out.wav", "wb") as f:
    f.write(audio_bytes)
print("Saved:", os.path.abspath("kokoro_out.wav"))


[kokoro] synth start | voice=am_adam speed=1.05 sr=24000 len=79


[kokoro] done | duration=5.14s, samples=123392
Saved: /Users/marcus/Documents/GitHub/Marcuss-toolbox/text_to_speech/kokoro_out.wav
