In [19]:
#!/usr/bin/env python3
import os, sys

# Make the CLI: PYTORCH_ENABLE_MPS_FALLBACK=1 python script.py
# happen programmatically (must be set before importing torch)
os.environ.setdefault("PYTORCH_ENABLE_MPS_FALLBACK", "1")

import torch

def pick_device() -> str:
    # Prefer MPS on Apple, then CUDA, else CPU
    try:
        if getattr(torch.backends, "mps", None) and torch.backends.mps.is_available():
            return "mps"
        if torch.cuda.is_available():
            return "cuda"
    except Exception:
        pass
    return "cpu"

device = pick_device()
try:
    torch.set_default_device(device)   # ok if Torch >=2.0; otherwise harmless if it fails
except Exception:
    pass
print(f"Using device: {device}  (MPS fallback={os.environ['PYTORCH_ENABLE_MPS_FALLBACK']})")


Using device: mps  (MPS fallback=1)


In [1]:
import sys, pathlib; print(sys.version, "\n", pathlib.Path(sys.executable))


3.12.11 (main, Jun  3 2025, 15:41:47) [Clang 17.0.0 (clang-1700.0.13.3)] 
 /Users/marcus/Documents/GitHub/more-attention/.venv/bin/python


In [2]:
from kokoro import KPipeline
from IPython.display import display, Audio
import soundfile as sf
import torch
pipeline = KPipeline(lang_code='a')
text = '''
[Kokoro](/kˈOkəɹO/) is an open-weight TTS model with 82 million parameters. Despite its lightweight architecture, it delivers comparable quality to larger models while being significantly faster and more cost-efficient. With Apache-licensed weights, [Kokoro](/kˈOkəɹO/) can be deployed anywhere from production environments to personal projects.
'''
generator = pipeline(text, voice='af_heart')
for i, (gs, ps, audio) in enumerate(generator):
    print(i, gs, ps)
    display(Audio(data=audio, rate=24000, autoplay=i==0))
    sf.write(f'{i}.wav', audio, 24000)

  from .autonotebook import tqdm as notebook_tqdm




  WeightNorm.apply(module, name, dim)


Collecting en-core-web-sm==3.8.0
  Downloading https://github.com/explosion/spacy-models/releases/download/en_core_web_sm-3.8.0/en_core_web_sm-3.8.0-py3-none-any.whl (12.8 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m12.8/12.8 MB[0m [31m22.0 MB/s[0m  [33m0:00:00[0m eta [36m0:00:01[0m
[?25hInstalling collected packages: en-core-web-sm
Successfully installed en-core-web-sm-3.8.0
[38;5;2m✔ Download and installation successful[0m
You can now load the package via spacy.load('en_core_web_sm')
[38;5;3m⚠ Restart to reload dependencies[0m
If you are in a Jupyter or Colab notebook, you may need to restart Python in
order to load all the package's dependencies. You can do this by selecting the
'Restart kernel' or 'Restart runtime' option.
0 Kokoro is an open-weight TTS model with 82 million parameters. Despite its lightweight architecture, it delivers comparable quality to larger models while being significantly faster and more cost-efficient. With Apache-licens

In [3]:
from kokoro import KPipeline
from IPython.display import display, Audio
import soundfile as sf

pipeline = KPipeline(lang_code='a')      # 'a' = American English (matches 'am_*' voices)
text = '''
My husband looked wrecked. Red-rimmed eyes, hair doing Jackson Pollock things. He hugged me hard and whispered, “I’m sorry. You were right. I didn’t know.” I believed him not because he said it, but because I had seen the texts where he unraveled hour by hour. He looked like the kind of tired you can’t fix with a nap. I thought, good, not because I’m cruel, but because empathy sometimes needs a body to live in, not a lecture.
'''

voice = "am_echo"   # the Echo voice
speed = 0.9         # 0.9 from your UI (1.0 is default tempo)

generator = pipeline(text, voice=voice, speed=speed)

for i, (gs, ps, audio) in enumerate(generator):
    display(Audio(audio, rate=24000, autoplay=(i == 0)))
    sf.write(f"{i}.wav", audio, 24000)




  WeightNorm.apply(module, name, dim)


In [4]:
from kokoro import KPipeline
from IPython.display import display, Audio
import soundfile as sf

pipeline = KPipeline(lang_code='a')      # 'a' = American English (matches 'am_*' voices)
text = '''
My husband looked wrecked. Red-rimmed eyes, hair doing Jackson Pollock things. He hugged me hard and whispered, “I’m sorry. You were right. I didn’t know.” I believed him not because he said it, but because I had seen the texts where he unraveled hour by hour. He looked like the kind of tired you can’t fix with a nap. I thought, good, not because I’m cruel, but because empathy sometimes needs a body to live in, not a lecture.
'''

voice = "am_adam"   # the Adam voice
speed = 0.9         # 0.9 from your UI (1.0 is default tempo)

generator = pipeline(text, voice=voice, speed=speed)

for i, (gs, ps, audio) in enumerate(generator):
    display(Audio(audio, rate=24000, autoplay=(i == 0)))
    sf.write(f"{i}.wav", audio, 24000)




In [22]:
female_names = {
    "af_alloy": True,
    "af_aoede": True,
    "af_bella": True,
    "af_heart": True,
    "af_jessica": True,
    "af_kore": True,
    "af_nicole": True,
    "af_nova": True,
    "af_river": True,
    "af_sarah": True,
    "af_sky": True,
    "bf_alice": True,
    "bf_emma": True,
    "bf_isabella": True,
    "bf_lily": True,
}

male_names = {
    "am_adam": True,
    "am_echo": True,
    "am_eric": True,
    "am_fenrir": True,
    "am_liam": True,
    "am_michael": True,
    "am_onyx": True,
    "am_puck": True,
    "am_santa": True,
    "bm_daniel": True,
    "bm_fable": True,
    "bm_george": True,
    "bm_lewis": True,
}


In [29]:
from kokoro import KPipeline
from IPython.display import display, Audio
import soundfile as sf

import torch, kokoro.istftnet as _istft

def _fixed_forward(self, x, s):
    out = self._residual(x, s)
    # constant lives on the same device/dtype as `out`
    c = torch.tensor(2.0, device=out.device, dtype=out.dtype).rsqrt()
    return (out + self._shortcut(x)) * c

_istft.AdainResBlk1d.forward = _fixed_forward


pipeline = KPipeline(lang_code='a')      # 'a' = American English (matches 'am_*' voices)
text = '''
My husband looked wrecked. Red-rimmed eyes, hair doing Jackson Pollock things. He hugged me hard and whispered, “I’m sorry. You were right. I didn’t know.” I believed him not because he said it, but because I had seen the texts where he unraveled hour by hour. He looked like the kind of tired you can’t fix with a nap. I thought, good, not because I’m cruel, but because empathy sometimes needs a body to live in, not a lecture.
'''

for genders in [male_names, female_names]:
    for key in genders:
        for speed in (0.9, 1.0):
            try:
                for i, (gs, ps, audio) in enumerate(pipeline(text, voice=key, speed=speed)):
                    print(f"{key} at {speed}")
                    display(Audio(audio, rate=24000, autoplay=(i == 0)))
                    print('=============================================')
                    
            except RuntimeError as e:
                if "same device" in str(e):
                    print(f"[skip] {key} @ {speed}: device mismatch (fixed by patch above).")
                else:
                    raise



am_adam at 0.9




KeyboardInterrupt: 