# TTS

## Kokoro

In [2]:
from kokoro import KPipeline
from IPython.display import display, Audio
import soundfile as sf
import torch
# 🇺🇸 'a' => American English, 🇬🇧 'b' => British English
# 🇯🇵 'j' => Japanese: pip install misaki[ja]
# 🇨🇳 'z' => Mandarin Chinese: pip install misaki[zh]
pipeline = KPipeline(lang_code='e') # <= make sure lang_code matches voice

voices_path = "./kokoro-voices"

# This text is for demonstration purposes only, unseen during training
text = '''¡Hey gente! Bienvenidos al stream, acomódense, agarren algo de tomar, y prepárense porque hoy se viene alta locura. ¡Let's gooo!'''

weight1 = 1   # 1, 0.3 
weight2 = 0   # 0, 0.7
weight3 = 0   # 0, 0.7

voice1 = torch.load(f"{voices_path}/ef_dora.pt", weights_only=True)
voice2 = torch.load(f"{voices_path}/jf_tebukuro.pt", weights_only=True)
voice3 = torch.load(f"{voices_path}/af_jessica.pt", weights_only=True)
# Old man: em_santa
# ef_dora (spanish base), af_aoede, af_heart, af_jessica [1], if_sara
# japanese: jf_alpha [2], jf_tebukuro [1]

sum_weights = weight1 + weight2 + weight3
blended_voice = ((voice1 * weight1) + (voice2 * weight2) + (voice3 * weight3)) / sum_weights

generator = pipeline(
    text, voice=blended_voice,
    speed=1.1, split_pattern=r'\n+'
)

for i, (gs, ps, audio) in enumerate(generator):
    print(i)  # i => index
    print(gs) # gs => graphemes/text
    print(ps) # ps => phonemes
    display(Audio(data=audio, rate=24000, autoplay=i==0))
    sf.write(f'./temp/kokoro_mirai_{i}.wav', audio, 24000) # save each audio file



FileNotFoundError: [Errno 2] No such file or directory: './kokoro-voices/ef_dora.pt'

In [None]:
from src import config
from src.services.whisper import WhisperService


whisper      = WhisperService(api_key=config.WHISPER_API_KEY)
audio_path = "./temp/user/stt_20250529_000635.wav"
with open(audio_path, "rb") as f:
    # whisper-1, gpt-4o-transcribe
    r = whisper.client.audio.transcriptions.create(
        model="gpt-4o-transcribe",
        file=f,
        response_format="text",
        language="es"
    )
    print(r)

¡Claro! Y prepárate para que me ponga memes hasta las cejas.



In [4]:
r["segments"]

TypeError: 'Transcription' object is not subscriptable

# Audio Broadcast

In [2]:
from src.services.audio import AudioPlayer
import sounddevice as sd

print("Output‐capable devices:")
for i, dev in enumerate(sd.query_devices()):
    if dev["max_output_channels"] > 0:
        print(f"{i:2d}:", dev["name"])


Output‐capable devices:
 7: Microsoft Sound Mapper - Output
 8: DragonSpeakers (Realtek USB Aud
 9: LG FULL HD (NVIDIA High Definit
10: CABLE In 16ch (VB-Audio Virtual
11: Headphones (VM70)
12: CABLE Input (VB-Audio Virtual C
13: C27F390 (NVIDIA High Definition
14: Realtek Digital Output (Realtek
15: Headphones (Oculus Virtual Audi
23: Primary Sound Driver
24: DragonSpeakers (Realtek USB Audio)
25: LG FULL HD (NVIDIA High Definition Audio)
26: CABLE In 16ch (VB-Audio Virtual Cable)
27: Headphones (VM70)
28: CABLE Input (VB-Audio Virtual Cable)
29: C27F390 (NVIDIA High Definition Audio)
30: Realtek Digital Output (Realtek USB Audio)
31: Headphones (Oculus Virtual Audio Device)
32: LG FULL HD (NVIDIA High Definition Audio)
33: CABLE In 16ch (VB-Audio Virtual Cable)
34: Headphones (VM70)
35: CABLE Input (VB-Audio Virtual Cable)
36: C27F390 (NVIDIA High Definition Audio)
37: Realtek Digital Output (Realtek USB Audio)
38: DragonSpeakers (Realtek USB Audio)
39: Headphones (Oculus Virtual Aud

In [8]:

player = AudioPlayer()
print("\nPlaying temp/mirai.mp3 → cable + default")
player.play('temp/mirai.mp3', devices=('CABLE In','default'))


Playing temp/mirai.mp3 → cable + default


# Voice Activity Detection

In [4]:
import sounddevice as sd
print("\n=== All Stereo Mix candidates ===")
for i, dev in enumerate(sd.query_devices()):
    host = sd.query_hostapis()[dev["hostapi"]]["name"]
    print(f"{i:2d}: [{host}] {dev['name']}")



=== All Stereo Mix candidates ===
 0: [MME] Microsoft Sound Mapper - Input
 1: [MME] Voicemeeter Out A1 (VB-Audio Vo
 2: [MME] Headset Microphone (Oculus Virt
 3: [MME] Microphone (C922 Pro Stream Web
 4: [MME] Front Line (Realtek USB Audio)
 5: [MME] CABLE Output (VB-Audio Virtual 
 6: [MME] DragonMic (VM70)
 7: [MME] Voicemeeter Out A3 (VB-Audio Vo
 8: [MME] Voicemeeter Out A2 (VB-Audio Vo
 9: [MME] Voicemeeter Out B2 (VB-Audio Vo
10: [MME] Microphone (NVIDIA Broadcast)
11: [MME] Microphone (Virtual Desktop Aud
12: [MME] Voicemeeter Out A4 (VB-Audio Vo
13: [MME] Voicemeeter Out B1 (VB-Audio Vo
14: [MME] Voicemeeter Out A5 (VB-Audio Vo
15: [MME] Voicemeeter Out B3 (VB-Audio Vo
16: [MME] Microsoft Sound Mapper - Output
17: [MME] Voicemeeter Input (VB-Audio Voi
18: [MME] Voicemeeter In 3 (VB-Audio Voic
19: [MME] Voicemeeter In 2 (VB-Audio Voic
20: [MME] Voicemeeter In 5 (VB-Audio Voic
21: [MME] Speakers (NVIDIA Broadcast)
22: [MME] Voicemeeter VAIO3 Input (VB-Aud
23: [MME] Voicemeeter 

In [60]:
from src.utils.device_finder import find_device


device_idx = find_device("stereo mix")
device_idx

In [61]:
device_idx

# END