Uses websockets + soundfile/scipy to load the Tortoise WAV, resample to 8 kHz mono, convert to mu-law, and stream it back as Twilio "media" messages. It also echoes any incoming audio length in logs so you can verify two-way traffic.

In [None]:
# media_ws.py
import asyncio, json, base64, audioop, numpy as np, soundfile as sf
from websockets.server import serve
from scipy.signal import resample_poly  # pip install scipy websockets soundfile

GREETING_WAV = "tortoise/outputs/martin/generated-martin.wav"  # your Tortoise file
FRAME_MS = 20                 # ~20ms frames for smooth playback
TWILIO_SR = 8000              # required by Twilio
BYTES_PER_SAMPLE = 2          # 16-bit PCM prior to mu-law

async def pcm16_to_mulaw_base64(pcm16: bytes) -> str:
    # Convert linear16 -> mu-law bytes
    mulaw = audioop.lin2ulaw(pcm16, BYTES_PER_SAMPLE)
    return base64.b64encode(mulaw).decode("ascii")

def load_and_prepare_mulaw_frames(wav_path: str, frame_ms=FRAME_MS):
    # Load your Tortoise WAV (any sr/mono/stereo), convert to 8k mono 16-bit PCM
    data, sr = sf.read(wav_path, dtype="float32", always_2d=True)
    mono = data.mean(axis=1)  # mono
    # Resample (polyphase is good)
    up = TWILIO_SR; down = int(sr)
    resampled = resample_poly(mono, up, down).astype(np.float32)
    # Clip and scale to int16
    resampled = np.clip(resampled, -1.0, 1.0)
    pcm16 = (resampled * 32767.0).astype(np.int16).tobytes()
    # Chunk into ~20ms frames @ 8k => 160 samples => 320 bytes (16-bit)
    samples_per_frame = int(TWILIO_SR * frame_ms / 1000)
    bytes_per_frame = samples_per_frame * BYTES_PER_SAMPLE
    frames = [pcm16[i:i+bytes_per_frame] for i in range(0, len(pcm16), bytes_per_frame)]
    return frames

GREETING_FRAMES = load_and_prepare_mulaw_frames(GREETING_WAV)

async def send_media(ws, stream_sid: str, pcm16_frame: bytes):
    payload = await pcm16_to_mulaw_base64(pcm16_frame)
    msg = {
        "event": "media",
        "streamSid": stream_sid,
        "media": {"payload": payload}
    }
    await ws.send(json.dumps(msg))

async def send_mark(ws, stream_sid: str, name: str):
    await ws.send(json.dumps({"event": "mark", "streamSid": stream_sid, "mark": {"name": name}}))

async def send_clear(ws, stream_sid: str):
    await ws.send(json.dumps({"event": "clear", "streamSid": stream_sid}))

async def handler(ws):
    stream_sid = None
    print("Client connected")

    async for raw in ws:
        msg = json.loads(raw)

        if msg.get("event") == "start":
            stream_sid = msg["start"]["streamSid"]
            call_sid = msg["start"]["callSid"]
            print(f"[start] streamSid={stream_sid} callSid={call_sid}")

            # Immediately play a greeting into the call (your Tortoise audio)
            for i, pcm16 in enumerate(GREETING_FRAMES):
                await send_media(ws, stream_sid, pcm16)
                await asyncio.sleep(FRAME_MS/1000.0)
            await send_mark(ws, stream_sid, "greeting_done")

        elif msg.get("event") == "media":
            # This is the caller's live audio (base64 mu-law @8k). Example: log frame size.
            payload_b64 = msg["media"]["payload"]
            ulaw = base64.b64decode(payload_b64)
            # (Optional) convert to lin16 for STT/NLP:
            lin16 = audioop.ulaw2lin(ulaw, 2)  # 16-bit PCM
            print(f"[inbound media] bytes={len(ulaw)} (mulaw) / {len(lin16)} (pcm16)")
            # Here you could feed to STT or do echo, etc.

        elif msg.get("event") == "mark":
            print(f"[mark ack] {msg}")

        elif msg.get("event") == "stop":
            print(f"[stop] {msg}")
            break

    print("Client disconnected")

async def main():
    async with serve(handler, "0.0.0.0", 8080, subprotocols=["twilio"]):
        print("WebSocket listening on ws://0.0.0.0:8080/media")
        await asyncio.Future()

if __name__ == "__main__":
    asyncio.run(main())


In [None]:
pip install flask websockets soundfile scipy
python media_ws.py
# in a second terminal
python twiml_app.py
# then expose both with ngrok (or deploy behind HTTPS wss)

In [None]:
# make_call.py
from twilio.rest import Client
import os
client = Client(os.environ["TWILIO_ACCOUNT_SID"], os.environ["TWILIO_AUTH_TOKEN"])
call = client.calls.create(
    to="+1XXXXXXXXXX",
    from_="+1YYYYYYYYYY",
    url="https://YOUR_PUBLIC_FLASK_HOST/voice"  # TwiML above
)
print(call.sid)