# Chatterbox TTS Server — UdemyCrores Voice Clone

**Setup:**
1. Make sure GPU runtime is enabled: Runtime → Change runtime type → T4 GPU
2. Run all cells in order
3. Copy the ngrok URL printed at the end
4. Paste it in your `.env` file as `CHATTERBOX_API_URL=<url>`

Voice reference is automatically downloaded from the GitHub repo.

In [None]:
# Cell 1: Install Chatterbox TTS + dependencies
import sys, os

# Use Colab's pre-installed torch 2.9.0, torchaudio, torchvision, numpy
# Install only chatterbox's other dependencies (compatible versions)
!pip install -q librosa==0.11.0 s3tokenizer transformers diffusers
!pip install -q resemble-perth conformer safetensors
!pip install -q spacy-pkuseg pykakasi==2.3.0 pyloudnorm omegaconf
!pip install -q pyngrok flask soundfile

# Clone chatterbox source
!rm -rf /content/chatterbox
!git clone -q https://github.com/resemble-ai/chatterbox.git /content/chatterbox

# Patch __init__ to skip version lookup
with open('/content/chatterbox/src/chatterbox/__init__.py', 'w') as f:
  f.write('__version__ = "0.1.6"\n')

# Add to path
sys.path.insert(0, '/content/chatterbox/src')

# Verify import
from chatterbox.tts import ChatterboxTTS
print("✅ chatterbox.tts imported successfully!")
print("✅ Installation complete!")

In [None]:
# Cell 2: Download voice reference from GitHub repo
import urllib.request
import os

VOICE_URL = 'https://github.com/JineeshTS/UdemyTrainings/raw/main/data/voice-reference.wav'
VOICE_FILE = '/content/voice-reference.wav'

if not os.path.exists(VOICE_FILE):
    print('Downloading voice reference from GitHub...')
    urllib.request.urlretrieve(VOICE_URL, VOICE_FILE)
    print(f'Downloaded: {VOICE_FILE} ({os.path.getsize(VOICE_FILE)} bytes)')
else:
    print(f'Voice reference already exists: {VOICE_FILE}')

# Or upload your own:
# from google.colab import files
# uploaded = files.upload()
# VOICE_FILE = list(uploaded.keys())[0]

In [None]:
# Cell 3: Load Chatterbox model
import sys
sys.path.insert(0, '/content/chatterbox/src')

import torch
import soundfile as sf
from chatterbox.tts import ChatterboxTTS

device = 'cuda' if torch.cuda.is_available() else 'cpu'
print(f'Device: {device}')

model = ChatterboxTTS.from_pretrained(device=device)
print('Chatterbox model loaded!')

# Test with voice reference
test_wav = model.generate("Hello, welcome to the course.", audio_prompt_path=VOICE_FILE)
wav_np = test_wav.squeeze().cpu().numpy()
sf.write('/content/test-output.wav', wav_np, model.sr)
print('Test generation successful!')

from IPython.display import Audio
Audio('/content/test-output.wav')

In [None]:
# Cell 4: Start OpenAI-compatible TTS API server with ngrok
from pyngrok import ngrok
from flask import Flask, request, send_file, jsonify
import io
import soundfile as sf
import numpy as np
import threading

# ============================================
# SET YOUR NGROK AUTH TOKEN HERE
# Get free token from: https://dashboard.ngrok.com/get-started/your-authtoken
NGROK_AUTH_TOKEN = 'YOUR_NGROK_TOKEN_HERE'  # <-- REPLACE THIS
# ============================================

ngrok.set_auth_token(NGROK_AUTH_TOKEN)

app = Flask(__name__)

@app.route('/health', methods=['GET'])
def health():
    return jsonify({'status': 'ok', 'model': 'chatterbox', 'device': device})

@app.route('/v1/audio/speech', methods=['POST'])
def tts():
    data = request.json
    text = data.get('input', '')
    if not text:
        return jsonify({'error': 'No input text'}), 400

    # Voice cloning tuning parameters (from client or defaults)
    # exaggeration: 0.0 = closest to reference voice, 1.0 = most expressive
    # cfg: higher = more faithful to reference (3-7 range)
    exaggeration = data.get('exaggeration', 0.3)
    cfg = data.get('cfg', 5.0)

    print(f'Generating (exag={exaggeration}, cfg={cfg}): {text[:80]}...')
    wav = model.generate(
        text,
        audio_prompt_path=VOICE_FILE,
        exaggeration=exaggeration,
        cfg_weight=cfg
    )

    wav_np = wav.squeeze().cpu().numpy()
    buf = io.BytesIO()
    sf.write(buf, wav_np, model.sr, format='WAV')
    buf.seek(0)

    return send_file(buf, mimetype='audio/wav', as_attachment=True, download_name='speech.wav')

# Start ngrok tunnel
tunnel = ngrok.connect(8000)
print(f'''
╔══════════════════════════════════════════════════════════╗
║  CHATTERBOX TTS SERVER RUNNING                         ║
║                                                        ║
║  API URL: {str(tunnel.public_url):45s}║
║                                                        ║
║  Voice cloning params (sent from client):              ║
║    exaggeration: 0.3 (low = closer to your voice)      ║
║    cfg: 5.0 (high = more faithful clone)               ║
║                                                        ║
║  Add to .env:                                          ║
║  CHATTERBOX_API_URL={str(tunnel.public_url):37s}║
╚══════════════════════════════════════════════════════════╝
''')

# Run Flask in thread so Colab stays responsive
threading.Thread(target=lambda: app.run(host='0.0.0.0', port=8000), daemon=True).start()

# Keep alive
import time
while True:
    time.sleep(60)
    print(f'Server running... {tunnel.public_url}')