# Hindi Voice Cloning with OpenVoice V2 (Google Colab)

Beginner-friendly notebook to run **OpenVoice V2 (MIT License)** for Hindi text-to-speech voice cloning.

> Recommended runtime: **Google Colab GPU** (Python 3.12 compatible flow).


## 1) Install dependencies
This cell will:
- Install PyTorch CUDA 11.8 wheels (`torch`, `torchvision`, `torchaudio`)
- Clone the OpenVoice repository
- Install OpenVoice dependencies
- Install helper libraries for widgets/downloads


In [None]:
# --- System and Python packages ---
!pip -q install --upgrade pip
!pip -q install --upgrade --index-url https://download.pytorch.org/whl/cu118 torch torchvision torchaudio

# Clone OpenVoice repo if missing
import os
if not os.path.exists('OpenVoice'):
    !git clone https://github.com/myshell-ai/OpenVoice.git
else:
    print('OpenVoice repo already exists, skipping clone.')

# Install repository requirements + runtime helpers
!pip -q install -r OpenVoice/requirements.txt
!pip -q install -U melo-tts ipywidgets soundfile huggingface_hub


## 2) Download OpenVoice V2 checkpoints (public)
The notebook first tries Hugging Face (`myshell-ai/OpenVoiceV2`) and then falls back to the MyShell public S3 zip.


In [None]:
import os
import zipfile
import requests
from pathlib import Path

CHECKPOINT_DIR = Path('OpenVoice/checkpoints_v2')
CHECKPOINT_DIR.mkdir(parents=True, exist_ok=True)

# Required files to consider download complete
required_files = [
    CHECKPOINT_DIR / 'converter' / 'config.json',
    CHECKPOINT_DIR / 'converter' / 'checkpoint.pth',
    CHECKPOINT_DIR / 'base_speakers' / 'ses' / 'en-newest.pth',
]

def checkpoints_ready():
    return all(p.exists() for p in required_files)

if checkpoints_ready():
    print('✅ OpenVoice V2 checkpoints already present.')
else:
    print('⬇️ Downloading checkpoints...')
    # Attempt 1: Hugging Face snapshot
    try:
        from huggingface_hub import snapshot_download
        snapshot_download(
            repo_id='myshell-ai/OpenVoiceV2',
            local_dir=str(CHECKPOINT_DIR),
            local_dir_use_symlinks=False,
            resume_download=True,
        )
        print('✅ Downloaded from Hugging Face: myshell-ai/OpenVoiceV2')
    except Exception as hf_err:
        print(f'⚠️ Hugging Face download failed: {hf_err}')
        # Attempt 2: Public S3 zip fallback
        s3_url = 'https://myshell-public-repo-hosting.s3.amazonaws.com/openvoice/checkpoints_v2_0417.zip'
        zip_path = Path('checkpoints_v2.zip')
        try:
            with requests.get(s3_url, stream=True, timeout=120) as r:
                r.raise_for_status()
                with open(zip_path, 'wb') as f:
                    for chunk in r.iter_content(chunk_size=8192):
                        if chunk:
                            f.write(chunk)
            with zipfile.ZipFile(zip_path, 'r') as zf:
                zf.extractall(CHECKPOINT_DIR)
            print('✅ Downloaded and extracted checkpoints from S3 fallback.')
        except Exception as s3_err:
            raise RuntimeError(
                'Failed to download OpenVoice V2 checkpoints from both public sources. '
                f'HF error: {hf_err} | S3 error: {s3_err}'
            )

print('Checkpoint directory:', CHECKPOINT_DIR.resolve())


## 3) Load model and auto-detect GPU


In [None]:
import os
import torch
import soundfile as sf
from pathlib import Path

# Make OpenVoice importable
import sys
sys.path.append('OpenVoice')

from melo.api import TTS as MeloTTS
from openvoice.api import ToneColorConverter
from openvoice import se_extractor

DEVICE = 'cuda:0' if torch.cuda.is_available() else 'cpu'
if DEVICE.startswith('cuda'):
    print('✅ GPU detected:', torch.cuda.get_device_name(0))
else:
    print('⚠️ GPU not detected, running on CPU (slower).')

CHECKPOINT_DIR = Path('OpenVoice/checkpoints_v2')
CONVERTER_CONFIG = CHECKPOINT_DIR / 'converter' / 'config.json'
CONVERTER_CKPT = CHECKPOINT_DIR / 'converter' / 'checkpoint.pth'
SOURCE_SE_PATH = CHECKPOINT_DIR / 'base_speakers' / 'ses' / 'en-newest.pth'

# Base TTS for Hindi
base_tts = MeloTTS(language='HI', device=DEVICE)
speaker_ids = base_tts.hps.data.spk2id
DEFAULT_SPEAKER = list(speaker_ids.values())[0]
print('Available Hindi speaker IDs:', speaker_ids)

# Tone color converter (voice cloning)
converter = ToneColorConverter(str(CONVERTER_CONFIG), device=DEVICE)
converter.load_ckpt(str(CONVERTER_CKPT))

source_se = torch.load(str(SOURCE_SE_PATH), map_location=DEVICE)
print('✅ Models loaded successfully.')


## 4) UI: Upload voice, type Hindi text, generate, play, and download


In [None]:
import re
import uuid
import IPython.display as ipd
import ipywidgets as widgets
from pathlib import Path

UPLOAD_DIR = Path('uploads')
UPLOAD_DIR.mkdir(exist_ok=True)

uploader = widgets.FileUpload(
    accept='.wav',
    multiple=False,
    description='Upload .wav'
)

text_input = widgets.Textarea(
    value='नमस्ते! यह आवाज़ क्लोनिंग का एक उदाहरण है।',
    placeholder='यहाँ हिंदी टेक्स्ट लिखें...',
    description='Hindi Text:',
    layout=widgets.Layout(width='100%', height='140px')
)

generate_btn = widgets.Button(
    description='Generate output.wav',
    button_style='success',
    icon='play'
)

status = widgets.HTML('<b>Status:</b> Waiting for input...')
output_box = widgets.Output()

def validate_inputs(upload_widget, text_value):
    if len(upload_widget.value) == 0:
        return False, 'Please upload a .wav voice sample first.'
    cleaned = text_value.strip()
    if not cleaned:
        return False, 'Please enter Hindi text.'
    # Gentle warning (not strict block) for non-Devanagari text
    if not re.search(r'[ऀ-ॿ]', cleaned):
        return True, 'Warning: text does not seem Devanagari, generating anyway.'
    return True, 'Inputs look good.'

def on_generate(_):
    output_box.clear_output()
    is_valid, msg = validate_inputs(uploader, text_input.value)
    if not is_valid:
        status.value = f"<b>Status:</b> ❌ {msg}"
        return

    status.value = f"<b>Status:</b> ⏳ {msg}" if 'Warning' in msg else '<b>Status:</b> ⏳ Generating audio...'

    # Save uploaded file
    uploaded = list(uploader.value.values())[0]
    ref_path = UPLOAD_DIR / f"reference_{uuid.uuid4().hex[:8]}.wav"
    with open(ref_path, 'wb') as f:
        f.write(uploaded['content'])

    tmp_base = Path('tmp_base.wav')
    final_out = Path('output.wav')

    try:
        # 1) Generate base Hindi speech
        base_tts.tts_to_file(
            text=text_input.value.strip(),
            speaker_id=DEFAULT_SPEAKER,
            output_path=str(tmp_base),
            speed=1.0,
            quiet=True,
        )

        # 2) Extract target speaker embedding from uploaded voice
        target_se, _ = se_extractor.get_se(str(ref_path), converter, vad=True)

        # 3) Convert base speech to uploaded voice tone
        converter.convert(
            audio_src_path=str(tmp_base),
            src_se=source_se,
            tgt_se=target_se,
            output_path=str(final_out),
            message='@MyShell'
        )

        status.value = '<b>Status:</b> ✅ Done! Created output.wav'

        with output_box:
            print('Playback:')
            display(ipd.Audio(str(final_out), autoplay=False))
            print('Download output.wav:')
            display(ipd.FileLink(str(final_out)))

    except Exception as e:
        status.value = f'<b>Status:</b> ❌ Failed: {e}'

generate_btn.on_click(on_generate)

ui = widgets.VBox([
    widgets.HTML('<h3>Hindi Voice Cloning UI</h3>'),
    uploader,
    text_input,
    generate_btn,
    status,
    output_box,
])

display(ui)
