# Voice Cloning with OpenVoice (Colab)

This notebook automates using OpenVoice (openvoice-cli) to produce a 30-second cloned sample from uploaded audio. Ensure runtime: GPU.


In [None]:
# 1) Environment check & installs
!nvidia-smi
!pip install --upgrade pip
# Install PyTorch (CUDA 11.8 wheel) -- adjust if Colab CUDA changes
!pip install torch torchvision torchaudio --index-url https://download.pytorch.org/whl/cu118
!pip install openvoice-cli gradio soundfile librosa --quiet
!apt-get install -y git-lfs
!git lfs install
print('✅ Installs complete')


In [None]:
# 2) Download OpenVoice checkpoints (Hugging Face)
# This may take time depending on network. It clones a repo into ./checkpoints
!git clone https://huggingface.co/myshell-ai/OpenVoiceV2 checkpoints || true
!ls -la checkpoints || true
print('✅ Checkpoints folder ready')


In [None]:
# 3) Upload reference audio and optional input audio
from google.colab import files
import os, librosa, soundfile as sf, numpy as np
uploaded = files.upload()
filenames = list(uploaded.keys())
print('Uploaded files:', filenames)
if len(filenames)==0:
  raise SystemExit('Upload at least one audio file (reference).')
# Use first uploaded as reference; if second uploaded, use as input narration
ref_path = filenames[0]
input_path = filenames[1] if len(filenames)>1 else filenames[0]
print('Reference:', ref_path, 'Input:', input_path)


In [None]:
# 4) Trim/pad uploaded audio to exactly 30 seconds (30s = 30*sr samples)
import math
TARGET_SEC = 30.0
def ensure_30s(infile, outfile, sr=16000):
    wav, _ = librosa.load(infile, sr=sr, mono=True)
    target_len = int(TARGET_SEC*sr)
    if len(wav) > target_len:
        wav = wav[:target_len]
    elif len(wav) < target_len:
        # pad by repeating or silence
        pad_len = target_len - len(wav)
        wav = np.concatenate([wav, np.zeros(pad_len)])
    sf.write(outfile, wav, sr)
    return outfile
ref_30 = 'samples/original_30s.wav'
inp_30 = 'samples/input_30s.wav'
ensure_30s(ref_path, ref_30, sr=16000)
ensure_30s(input_path, inp_30, sr=16000)
print('✅ Created 30s trimmed/padded files:', ref_30, inp_30)


In [None]:
# 5) Run OpenVoice CLI conversion (single) to convert input -> cloned voice using reference
import shlex, subprocess, sys, os
ref = 'samples/original_30s.wav'
inp = 'samples/input_30s.wav'
out = 'samples/cloned_30s.wav'
cmd = f"python -m openvoice_cli.single -i {inp} -r {ref} -o {out} -d cpu"
print('Running:', cmd)
try:
    import subprocess as sp
    gpu_check = sp.run(['nvidia-smi'], stdout=sp.PIPE, stderr=sp.PIPE)
    if gpu_check.returncode==0:
        cmd = f"python -m openvoice_cli.single -i {inp} -r {ref} -o {out} -d cuda:0"
    res = sp.run(shlex.split(cmd), stdout=sp.PIPE, stderr=sp.PIPE, text=True)
    print('STDOUT:\n', res.stdout)
    print('STDERR:\n', res.stderr)
except Exception as e:
    print('Error running OpenVoice CLI:', e)
print('✅ Conversion command finished — check samples/cloned_30s.wav')


In [None]:
# 6) Play back original vs cloned
from IPython.display import Audio, display
print('Original (trimmed to 30s):')
display(Audio('samples/original_30s.wav', rate=16000))
print('Cloned (30s):')
display(Audio('samples/cloned_30s.wav', rate=16000))
# Auto-download cloned file
from google.colab import files as gfiles
gfiles.download('samples/cloned_30s.wav')
