Note that the openvoice conda environment should already have been set up according to the OpenVoice documentation.
```
conda init
conda activate openvoice
pip install facenet-pytorch==2.6.0
pip install torch==2.7.1+cu118 torchvision==0.22.1+cu118 torchaudio==2.7.1+cu118 --index-url https://download.pytorch.org/whl/cu118
pip install ipykernel
```

In [44]:
import os
import sys
sys.path.append(os.path.abspath("../OpenVoice"))

In [45]:
import torch
from facenet_pytorch import InceptionResnetV1
from PIL import Image
import torchvision.transforms as T
from openvoice import se_extractor
from openvoice.api import ToneColorConverter
from melo.api import TTS

In [46]:
import nltk
nltk.download('averaged_perceptron_tagger_eng')

[nltk_data] Downloading package averaged_perceptron_tagger_eng to
[nltk_data]     C:\Users\ExoHorizon\AppData\Roaming\nltk_data...
[nltk_data]   Package averaged_perceptron_tagger_eng is already up-to-
[nltk_data]       date!


True

In [47]:
# Device setup
device = "cuda" if torch.cuda.is_available() else "cpu"
print("Using device:", device)

Using device: cuda


In [48]:
# Configuration
image_path = "e2e_example_2.jpg"

output_dir = './'
output_path = os.path.join(output_dir, 'e2e_example_2.wav')
os.makedirs(output_dir, exist_ok=True)

text = "Did you ever hear a folk tale about a giant turtle?"
src_path = os.path.join(output_dir, 'tmp.wav')
speed = 1.0

### FaceNet Section

In [49]:
# Load FaceNet model
facenet = InceptionResnetV1(pretrained='vggface2').eval().to(device)

In [50]:
# Preprocessing for aligned faces
preprocess = T.Compose([
    T.Resize((160, 160)),
    T.ToTensor(),
    T.Normalize(mean=[0.5, 0.5, 0.5], std=[0.5, 0.5, 0.5])
])

In [51]:
def get_image_embedding(image_path):
    """Load an image and return its 512-d FaceNet embedding"""
    img = Image.open(image_path).convert("RGB")
    img_tensor = preprocess(img).unsqueeze(0).to(device)
    with torch.no_grad():
        embedding = facenet(img_tensor)
        embedding = torch.nn.functional.normalize(embedding, dim=1)
    return embedding.squeeze(0).cpu()

In [52]:
# Get image embedding using FaceNet
image_emb = get_image_embedding(image_path)
print("Image Embedding shape:", image_emb.shape)
print("First 6 values:", image_emb[:6])

Image Embedding shape: torch.Size([512])
First 6 values: tensor([ 0.0293, -0.0276, -0.0313,  0.0459, -0.0281,  0.0789])


### Face2Voice Section

In [53]:
# Load Face2Voice model
model_path = "../05_final_models/face2voice_S_D_BN_torchscript.pt"
model = torch.jit.load(model_path, map_location=device)
model.eval()

RecursiveScriptModule(
  original_name=MLP
  (net): RecursiveScriptModule(
    original_name=Sequential
    (0): RecursiveScriptModule(original_name=Linear)
    (1): RecursiveScriptModule(original_name=BatchNorm1d)
    (2): RecursiveScriptModule(original_name=ReLU)
    (3): RecursiveScriptModule(original_name=Dropout)
    (4): RecursiveScriptModule(original_name=Linear)
    (5): RecursiveScriptModule(original_name=BatchNorm1d)
    (6): RecursiveScriptModule(original_name=ReLU)
    (7): RecursiveScriptModule(original_name=Dropout)
    (8): RecursiveScriptModule(original_name=Linear)
  )
)

In [54]:
def get_speaker_embedding(image_embedding):
    """
    Given a 512-d FaceNet image embedding, pass it through Face2Voice
    to get the 256-d speaker embedding.
    """
    embedding = image_embedding.to(device).float()
    embedding = embedding.unsqueeze(0)
    
    with torch.no_grad():
        speaker_embedding = model(embedding)
    
    speaker_embedding = speaker_embedding.squeeze(0).cpu()
    return speaker_embedding

In [55]:
# Get speaker embedding using Face2Voice
speaker_emb = get_speaker_embedding(image_emb)
print("Speaker embedding shape:", speaker_emb.shape)
print("First 6 values:", speaker_emb[:6])

Speaker embedding shape: torch.Size([256])
First 6 values: tensor([-0.1036,  0.0608, -0.1813,  0.4126,  0.6130,  0.6571])


In [56]:
# Reshape speaker embedding for OpenVoiceV2 and move back to device
speaker_emb = speaker_emb.view(1, 256, 1)
print("Speaker embedding shape:", speaker_emb.shape)
speaker_emb = speaker_emb.to(device)

Speaker embedding shape: torch.Size([1, 256, 1])


### OpenVoiceV2 Section

In [57]:
# Initialize ToneColorConverter
ckpt_converter = 'checkpoints_v2/converter'
tone_color_converter = ToneColorConverter(f'../OpenVoice/{ckpt_converter}/config.json', device=device)
tone_color_converter.load_ckpt(f'../OpenVoice/{ckpt_converter}/checkpoint.pth')

Loaded checkpoint '../OpenVoice/checkpoints_v2/converter/checkpoint.pth'
missing/unexpected keys: [] []


In [58]:
# Initialize English TTS model
language = 'EN'
model = TTS(language=language, device=device)
speaker_ids = model.hps.data.spk2id

In [59]:
# Use the first English base speaker
speaker_key = list(speaker_ids.keys())[0]
speaker_id = speaker_ids[speaker_key]
speaker_key = speaker_key.lower().replace('_', '-')

In [60]:
# Load source speaker embedding
source_se = torch.load(f'../OpenVoice/checkpoints_v2/base_speakers/ses/{speaker_key}.pth', map_location=device)

In [61]:
# Optional workaround for MPS on CPU
if torch.backends.mps.is_available() and device == 'cpu':
    torch.backends.mps.is_available = lambda: False

In [62]:
# Generate reference audio using MeloTTS
model.tts_to_file(text, speaker_id, src_path, speed=speed)

 > Text split to sentences.
Did you ever hear a folk tale about a giant turtle?


100%|██████████| 1/1 [00:00<00:00,  6.01it/s]


In [63]:
# Apply tone color conversion to clone target voice with Face2Voice speaker emb
encode_message = "@MyShell"
tone_color_converter.convert(
    audio_src_path=src_path,
    src_se=source_se,
    tgt_se=speaker_emb,
    output_path=output_path,
    message=encode_message
)