# Style-Preserving Speech-to-Speech Translation Experiment

This notebook runs the experiment to determine the minimal duration of speaker embeddings required to effectively clone a speaker's voice across languages.

## 1. Setup Environment
Install necessary dependencies if running on Google Colab.

In [12]:
#only if needed, clear all files except experiment.ipynb
# This command will remove all files and folders in the current directory except "experiment.ipynb"
import os

for fname in os.listdir():
    if fname != "experiment.ipynb":
        if os.path.isdir(fname):
            import shutil
            shutil.rmtree(fname)
        else:
            os.remove(fname)







In [13]:
# Cell to refresh code from GitHub
import os

# Navigate to the repo directory
if os.path.exists("CS479-SpeakerEmbeddings"):
    os.chdir("CS479-SpeakerEmbeddings")
    !git pull
else:
    !git clone https://github.com/NathanAsayDong/CS479-SpeakerEmbeddings.git
    os.chdir("CS479-SpeakerEmbeddings")

# Optional: Reload modules if you've already imported them
import sys
import importlib

# List of your custom modules to reload
modules_to_reload = [
    "common_voice_dataset",
    "setup_experiment",
    "run_experiment",
    "asr_service",
    "translation_service",
    "tts_service",
    "embedding_service",
    "synthetic_data_service",
    "enums"
]

for module_name in modules_to_reload:
    if module_name in sys.modules:
        importlib.reload(sys.modules[module_name])
        print(f"Reloaded {module_name}")

Cloning into 'CS479-SpeakerEmbeddings'...
remote: Enumerating objects: 77, done.[K
remote: Counting objects: 100% (77/77), done.[K
remote: Compressing objects: 100% (53/53), done.[K
remote: Total 77 (delta 36), reused 63 (delta 22), pack-reused 0 (from 0)[K
Receiving objects: 100% (77/77), 599.04 KiB | 16.64 MiB/s, done.
Resolving deltas: 100% (36/36), done.
Reloaded common_voice_dataset
Reloaded setup_experiment
Reloaded run_experiment


IndentationError: unexpected indent (asr_service.py, line 16)

In [None]:

# repo_dir = "CS479-SpeakerEmbeddings"
# if os.path.exists(repo_dir):
#     shutil.rmtree(repo_dir)
# !git clone https://github.com/NathanAsayDong/CS479-SpeakerEmbeddings.git
# %cd CS479-SpeakerEmbeddings
# !ls

Cloning into 'CS479-SpeakerEmbeddings'...
remote: Enumerating objects: 71, done.[K
remote: Counting objects: 100% (71/71), done.[K
remote: Compressing objects: 100% (51/51), done.[K
remote: Total 71 (delta 31), reused 58 (delta 18), pack-reused 0 (from 0)[K
Receiving objects: 100% (71/71), 590.78 KiB | 13.74 MiB/s, done.
Resolving deltas: 100% (31/31), done.
/content/CS479-SpeakerEmbeddings/CS479-SpeakerEmbeddings
asr_service.py		 peoples_speech_dataset.py  setup_experiment.py
common_voice_dataset.py  ProjectOutline.pdf	    synthetic_data_service.py
embedding_service.py	 __pycache__		    tmp_model
enums.py		 readMe			    translation_service.py
experiment.ipynb	 requirements.txt	    tts_service.py
libri_speech_dataset.py  run_experiment.py
main.py			 Samples


In [None]:
!pip install torch transformers speechbrain soundfile librosa openai-whisper accelerate sentencepiece pydantic torchcodec datasets kagglehub[pandas-datasets]
!pip install sounddevice
!sudo apt-get install libportaudio2

Collecting speechbrain
  Downloading speechbrain-1.0.3-py3-none-any.whl.metadata (24 kB)
Collecting openai-whisper
  Downloading openai_whisper-20250625.tar.gz (803 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m803.2/803.2 kB[0m [31m22.0 MB/s[0m eta [36m0:00:00[0m00:01[0m
[?25h  Installing build dependencies ... [?25l[?25hdone
  Getting requirements to build wheel ... [?25l[?25hdone
  Preparing metadata (pyproject.toml) ... [?25l[?25hdone
Collecting torchcodec
  Downloading torchcodec-0.8.1-cp312-cp312-manylinux_2_28_x86_64.whl.metadata (9.7 kB)
Collecting hyperpyyaml (from speechbrain)
  Downloading HyperPyYAML-1.2.2-py3-none-any.whl.metadata (7.6 kB)
Collecting ruamel.yaml>=0.17.28 (from hyperpyyaml->speechbrain)
  Downloading ruamel.yaml-0.18.16-py3-none-any.whl.metadata (25 kB)
Collecting ruamel.yaml.clib>=0.2.7 (from ruamel.yaml>=0.17.28->hyperpyyaml->speechbrain)
  Downloading ruamel_yaml_clib-0.2.15-cp312-cp312-manylinux2014_x86_64.manylinux_2_17

## 2. Import Modules
Import the experiment setup and runner classes.

In [None]:
import os
import sys

# Add current directory to path if needed
sys.path.append(os.getcwd())

from enums import Language
from setup_experiment import ExperimentSetup
from run_experiment import ExperimentRunner

DEBUG:speechbrain.utils.checkpoints:Registered checkpoint save hook for _speechbrain_save
DEBUG:speechbrain.utils.checkpoints:Registered checkpoint load hook for _speechbrain_load
DEBUG:speechbrain.utils.checkpoints:Registered checkpoint save hook for save
DEBUG:speechbrain.utils.checkpoints:Registered checkpoint load hook for load
  self.setter(val)
DEBUG:speechbrain.utils.checkpoints:Registered checkpoint save hook for _save
DEBUG:speechbrain.utils.checkpoints:Registered checkpoint load hook for _recover


## 3. Configure Experiment
Define the parameters for the experiment: source/target languages and reference durations to test.

In [None]:
SOURCE_LANG = Language.ENGLISH
TARGET_LANG = Language.SPANISH
DURATIONS = [5.0, 10.0, 15.0, 20.0, 30.0]
NUM_SAMPLES_PER_DURATION = 5 # Number of unique speakers to test
SEED = 42

## 4. Prepare Data
This step:
1. Downloads/Loads Common Voice dataset via KaggleHub.
2. Selects `NUM_SPEAKERS` with sufficient data.
3. Creates concatenated reference audio files for each duration.
4. Generates a manifest for the experiment run.

In [None]:
!ls

asr_service.py		 peoples_speech_dataset.py  setup_experiment.py
common_voice_dataset.py  ProjectOutline.pdf	    synthetic_data_service.py
embedding_service.py	 __pycache__		    tmp_model
enums.py		 readMe			    translation_service.py
experiment.ipynb	 requirements.txt	    tts_service.py
libri_speech_dataset.py  run_experiment.py
main.py			 Samples


In [None]:
setup = ExperimentSetup(
    source_language=SOURCE_LANG,
    target_language=TARGET_LANG,
    reference_durations=DURATIONS,
    seed=SEED
)

# Prepare the manifest
manifest = setup.prepare_data(num_samples_per_duration=NUM_SAMPLES_PER_DURATION)

print(f"Manifest ready with {len(manifest)} speakers.")
print("Sample Item:", manifest[0] if manifest else "No data")

Preparing experiment data: 5 samples for each of [5.0, 10.0, 15.0, 20.0, 30.0]s durations...
Loading Common Voice dataset for language 'en'...
Using Colab cache for faster access to the 'common-voice' dataset.
Dataset path: /kaggle/input/common-voice
Searching for language 'en' in /kaggle/input/common-voice
Found flattened dataset structure at /kaggle/input/common-voice
Loaded 4076 records for en/dev
Columns: ['filename', 'text', 'up_votes', 'down_votes', 'age', 'gender', 'accent', 'duration', 'path', 'sentence']
Manifest ready with 25 speakers.
Sample Item: {'sample_id': 'sample_1', 'test_input_path': '/kaggle/input/common-voice/cv-valid-dev/cv-valid-dev/sample-001749.mp3', 'test_input_text': 'and in that way the months passed', 'reference_path': 'experiment_data/sample_1/ref_5s.wav', 'target_duration': 5.0}


## 5. Run Experiment
Execute the pipeline for each speaker and duration:
1. Extract ground truth embedding (original speaker).
2. Translate source text to Spanish.
3. Synthesize Spanish speech using the reference audio (5s, 10s, etc.) for style.
4. Compute Cosine Similarity between ground truth and output embeddings.

In [None]:
runner = ExperimentRunner()
runner.run(manifest)

Error while fetching `HF_TOKEN` secret value from your vault: 'Requesting secret HF_TOKEN timed out. Secrets can only be fetched when running from the Colab UI.'.
You are not authenticated with the Hugging Face Hub in this notebook.
If the error persists, please let us know by opening an issue on GitHub (https://github.com/huggingface/huggingface_hub/issues/new).


tokenizer_config.json:   0%|          | 0.00/44.0 [00:00<?, ?B/s]

source.spm:   0%|          | 0.00/802k [00:00<?, ?B/s]

target.spm:   0%|          | 0.00/826k [00:00<?, ?B/s]

vocab.json: 0.00B [00:00, ?B/s]

config.json: 0.00B [00:00, ?B/s]



pytorch_model.bin:   0%|          | 0.00/312M [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/293 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/312M [00:00<?, ?B/s]

preprocessor_config.json:   0%|          | 0.00/433 [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/232 [00:00<?, ?B/s]

spm_char.model:   0%|          | 0.00/238k [00:00<?, ?B/s]

added_tokens.json:   0%|          | 0.00/40.0 [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/234 [00:00<?, ?B/s]

config.json: 0.00B [00:00, ?B/s]

pytorch_model.bin:   0%|          | 0.00/585M [00:00<?, ?B/s]

config.json:   0%|          | 0.00/636 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/585M [00:00<?, ?B/s]

pytorch_model.bin:   0%|          | 0.00/50.7M [00:00<?, ?B/s]

INFO:speechbrain.utils.fetching:Fetch hyperparams.yaml: Fetching from HuggingFace Hub 'speechbrain/spkrec-xvect-voxceleb' if not cached


hyperparams.yaml: 0.00B [00:00, ?B/s]

DEBUG:speechbrain.utils.fetching:Fetch: Local file found, creating symlink '/root/.cache/huggingface/hub/models--speechbrain--spkrec-xvect-voxceleb/snapshots/56895a2df401be4150a159f3a1c653f00051d477/hyperparams.yaml' -> '/content/CS479-SpeakerEmbeddings/CS479-SpeakerEmbeddings/tmp_model/hyperparams.yaml'
INFO:speechbrain.utils.fetching:Fetch custom.py: Fetching from HuggingFace Hub 'speechbrain/spkrec-xvect-voxceleb' if not cached


model.safetensors:   0%|          | 0.00/50.6M [00:00<?, ?B/s]

DEBUG:speechbrain.utils.checkpoints:Registered checkpoint save hook for _save
DEBUG:speechbrain.utils.checkpoints:Registered checkpoint load hook for _load
DEBUG:speechbrain.utils.checkpoints:Registered parameter transfer hook for _load
  wrapped_fwd = torch.cuda.amp.custom_fwd(fwd, cast_inputs=cast_inputs)
DEBUG:speechbrain.utils.checkpoints:Registered checkpoint save hook for save
DEBUG:speechbrain.utils.checkpoints:Registered checkpoint load hook for load_if_possible
DEBUG:speechbrain.utils.parameter_transfer:Collecting files (or symlinks) for pretraining in tmp_model.
INFO:speechbrain.utils.fetching:Fetch embedding_model.ckpt: Fetching from HuggingFace Hub 'speechbrain/spkrec-xvect-voxceleb' if not cached


embedding_model.ckpt:   0%|          | 0.00/16.9M [00:00<?, ?B/s]

DEBUG:speechbrain.utils.fetching:Fetch: Local file found, creating symlink '/root/.cache/huggingface/hub/models--speechbrain--spkrec-xvect-voxceleb/snapshots/56895a2df401be4150a159f3a1c653f00051d477/embedding_model.ckpt' -> '/content/CS479-SpeakerEmbeddings/CS479-SpeakerEmbeddings/tmp_model/embedding_model.ckpt'
DEBUG:speechbrain.utils.parameter_transfer:Set local path in self.paths["embedding_model"] = /content/CS479-SpeakerEmbeddings/CS479-SpeakerEmbeddings/tmp_model/embedding_model.ckpt
INFO:speechbrain.utils.fetching:Fetch mean_var_norm_emb.ckpt: Fetching from HuggingFace Hub 'speechbrain/spkrec-xvect-voxceleb' if not cached


mean_var_norm_emb.ckpt:   0%|          | 0.00/3.20k [00:00<?, ?B/s]

DEBUG:speechbrain.utils.fetching:Fetch: Local file found, creating symlink '/root/.cache/huggingface/hub/models--speechbrain--spkrec-xvect-voxceleb/snapshots/56895a2df401be4150a159f3a1c653f00051d477/mean_var_norm_emb.ckpt' -> '/content/CS479-SpeakerEmbeddings/CS479-SpeakerEmbeddings/tmp_model/mean_var_norm_emb.ckpt'
DEBUG:speechbrain.utils.parameter_transfer:Set local path in self.paths["mean_var_norm_emb"] = /content/CS479-SpeakerEmbeddings/CS479-SpeakerEmbeddings/tmp_model/mean_var_norm_emb.ckpt
INFO:speechbrain.utils.fetching:Fetch classifier.ckpt: Fetching from HuggingFace Hub 'speechbrain/spkrec-xvect-voxceleb' if not cached


classifier.ckpt:   0%|          | 0.00/15.9M [00:00<?, ?B/s]

DEBUG:speechbrain.utils.fetching:Fetch: Local file found, creating symlink '/root/.cache/huggingface/hub/models--speechbrain--spkrec-xvect-voxceleb/snapshots/56895a2df401be4150a159f3a1c653f00051d477/classifier.ckpt' -> '/content/CS479-SpeakerEmbeddings/CS479-SpeakerEmbeddings/tmp_model/classifier.ckpt'
DEBUG:speechbrain.utils.parameter_transfer:Set local path in self.paths["classifier"] = /content/CS479-SpeakerEmbeddings/CS479-SpeakerEmbeddings/tmp_model/classifier.ckpt
INFO:speechbrain.utils.fetching:Fetch label_encoder.txt: Fetching from HuggingFace Hub 'speechbrain/spkrec-xvect-voxceleb' if not cached


label_encoder.txt: 0.00B [00:00, ?B/s]

DEBUG:speechbrain.utils.fetching:Fetch: Local file found, creating symlink '/root/.cache/huggingface/hub/models--speechbrain--spkrec-xvect-voxceleb/snapshots/56895a2df401be4150a159f3a1c653f00051d477/label_encoder.txt' -> '/content/CS479-SpeakerEmbeddings/CS479-SpeakerEmbeddings/tmp_model/label_encoder.ckpt'
DEBUG:speechbrain.utils.parameter_transfer:Set local path in self.paths["label_encoder"] = /content/CS479-SpeakerEmbeddings/CS479-SpeakerEmbeddings/tmp_model/label_encoder.ckpt
INFO:speechbrain.utils.parameter_transfer:Loading pretrained files for: embedding_model, mean_var_norm_emb, classifier, label_encoder
DEBUG:speechbrain.utils.parameter_transfer:Redirecting (loading from local path): embedding_model -> /content/CS479-SpeakerEmbeddings/CS479-SpeakerEmbeddings/tmp_model/embedding_model.ckpt
DEBUG:speechbrain.utils.parameter_transfer:Redirecting (loading from local path): mean_var_norm_emb -> /content/CS479-SpeakerEmbeddings/CS479-SpeakerEmbeddings/tmp_model/mean_var_norm_emb.ckp

Starting experiment run...
Running: Sample sample_1 | Ref Duration: 5.0s
  -> Error: The first dimension of speaker_embeddings must be either 1 or the same as batch size.
Running: Sample sample_1 | Ref Duration: 10.0s
  -> Error: The first dimension of speaker_embeddings must be either 1 or the same as batch size.
Running: Sample sample_1 | Ref Duration: 15.0s
  -> Error: The first dimension of speaker_embeddings must be either 1 or the same as batch size.
Running: Sample sample_1 | Ref Duration: 20.0s
  -> Error: The first dimension of speaker_embeddings must be either 1 or the same as batch size.
Running: Sample sample_1 | Ref Duration: 30.0s
  -> Error: The first dimension of speaker_embeddings must be either 1 or the same as batch size.
Running: Sample sample_2 | Ref Duration: 5.0s
  -> Error: The first dimension of speaker_embeddings must be either 1 or the same as batch size.
Running: Sample sample_2 | Ref Duration: 10.0s
  -> Error: The first dimension of speaker_embeddings must

## 6. Analyze Results
Save and inspect the results.

In [None]:
runner.save_results("experiment_results.csv")

import pandas as pd
results_df = pd.read_csv("experiment_results.csv")

# Display average similarity score per duration
print("\nAverage Similarity Scores by Duration:")
print(results_df.groupby("duration")["similarity_score"].mean())

results_df.head(10)

Results saved to experiment_results.csv


EmptyDataError: No columns to parse from file

## 7. Real-World Demo
Record your own voice, translate it, and check the similarity score.
NOTE: This requires a microphone. If running on a remote Colab kernel without audio forwarding, this might not work directly.

In [None]:
# Real-World Test
from asr_service import ASRService
from translation_service import TranslationService
from tts_service import TTSService
from embedding_service import EmbeddingService
from enums import Language
import torch.nn.functional as F

def run_live_demo(duration=10):
    print(f"--- Live Demo (Enrollment: {duration}s) ---")
    
    # Initialize services
    asr = ASRService()
    translator = TranslationService()
    tts = TTSService()
    embedder = EmbeddingService()
    
    # 1. Record Input
    try:
        # Step 1: Listen and Transcribe
        print("\nRecording... (speak now)")
        english_text, source_audio_path = asr.listen_transcribe(duration=5)
        print(f"Transcribed: {english_text}")
        
        # Step 2: Record Enrollment (Reference)
        # Ideally we use the same audio if it's long enough, but let's record a dedicated style clip
        # input("Press Enter to record style enrollment (speak clearly)...")
        # ref_path = asr.record_audio(duration=duration, file_path="demo_ref.wav")
        
        # Simpler: Just use the input audio itself as reference (Zero-Shot on self)
        ref_path = source_audio_path

        # 2. Translate
        spanish_text = translator.translate(english_text, target_language=Language.SPANISH)
        print(f"Translated: {spanish_text}")

        # 3. Synthesize
        output_path = "demo_output.wav"
        tts.synthesize(spanish_text, output_path, ref_path)
        print(f"Synthesized audio saved to {output_path}")

        # 4. Evaluate Similarity
        gt_embedding = embedder.extract_embedding(ref_path)
        out_embedding = embedder.extract_embedding(output_path)
        
        if gt_embedding.dim() == 1: gt_embedding = gt_embedding.unsqueeze(0)
        if out_embedding.dim() == 1: out_embedding = out_embedding.unsqueeze(0)
            
        score = F.cosine_similarity(gt_embedding, out_embedding).item()
        print(f"Speaker Similarity Score: {score:.4f}")
        
        # Playback (if in Colab/Jupyter)
        from IPython.display import Audio, display
        print("Original:")
        display(Audio(source_audio_path))
        print("Synthesized (Spanish):")
        display(Audio(output_path))
        
    except Exception as e:
        print(f"Error during demo: {e}")
        print("Note: Microphone recording might fail on remote servers without audio forwarding.")

# run_live_demo(duration=10)
# To run this on Colab, you'd typically need a Javascript helper to record audio from the browser,
# as 'sounddevice' tries to access the server's mic (which doesn't exist).
# For now, this code works if running locally.