# Voice Cloning Experiment — Reproducible Notebook
Objective: reproduce a voice cloning pipeline, log errors, track time and GPU usage, and produce a 30s original vs cloned sample.



In [None]:
# Core dependencies (examples; prefer lightweight, well-known repos)
!pip install numpy scipy matplotlib librosa soundfile webrtcvad tqdm
# For deep learning
!pip install torch torchvision torchaudio --index-url https://download.pytorch.org/whl/cu118
# Optional: use a maintained voice cloning repo (e.g., Coqui TTS or Real-Time-Voice-Cloning forks)
# Here we clone a stable implementation. Replace with your preferred repo.
!git clone https://github.com/CorentinJ/Real-Time-Voice-Cloning.git /content/voiceclone
%cd /content/voiceclone
!pip install -r requirements.txt


Looking in indexes: https://download.pytorch.org/whl/cu118
fatal: destination path '/content/voiceclone' already exists and is not an empty directory.
/content/voiceclone
Collecting inflect==5.3.0 (from -r requirements.txt (line 1))
  Using cached inflect-5.3.0-py3-none-any.whl.metadata (21 kB)
Collecting librosa==0.8.1 (from -r requirements.txt (line 2))
  Using cached librosa-0.8.1-py3-none-any.whl.metadata (6.8 kB)
Collecting matplotlib==3.5.1 (from -r requirements.txt (line 3))
  Using cached matplotlib-3.5.1.tar.gz (35.3 MB)
  Preparing metadata (setup.py) ... [?25l[?25hdone
Collecting numpy==1.20.3 (from -r requirements.txt (line 4))
  Using cached numpy-1.20.3.zip (7.8 MB)
  Installing build dependencies ... [?25l[?25hdone
  Getting requirements to build wheel ... [?25l[?25hdone
[31mERROR: Exception:
Traceback (most recent call last):
  File "/usr/local/lib/python3.12/dist-packages/pip/_internal/cli/base_command.py", line 179, in exc_logging_wrapper
    status = run_func(

In [None]:
from google.colab import drive
drive.mount('/content/drive')
# Create experiment directory
!mkdir -p /content/drive/MyDrive/voice_cloning_experiment
EXPERIMENT_DIR = "/content/drive/MyDrive/voice_cloning_experiment"

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [None]:
# Example: expected checkpoints for encoder, synthesizer, vocoder
# Replace the URLs with checkpoint URLs you trust or upload them to Drive.
ENCODER_CKPT = f"{EXPERIMENT_DIR}/encoder.pt"
SYNTHESIZER_CKPT = f"{EXPERIMENT_DIR}/synthesizer.pt"
VOCODER_CKPT = f"{EXPERIMENT_DIR}/vocoder.pt"

print('Place pretrained checkpoints at:', EXPERIMENT_DIR)
# If you have URLs, use wget here. Otherwise, upload to Drive manually.

Place pretrained checkpoints at: /content/drive/MyDrive/voice_cloning_experiment


In [None]:
import time, os, json, traceback
LOG_DIR = os.path.join(EXPERIMENT_DIR, "logs")
os.makedirs(LOG_DIR, exist_ok=True)
ERROR_LOG = os.path.join(LOG_DIR, "error_log.jsonl")
TIMING_LOG = os.path.join(LOG_DIR, "timing_log.jsonl")

def log_error(stage, err):
    entry = {
        "timestamp": time.time(),
        "stage": stage,
        "error": str(err),
        "traceback": traceback.format_exc()
    }
    with open(ERROR_LOG, "a") as f:
        f.write(json.dumps(entry)+"\n")
    print("Logged error:", stage)

def log_timing(stage, duration_seconds):
    entry = { "timestamp": time.time(), "stage": stage, "duration_s": duration_seconds }
    with open(TIMING_LOG, "a") as f:
        f.write(json.dumps(entry)+"\n")

In [None]:
# Check GPU availability and specs
import torch, subprocess, json
gpu_available = torch.cuda.is_available()
gpu_name = torch.cuda.get_device_name(0) if gpu_available else "CPU"
total_mem = torch.cuda.get_device_properties(0).total_memory if gpu_available else None
print("GPU available:", gpu_available, "GPU name:", gpu_name, "Total mem:", total_mem)
# Log GPU info
with open(os.path.join(LOG_DIR, "gpu_info.json"), "w") as f:
    json.dump({"gpu_available": gpu_available, "gpu_name": gpu_name, "total_mem": total_mem}, f)

GPU available: True GPU name: Tesla T4 Total mem: 15828320256


In [None]:
# Assumes you will upload a folder of WAVs for the target speaker and optionally a single original 30s sample to compare.
DATA_DIR = os.path.join(EXPERIMENT_DIR, "data")
os.makedirs(DATA_DIR, exist_ok=True)
print("Upload your WAV files into:", DATA_DIR)
# example script to list files
import glob
wav_files = glob.glob(os.path.join(DATA_DIR,"*.wav"))
print("Found wav files:", len(wav_files))

Upload your WAV files into: /content/drive/MyDrive/voice_cloning_experiment/data
Found wav files: 0


In [None]:
# Example using librosa to load and resample to 22050 Hz, mono
import librosa, soundfile as sf
def preprocess_wav(src_path, dst_path, sr=22050):
    y, _ = librosa.load(src_path, sr=sr, mono=True)
    sf.write(dst_path, y, sr)
# run on all files
for i, src in enumerate(wav_files):
    dst = os.path.join(DATA_DIR, f"proc_{i}.wav")
    try:
        preprocess_wav(src, dst)
    except Exception as e:
        log_error("preprocess_wav", e)

In [None]:
# WARNING: full training requires heavy resources. Many experiments use pretrained models + fine-tune or synthesizer only.
# Example pseudocode of fine-tuning the synthesizer (replace with repo-specific training commands).
start = time.time()
try:
    # Placeholder: replace with actual training command from the cloned repo
    # e.g., !python train_synth.py --data_dir /content/drive/MyDrive/voice_cloning_experiment/data ...
    print("Run the repo-specific training command here; this cell is a placeholder.")
except Exception as e:
    log_error("train", e)
finally:
    log_timing("train", time.time() - start)

Run the repo-specific training command here; this cell is a placeholder.


In [None]:
# Example inference flow (repo-specific): encode speaker, synthesize mel, vocode to waveform.
start = time.time()
try:
    # Placeholder pseudocode; replace with repo inference functions.
    # from encoder import inference as enc
    # from synthesizer import synthesize
    # from vocoder import infer_waveform
    print("Run inference to synthesize a 30s sample. Use your text or original sample's content.")
except Exception as e:
    log_error("inference", e)
finally:
    log_timing("inference", time.time() - start)

Run inference to synthesize a 30s sample. Use your text or original sample's content.


In [None]:
# Save generated wav to results/audio_samples/cloned_30s.wav
OUT_DIR = os.path.join(EXPERIMENT_DIR, "results", "audio_samples")
os.makedirs(OUT_DIR, exist_ok=True)
# Example: copy proc_0.wav to cloned sample path for placeholder
import shutil
try:
    # Replace with the actual generated file path
    generated_wav = os.path.join(DATA_DIR, "proc_0.wav")
    cloned_path = os.path.join(OUT_DIR, "cloned_30s.wav")
    shutil.copyfile(generated_wav, cloned_path)
    print("Cloned sample saved to:", cloned_path)
except Exception as e:
    log_error("save_cloned", e)

Logged error: save_cloned


In [None]:
# Compute basic objective metrics: SNR, PESQ (if installed), Mel-Cepstral distortion (MCD) stub
import numpy as np
def compute_snr(orig, synth):
    import librosa
    o, _ = librosa.load(orig, sr=22050, mono=True)
    s, _ = librosa.load(synth, sr=22050, mono=True)
    min_len = min(len(o), len(s))
    o, s = o[:min_len], s[:min_len]
    noise = o - s
    snr = 10 * np.log10((o**2).sum() / (noise**2).sum() + 1e-9)
    return snr
try:
    orig = os.path.join(DATA_DIR, "proc_0.wav")  # replace with true original 30s
    synt = os.path.join(OUT_DIR, "cloned_30s.wav")
    print("SNR (dB):", compute_snr(orig, synt))
except Exception as e:
    log_error("objective_eval", e)


Logged error: objective_eval


  o, _ = librosa.load(orig, sr=22050, mono=True)
	Deprecated as of librosa version 0.10.0.
	It will be removed in librosa version 1.0.
  y, sr_native = __audioread_load(path, offset, duration, dtype)


## Subjective Listening Test

1. Prepare pairs: `original_30s.wav` and `cloned_30s.wav`.
2. Randomize order for listeners; collect judgments for:
   - Naturalness (1-5)
   - Similarity to target speaker (1-5)
   - Intelligibility (1-5)
3. Use at least 5 listeners for a small test. Aggregate mean scores and standard deviation.


In [None]:
# List outputs and create a zip to download
!zip -r /content/voice_cloning_results.zip {EXPERIMENT_DIR}
print("Zipped experiment folder:", "/content/voice_cloning_results.zip")

updating: content/drive/MyDrive/voice_cloning_experiment/ (stored 0%)
updating: content/drive/MyDrive/voice_cloning_experiment/logs/ (stored 0%)
updating: content/drive/MyDrive/voice_cloning_experiment/logs/gpu_info.json (deflated 8%)
updating: content/drive/MyDrive/voice_cloning_experiment/logs/timing_log.jsonl (deflated 67%)
updating: content/drive/MyDrive/voice_cloning_experiment/logs/error_log.jsonl (deflated 90%)
updating: content/drive/MyDrive/voice_cloning_experiment/data/ (stored 0%)
updating: content/drive/MyDrive/voice_cloning_experiment/results/ (stored 0%)
updating: content/drive/MyDrive/voice_cloning_experiment/results/audio_samples/ (stored 0%)
Zipped experiment folder: /content/voice_cloning_results.zip


In [None]:
def run_experiment(cfg):
    probe = probe_resources()
    log("probe", probe)
    try:
        preprocess(cfg.data_dir, cfg.work_dir)
        train_cfg = adapt_cfg_for_resources(cfg, probe)
        train_model(train_cfg)
        synth_wav = inference(cfg.eval_text, cfg.work_dir)
        metrics = evaluate(cfg.original_wav, synth_wav)
        package_artifacts(cfg.work_dir)
    except Exception as e:
        log_error("run_experiment", e)
        raise


In [None]:
def adapt_cfg_for_resources(cfg, probe):
    if probe.gpu_memory < 12*1024**2: # < 12GB
        cfg.batch_size = max(1, cfg.batch_size // 4)
        cfg.use_mixed_precision = True
    return cfg

9. How to produce the deliverables you requested

Deliverable checklist & instructions

GitHub repo with code

Create repo locally:

In [None]:
!mkdir voice-cloning-experiment && cd voice-cloning-experiment
!git init
# create files per scaffold (README, colab notebook, src files)
!git add .
!git commit -m "Initial experiment scaffold"
# create remote and push (replace with your own GitHub URL)
!git remote add origin https://github.com/MujahidMalik819061/Voice-Cloning-Assignment.git
!git push -u origin main

mkdir: cannot create directory ‘voice-cloning-experiment’: File exists
Reinitialized existing Git repository in /content/voiceclone/.git/
Author identity unknown

*** Please tell me who you are.

Run

  git config --global user.email "you@example.com"
  git config --global user.name "Your Name"

to set your account's default identity.
Omit --global to set the identity only in this repository.

fatal: unable to auto-detect email address (got 'root@905b31e1d3e5.(none)')
error: remote origin already exists.
error: src refspec main does not match any
[31merror: failed to push some refs to 'https://github.com/CorentinJ/Real-Time-Voice-Cloning.git'
[m

In [None]:
import json
with open(os.path.join(LOG_DIR, 'error_log.jsonl')) as f:
    entries = [json.loads(line) for line in f]
# write summary to docs/error_summary.md

13. Example src/inference.py (skeleton)

In [None]:
import torch
from pathlib import Path

def load_models(encoder_path, syn_path, voc_path, device="cuda"):
    # repo-specific loading; this is a skeleton.
    encoder = torch.load(encoder_path, map_location=device)
    synthesizer = torch.load(syn_path, map_location=device)
    vocoder = torch.load(voc_path, map_location=device)
    return encoder, synthesizer, vocoder

def synthesize_text(encoder, synthesizer, vocoder, speaker_wav, text, out_path):
    # 1) Encode speaker
    # 2) Synthesize mel from text + speaker embedding
    # 3) Vocode mel to waveform
    pass