In [3]:
from speech_encoder_v2 import SpeechEncoderV2
from params import *
from pathlib import Path
import torch
from tqdm import tqdm
import os
from data_preprocessing import *
from data_scripts import *

params = {
    "run_id": "speech_encoder_1",
    "clean_data_root": "D:\CODING\SpeechEncoder\data\LibriSpeech/train-clean-100",
    "umap_every": 500,
    "save_every": 500,
    "backup_every": 5000,
    "vis_every": 100,
    "force_restart": False,
    "models_dir": Path("models"),
}

# Device setup
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
loss_device = torch.device("cpu")

def sync(device: torch.device):
    if device.type == "cuda":
        torch.cuda.synchronize(device)

### Data Preparation & Model Initialization

In [4]:
# Preprocess data using your pipeline
preprocess(
    raw_data_root=params["clean_data_root"],
    processed_data_root="data/processed_data",
    skip_existing=not params["force_restart"]
)

# Load data using your interface
dataset, loader = load_data(
    processed_root="data/processed_data",
    speakers_per_batch=speakers_per_batch,
    utterances_per_speaker=utterances_per_speaker,
    num_workers=4
)

Starting preprocessing...
  Raw data source:      D:\CODING\SpeechEncoder\data\LibriSpeech\train-clean-100
  Processed data target: D:\CODING\Voice-Cloning\data\processed_data
Found 251 total speaker directories.
Scanning for existing data to determine processing scope...


Scanning speakers: 100%|██████████| 251/251 [00:00<00:00, 8705.40it/s]

-> Skipping 251 previously processed speakers.
-> No new speakers require processing.
Preprocessing complete.





### Training Execution

In [3]:
init_step = 1
pbar = tqdm(enumerate(loader, init_step), desc="Training")

for step, speaker_batch in pbar:
    # Forward pass
    inputs = torch.from_numpy(speaker_batch.data).float().to(device)
    sync(device)
    embeds = model(inputs)
    sync(device)
    
    # Loss calculation
    embeds_loss = embeds.view((speakers_per_batch, utterances_per_speaker, -1)).to(loss_device)
    loss, eer = model.loss(embeds_loss)
    sync(loss_device)

    # Backward pass
    model.zero_grad()
    loss.backward()
    model.do_gradient_ops()
    optimizer.step()

    # Progress updates
    pbar.set_postfix(loss=loss.item(), eer=eer)

    # Model checkpointing
    if step % params['save_every'] == 0:
        torch.save({
            "step": step + 1,
            "model_state": model.state_dict(),
            "optimizer_state": optimizer.state_dict(),
        }, state_fpath)

    if step % params['backup_every'] == 0:
        backup_fpath = model_dir / f"encoder_{step:06d}.bak"
        torch.save({
            "step": step + 1,
            "model_state": model.state_dict(),
            "optimizer_state": optimizer.state_dict(),
        }, backup_fpath)

Training: 0it [00:04, ?it/s]


IndexError: Dimension out of range (expected to be in range of [-1, 0], but got 1)