In [1]:
from speech_encoder_v3 import SpeechEncoderV3
from params import *
from pathlib import Path
import torch
import utils
import visualisations
from data_processor import *

In [2]:
def sync(device: torch.device):
    # For correct profiling (cuda operations are async)
    if device.type == "cuda":
        torch.cuda.synchronize(device)

In [3]:
params = {
    "run_id": "speech_encoder_transformer_v2",  # A unique identifier for this training run
    "clean_data_root": "D:/CODING/SpeechEncoder/data/processed_audio",  # Path to LibriSpeech dataset
    "models_dir": "models",  # Directory to save model checkpoints
    "umap_every": 500,  # Update UMAP visualization every 500 steps
    "save_every": 500,  # Save model checkpoint every 500 steps
    "backup_every": 5000,  # Create a backup copy of the model every 5000 steps
    "vis_every": 100,  # Update visualization metrics every 100 steps
    "force_restart": False,  # Whether to restart training from scratch
    "visdom_server": "http://localhost",  # Visdom server address for visualization
    "no_visdom": False,  # Whether to disable Visdom visualization
    "models_dir": Path("models"),  # Directory to save model checkpoints
}

In [4]:
dataset = SpeakerVerificationDataset(Path("D:/CODING/SpeechEncoder/data/his_processed_audio"))
loader = SpeakerVerificationDataLoader(
        dataset,
        40,
        10,
        num_workers=0,
    )

In [5]:
for batch in loader:
    print(batch.data.shape)
    break  # Check the shape of the first batch only

(400, 160, 40)


In [6]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
device

device(type='cuda')

In [7]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
loss_device = torch.device("cpu")

# Create the model and the optimizer
model = SpeechEncoderV3(device, device)
model.to(device)

#Uncomment the bellow lines to load the model from a checkpoint

# checkpoints = torch.load("models\speech_encoder_transformer\encoder(0.096).pt")
# model.load_state_dict(checkpoints['model_state'])

optimizer = torch.optim.Adam(model.parameters(), lr=learning_rate_init)
init_step = 1

# Configure file path for the model
model_dir = params['models_dir'] / params['run_id']
model_dir.mkdir(exist_ok=True, parents=True)
state_fpath = model_dir / "encoder.pt"

In [None]:
import sys
from tqdm import tqdm
from torch.utils.tensorboard import SummaryWriter

log_dir = params['models_dir'] / "logs"

# Initialize TensorBoard writer (ensure log_dir is defined)
writer = SummaryWriter(log_dir=log_dir)

# Initialize the progress bar
total_steps = len(loader)  # Assuming `loader` has a defined length
progress_bar = tqdm(enumerate(loader, init_step), total=total_steps, desc="Training", unit="step")

model.train()

for step, speaker_batch in progress_bar:
    # Forward pass
    inputs = torch.from_numpy(speaker_batch.data).to(device)
    sync(device)
    embeds = model(inputs)
    sync(device)
    embeds_loss = embeds.view((speakers_per_batch, utterances_per_speaker, -1)).to(loss_device)
    loss, eer = model.loss(embeds_loss)
    sync(loss_device)

    # Backward pass
    model.zero_grad()
    loss.backward()
    
    model.do_gradient_ops()
    optimizer.step()
    
    # Log scalars to TensorBoard
    writer.add_scalar("Loss", loss.item(), step)
    writer.add_scalar("EER", eer, step)
    
    # Update the progress bar with the current loss and EER
    progress_bar.set_postfix({"loss": loss.item(), "eer": eer})

    # Save the model every 'save_every' steps with a unique filename that includes the step and loss
    if params['save_every'] != 0 and step % params['save_every'] == 0:
        filename = model_dir / f"encoder_{step:06d}_loss_{loss.item():.4f}.pt"
        print("Saving the model (step %d) to %s" % (step, filename))
        torch.save({
            "step": step + 1,
            "model_state": model.state_dict(),
            "optimizer_state": optimizer.state_dict(),
        }, filename)

    # Make a backup every 'backup_every' steps
    if params['backup_every'] != 0 and step % params['backup_every'] == 0:
        print("Making a backup (step %d)" % step)
        backup_fpath = model_dir / f"encoder_{step:06d}.bak"
        torch.save({
            "step": step + 1,
            "model_state": model.state_dict(),
            "optimizer_state": optimizer.state_dict(),
        }, backup_fpath)
    
    # Update the progress bar with loss and EER information.
    progress_bar.set_postfix(loss=loss.item(), eer=eer)
    
# Optionally, close the writer after training
writer.close()


Training:   0%|          | 500/250000000 [08:35<38898:50:57,  1.79step/s, eer=0.19, loss=2.43] 

Saving the model (step 500) to models\speech_encoder_transformer_v2\encoder_000500_loss_2.4271.pt


Training:   0%|          | 1000/250000000 [13:07<38161:31:09,  1.82step/s, eer=0.181, loss=2.29]

Saving the model (step 1000) to models\speech_encoder_transformer_v2\encoder_001000_loss_2.2906.pt


Training:   0%|          | 1500/250000000 [17:39<38302:35:58,  1.81step/s, eer=0.166, loss=2.24]

Saving the model (step 1500) to models\speech_encoder_transformer_v2\encoder_001500_loss_2.2393.pt


Training:   0%|          | 2000/250000000 [22:09<37789:45:38,  1.84step/s, eer=0.142, loss=2.11]

Saving the model (step 2000) to models\speech_encoder_transformer_v2\encoder_002000_loss_2.1103.pt


Training:   0%|          | 2500/250000000 [26:38<37495:36:26,  1.85step/s, eer=0.167, loss=2.12]

Saving the model (step 2500) to models\speech_encoder_transformer_v2\encoder_002500_loss_2.1182.pt


Training:   0%|          | 3000/250000000 [31:07<37865:01:33,  1.83step/s, eer=0.115, loss=1.72]

Saving the model (step 3000) to models\speech_encoder_transformer_v2\encoder_003000_loss_1.7168.pt


Training:   0%|          | 3185/250000000 [32:46<37456:08:40,  1.85step/s, eer=0.144, loss=1.94] 