In [1]:
from speech_encoder_v2 import SpeechEncoderV2
from params import *
from pathlib import Path
import torch
import utils
import visualisations
from data_processor import *

### Setting Up Profiler and Selecting the appropriate Device

In [2]:
def sync(device: torch.device):
    # For correct profiling (cuda operations are async)
    if device.type == "cuda":
        torch.cuda.synchronize(device)

### Setting up the Parameters

In [3]:
params = {
    "run_id": "speech_encoder_transformer",  # A unique identifier for this training run
    "clean_data_root": "D:/CODING/SpeechEncoder/data/processed_audio",  # Path to LibriSpeech dataset
    "models_dir": "models",  # Directory to save model checkpoints
    "umap_every": 500,  # Update UMAP visualization every 500 steps
    "save_every": 500,  # Save model checkpoint every 1000 steps
    "backup_every": 5000,  # Create a backup copy of the model every 5000 steps
    "vis_every": 100,  # Update visualization metrics every 100 steps
    "force_restart": False,  # Whether to restart training from scratch
    "visdom_server": "http://localhost",  # Visdom server address for visualization
    "no_visdom": False,  # Whether to disable Visdom visualization
    "models_dir": Path("models"),  # Directory to save model checkpoints
}

### Loading the data

In [4]:
dataset = SpeakerVerificationDataset(Path("D:/CODING/SpeechEncoder/data/his_processed_audio"))
loader = SpeakerVerificationDataLoader(
        dataset,
        40,
        10,
        num_workers=0,
    )

In [6]:
for batch in loader:
    print(batch.data.shape)
    break  # Check the shape of the first batch only

(400, 160, 40)


### Initializing the model

In [5]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
device

device(type='cuda')

In [6]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
loss_device = torch.device("cpu")

# Create the model and the optimizer
model = SpeechEncoderV2(device, device)
model.to(device)
optimizer = torch.optim.Adam(model.parameters(), lr=learning_rate_init)
init_step = 1

# Configure file path for the model
model_dir = params['models_dir'] / params['run_id']
model_dir.mkdir(exist_ok=True, parents=True)
state_fpath = model_dir / "encoder.pt"



### Training the model

In [None]:
import sys
from tqdm import tqdm  # Import tqdm for the progress bar

# Initialize the progress bar
total_steps = len(loader)  # Assuming `loader` has a defined length
progress_bar = tqdm(enumerate(loader, init_step), total=total_steps, desc="Training", unit="step")

for step, speaker_batch in progress_bar:
    # Forward pass
    inputs = torch.from_numpy(speaker_batch.data).to(device)
    sync(device)
    embeds = model(inputs)
    sync(device)
    embeds_loss = embeds.view((speakers_per_batch, utterances_per_speaker, -1)).to(loss_device)
    loss, eer = model.loss(embeds_loss)
    sync(loss_device)

    # Backward pass
    model.zero_grad()
    loss.backward()
    
    model.do_gradient_ops()
    optimizer.step()
    
    # Update the progress bar with the current loss
    progress_bar.set_postfix({"loss": loss.item(), "eer": eer})

    # Overwrite the latest version of the model
    if params['save_every'] != 0 and step % params['save_every'] == 0:
        print("Saving the model (step %d)" % step)
        torch.save({
            "step": step + 1,
            "model_state": model.state_dict(),
            "optimizer_state": optimizer.state_dict(),
        }, state_fpath)

    # Make a backup
    if params['backup_every'] != 0 and step % params['backup_every'] == 0:
        print("Making a backup (step %d)" % step)
        backup_fpath = model_dir / f"encoder_{step:06d}.bak"
        torch.save({
            "step": step + 1,
            "model_state": model.state_dict(),
            "optimizer_state": optimizer.state_dict(),
        }, backup_fpath)
    
    # Update the progress bar with loss and EER information.
    progress_bar.set_postfix(loss=loss.item(), eer=eer)


Training:   0%|          | 500/250000000 [05:43<38995:14:32,  1.78step/s, eer=0.114, loss=1.59] 

Saving the model (step 500)


Training:   0%|          | 1000/250000000 [10:20<38546:25:14,  1.80step/s, eer=0.095, loss=1.27]

Saving the model (step 1000)


Training:   0%|          | 1500/250000000 [14:55<38611:15:07,  1.80step/s, eer=0.0844, loss=1.15] 

Saving the model (step 1500)


Training:   0%|          | 2000/250000000 [19:31<38864:52:42,  1.79step/s, eer=0.0642, loss=0.971]

Saving the model (step 2000)


Training:   0%|          | 2500/250000000 [24:05<38452:03:44,  1.81step/s, eer=0.0525, loss=0.777]

Saving the model (step 2500)


Training:   0%|          | 3000/250000000 [28:40<38289:58:23,  1.81step/s, eer=0.0725, loss=1.02] 

Saving the model (step 3000)


Training:   0%|          | 3500/250000000 [33:15<38017:16:22,  1.83step/s, eer=0.0549, loss=0.758]

Saving the model (step 3500)


Training:   0%|          | 4000/250000000 [37:53<38742:03:30,  1.79step/s, eer=0.0631, loss=0.889]

Saving the model (step 4000)


Training:   0%|          | 4500/250000000 [42:39<40739:33:52,  1.70step/s, eer=0.0567, loss=0.788]

Saving the model (step 4500)


Training:   0%|          | 5000/250000000 [47:26<40679:28:04,  1.71step/s, eer=0.0525, loss=0.758]

Saving the model (step 5000)
Making a backup (step 5000)


Training:   0%|          | 5500/250000000 [52:11<38566:37:24,  1.80step/s, eer=0.0475, loss=0.685]

Saving the model (step 5500)


Training:   0%|          | 6000/250000000 [56:59<40754:19:35,  1.70step/s, eer=0.0398, loss=0.516]

Saving the model (step 6000)


Training:   0%|          | 6500/250000000 [1:01:47<40045:29:26,  1.73step/s, eer=0.05, loss=0.701]  

Saving the model (step 6500)


Training:   0%|          | 7000/250000000 [1:06:31<39724:07:21,  1.75step/s, eer=0.045, loss=0.607] 

Saving the model (step 7000)


Training:   0%|          | 7500/250000000 [1:11:14<40247:54:18,  1.73step/s, eer=0.0396, loss=0.521]

Saving the model (step 7500)


Training:   0%|          | 8000/250000000 [1:15:57<39523:47:55,  1.76step/s, eer=0.045, loss=0.617] 

Saving the model (step 8000)


Training:   0%|          | 8500/250000000 [1:20:34<37161:49:57,  1.87step/s, eer=0.0352, loss=0.45] 

Saving the model (step 8500)


Training:   0%|          | 9000/250000000 [1:25:13<37737:49:23,  1.84step/s, eer=0.0263, loss=0.373]

Saving the model (step 9000)


Training:   0%|          | 9500/250000000 [1:29:37<38119:04:34,  1.82step/s, eer=0.0375, loss=0.51] 

Saving the model (step 9500)


Training:   0%|          | 10000/250000000 [1:34:19<38745:00:32,  1.79step/s, eer=0.0425, loss=0.55]

Saving the model (step 10000)
Making a backup (step 10000)


Training:   0%|          | 10500/250000000 [1:38:58<39057:51:38,  1.78step/s, eer=0.039, loss=0.448] 

Saving the model (step 10500)


Training:   0%|          | 11000/250000000 [1:43:31<37824:49:51,  1.84step/s, eer=0.0475, loss=0.617]

Saving the model (step 11000)


Training:   0%|          | 11500/250000000 [1:48:04<38546:18:00,  1.80step/s, eer=0.0302, loss=0.428]

Saving the model (step 11500)


Training:   0%|          | 12000/250000000 [1:52:35<37980:27:39,  1.83step/s, eer=0.0375, loss=0.535]

Saving the model (step 12000)


Training:   0%|          | 12500/250000000 [1:57:07<37881:43:03,  1.83step/s, eer=0.0448, loss=0.628]

Saving the model (step 12500)


Training:   0%|          | 13000/250000000 [2:01:39<38190:33:06,  1.82step/s, eer=0.0375, loss=0.466]

Saving the model (step 13000)


Training:   0%|          | 13500/250000000 [2:06:11<38310:13:17,  1.81step/s, eer=0.03, loss=0.386]  

Saving the model (step 13500)


Training:   0%|          | 14000/250000000 [2:10:42<38074:20:25,  1.82step/s, eer=0.028, loss=0.364] 

Saving the model (step 14000)


Training:   0%|          | 14500/250000000 [2:15:15<38230:16:19,  1.82step/s, eer=0.0375, loss=0.455]

Saving the model (step 14500)


Training:   0%|          | 15000/250000000 [2:19:46<38669:47:56,  1.80step/s, eer=0.0315, loss=0.414]

Saving the model (step 15000)
Making a backup (step 15000)


Training:   0%|          | 15500/250000000 [2:24:18<38223:55:56,  1.82step/s, eer=0.0268, loss=0.383]

Saving the model (step 15500)


Training:   0%|          | 16000/250000000 [2:28:49<38740:54:48,  1.79step/s, eer=0.0381, loss=0.454]

Saving the model (step 16000)


Training:   0%|          | 16500/250000000 [2:33:21<38165:25:47,  1.82step/s, eer=0.0304, loss=0.374]

Saving the model (step 16500)


Training:   0%|          | 17000/250000000 [2:37:54<37740:51:58,  1.84step/s, eer=0.0433, loss=0.488]

Saving the model (step 17000)


Training:   0%|          | 17500/250000000 [2:42:26<38012:04:16,  1.83step/s, eer=0.025, loss=0.357] 

Saving the model (step 17500)


Training:   0%|          | 18000/250000000 [2:46:58<37975:38:45,  1.83step/s, eer=0.03, loss=0.336]  

Saving the model (step 18000)


Training:   0%|          | 18500/250000000 [2:51:29<38391:12:38,  1.81step/s, eer=0.0325, loss=0.352]

Saving the model (step 18500)


Training:   0%|          | 19000/250000000 [2:56:00<37963:25:49,  1.83step/s, eer=0.0325, loss=0.403]

Saving the model (step 19000)


Training:   0%|          | 19500/250000000 [3:00:34<39133:56:43,  1.77step/s, eer=0.0225, loss=0.318]

Saving the model (step 19500)


Training:   0%|          | 20000/250000000 [3:05:13<41085:14:27,  1.69step/s, eer=0.0325, loss=0.434]

Saving the model (step 20000)
Making a backup (step 20000)


Training:   0%|          | 20500/250000000 [3:09:54<38241:31:25,  1.82step/s, eer=0.0275, loss=0.314]

Saving the model (step 20500)


Training:   0%|          | 21000/250000000 [3:14:29<38325:12:13,  1.81step/s, eer=0.0322, loss=0.404]

Saving the model (step 21000)


Training:   0%|          | 21500/250000000 [3:22:41<40686:13:57,  1.71step/s, eer=0.025, loss=0.356]  

Saving the model (step 21500)


Training:   0%|          | 22000/250000000 [3:27:15<39654:14:03,  1.75step/s, eer=0.0217, loss=0.241]

Saving the model (step 22000)


Training:   0%|          | 22500/250000000 [3:31:59<46750:45:57,  1.49step/s, eer=0.0174, loss=0.18] 

Saving the model (step 22500)


Training:   0%|          | 23000/250000000 [3:37:00<41051:10:13,  1.69step/s, eer=0.0175, loss=0.247]

Saving the model (step 23000)


Training:   0%|          | 23500/250000000 [3:41:38<38589:28:15,  1.80step/s, eer=0.0275, loss=0.34] 

Saving the model (step 23500)


Training:   0%|          | 24000/250000000 [3:46:19<43366:16:15,  1.60step/s, eer=0.0275, loss=0.34] 

Saving the model (step 24000)


Training:   0%|          | 24500/250000000 [3:51:16<40339:52:20,  1.72step/s, eer=0.0275, loss=0.305]

Saving the model (step 24500)


Training:   0%|          | 25000/250000000 [3:56:29<48115:27:50,  1.44step/s, eer=0.0275, loss=0.404]

Saving the model (step 25000)
Making a backup (step 25000)


Training:   0%|          | 25500/250000000 [4:01:49<42287:06:15,  1.64step/s, eer=0.025, loss=0.232] 

Saving the model (step 25500)


Training:   0%|          | 26000/250000000 [4:07:00<41331:49:48,  1.68step/s, eer=0.0272, loss=0.317]

Saving the model (step 26000)


Training:   0%|          | 26500/250000000 [4:11:24<36321:12:14,  1.91step/s, eer=0.0225, loss=0.31] 

Saving the model (step 26500)


Training:   0%|          | 27000/250000000 [4:15:45<36419:08:40,  1.91step/s, eer=0.0193, loss=0.236]

Saving the model (step 27000)


Training:   0%|          | 27500/250000000 [4:20:06<36523:14:37,  1.90step/s, eer=0.0194, loss=0.252]

Saving the model (step 27500)


Training:   0%|          | 28000/250000000 [4:24:34<36650:35:50,  1.89step/s, eer=0.0235, loss=0.296]

Saving the model (step 28000)


Training:   0%|          | 28500/250000000 [4:29:03<36490:54:43,  1.90step/s, eer=0.0225, loss=0.312]  

Saving the model (step 28500)


Training:   0%|          | 29000/250000000 [4:38:20<55841:10:44,  1.24step/s, eer=0.02, loss=0.231]   

Saving the model (step 29000)


Training:   0%|          | 29500/250000000 [4:45:57<49568:15:06,  1.40step/s, eer=0.0194, loss=0.232] 

Saving the model (step 29500)


Training:   0%|          | 30000/250000000 [4:53:34<64742:15:28,  1.07step/s, eer=0.0188, loss=0.158] 

Saving the model (step 30000)
Making a backup (step 30000)


Training:   0%|          | 30500/250000000 [5:00:54<61722:36:39,  1.12step/s, eer=0.02, loss=0.264]   

Saving the model (step 30500)


Training:   0%|          | 31000/250000000 [5:08:35<61131:31:07,  1.14step/s, eer=0.0194, loss=0.197] 

Saving the model (step 31000)


Training:   0%|          | 31500/250000000 [5:16:01<60643:16:50,  1.14step/s, eer=0.0212, loss=0.279]

Saving the model (step 31500)


Training:   0%|          | 32000/250000000 [5:23:19<54791:59:34,  1.27step/s, eer=0.0175, loss=0.188] 

Saving the model (step 32000)


Training:   0%|          | 32500/250000000 [5:30:01<54499:48:05,  1.27step/s, eer=0.015, loss=0.181]   

Saving the model (step 32500)


Training:   0%|          | 33000/250000000 [5:36:03<51305:13:06,  1.35step/s, eer=0.02, loss=0.223]    

Saving the model (step 33000)


Training:   0%|          | 33500/250000000 [5:42:21<50344:09:24,  1.38step/s, eer=0.0225, loss=0.214] 

Saving the model (step 33500)


Training:   0%|          | 34000/250000000 [5:49:37<57580:21:09,  1.21step/s, eer=0.0165, loss=0.167] 

Saving the model (step 34000)


Training:   0%|          | 34500/250000000 [5:56:43<59125:59:57,  1.17step/s, eer=0.0175, loss=0.204] 

Saving the model (step 34500)


Training:   0%|          | 35000/250000000 [6:03:49<57689:49:46,  1.20step/s, eer=0.0175, loss=0.182]

Saving the model (step 35000)
Making a backup (step 35000)


Training:   0%|          | 35500/250000000 [6:11:21<64655:41:53,  1.07step/s, eer=0.0147, loss=0.103] 

Saving the model (step 35500)


Training:   0%|          | 36000/250000000 [6:19:13<62032:35:02,  1.12step/s, eer=0.0153, loss=0.23]   

Saving the model (step 36000)


Training:   0%|          | 36500/250000000 [6:28:24<67872:37:51,  1.02step/s, eer=0.02, loss=0.212]     

Saving the model (step 36500)


Training:   0%|          | 37000/250000000 [6:36:07<80744:51:03,  1.16s/step, eer=0.0159, loss=0.166] 

Saving the model (step 37000)


Training:   0%|          | 37500/250000000 [6:42:05<54450:24:59,  1.28step/s, eer=0.02, loss=0.306]    

Saving the model (step 37500)


Training:   0%|          | 38000/250000000 [6:48:11<38923:27:03,  1.78step/s, eer=0.0168, loss=0.152] 

Saving the model (step 38000)


Training:   0%|          | 38500/250000000 [6:52:44<37068:59:49,  1.87step/s, eer=0.0175, loss=0.204]  

Saving the model (step 38500)


Training:   0%|          | 39000/250000000 [6:57:08<36706:57:06,  1.89step/s, eer=0.0125, loss=0.119] 

Saving the model (step 39000)


Training:   0%|          | 39500/250000000 [7:01:31<36698:47:16,  1.89step/s, eer=0.02, loss=0.159]    

Saving the model (step 39500)


Training:   0%|          | 40000/250000000 [7:05:55<37420:41:51,  1.86step/s, eer=0.0154, loss=0.159]  

Saving the model (step 40000)
Making a backup (step 40000)


Training:   0%|          | 40500/250000000 [7:10:19<37254:12:29,  1.86step/s, eer=0.0196, loss=0.201]  

Saving the model (step 40500)


Training:   0%|          | 41000/250000000 [7:14:42<36866:53:14,  1.88step/s, eer=0.01, loss=0.111]    

Saving the model (step 41000)


Training:   0%|          | 41500/250000000 [7:19:06<37050:40:54,  1.87step/s, eer=0.0175, loss=0.144]  

Saving the model (step 41500)


Training:   0%|          | 42000/250000000 [7:23:41<42079:41:51,  1.65step/s, eer=0.015, loss=0.135]   

Saving the model (step 42000)


Training:   0%|          | 42500/250000000 [7:29:36<42629:25:48,  1.63step/s, eer=0.0169, loss=0.22]   

Saving the model (step 42500)


Training:   0%|          | 43000/250000000 [7:34:29<40659:17:26,  1.71step/s, eer=0.0146, loss=0.173]  

Saving the model (step 43000)


Training:   0%|          | 43500/250000000 [7:39:21<40903:37:44,  1.70step/s, eer=0.02, loss=0.188]    

Saving the model (step 43500)


Training:   0%|          | 44000/250000000 [7:44:26<42166:18:58,  1.65step/s, eer=0.015, loss=0.144]   

Saving the model (step 44000)


Training:   0%|          | 44500/250000000 [7:49:10<38070:04:45,  1.82step/s, eer=0.02, loss=0.191]    

Saving the model (step 44500)


Training:   0%|          | 45000/250000000 [7:53:39<37185:51:43,  1.87step/s, eer=0.015, loss=0.167]   

Saving the model (step 45000)
Making a backup (step 45000)


Training:   0%|          | 45500/250000000 [7:58:31<38524:17:26,  1.80step/s, eer=0.015, loss=0.164]   

Saving the model (step 45500)


Training:   0%|          | 46000/250000000 [8:03:44<44923:15:24,  1.55step/s, eer=0.0172, loss=0.168]  

Saving the model (step 46000)


Training:   0%|          | 46500/250000000 [8:08:58<43355:19:14,  1.60step/s, eer=0.015, loss=0.139]   

Saving the model (step 46500)


Training:   0%|          | 47000/250000000 [8:13:58<38090:49:16,  1.82step/s, eer=0.0125, loss=0.13]   

Saving the model (step 47000)


Training:   0%|          | 47500/250000000 [8:18:39<43531:17:08,  1.59step/s, eer=0.015, loss=0.118]   

Saving the model (step 47500)


Training:   0%|          | 48000/250000000 [8:23:54<44638:15:37,  1.56step/s, eer=0.0121, loss=0.0836] 

Saving the model (step 48000)


Training:   0%|          | 48500/250000000 [8:29:09<43857:07:42,  1.58step/s, eer=0.0175, loss=0.204]  

Saving the model (step 48500)


Training:   0%|          | 49000/250000000 [8:34:23<44329:55:06,  1.57step/s, eer=0.01, loss=0.114]    

Saving the model (step 49000)


Training:   0%|          | 49500/250000000 [8:39:37<44990:05:54,  1.54step/s, eer=0.01, loss=0.0955]   

Saving the model (step 49500)


Training:   0%|          | 50000/250000000 [8:44:36<38937:13:26,  1.78step/s, eer=0.00962, loss=0.147] 

Saving the model (step 50000)
Making a backup (step 50000)


Training:   0%|          | 50500/250000000 [8:49:22<41384:27:52,  1.68step/s, eer=0.015, loss=0.106]   

Saving the model (step 50500)


Training:   0%|          | 51000/250000000 [8:54:28<46670:01:27,  1.49step/s, eer=0.014, loss=0.167]   

Saving the model (step 51000)


Training:   0%|          | 51500/250000000 [8:59:24<39990:24:36,  1.74step/s, eer=0.01, loss=0.0787]   

Saving the model (step 51500)


Training:   0%|          | 52000/250000000 [9:04:17<43683:06:22,  1.59step/s, eer=0.0125, loss=0.126]  

Saving the model (step 52000)


Training:   0%|          | 52500/250000000 [9:09:10<44489:42:45,  1.56step/s, eer=0.01, loss=0.0965]   

Saving the model (step 52500)


Training:   0%|          | 53000/250000000 [9:13:59<41844:48:17,  1.66step/s, eer=0.0169, loss=0.137]  

Saving the model (step 53000)


Training:   0%|          | 53500/250000000 [9:18:51<40818:42:01,  1.70step/s, eer=0.0208, loss=0.2]    

Saving the model (step 53500)


Training:   0%|          | 54000/250000000 [9:23:45<43534:42:33,  1.59step/s, eer=0.00917, loss=0.0978]

Saving the model (step 54000)


Training:   0%|          | 54500/250000000 [9:28:28<40834:33:30,  1.70step/s, eer=0.0125, loss=0.117]  

Saving the model (step 54500)


Training:   0%|          | 55000/250000000 [9:36:28<38952:23:04,  1.78step/s, eer=0.015, loss=0.14]     

Saving the model (step 55000)
Making a backup (step 55000)


Training:   0%|          | 55500/250000000 [9:41:01<38292:26:08,  1.81step/s, eer=0.0103, loss=0.109]  

Saving the model (step 55500)


Training:   0%|          | 56000/250000000 [9:45:34<38167:17:18,  1.82step/s, eer=0.0133, loss=0.112]  

Saving the model (step 56000)


Training:   0%|          | 56500/250000000 [9:50:06<38319:14:09,  1.81step/s, eer=0.011, loss=0.0903]  

Saving the model (step 56500)


Training:   0%|          | 57000/250000000 [9:54:38<38126:18:37,  1.82step/s, eer=0.015, loss=0.152]   

Saving the model (step 57000)


Training:   0%|          | 57500/250000000 [9:59:12<38336:11:04,  1.81step/s, eer=0.0171, loss=0.151]  

Saving the model (step 57500)


Training:   0%|          | 57948/250000000 [10:03:17<46804:47:13,  1.48step/s, eer=0.00904, loss=0.0793]