In [1]:
import sys
sys.path.append("..")

from speech_encoder_v2_updated import SpeechEncoderV2
from params import *
from pathlib import Path
import torch
import utils
import visualisations
from data_scripts import *

In [2]:
def sync(device: torch.device):
    # For correct profiling (cuda operations are async)
    if device.type == "cuda":
        torch.cuda.synchronize(device)

In [3]:
params = {
    "run_id": "speech_encoder_transformer_updated",  # A unique identifier for this training run
    "clean_data_root": "D:/CODING/SpeechEncoder/data/his_processed_audio",  # Path to LibriSpeech dataset
    "models_dir": "models",  # Directory to save model checkpoints
    "umap_every": 500,  # Update UMAP visualization every 500 steps
    "save_every": 500,  # Save model checkpoint every 500 steps
    "backup_every": 5000,  # Create a backup copy of the model every 5000 steps
    "vis_every": 100,  # Update visualization metrics every 100 steps
    "force_restart": False,  # Whether to restart training from scratch
    "visdom_server": "http://localhost",  # Visdom server address for visualization
    "no_visdom": False,  # Whether to disable Visdom visualization
    "models_dir": Path("../models"),  # Directory to save model checkpoints
}

In [4]:
dataset, loader = load_data(params['clean_data_root'], 40, 10, 0)

In [5]:
for batch in loader:
    print(batch.data.shape)
    break  # Check the shape of the first batch only

(400, 160, 40)


In [6]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
device

device(type='cuda')

In [7]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
loss_device = torch.device("cpu")

# Create the model and the optimizer
model = SpeechEncoderV2(device, device)
model.to(device)

#Uncomment the bellow lines to load the model from a checkpoint

checkpoints = torch.load("../models\speech_encoder_transformer_updated\encoder_031000_loss_0.2147.pt")
model.load_state_dict(checkpoints['model_state'])

optimizer = torch.optim.Adam(model.parameters(), lr=learning_rate_init)
init_step = 31267

# Configure file path for the model
model_dir = params['models_dir'] / params['run_id']
model_dir.mkdir(exist_ok=True, parents=True)
state_fpath = model_dir / "encoder.pt"



In [8]:
import sys
from tqdm import tqdm
from torch.utils.tensorboard import SummaryWriter

log_dir = params['models_dir'] / "logs"

# Initialize TensorBoard writer (ensure log_dir is defined)
writer = SummaryWriter(log_dir=log_dir)

# Initialize the progress bar
total_steps = len(loader)  # Assuming `loader` has a defined length
progress_bar = tqdm(enumerate(loader, init_step), total=total_steps, desc="Training", unit="step")

model.train()

for step, speaker_batch in progress_bar:
    # Forward pass
    inputs = torch.from_numpy(speaker_batch.data).to(device)
    sync(device)
    embeds = model(inputs)
    sync(device)
    embeds_loss = embeds.view((speakers_per_batch, utterances_per_speaker, -1)).to(loss_device)
    loss, eer = model.loss(embeds_loss)
    sync(loss_device)

    # Backward pass
    model.zero_grad()
    loss.backward()
    
    model.do_gradient_ops()
    optimizer.step()
    
    # Log scalars to TensorBoard
    writer.add_scalar("Loss", loss.item(), step)
    writer.add_scalar("EER", eer, step)
    
    # Update the progress bar with the current loss and EER
    progress_bar.set_postfix({"loss": loss.item(), "eer": eer})

    # Save the model every 'save_every' steps with a unique filename that includes the step and loss
    if params['save_every'] != 0 and step % params['save_every'] == 0:
        filename = model_dir / f"encoder_{step:06d}_loss_{loss.item():.4f}.pt"
        print("Saving the model (step %d) to %s" % (step, filename))
        torch.save({
            "step": step + 1,
            "model_state": model.state_dict(),
            "optimizer_state": optimizer.state_dict(),
        }, filename)

    # Make a backup every 'backup_every' steps
    if params['backup_every'] != 0 and step % params['backup_every'] == 0:
        print("Making a backup (step %d)" % step)
        backup_fpath = model_dir / f"encoder_{step:06d}.bak"
        torch.save({
            "step": step + 1,
            "model_state": model.state_dict(),
            "optimizer_state": optimizer.state_dict(),
        }, backup_fpath)
    
    # Update the progress bar with loss and EER information.
    progress_bar.set_postfix(loss=loss.item(), eer=eer)
    
# Optionally, close the writer after training
writer.close()


Training:   0%|          | 234/250000000 [07:21<53407:41:50,  1.30step/s, eer=0.03, loss=0.292]  

Saving the model (step 31500) to ..\models\speech_encoder_transformer_updated\encoder_031500_loss_0.2925.pt


Training:   0%|          | 734/250000000 [13:22<48147:38:33,  1.44step/s, eer=0.0275, loss=0.208]

Saving the model (step 32000) to ..\models\speech_encoder_transformer_updated\encoder_032000_loss_0.2082.pt


Training:   0%|          | 1234/250000000 [18:51<45155:39:19,  1.54step/s, eer=0.03, loss=0.29]   

Saving the model (step 32500) to ..\models\speech_encoder_transformer_updated\encoder_032500_loss_0.2897.pt


Training:   0%|          | 1734/250000000 [24:10<44622:40:40,  1.56step/s, eer=0.019, loss=0.256] 

Saving the model (step 33000) to ..\models\speech_encoder_transformer_updated\encoder_033000_loss_0.2559.pt


Training:   0%|          | 2234/250000000 [29:30<44749:50:42,  1.55step/s, eer=0.0222, loss=0.242]

Saving the model (step 33500) to ..\models\speech_encoder_transformer_updated\encoder_033500_loss_0.2422.pt


Training:   0%|          | 2734/250000000 [34:50<44388:44:29,  1.56step/s, eer=0.0195, loss=0.199]

Saving the model (step 34000) to ..\models\speech_encoder_transformer_updated\encoder_034000_loss_0.1990.pt


Training:   0%|          | 3234/250000000 [40:11<44492:06:53,  1.56step/s, eer=0.0172, loss=0.177]

Saving the model (step 34500) to ..\models\speech_encoder_transformer_updated\encoder_034500_loss_0.1768.pt


Training:   0%|          | 3734/250000000 [45:30<44879:14:11,  1.55step/s, eer=0.025, loss=0.255] 

Saving the model (step 35000) to ..\models\speech_encoder_transformer_updated\encoder_035000_loss_0.2554.pt
Making a backup (step 35000)


Training:   0%|          | 4234/250000000 [50:50<44441:01:27,  1.56step/s, eer=0.0247, loss=0.275]

Saving the model (step 35500) to ..\models\speech_encoder_transformer_updated\encoder_035500_loss_0.2750.pt


Training:   0%|          | 4734/250000000 [56:09<44896:46:44,  1.55step/s, eer=0.017, loss=0.195] 

Saving the model (step 36000) to ..\models\speech_encoder_transformer_updated\encoder_036000_loss_0.1947.pt


Training:   0%|          | 5234/250000000 [1:01:34<45902:47:16,  1.51step/s, eer=0.0225, loss=0.222]

Saving the model (step 36500) to ..\models\speech_encoder_transformer_updated\encoder_036500_loss_0.2222.pt


Training:   0%|          | 5734/250000000 [1:07:01<44871:04:53,  1.55step/s, eer=0.0206, loss=0.248]

Saving the model (step 37000) to ..\models\speech_encoder_transformer_updated\encoder_037000_loss_0.2482.pt


Training:   0%|          | 6234/250000000 [1:12:23<46010:35:06,  1.51step/s, eer=0.0225, loss=0.299]

Saving the model (step 37500) to ..\models\speech_encoder_transformer_updated\encoder_037500_loss_0.2987.pt


Training:   0%|          | 6734/250000000 [1:17:55<47112:56:39,  1.47step/s, eer=0.0229, loss=0.309] 

Saving the model (step 38000) to ..\models\speech_encoder_transformer_updated\encoder_038000_loss_0.3090.pt


Training:   0%|          | 7234/250000000 [1:23:20<45260:51:37,  1.53step/s, eer=0.0204, loss=0.254]

Saving the model (step 38500) to ..\models\speech_encoder_transformer_updated\encoder_038500_loss_0.2541.pt


Training:   0%|          | 7734/250000000 [1:28:48<45582:41:43,  1.52step/s, eer=0.0173, loss=0.183]  

Saving the model (step 39000) to ..\models\speech_encoder_transformer_updated\encoder_039000_loss_0.1831.pt


Training:   0%|          | 8234/250000000 [1:34:19<45674:07:17,  1.52step/s, eer=0.0212, loss=0.241] 

Saving the model (step 39500) to ..\models\speech_encoder_transformer_updated\encoder_039500_loss_0.2414.pt


Training:   0%|          | 8734/250000000 [1:39:48<46300:42:31,  1.50step/s, eer=0.0215, loss=0.25] 

Saving the model (step 40000) to ..\models\speech_encoder_transformer_updated\encoder_040000_loss_0.2501.pt
Making a backup (step 40000)


Training:   0%|          | 9234/250000000 [1:45:17<45733:10:18,  1.52step/s, eer=0.025, loss=0.225]  

Saving the model (step 40500) to ..\models\speech_encoder_transformer_updated\encoder_040500_loss_0.2252.pt


Training:   0%|          | 9734/250000000 [1:50:46<45302:25:51,  1.53step/s, eer=0.03, loss=0.281]  

Saving the model (step 41000) to ..\models\speech_encoder_transformer_updated\encoder_041000_loss_0.2809.pt


Training:   0%|          | 10234/250000000 [1:56:21<46762:57:48,  1.48step/s, eer=0.0175, loss=0.239]  

Saving the model (step 41500) to ..\models\speech_encoder_transformer_updated\encoder_041500_loss_0.2388.pt


Training:   0%|          | 10734/250000000 [2:02:02<47636:00:28,  1.46step/s, eer=0.0279, loss=0.276] 

Saving the model (step 42000) to ..\models\speech_encoder_transformer_updated\encoder_042000_loss_0.2759.pt


Training:   0%|          | 11234/250000000 [2:07:37<46482:50:04,  1.49step/s, eer=0.0175, loss=0.138] 

Saving the model (step 42500) to ..\models\speech_encoder_transformer_updated\encoder_042500_loss_0.1382.pt


Training:   0%|          | 11734/250000000 [2:13:13<46560:19:00,  1.49step/s, eer=0.0183, loss=0.302] 

Saving the model (step 43000) to ..\models\speech_encoder_transformer_updated\encoder_043000_loss_0.3019.pt


Training:   0%|          | 12234/250000000 [2:18:56<48927:10:08,  1.42step/s, eer=0.024, loss=0.322]  

Saving the model (step 43500) to ..\models\speech_encoder_transformer_updated\encoder_043500_loss_0.3222.pt


Training:   0%|          | 12734/250000000 [2:24:40<47842:37:02,  1.45step/s, eer=0.0223, loss=0.222] 

Saving the model (step 44000) to ..\models\speech_encoder_transformer_updated\encoder_044000_loss_0.2217.pt


Training:   0%|          | 13234/250000000 [2:30:38<48905:23:10,  1.42step/s, eer=0.0263, loss=0.254] 

Saving the model (step 44500) to ..\models\speech_encoder_transformer_updated\encoder_044500_loss_0.2538.pt


Training:   0%|          | 13734/250000000 [2:36:27<49318:50:52,  1.41step/s, eer=0.025, loss=0.323]  

Saving the model (step 45000) to ..\models\speech_encoder_transformer_updated\encoder_045000_loss_0.3227.pt
Making a backup (step 45000)


Training:   0%|          | 14234/250000000 [2:42:19<49350:49:40,  1.41step/s, eer=0.0225, loss=0.243]  

Saving the model (step 45500) to ..\models\speech_encoder_transformer_updated\encoder_045500_loss_0.2425.pt


Training:   0%|          | 14734/250000000 [2:48:15<49040:18:36,  1.42step/s, eer=0.02, loss=0.222]   

Saving the model (step 46000) to ..\models\speech_encoder_transformer_updated\encoder_046000_loss_0.2216.pt


Training:   0%|          | 15234/250000000 [2:54:06<49696:54:42,  1.40step/s, eer=0.0115, loss=0.121] 

Saving the model (step 46500) to ..\models\speech_encoder_transformer_updated\encoder_046500_loss_0.1210.pt


Training:   0%|          | 15734/250000000 [2:59:58<50644:23:35,  1.37step/s, eer=0.0225, loss=0.262] 

Saving the model (step 47000) to ..\models\speech_encoder_transformer_updated\encoder_047000_loss_0.2620.pt


Training:   0%|          | 16234/250000000 [3:05:52<49813:25:05,  1.39step/s, eer=0.0171, loss=0.179]  

Saving the model (step 47500) to ..\models\speech_encoder_transformer_updated\encoder_047500_loss_0.1787.pt


Training:   0%|          | 16734/250000000 [3:11:51<53655:01:22,  1.29step/s, eer=0.0186, loss=0.218]  

Saving the model (step 48000) to ..\models\speech_encoder_transformer_updated\encoder_048000_loss_0.2185.pt


Training:   0%|          | 17234/250000000 [3:17:54<51654:54:37,  1.34step/s, eer=0.0165, loss=0.17]   

Saving the model (step 48500) to ..\models\speech_encoder_transformer_updated\encoder_048500_loss_0.1698.pt


Training:   0%|          | 17734/250000000 [3:23:54<49829:56:13,  1.39step/s, eer=0.0172, loss=0.241]  

Saving the model (step 49000) to ..\models\speech_encoder_transformer_updated\encoder_049000_loss_0.2406.pt


Training:   0%|          | 18234/250000000 [3:30:10<52632:35:12,  1.32step/s, eer=0.0162, loss=0.15]   

Saving the model (step 49500) to ..\models\speech_encoder_transformer_updated\encoder_049500_loss_0.1496.pt


Training:   0%|          | 18734/250000000 [3:36:29<55340:08:17,  1.25step/s, eer=0.0129, loss=0.1]   

Saving the model (step 50000) to ..\models\speech_encoder_transformer_updated\encoder_050000_loss_0.1001.pt
Making a backup (step 50000)


Training:   0%|          | 19234/250000000 [3:43:01<53123:19:52,  1.31step/s, eer=0.0205, loss=0.218]  

Saving the model (step 50500) to ..\models\speech_encoder_transformer_updated\encoder_050500_loss_0.2177.pt


Training:   0%|          | 19734/250000000 [3:49:25<54321:39:47,  1.28step/s, eer=0.0125, loss=0.169]  

Saving the model (step 51000) to ..\models\speech_encoder_transformer_updated\encoder_051000_loss_0.1686.pt


Training:   0%|          | 20234/250000000 [3:55:46<54073:35:14,  1.28step/s, eer=0.02, loss=0.174]    

Saving the model (step 51500) to ..\models\speech_encoder_transformer_updated\encoder_051500_loss_0.1737.pt


Training:   0%|          | 20734/250000000 [4:02:14<54416:51:02,  1.28step/s, eer=0.0176, loss=0.177]  

Saving the model (step 52000) to ..\models\speech_encoder_transformer_updated\encoder_052000_loss_0.1768.pt


Training:   0%|          | 21234/250000000 [4:08:38<52960:28:23,  1.31step/s, eer=0.0242, loss=0.271]  

Saving the model (step 52500) to ..\models\speech_encoder_transformer_updated\encoder_052500_loss_0.2715.pt


Training:   0%|          | 21734/250000000 [4:15:01<52903:11:55,  1.31step/s, eer=0.0194, loss=0.182]  

Saving the model (step 53000) to ..\models\speech_encoder_transformer_updated\encoder_053000_loss_0.1819.pt


Training:   0%|          | 22234/250000000 [4:21:28<51966:57:39,  1.34step/s, eer=0.0158, loss=0.158]  

Saving the model (step 53500) to ..\models\speech_encoder_transformer_updated\encoder_053500_loss_0.1579.pt


Training:   0%|          | 22734/250000000 [4:27:52<53277:36:07,  1.30step/s, eer=0.015, loss=0.184]   

Saving the model (step 54000) to ..\models\speech_encoder_transformer_updated\encoder_054000_loss_0.1844.pt


Training:   0%|          | 23234/250000000 [4:39:02<66367:10:58,  1.05step/s, eer=0.0175, loss=0.211]  

Saving the model (step 54500) to ..\models\speech_encoder_transformer_updated\encoder_054500_loss_0.2106.pt


Training:   0%|          | 23734/250000000 [4:47:14<64223:55:20,  1.08step/s, eer=0.0172, loss=0.155]   

Saving the model (step 55000) to ..\models\speech_encoder_transformer_updated\encoder_055000_loss_0.1555.pt
Making a backup (step 55000)


Training:   0%|          | 24234/250000000 [4:55:38<52417:53:29,  1.32step/s, eer=0.025, loss=0.244]   

Saving the model (step 55500) to ..\models\speech_encoder_transformer_updated\encoder_055500_loss_0.2438.pt


Training:   0%|          | 24734/250000000 [5:01:45<51419:06:16,  1.35step/s, eer=0.0201, loss=0.187]  

Saving the model (step 56000) to ..\models\speech_encoder_transformer_updated\encoder_056000_loss_0.1871.pt


Training:   0%|          | 25234/250000000 [5:07:46<50440:50:01,  1.38step/s, eer=0.0154, loss=0.163]  

Saving the model (step 56500) to ..\models\speech_encoder_transformer_updated\encoder_056500_loss_0.1627.pt


Training:   0%|          | 25734/250000000 [5:13:40<48641:26:19,  1.43step/s, eer=0.0185, loss=0.165]  

Saving the model (step 57000) to ..\models\speech_encoder_transformer_updated\encoder_057000_loss_0.1647.pt


Training:   0%|          | 26234/250000000 [5:19:36<50129:15:15,  1.39step/s, eer=0.015, loss=0.177]   

Saving the model (step 57500) to ..\models\speech_encoder_transformer_updated\encoder_057500_loss_0.1770.pt


Training:   0%|          | 26734/250000000 [5:25:31<48894:04:14,  1.42step/s, eer=0.0225, loss=0.196]  

Saving the model (step 58000) to ..\models\speech_encoder_transformer_updated\encoder_058000_loss_0.1965.pt


Training:   0%|          | 27234/250000000 [5:31:27<49295:45:39,  1.41step/s, eer=0.02, loss=0.187]    

Saving the model (step 58500) to ..\models\speech_encoder_transformer_updated\encoder_058500_loss_0.1867.pt


Training:   0%|          | 27734/250000000 [5:37:28<50853:48:10,  1.37step/s, eer=0.0297, loss=0.295]  

Saving the model (step 59000) to ..\models\speech_encoder_transformer_updated\encoder_059000_loss_0.2946.pt


Training:   0%|          | 28234/250000000 [5:43:30<50947:28:05,  1.36step/s, eer=0.015, loss=0.16]    

Saving the model (step 59500) to ..\models\speech_encoder_transformer_updated\encoder_059500_loss_0.1601.pt


Training:   0%|          | 28734/250000000 [5:49:28<49197:39:47,  1.41step/s, eer=0.0244, loss=0.208]  

Saving the model (step 60000) to ..\models\speech_encoder_transformer_updated\encoder_060000_loss_0.2085.pt
Making a backup (step 60000)


Training:   0%|          | 29234/250000000 [5:55:29<51423:32:57,  1.35step/s, eer=0.0175, loss=0.17]   

Saving the model (step 60500) to ..\models\speech_encoder_transformer_updated\encoder_060500_loss_0.1695.pt


Training:   0%|          | 29734/250000000 [6:01:36<49460:00:33,  1.40step/s, eer=0.0275, loss=0.288]  

Saving the model (step 61000) to ..\models\speech_encoder_transformer_updated\encoder_061000_loss_0.2877.pt


Training:   0%|          | 30234/250000000 [6:07:50<50948:51:51,  1.36step/s, eer=0.0145, loss=0.129]  

Saving the model (step 61500) to ..\models\speech_encoder_transformer_updated\encoder_061500_loss_0.1289.pt


Training:   0%|          | 30734/250000000 [6:14:01<50960:31:53,  1.36step/s, eer=0.0125, loss=0.117]  

Saving the model (step 62000) to ..\models\speech_encoder_transformer_updated\encoder_062000_loss_0.1165.pt


Training:   0%|          | 31234/250000000 [6:20:09<51829:40:15,  1.34step/s, eer=0.00942, loss=0.112] 

Saving the model (step 62500) to ..\models\speech_encoder_transformer_updated\encoder_062500_loss_0.1118.pt


Training:   0%|          | 31734/250000000 [6:26:18<50992:40:33,  1.36step/s, eer=0.02, loss=0.176]    

Saving the model (step 63000) to ..\models\speech_encoder_transformer_updated\encoder_063000_loss_0.1755.pt


Training:   0%|          | 32234/250000000 [6:32:27<51717:56:56,  1.34step/s, eer=0.025, loss=0.411]   

Saving the model (step 63500) to ..\models\speech_encoder_transformer_updated\encoder_063500_loss_0.4107.pt


Training:   0%|          | 32734/250000000 [6:38:42<51985:10:57,  1.34step/s, eer=0.011, loss=0.101]   

Saving the model (step 64000) to ..\models\speech_encoder_transformer_updated\encoder_064000_loss_0.1015.pt


Training:   0%|          | 33234/250000000 [6:44:56<51557:34:40,  1.35step/s, eer=0.015, loss=0.139]   

Saving the model (step 64500) to ..\models\speech_encoder_transformer_updated\encoder_064500_loss_0.1391.pt


Training:   0%|          | 33734/250000000 [6:51:06<51468:38:44,  1.35step/s, eer=0.011, loss=0.185]   

Saving the model (step 65000) to ..\models\speech_encoder_transformer_updated\encoder_065000_loss_0.1847.pt
Making a backup (step 65000)


Training:   0%|          | 34234/250000000 [6:57:24<51552:26:35,  1.35step/s, eer=0.0108, loss=0.133]  

Saving the model (step 65500) to ..\models\speech_encoder_transformer_updated\encoder_065500_loss_0.1334.pt


Training:   0%|          | 34734/250000000 [7:03:47<52786:48:30,  1.32step/s, eer=0.00833, loss=0.0781]

Saving the model (step 66000) to ..\models\speech_encoder_transformer_updated\encoder_066000_loss_0.0781.pt


Training:   0%|          | 35234/250000000 [7:10:04<54277:48:03,  1.28step/s, eer=0.0125, loss=0.225]  

Saving the model (step 66500) to ..\models\speech_encoder_transformer_updated\encoder_066500_loss_0.2249.pt


Training:   0%|          | 35734/250000000 [7:16:19<53491:35:10,  1.30step/s, eer=0.0187, loss=0.155]  

Saving the model (step 67000) to ..\models\speech_encoder_transformer_updated\encoder_067000_loss_0.1549.pt


Training:   0%|          | 36234/250000000 [7:22:34<51479:45:56,  1.35step/s, eer=0.01, loss=0.142]    

Saving the model (step 67500) to ..\models\speech_encoder_transformer_updated\encoder_067500_loss_0.1416.pt


Training:   0%|          | 36734/250000000 [7:28:51<53953:06:12,  1.29step/s, eer=0.0208, loss=0.171]  

Saving the model (step 68000) to ..\models\speech_encoder_transformer_updated\encoder_068000_loss_0.1706.pt


Training:   0%|          | 37234/250000000 [7:35:14<52809:19:03,  1.31step/s, eer=0.0175, loss=0.133]  

Saving the model (step 68500) to ..\models\speech_encoder_transformer_updated\encoder_068500_loss_0.1333.pt


Training:   0%|          | 37734/250000000 [7:41:32<53964:02:20,  1.29step/s, eer=0.0175, loss=0.212]  

Saving the model (step 69000) to ..\models\speech_encoder_transformer_updated\encoder_069000_loss_0.2122.pt


Training:   0%|          | 38234/250000000 [7:47:51<53030:20:20,  1.31step/s, eer=0.0125, loss=0.154]  

Saving the model (step 69500) to ..\models\speech_encoder_transformer_updated\encoder_069500_loss_0.1541.pt


Training:   0%|          | 38734/250000000 [7:54:10<53834:42:18,  1.29step/s, eer=0.00955, loss=0.117] 

Saving the model (step 70000) to ..\models\speech_encoder_transformer_updated\encoder_070000_loss_0.1168.pt
Making a backup (step 70000)


Training:   0%|          | 39234/250000000 [8:00:33<53162:35:44,  1.31step/s, eer=0.0138, loss=0.135]  

Saving the model (step 70500) to ..\models\speech_encoder_transformer_updated\encoder_070500_loss_0.1350.pt


Training:   0%|          | 39734/250000000 [8:07:03<52141:16:04,  1.33step/s, eer=0.0147, loss=0.121]  

Saving the model (step 71000) to ..\models\speech_encoder_transformer_updated\encoder_071000_loss_0.1210.pt


Training:   0%|          | 40234/250000000 [8:13:32<54699:22:38,  1.27step/s, eer=0.01, loss=0.0828]   

Saving the model (step 71500) to ..\models\speech_encoder_transformer_updated\encoder_071500_loss_0.0828.pt


Training:   0%|          | 40734/250000000 [8:20:08<51528:31:35,  1.35step/s, eer=0.0141, loss=0.129]  

Saving the model (step 72000) to ..\models\speech_encoder_transformer_updated\encoder_072000_loss_0.1287.pt


Training:   0%|          | 41234/250000000 [8:26:35<53304:43:02,  1.30step/s, eer=0.0155, loss=0.11]   

Saving the model (step 72500) to ..\models\speech_encoder_transformer_updated\encoder_072500_loss_0.1102.pt


Training:   0%|          | 41734/250000000 [8:33:04<53090:57:30,  1.31step/s, eer=0.0125, loss=0.0852] 

Saving the model (step 73000) to ..\models\speech_encoder_transformer_updated\encoder_073000_loss_0.0852.pt


Training:   0%|          | 42234/250000000 [8:39:33<56502:06:58,  1.23step/s, eer=0.00904, loss=0.0724]

Saving the model (step 73500) to ..\models\speech_encoder_transformer_updated\encoder_073500_loss_0.0724.pt


Training:   0%|          | 42513/250000000 [8:43:19<51282:33:15,  1.35step/s, eer=0.00981, loss=0.0695]


KeyboardInterrupt: 