In [1]:
import os
import glob
import time
import math
import warnings
from pathlib import Path

warnings.filterwarnings('ignore')

import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import soundfile as sf
import librosa
import librosa.display

import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim
from torch.utils.data import Dataset, DataLoader

from sklearn.manifold import TSNE
from sklearn.decomposition import PCA
from sklearn.metrics import (precision_score, recall_score, f1_score,
                             accuracy_score, roc_curve, auc, confusion_matrix,
                             average_precision_score, precision_recall_curve)
from scipy.stats import gaussian_kde
from tqdm import tqdm

In [2]:
try:
    from IPython.display import Audio as IPyAudio, display as ipy_display
except Exception:
    IPyAudio = None
    ipy_display = None

plt.style.use('seaborn-v0_8-darkgrid')
sns.set_palette("husl")

torch.manual_seed(42)
np.random.seed(42)

device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
print("Device:", device)

Device: cuda


In [3]:
class Config:
    SAMPLE_RATE = 16000
    N_FFT = 512
    HOP_LENGTH = 256
    N_MELS = 128
    AUDIO_DURATION = 3.0            
    LATENT_DIM = 64
    HIDDEN_DIMS = [128, 256, 512, 1024]
    BETA = 1.0
    BATCH_SIZE = 16
    EPOCHS = 200                    
    LEARNING_RATE = 1e-3
    NUM_SAMPLES = 1000              
    CONTAMINATION = 0.1
    DATA_PATH = './librispeech_data' 
    OUTPUT_PATH = './output'
    VIS_SAMPLES = 5
    TSNE_MAX = 1000

config = Config()

os.makedirs(config.OUTPUT_PATH, exist_ok=True)
os.makedirs(os.path.join(config.OUTPUT_PATH, "audio"), exist_ok=True)
os.makedirs(os.path.join(config.OUTPUT_PATH, "plots"), exist_ok=True)
os.makedirs(os.path.join(config.OUTPUT_PATH, "models"), exist_ok=True)

BEST_MODEL_PATH = os.path.join(config.OUTPUT_PATH, "models", "best_vae_model.pth")

In [4]:
class LibriSpeechVAEDataset(Dataset):

    def __init__(self, root_dir, sample_rate=16000, n_mels=128, n_fft=512, hop_length=256, duration=3.0, num_samples=None):
        self.root_dir = os.path.abspath(root_dir)
        self.sample_rate = sample_rate
        self.n_mels = n_mels
        self.n_fft = n_fft
        self.hop_length = hop_length
        self.duration = duration
        self.target_length = int(sample_rate * duration)

        patterns = ['**/*.wav', '**/*.flac', '**/*.mp3', '**/*.m4a', '**/*.aac']
        files = []
        for p in patterns:
            files.extend(glob.glob(os.path.join(self.root_dir, p), recursive=True))
        files = sorted(files)
        if num_samples:
            files = files[:min(num_samples, len(files))]
        if len(files) == 0:
            raise RuntimeError(f"No audio files found under {self.root_dir}. Place audio (.wav/.flac) files there.")
        self.files = files
        print(f"Found {len(self.files)} audio files under {self.root_dir}")

    def __len__(self):
        return len(self.files)

    def __getitem__(self, idx):
        path = self.files[idx]
        # load with librosa (resamples automatically)
        y, sr = librosa.load(path, sr=self.sample_rate, mono=True)
        # trim/pad to duration
        if len(y) > self.target_length:
            start = np.random.randint(0, len(y) - self.target_length + 1)
            y = y[start:start + self.target_length]
        elif len(y) < self.target_length:
            pad = self.target_length - len(y)
            y = np.pad(y, (0, pad), mode='constant')

        # mel spectrogram (power)
        mel_power = librosa.feature.melspectrogram(y=y, sr=self.sample_rate,
                                                   n_fft=self.n_fft, hop_length=self.hop_length,
                                                   n_mels=self.n_mels, power=2.0)
        mel_log = np.log(mel_power + 1e-9)
        # normalize per-sample (zero mean, unit std)
        mel_norm = (mel_log - mel_log.mean()) / (mel_log.std() + 1e-9)

        mel_norm_t = torch.from_numpy(mel_norm).float()      # [n_mels, time]
        return mel_norm_t, y.astype(np.float32), mel_power.astype(np.float32), mel_log.astype(np.float32)

In [5]:
class Encoder(nn.Module):
    def __init__(self, in_channels, hidden_dims, latent_dim):
        super().__init__()
        modules = []
        c = in_channels
        for h in hidden_dims:
            modules.append(nn.Sequential(
                nn.Conv2d(c, h, kernel_size=3, stride=2, padding=1),
                nn.BatchNorm2d(h),
                nn.LeakyReLU(0.2),
                nn.Dropout2d(0.1)
            ))
            c = h
        self.net = nn.Sequential(*modules)
        self.flatten_size = None
        self.fc_mu = None
        self.fc_logvar = None
        self.latent_dim = latent_dim

    def forward(self, x):
        x = self.net(x)
        if self.flatten_size is None:
            self.flatten_size = x.shape[1] * x.shape[2] * x.shape[3]
            self.fc_mu = nn.Linear(self.flatten_size, self.latent_dim).to(x.device)
            self.fc_logvar = nn.Linear(self.flatten_size, self.latent_dim).to(x.device)
        x = x.view(x.size(0), -1)
        mu = self.fc_mu(x)
        logvar = self.fc_logvar(x)
        return mu, logvar, self.flatten_size

In [6]:
class Decoder(nn.Module):
    def __init__(self, latent_dim, hidden_dims, out_channels):
        super().__init__()
        self.latent_dim = latent_dim
        self.hidden_dims = hidden_dims
        self.fc = None
        modules = []
        rev = hidden_dims[::-1]
        for i in range(len(rev)-1):
            modules.append(nn.Sequential(
                nn.ConvTranspose2d(rev[i], rev[i+1], kernel_size=3, stride=2, padding=1, output_padding=1),
                nn.BatchNorm2d(rev[i+1]),
                nn.LeakyReLU(0.2),
                nn.Dropout2d(0.1)
            ))
        modules.append(nn.Sequential(
            nn.ConvTranspose2d(rev[-1], out_channels, kernel_size=3, stride=2, padding=1, output_padding=1),
            nn.Tanh()
        ))
        self.net = nn.Sequential(*modules)

    def forward(self, z, flatten_size, spatial_size):
        if self.fc is None:
            self.fc = nn.Linear(self.latent_dim, flatten_size).to(z.device)
        x = self.fc(z)
        channels = self.hidden_dims[-1]
        x = x.view(x.size(0), channels, spatial_size[0], spatial_size[1])
        x = self.net(x)
        return x

In [7]:
class SpeechVAE(nn.Module):
    def __init__(self, input_channels=1, hidden_dims=None, latent_dim=64, beta=1.0):
        super().__init__()
        if hidden_dims is None:
            hidden_dims = [128, 256, 512, 1024]
        self.encoder = Encoder(input_channels, hidden_dims, latent_dim)
        self.decoder = Decoder(latent_dim, hidden_dims, input_channels)
        self.beta = beta
        self.spatial_size = None
        self.latent_dim = latent_dim

    def reparameterize(self, mu, logvar):
        std = torch.exp(0.5*logvar)
        eps = torch.randn_like(std)
        return mu + eps*std

    def forward(self, x):
        mu, logvar, flatten_size = self.encoder(x)
        if self.spatial_size is None:
            with torch.no_grad():
                tmp = self.encoder.net(x)
                self.spatial_size = (tmp.shape[2], tmp.shape[3])
        z = self.reparameterize(mu, logvar)
        recon = self.decoder(z, flatten_size, self.spatial_size)
        if recon.shape != x.shape:
            recon = F.interpolate(recon, size=(x.shape[2], x.shape[3]), mode='bilinear', align_corners=False)
        return recon, mu, logvar, z

    def loss_function(self, recon, x, mu, logvar):
        recon_loss = F.mse_loss(recon, x, reduction='sum')
        kld = -0.5 * torch.sum(1 + logvar - mu.pow(2) - logvar.exp())
        return recon_loss + self.beta * kld, recon_loss, kld

    def get_reconstruction_error(self, x):
        self.eval()
        with torch.no_grad():
            recon, mu, logvar, z = self(x)
            e = F.mse_loss(recon, x, reduction='none')
            e = e.view(e.size(0), -1).mean(dim=1)
            return e

In [8]:
class MetricsCalculator:
    @staticmethod
    def mse(a,b): return np.mean((a-b)**2)
    @staticmethod
    def mae(a,b): return np.mean(np.abs(a-b))
    @staticmethod
    def psnr(a,b, max_val=1.0):
        mse = np.mean((a-b)**2)
        if mse == 0: return 100.0
        return 20 * np.log10(max_val / math.sqrt(mse))
    @staticmethod
    def snr(a,b):
        sp = np.mean(a**2)
        npow = np.mean((a-b)**2)
        if npow == 0: return 100.0
        return 10 * np.log10(sp / npow)
    @staticmethod
    def ssim(a,b):
        c1 = 0.01**2; c2 = 0.03**2
        mu1, mu2 = a.mean(), b.mean()
        s1, s2 = a.var(), b.var()
        cov = np.mean((a-mu1)*(b-mu2))
        return ((2*mu1*mu2 + c1)*(2*cov + c2))/((mu1**2 + mu2**2 + c1)*(s1+s2+c2)+1e-12)
    @staticmethod
    def spectral_convergence(a,b):
        num = np.linalg.norm(a-b, ord='fro')
        den = np.linalg.norm(a, ord='fro')
        return num/(den+1e-9)

In [9]:
class AudioReconstructor:
    def __init__(self, sr, n_fft, hop_length, n_mels):
        self.sr = sr
        self.n_fft = n_fft
        self.hop_length = hop_length
        self.n_mels = n_mels

    def reconstruct_audio(self, recon_norm_log, reference_mel_log):
        """
        recon_norm_log: numpy [n_mels, t] => model output normalized log-mel
        reference_mel_log: numpy [n_mels, t] => original mel_log for denorm stats
        """
        try:
            # Denormalize approx using reference mel_log mean/std
            ref_mean = reference_mel_log.mean()
            ref_std = reference_mel_log.std()
            recon_log = recon_norm_log * (ref_std + 1e-9) + ref_mean
            recon_power = np.exp(recon_log)  # mel power
            # invert mel to audio using librosa (Griffin-Lim inside)
            y = librosa.feature.inverse.mel_to_audio(M=recon_power, sr=self.sr, n_fft=self.n_fft,
                                                     hop_length=self.hop_length, power=2.0, n_iter=64)
            # normalize audio to -0.99..0.99
            maxv = np.max(np.abs(y)) + 1e-9
            y = y / maxv * 0.99
            return y
        except Exception as e:
            # fallback None
            return None

In [10]:
def train_vae(model, train_loader, val_loader, epochs, lr, device):
    optimizer = optim.AdamW(model.parameters(), lr=lr, weight_decay=1e-5)
    scheduler = optim.lr_scheduler.ReduceLROnPlateau(optimizer, 'min', patience=8, factor=0.5)
    train_losses, val_losses, recon_losses, kl_losses = [], [], [], []
    best_val = float('inf'); patience = 12; pcount = 0

    for epoch in range(epochs):
        model.train()
        tloss = trecon = tkl = 0.0
        pbar = tqdm(train_loader, desc=f"Epoch {epoch+1}/{epochs}")
        for mel_norm, _, _, _ in pbar:
            x = mel_norm.unsqueeze(1).to(device)   # [B,1,H,W]
            optimizer.zero_grad()
            recon, mu, logvar, _ = model(x)
            loss, recon_l, kld = model.loss_function(recon, x, mu, logvar)
            loss.backward()
            torch.nn.utils.clip_grad_norm_(model.parameters(), 1.0)
            optimizer.step()
            tloss += loss.item()
            trecon += recon_l.item()
            tkl += kld.item()
            pbar.set_postfix({'loss': (loss.item()/x.size(0))})
        avg_train = tloss / len(train_loader.dataset)
        avg_recon = trecon / len(train_loader.dataset)
        avg_kl = tkl / len(train_loader.dataset)
        train_losses.append(avg_train); recon_losses.append(avg_recon); kl_losses.append(avg_kl)

        model.eval()
        vloss = 0.0
        with torch.no_grad():
            for mel_norm, _, _, _ in val_loader:
                x = mel_norm.unsqueeze(1).to(device)
                recon, mu, logvar, _ = model(x)
                loss, _, _ = model.loss_function(recon, x, mu, logvar)
                vloss += loss.item()
        avg_val = vloss / len(val_loader.dataset)
        val_losses.append(avg_val)

        scheduler.step(avg_val)

        if avg_val < best_val:
            best_val = avg_val
            torch.save({'epoch': epoch, 'model_state_dict': model.state_dict(), 'val_loss': avg_val}, BEST_MODEL_PATH)
            pcount = 0
        else:
            pcount += 1

        print(f"Epoch {epoch+1}/{epochs} | Train: {avg_train:.4f} | Val: {avg_val:.4f} | Recon: {avg_recon:.4f} | KL: {avg_kl:.4f}")

        if pcount >= patience:
            print("Early stopping triggered.")
            break

    return train_losses, val_losses, recon_losses, kl_losses

In [11]:
def evaluate_model(model, test_loader, device, audio_reconstructor):
    model.eval()
    metrics = {'mse':[], 'mae':[], 'psnr':[], 'snr':[], 'ssim':[], 'spectral_conv':[]}
    orig_specs, recon_specs, latents = [], [], []
    orig_audio, recon_audio = [], []

    with torch.no_grad():
        for batch_idx, (mel_norm, waveform, mel_power, mel_log) in enumerate(tqdm(test_loader, desc="Evaluating")):
            if batch_idx >= 8:  # limit for memory/time
                break
            x = mel_norm.unsqueeze(1).to(device)
            recon, mu, logvar, _ = model(x)
            recon_np = recon.cpu().numpy()
            orig_np = x.cpu().numpy()
            for i in range(recon_np.shape[0]):
                orig_spec = orig_np[i,0]
                recon_spec = recon_np[i,0]
                metrics['mse'].append(MetricsCalculator.mse(orig_spec, recon_spec))
                metrics['mae'].append(MetricsCalculator.mae(orig_spec, recon_spec))
                metrics['psnr'].append(MetricsCalculator.psnr(orig_spec, recon_spec))
                metrics['snr'].append(MetricsCalculator.snr(orig_spec, recon_spec))
                metrics['ssim'].append(MetricsCalculator.ssim(orig_spec, recon_spec))
                metrics['spectral_conv'].append(MetricsCalculator.spectral_convergence(orig_spec, recon_spec))

                # reconstruct audio for first few samples
                if len(orig_audio) < config.VIS_SAMPLES:
                    orig_audio.append(waveform[i].numpy() if isinstance(waveform[i], np.ndarray) else waveform[i].numpy())
                    # recon_spec is normalized log-mel — use corresponding mel_log for denorm stats
                    ref_log = mel_log[i].numpy() if isinstance(mel_log, torch.Tensor) else mel_log[i]
                    recon_audio_sample = audio_reconstructor.reconstruct_audio(recon_spec, ref_log)
                    recon_audio.append(recon_audio_sample)

            orig_specs.append(torch.from_numpy(orig_np))
            recon_specs.append(torch.from_numpy(recon_np))
            latents.append(mu.cpu())

    avg = {k: np.mean(v) if len(v)>0 else float('nan') for k,v in metrics.items()}
    std = {k: np.std(v) if len(v)>0 else float('nan') for k,v in metrics.items()}
    return avg, std, orig_specs, recon_specs, latents, orig_audio, recon_audio, metrics

In [12]:
def evaluate_anomaly_detection(model, test_loader, device):
    # collect small pool
    pool = []
    for i, (mel_norm, _, _, _) in enumerate(test_loader):
        pool.append(mel_norm)
        if i >= 80: break
    if len(pool) == 0:
        raise RuntimeError("Not enough data for anomaly evaluation.")
    pool = torch.cat(pool, dim=0)  # [N, H, W]

    # normal and anomalies (add noise)
    N = min(200, pool.size(0))
    normal = pool[:N]
    anomaly = normal + torch.randn_like(normal) * 2.0

    num_anom = max(1, int(N * config.CONTAMINATION))
    test_data = torch.cat([normal, anomaly[:num_anom]], dim=0)
    test_labels = np.concatenate([np.zeros(N), np.ones(num_anom)])

    # compute reconstruction errors
    model.eval()
    errors = []
    with torch.no_grad():
        for i in range(0, len(test_data), 32):
            batch = test_data[i:i+32].unsqueeze(1).to(device)
            err = model.get_reconstruction_error(batch)
            errors.extend(err.cpu().numpy())
    errors = np.array(errors)
    threshold = np.percentile(errors, 95)
    preds = (errors > threshold).astype(int)

    acc = accuracy_score(test_labels, preds)
    prec = precision_score(test_labels, preds, zero_division=0)
    rec = recall_score(test_labels, preds, zero_division=0)
    f1 = f1_score(test_labels, preds, zero_division=0)
    fpr, tpr, _ = roc_curve(test_labels, errors)
    roc_auc = auc(fpr, tpr)
    avg_prec = average_precision_score(test_labels, errors)
    cm = confusion_matrix(test_labels, preds)

    return {
        'accuracy': acc, 'precision': prec, 'recall': rec, 'f1_score': f1,
        'roc_auc': roc_auc, 'avg_precision': avg_prec, 'confusion_matrix': cm,
        'fpr': fpr, 'tpr': tpr, 'errors': errors, 'predictions': preds,
        'labels': test_labels, 'threshold': threshold
    }

In [13]:
def save_audio(original_audio, reconstructed_audio, sr):
    for i in range(min(len(original_audio), config.VIS_SAMPLES)):
        sf.write(os.path.join(config.OUTPUT_PATH, "audio", f"original_{i+1}.wav"), original_audio[i], sr)
        if reconstructed_audio[i] is not None:
            sf.write(os.path.join(config.OUTPUT_PATH, "audio", f"reconstructed_{i+1}.wav"), reconstructed_audio[i], sr)
    print("Saved audio samples to", os.path.join(config.OUTPUT_PATH, "audio"))

def plot_training_curves(train_losses, val_losses, recon_losses, kl_losses):
    epochs = list(range(1, len(train_losses)+1))
    plt.figure(figsize=(10,6))
    plt.plot(epochs, train_losses, label='train')
    plt.plot(epochs, val_losses, label='val')
    plt.xlabel('Epoch'); plt.ylabel('Loss'); plt.legend(); plt.title('Total Loss')
    plt.savefig(os.path.join(config.OUTPUT_PATH, "plots", "training_curves.png"), dpi=200)
    plt.close()

def plot_reconstruction_examples(original_specs, reconstructed_specs, n=4):
    if len(original_specs)==0: return
    n = min(n, len(original_specs[0]))
    fig, axes = plt.subplots(n, 3, figsize=(12, 3*n))
    for i in range(n):
        orig = original_specs[0][i,0].numpy()
        recon = reconstructed_specs[0][i,0].numpy()
        axes[i,0].imshow(orig, origin='lower', aspect='auto'); axes[i,0].set_title('Original')
        axes[i,1].imshow(recon, origin='lower', aspect='auto'); axes[i,1].set_title('Reconstructed')
        axes[i,2].imshow(np.abs(orig-recon), origin='lower', aspect='auto'); axes[i,2].set_title('Abs Diff')
    plt.tight_layout()
    plt.savefig(os.path.join(config.OUTPUT_PATH, "plots", "reconstruction_examples.png"), dpi=200)
    plt.close()

def plot_latent_tsne(latents):
    if len(latents)==0: return
    X = torch.cat(latents, dim=0).numpy()
    n = min(len(X), config.TSNE_MAX)
    Xs = X[:n]
    print("Computing t-SNE (this may take a while)...")
    tsne = TSNE(n_components=2, random_state=42, perplexity=30)
    Z = tsne.fit_transform(Xs)
    plt.figure(figsize=(8,6))
    plt.scatter(Z[:,0], Z[:,1], s=6, alpha=0.6)
    plt.title('Latent t-SNE')
    plt.savefig(os.path.join(config.OUTPUT_PATH, "plots", "latent_tsne.png"), dpi=200)
    plt.close()

In [14]:
def main():
    t0 = time.time()
    print("\n=== Speech VAE (librosa-based) ===\n")
    print("Data folder:", os.path.abspath(config.DATA_PATH))
    # build dataset
    dataset = LibriSpeechVAEDataset(config.DATA_PATH, sample_rate=config.SAMPLE_RATE,
                                    n_mels=config.N_MELS, n_fft=config.N_FFT,
                                    hop_length=config.HOP_LENGTH, duration=config.AUDIO_DURATION,
                                    num_samples=config.NUM_SAMPLES)
    N = len(dataset)
    train_n = int(0.7 * N); val_n = int(0.15 * N); test_n = N - train_n - val_n
    train_ds, val_ds, test_ds = torch.utils.data.random_split(dataset, [train_n, val_n, test_n],
                                                               generator=torch.Generator().manual_seed(42))
    train_loader = DataLoader(train_ds, batch_size=config.BATCH_SIZE, shuffle=True, num_workers=0)
    val_loader = DataLoader(val_ds, batch_size=config.BATCH_SIZE, shuffle=False, num_workers=0)
    test_loader = DataLoader(test_ds, batch_size=config.BATCH_SIZE, shuffle=False, num_workers=0)

    print(f"Dataset splits -> train: {train_n}, val: {val_n}, test: {test_n}")

    model = SpeechVAE(input_channels=1, hidden_dims=config.HIDDEN_DIMS, latent_dim=config.LATENT_DIM, beta=config.BETA).to(device)
    params = sum(p.numel() for p in model.parameters())
    print("Model parameters:", params)

    train_losses, val_losses, recon_losses, kl_losses = train_vae(model, train_loader, val_loader, config.EPOCHS, config.LEARNING_RATE, device)

    if os.path.exists(BEST_MODEL_PATH):
        ck = torch.load(BEST_MODEL_PATH, map_location=device)
        model.load_state_dict(ck['model_state_dict'] if 'model_state_dict' in ck else ck)
        print("Loaded best model from", BEST_MODEL_PATH)

    audio_reconstructor = AudioReconstructor(config.SAMPLE_RATE, config.N_FFT, config.HOP_LENGTH, config.N_MELS)
    avg_metrics, std_metrics, orig_specs, recon_specs, latents, orig_audio, recon_audio, all_metrics = evaluate_model(model, test_loader, device, audio_reconstructor)

    anomaly_metrics = evaluate_anomaly_detection(model, test_loader, device)

    if len(orig_audio) > 0:
        save_audio(orig_audio, recon_audio, config.SAMPLE_RATE)

    plot_training_curves(train_losses, val_losses, recon_losses, kl_losses)
    plot_reconstruction_examples(orig_specs, recon_specs, n=min(4, len(orig_specs[0]) if len(orig_specs)>0 else 0))
    plot_latent_tsne(latents)

    try:
        import matplotlib
        # ROC
        plt.figure(figsize=(6,6))
        plt.plot(anomaly_metrics['fpr'], anomaly_metrics['tpr'], label=f"AUC={anomaly_metrics['roc_auc']:.4f}")
        plt.plot([0,1],[0,1], linestyle='--', color='k')
        plt.xlabel('FPR'); plt.ylabel('TPR'); plt.title('ROC'); plt.legend()
        plt.savefig(os.path.join(config.OUTPUT_PATH, "plots", "anomaly_roc.png"), dpi=200)
        plt.close()
    except Exception:
        pass

    print("\n--- Reconstruction metrics (averages) ---")
    for k,v in avg_metrics.items():
        print(f"{k}: {v:.4f} (std {std_metrics[k]:.4f})")
    print("\n--- Anomaly detection ---")
    for k in ['accuracy','precision','recall','f1_score','roc_auc','avg_precision']:
        print(f"{k}: {anomaly_metrics[k]:.4f}")

    if IPyAudio is not None and len(orig_audio)>0:
        print("\nPlaying original (first) and reconstructed (first) audio (notebook only):")
        ipy_display(IPyAudio(orig_audio[0], rate=config.SAMPLE_RATE))
        if recon_audio[0] is not None:
            ipy_display(IPyAudio(recon_audio[0], rate=config.SAMPLE_RATE))
        else:
            print("No reconstructed audio available for playback.")

    print("\nOutputs saved to:", os.path.abspath(config.OUTPUT_PATH))
    print("Total time (min):", (time.time()-t0)/60.0)

if __name__ == "__main__":
    main()



=== Speech VAE (librosa-based) ===

Data folder: C:\Users\koust\Koustab Projects\VAE_LVM\Speech\librispeech_data
Found 1000 audio files under C:\Users\koust\Koustab Projects\VAE_LVM\Speech\librispeech_data
Dataset splits -> train: 700, val: 150, test: 150
Model parameters: 12397057


Epoch 1/200: 100%|███████████████████████████████████████████████████████| 44/44 [00:05<00:00,  8.39it/s, loss=1.96e+4]


Epoch 1/200 | Train: 22291.3098 | Val: 19138.9977 | Recon: 22254.2127 | KL: 37.0970


Epoch 2/200: 100%|███████████████████████████████████████████████████████| 44/44 [00:03<00:00, 13.04it/s, loss=1.44e+4]


Epoch 2/200 | Train: 17324.8781 | Val: 15580.3573 | Recon: 17276.7410 | KL: 48.1373


Epoch 3/200: 100%|███████████████████████████████████████████████████████| 44/44 [00:03<00:00, 12.42it/s, loss=1.41e+4]


Epoch 3/200 | Train: 14127.5399 | Val: 13116.7184 | Recon: 14070.2917 | KL: 57.2482


Epoch 4/200: 100%|███████████████████████████████████████████████████████| 44/44 [00:03<00:00, 12.50it/s, loss=1.06e+4]


Epoch 4/200 | Train: 12662.1161 | Val: 12244.2904 | Recon: 12601.5794 | KL: 60.5368


Epoch 5/200: 100%|███████████████████████████████████████████████████████| 44/44 [00:03<00:00, 12.93it/s, loss=1.26e+4]


Epoch 5/200 | Train: 12114.8207 | Val: 11844.0983 | Recon: 12049.9836 | KL: 64.8371


Epoch 6/200: 100%|███████████████████████████████████████████████████████| 44/44 [00:03<00:00, 12.89it/s, loss=1.13e+4]


Epoch 6/200 | Train: 11641.4188 | Val: 11555.3805 | Recon: 11572.8828 | KL: 68.5359


Epoch 7/200: 100%|███████████████████████████████████████████████████████| 44/44 [00:03<00:00, 12.63it/s, loss=1.02e+4]


Epoch 7/200 | Train: 11455.8190 | Val: 11339.0943 | Recon: 11383.8168 | KL: 72.0022


Epoch 8/200: 100%|███████████████████████████████████████████████████████| 44/44 [00:03<00:00, 13.01it/s, loss=1.05e+4]


Epoch 8/200 | Train: 11270.4582 | Val: 11024.3610 | Recon: 11195.1001 | KL: 75.3581


Epoch 9/200: 100%|███████████████████████████████████████████████████████| 44/44 [00:03<00:00, 13.37it/s, loss=1.13e+4]


Epoch 9/200 | Train: 11037.1756 | Val: 10820.8942 | Recon: 10958.9725 | KL: 78.2032


Epoch 10/200: 100%|██████████████████████████████████████████████████████| 44/44 [00:03<00:00, 13.34it/s, loss=1.14e+4]


Epoch 10/200 | Train: 10898.3008 | Val: 10737.5537 | Recon: 10817.5175 | KL: 80.7833


Epoch 11/200: 100%|██████████████████████████████████████████████████████| 44/44 [00:03<00:00, 13.39it/s, loss=9.91e+3]


Epoch 11/200 | Train: 10669.0603 | Val: 10544.3347 | Recon: 10585.7974 | KL: 83.2629


Epoch 12/200: 100%|██████████████████████████████████████████████████████| 44/44 [00:03<00:00, 13.22it/s, loss=1.18e+4]


Epoch 12/200 | Train: 10571.1772 | Val: 10419.6710 | Recon: 10484.9052 | KL: 86.2719


Epoch 13/200: 100%|██████████████████████████████████████████████████████| 44/44 [00:03<00:00, 13.60it/s, loss=9.79e+3]


Epoch 13/200 | Train: 10459.2127 | Val: 10313.5689 | Recon: 10370.0668 | KL: 89.1459


Epoch 14/200: 100%|██████████████████████████████████████████████████████| 44/44 [00:03<00:00, 13.46it/s, loss=1.02e+4]


Epoch 14/200 | Train: 10338.3571 | Val: 10146.7867 | Recon: 10247.6030 | KL: 90.7541


Epoch 15/200: 100%|██████████████████████████████████████████████████████| 44/44 [00:03<00:00, 12.67it/s, loss=1.07e+4]


Epoch 15/200 | Train: 10237.0679 | Val: 10100.6955 | Recon: 10144.4549 | KL: 92.6130


Epoch 16/200: 100%|██████████████████████████████████████████████████████| 44/44 [00:03<00:00, 13.20it/s, loss=9.77e+3]


Epoch 16/200 | Train: 10050.7358 | Val: 10070.9568 | Recon: 9956.1363 | KL: 94.5995


Epoch 17/200: 100%|██████████████████████████████████████████████████████| 44/44 [00:03<00:00, 13.53it/s, loss=1.07e+4]


Epoch 17/200 | Train: 9999.0454 | Val: 9846.7143 | Recon: 9902.0186 | KL: 97.0268


Epoch 18/200: 100%|██████████████████████████████████████████████████████| 44/44 [00:03<00:00, 13.55it/s, loss=9.99e+3]


Epoch 18/200 | Train: 9929.6557 | Val: 9803.2028 | Recon: 9830.2873 | KL: 99.3684


Epoch 19/200: 100%|██████████████████████████████████████████████████████| 44/44 [00:03<00:00, 13.54it/s, loss=9.95e+3]


Epoch 19/200 | Train: 9833.9732 | Val: 9734.0185 | Recon: 9732.8451 | KL: 101.1280


Epoch 20/200: 100%|██████████████████████████████████████████████████████| 44/44 [00:03<00:00, 13.10it/s, loss=9.58e+3]


Epoch 20/200 | Train: 9757.2643 | Val: 9597.2505 | Recon: 9654.5085 | KL: 102.7558


Epoch 21/200: 100%|██████████████████████████████████████████████████████| 44/44 [00:03<00:00, 13.21it/s, loss=9.94e+3]


Epoch 21/200 | Train: 9624.0304 | Val: 9469.5025 | Recon: 9518.4687 | KL: 105.5617


Epoch 22/200: 100%|██████████████████████████████████████████████████████| 44/44 [00:03<00:00, 12.77it/s, loss=8.13e+3]


Epoch 22/200 | Train: 9578.2552 | Val: 9402.9192 | Recon: 9471.3574 | KL: 106.8978


Epoch 23/200: 100%|██████████████████████████████████████████████████████| 44/44 [00:03<00:00, 12.76it/s, loss=9.52e+3]


Epoch 23/200 | Train: 9454.5960 | Val: 9483.8963 | Recon: 9345.7438 | KL: 108.8521


Epoch 24/200: 100%|██████████████████████████████████████████████████████| 44/44 [00:03<00:00, 13.02it/s, loss=8.41e+3]


Epoch 24/200 | Train: 9481.7018 | Val: 9231.4308 | Recon: 9371.3400 | KL: 110.3619


Epoch 25/200: 100%|██████████████████████████████████████████████████████| 44/44 [00:03<00:00, 13.04it/s, loss=9.44e+3]


Epoch 25/200 | Train: 9366.4821 | Val: 9261.3589 | Recon: 9254.3442 | KL: 112.1379


Epoch 26/200: 100%|██████████████████████████████████████████████████████| 44/44 [00:03<00:00, 13.10it/s, loss=9.74e+3]


Epoch 26/200 | Train: 9273.9184 | Val: 9197.9062 | Recon: 9160.7061 | KL: 113.2123


Epoch 27/200: 100%|██████████████████████████████████████████████████████| 44/44 [00:03<00:00, 13.05it/s, loss=9.61e+3]


Epoch 27/200 | Train: 9217.1455 | Val: 9231.7596 | Recon: 9101.7647 | KL: 115.3808


Epoch 28/200: 100%|██████████████████████████████████████████████████████| 44/44 [00:03<00:00, 13.13it/s, loss=9.35e+3]


Epoch 28/200 | Train: 9230.1706 | Val: 9038.9626 | Recon: 9113.0484 | KL: 117.1222


Epoch 29/200: 100%|██████████████████████████████████████████████████████| 44/44 [00:03<00:00, 12.56it/s, loss=9.44e+3]


Epoch 29/200 | Train: 9175.4663 | Val: 9207.1603 | Recon: 9057.2327 | KL: 118.2337


Epoch 30/200: 100%|██████████████████████████████████████████████████████| 44/44 [00:03<00:00, 12.28it/s, loss=8.47e+3]


Epoch 30/200 | Train: 9132.7623 | Val: 9184.0047 | Recon: 9012.4414 | KL: 120.3209


Epoch 31/200: 100%|███████████████████████████████████████████████████████| 44/44 [00:03<00:00, 13.11it/s, loss=9.3e+3]


Epoch 31/200 | Train: 9083.9093 | Val: 8986.7991 | Recon: 8963.3814 | KL: 120.5280


Epoch 32/200: 100%|███████████████████████████████████████████████████████| 44/44 [00:03<00:00, 13.16it/s, loss=9.9e+3]


Epoch 32/200 | Train: 8994.5931 | Val: 8918.6255 | Recon: 8872.3067 | KL: 122.2865


Epoch 33/200: 100%|██████████████████████████████████████████████████████| 44/44 [00:03<00:00, 13.06it/s, loss=7.88e+3]


Epoch 33/200 | Train: 8992.4112 | Val: 8969.9191 | Recon: 8869.0002 | KL: 123.4110


Epoch 34/200: 100%|██████████████████████████████████████████████████████| 44/44 [00:03<00:00, 13.04it/s, loss=9.58e+3]


Epoch 34/200 | Train: 8992.3928 | Val: 8899.0756 | Recon: 8867.7753 | KL: 124.6175


Epoch 35/200: 100%|█████████████████████████████████████████████████████████| 44/44 [00:03<00:00, 13.38it/s, loss=1e+4]


Epoch 35/200 | Train: 8949.3759 | Val: 8953.5815 | Recon: 8823.2896 | KL: 126.0863


Epoch 36/200: 100%|██████████████████████████████████████████████████████| 44/44 [00:03<00:00, 13.42it/s, loss=9.78e+3]


Epoch 36/200 | Train: 8901.8187 | Val: 8750.8655 | Recon: 8774.8548 | KL: 126.9639


Epoch 37/200: 100%|██████████████████████████████████████████████████████| 44/44 [00:03<00:00, 12.75it/s, loss=8.84e+3]


Epoch 37/200 | Train: 8867.3739 | Val: 8820.8013 | Recon: 8739.0754 | KL: 128.2984


Epoch 38/200: 100%|██████████████████████████████████████████████████████| 44/44 [00:03<00:00, 13.00it/s, loss=8.52e+3]


Epoch 38/200 | Train: 8825.6515 | Val: 8729.3404 | Recon: 8695.6550 | KL: 129.9964


Epoch 39/200: 100%|██████████████████████████████████████████████████████| 44/44 [00:03<00:00, 13.45it/s, loss=8.32e+3]


Epoch 39/200 | Train: 8785.3164 | Val: 8844.9793 | Recon: 8654.6096 | KL: 130.7068


Epoch 40/200: 100%|██████████████████████████████████████████████████████| 44/44 [00:03<00:00, 13.40it/s, loss=7.68e+3]


Epoch 40/200 | Train: 8815.4084 | Val: 8692.9384 | Recon: 8683.5788 | KL: 131.8297


Epoch 41/200: 100%|██████████████████████████████████████████████████████| 44/44 [00:03<00:00, 13.41it/s, loss=7.72e+3]


Epoch 41/200 | Train: 8741.6014 | Val: 8688.4776 | Recon: 8608.6077 | KL: 132.9936


Epoch 42/200: 100%|██████████████████████████████████████████████████████| 44/44 [00:03<00:00, 13.40it/s, loss=8.77e+3]


Epoch 42/200 | Train: 8687.7120 | Val: 8638.3873 | Recon: 8553.2967 | KL: 134.4152


Epoch 43/200: 100%|██████████████████████████████████████████████████████| 44/44 [00:03<00:00, 13.46it/s, loss=7.26e+3]


Epoch 43/200 | Train: 8666.5137 | Val: 8679.1812 | Recon: 8531.7577 | KL: 134.7559


Epoch 44/200: 100%|██████████████████████████████████████████████████████| 44/44 [00:03<00:00, 13.25it/s, loss=8.84e+3]


Epoch 44/200 | Train: 8712.5246 | Val: 8693.2541 | Recon: 8577.3245 | KL: 135.2002


Epoch 45/200: 100%|██████████████████████████████████████████████████████| 44/44 [00:03<00:00, 12.51it/s, loss=9.28e+3]


Epoch 45/200 | Train: 8659.1318 | Val: 8438.5666 | Recon: 8522.6731 | KL: 136.4587


Epoch 46/200: 100%|██████████████████████████████████████████████████████| 44/44 [00:03<00:00, 13.43it/s, loss=1.07e+4]


Epoch 46/200 | Train: 8630.4078 | Val: 8635.8557 | Recon: 8492.9793 | KL: 137.4285


Epoch 47/200: 100%|███████████████████████████████████████████████████████| 44/44 [00:03<00:00, 13.52it/s, loss=8.5e+3]


Epoch 47/200 | Train: 8611.7368 | Val: 8505.2900 | Recon: 8473.6697 | KL: 138.0671


Epoch 48/200: 100%|██████████████████████████████████████████████████████| 44/44 [00:03<00:00, 13.41it/s, loss=9.62e+3]


Epoch 48/200 | Train: 8559.8689 | Val: 8545.6693 | Recon: 8419.5813 | KL: 140.2877


Epoch 49/200: 100%|██████████████████████████████████████████████████████| 44/44 [00:03<00:00, 13.47it/s, loss=9.23e+3]


Epoch 49/200 | Train: 8608.1771 | Val: 8408.9220 | Recon: 8467.7643 | KL: 140.4128


Epoch 50/200: 100%|██████████████████████████████████████████████████████| 44/44 [00:03<00:00, 13.58it/s, loss=8.21e+3]


Epoch 50/200 | Train: 8572.5275 | Val: 8465.5315 | Recon: 8431.6685 | KL: 140.8591


Epoch 51/200: 100%|██████████████████████████████████████████████████████| 44/44 [00:03<00:00, 13.51it/s, loss=8.53e+3]


Epoch 51/200 | Train: 8517.9131 | Val: 8580.1664 | Recon: 8375.9814 | KL: 141.9318


Epoch 52/200: 100%|██████████████████████████████████████████████████████| 44/44 [00:03<00:00, 13.16it/s, loss=9.33e+3]


Epoch 52/200 | Train: 8549.0744 | Val: 8459.6324 | Recon: 8406.5048 | KL: 142.5696


Epoch 53/200: 100%|██████████████████████████████████████████████████████| 44/44 [00:03<00:00, 12.67it/s, loss=9.02e+3]


Epoch 53/200 | Train: 8507.5245 | Val: 8469.1507 | Recon: 8363.8892 | KL: 143.6352


Epoch 54/200: 100%|██████████████████████████████████████████████████████| 44/44 [00:03<00:00, 13.47it/s, loss=8.51e+3]


Epoch 54/200 | Train: 8471.5992 | Val: 8419.5801 | Recon: 8328.0780 | KL: 143.5213


Epoch 55/200: 100%|█████████████████████████████████████████████████████████| 44/44 [00:03<00:00, 13.46it/s, loss=9e+3]


Epoch 55/200 | Train: 8445.8119 | Val: 8455.5841 | Recon: 8300.2733 | KL: 145.5386


Epoch 56/200: 100%|██████████████████████████████████████████████████████| 44/44 [00:03<00:00, 13.50it/s, loss=8.74e+3]


Epoch 56/200 | Train: 8476.6438 | Val: 8433.6244 | Recon: 8330.7287 | KL: 145.9150


Epoch 57/200: 100%|███████████████████████████████████████████████████████| 44/44 [00:03<00:00, 13.57it/s, loss=8.1e+3]


Epoch 57/200 | Train: 8476.4464 | Val: 8395.2746 | Recon: 8330.1810 | KL: 146.2655


Epoch 58/200: 100%|██████████████████████████████████████████████████████| 44/44 [00:03<00:00, 13.50it/s, loss=9.32e+3]


Epoch 58/200 | Train: 8439.3681 | Val: 8470.1136 | Recon: 8291.8784 | KL: 147.4897


Epoch 59/200: 100%|██████████████████████████████████████████████████████| 44/44 [00:03<00:00, 13.53it/s, loss=9.87e+3]


Epoch 59/200 | Train: 8383.5599 | Val: 8368.8463 | Recon: 8235.0890 | KL: 148.4709


Epoch 60/200: 100%|██████████████████████████████████████████████████████| 44/44 [00:03<00:00, 12.88it/s, loss=8.68e+3]


Epoch 60/200 | Train: 8426.1114 | Val: 8357.7555 | Recon: 8277.8567 | KL: 148.2547


Epoch 61/200: 100%|██████████████████████████████████████████████████████| 44/44 [00:03<00:00, 12.93it/s, loss=9.44e+3]


Epoch 61/200 | Train: 8354.7471 | Val: 8430.7053 | Recon: 8205.6487 | KL: 149.0983


Epoch 62/200: 100%|██████████████████████████████████████████████████████| 44/44 [00:03<00:00, 13.43it/s, loss=8.91e+3]


Epoch 62/200 | Train: 8379.1921 | Val: 8303.1207 | Recon: 8228.3963 | KL: 150.7959


Epoch 63/200: 100%|██████████████████████████████████████████████████████| 44/44 [00:03<00:00, 13.48it/s, loss=8.78e+3]


Epoch 63/200 | Train: 8358.2324 | Val: 8370.5507 | Recon: 8206.9770 | KL: 151.2555


Epoch 64/200: 100%|██████████████████████████████████████████████████████| 44/44 [00:03<00:00, 13.50it/s, loss=9.43e+3]


Epoch 64/200 | Train: 8320.8203 | Val: 8391.2129 | Recon: 8169.1061 | KL: 151.7143


Epoch 65/200: 100%|██████████████████████████████████████████████████████| 44/44 [00:03<00:00, 13.47it/s, loss=7.35e+3]


Epoch 65/200 | Train: 8328.4833 | Val: 8298.6297 | Recon: 8176.7301 | KL: 151.7532


Epoch 66/200: 100%|██████████████████████████████████████████████████████| 44/44 [00:03<00:00, 13.36it/s, loss=8.18e+3]


Epoch 66/200 | Train: 8295.6067 | Val: 8361.8933 | Recon: 8142.3179 | KL: 153.2889


Epoch 67/200: 100%|██████████████████████████████████████████████████████| 44/44 [00:03<00:00, 13.37it/s, loss=7.77e+3]


Epoch 67/200 | Train: 8315.9705 | Val: 8208.7062 | Recon: 8162.1141 | KL: 153.8565


Epoch 68/200: 100%|██████████████████████████████████████████████████████| 44/44 [00:03<00:00, 12.69it/s, loss=8.52e+3]


Epoch 68/200 | Train: 8303.0782 | Val: 8277.9551 | Recon: 8148.7831 | KL: 154.2950


Epoch 69/200: 100%|██████████████████████████████████████████████████████| 44/44 [00:03<00:00, 13.40it/s, loss=8.73e+3]


Epoch 69/200 | Train: 8266.6686 | Val: 8235.5483 | Recon: 8111.4781 | KL: 155.1905


Epoch 70/200: 100%|██████████████████████████████████████████████████████| 44/44 [00:03<00:00, 13.44it/s, loss=8.04e+3]


Epoch 70/200 | Train: 8258.7292 | Val: 8209.4373 | Recon: 8102.6958 | KL: 156.0335


Epoch 71/200: 100%|██████████████████████████████████████████████████████| 44/44 [00:03<00:00, 13.48it/s, loss=7.38e+3]


Epoch 71/200 | Train: 8289.6903 | Val: 8282.2230 | Recon: 8133.3912 | KL: 156.2991


Epoch 72/200: 100%|██████████████████████████████████████████████████████| 44/44 [00:03<00:00, 13.49it/s, loss=7.86e+3]


Epoch 72/200 | Train: 8204.0096 | Val: 8229.6247 | Recon: 8047.2303 | KL: 156.7794


Epoch 73/200: 100%|██████████████████████████████████████████████████████| 44/44 [00:03<00:00, 13.41it/s, loss=9.03e+3]


Epoch 73/200 | Train: 8214.4711 | Val: 8314.3177 | Recon: 8057.7372 | KL: 156.7340


Epoch 74/200: 100%|██████████████████████████████████████████████████████| 44/44 [00:03<00:00, 13.39it/s, loss=9.37e+3]


Epoch 74/200 | Train: 8226.5118 | Val: 8217.0132 | Recon: 8068.9993 | KL: 157.5125


Epoch 75/200: 100%|██████████████████████████████████████████████████████| 44/44 [00:03<00:00, 13.03it/s, loss=7.44e+3]


Epoch 75/200 | Train: 8225.9854 | Val: 8397.2737 | Recon: 8067.8473 | KL: 158.1381


Epoch 76/200: 100%|██████████████████████████████████████████████████████| 44/44 [00:03<00:00, 12.34it/s, loss=9.84e+3]


Epoch 76/200 | Train: 8217.6210 | Val: 8192.1732 | Recon: 8059.0253 | KL: 158.5958


Epoch 77/200: 100%|██████████████████████████████████████████████████████| 44/44 [00:03<00:00, 13.13it/s, loss=7.09e+3]


Epoch 77/200 | Train: 8197.6554 | Val: 8222.2061 | Recon: 8037.9663 | KL: 159.6891


Epoch 78/200: 100%|██████████████████████████████████████████████████████| 44/44 [00:03<00:00, 13.11it/s, loss=8.37e+3]


Epoch 78/200 | Train: 8241.6726 | Val: 8237.9151 | Recon: 8082.7637 | KL: 158.9089


Epoch 79/200: 100%|██████████████████████████████████████████████████████| 44/44 [00:03<00:00, 13.14it/s, loss=8.09e+3]


Epoch 79/200 | Train: 8192.4461 | Val: 8168.4052 | Recon: 8031.5704 | KL: 160.8757


Epoch 80/200: 100%|██████████████████████████████████████████████████████| 44/44 [00:03<00:00, 13.13it/s, loss=8.72e+3]


Epoch 80/200 | Train: 8148.3501 | Val: 8319.2927 | Recon: 7987.0767 | KL: 161.2734


Epoch 81/200: 100%|██████████████████████████████████████████████████████| 44/44 [00:03<00:00, 13.17it/s, loss=7.85e+3]


Epoch 81/200 | Train: 8182.3547 | Val: 8262.4202 | Recon: 8020.7627 | KL: 161.5920


Epoch 82/200: 100%|██████████████████████████████████████████████████████| 44/44 [00:03<00:00, 13.14it/s, loss=7.84e+3]


Epoch 82/200 | Train: 8125.4679 | Val: 8243.1386 | Recon: 7963.9848 | KL: 161.4830


Epoch 83/200: 100%|██████████████████████████████████████████████████████| 44/44 [00:03<00:00, 12.77it/s, loss=7.46e+3]


Epoch 83/200 | Train: 8172.0312 | Val: 8243.1402 | Recon: 8009.6564 | KL: 162.3747


Epoch 84/200: 100%|██████████████████████████████████████████████████████| 44/44 [00:03<00:00, 12.74it/s, loss=8.16e+3]


Epoch 84/200 | Train: 8141.8863 | Val: 8279.9007 | Recon: 7979.2595 | KL: 162.6269


Epoch 85/200: 100%|██████████████████████████████████████████████████████| 44/44 [00:03<00:00, 12.90it/s, loss=8.25e+3]


Epoch 85/200 | Train: 8116.1367 | Val: 8210.7160 | Recon: 7953.3993 | KL: 162.7374


Epoch 86/200: 100%|██████████████████████████████████████████████████████| 44/44 [00:03<00:00, 13.16it/s, loss=8.21e+3]


Epoch 86/200 | Train: 8157.9703 | Val: 8174.4878 | Recon: 7994.9205 | KL: 163.0498


Epoch 87/200: 100%|██████████████████████████████████████████████████████| 44/44 [00:03<00:00, 13.52it/s, loss=7.84e+3]


Epoch 87/200 | Train: 8080.4854 | Val: 8184.7280 | Recon: 7916.6106 | KL: 163.8748


Epoch 88/200: 100%|██████████████████████████████████████████████████████| 44/44 [00:03<00:00, 13.42it/s, loss=7.92e+3]


Epoch 88/200 | Train: 8116.3735 | Val: 8154.9814 | Recon: 7951.4638 | KL: 164.9097


Epoch 89/200: 100%|██████████████████████████████████████████████████████| 44/44 [00:03<00:00, 13.41it/s, loss=9.33e+3]


Epoch 89/200 | Train: 8100.4661 | Val: 8149.1865 | Recon: 7936.2344 | KL: 164.2317


Epoch 90/200: 100%|██████████████████████████████████████████████████████| 44/44 [00:03<00:00, 13.24it/s, loss=6.89e+3]


Epoch 90/200 | Train: 8086.0754 | Val: 8142.7520 | Recon: 7921.3567 | KL: 164.7187


Epoch 91/200: 100%|██████████████████████████████████████████████████████| 44/44 [00:03<00:00, 12.62it/s, loss=7.87e+3]


Epoch 91/200 | Train: 8104.0019 | Val: 8260.6730 | Recon: 7938.4179 | KL: 165.5841


Epoch 92/200: 100%|██████████████████████████████████████████████████████| 44/44 [00:03<00:00, 13.11it/s, loss=9.37e+3]


Epoch 92/200 | Train: 8088.1802 | Val: 8181.0137 | Recon: 7922.1092 | KL: 166.0710


Epoch 93/200: 100%|██████████████████████████████████████████████████████| 44/44 [00:03<00:00, 13.21it/s, loss=7.64e+3]


Epoch 93/200 | Train: 8143.4343 | Val: 8075.1566 | Recon: 7977.0120 | KL: 166.4223


Epoch 94/200: 100%|██████████████████████████████████████████████████████| 44/44 [00:03<00:00, 13.42it/s, loss=7.74e+3]


Epoch 94/200 | Train: 8105.9937 | Val: 8172.0955 | Recon: 7939.0141 | KL: 166.9797


Epoch 95/200: 100%|██████████████████████████████████████████████████████| 44/44 [00:03<00:00, 13.44it/s, loss=8.82e+3]


Epoch 95/200 | Train: 8088.8634 | Val: 8086.3217 | Recon: 7921.0710 | KL: 167.7924


Epoch 96/200: 100%|██████████████████████████████████████████████████████| 44/44 [00:03<00:00, 13.51it/s, loss=8.68e+3]


Epoch 96/200 | Train: 8081.3772 | Val: 8208.4985 | Recon: 7913.6723 | KL: 167.7048


Epoch 97/200: 100%|██████████████████████████████████████████████████████| 44/44 [00:03<00:00, 13.33it/s, loss=8.53e+3]


Epoch 97/200 | Train: 8024.1519 | Val: 8103.3200 | Recon: 7856.1774 | KL: 167.9745


Epoch 98/200: 100%|██████████████████████████████████████████████████████| 44/44 [00:03<00:00, 13.05it/s, loss=8.17e+3]


Epoch 98/200 | Train: 8008.6910 | Val: 8187.4930 | Recon: 7840.2797 | KL: 168.4113


Epoch 99/200: 100%|██████████████████████████████████████████████████████| 44/44 [00:03<00:00, 12.62it/s, loss=7.22e+3]


Epoch 99/200 | Train: 8053.0510 | Val: 8097.5591 | Recon: 7884.4708 | KL: 168.5802


Epoch 100/200: 100%|█████████████████████████████████████████████████████| 44/44 [00:03<00:00, 13.26it/s, loss=8.48e+3]


Epoch 100/200 | Train: 8043.3988 | Val: 8076.5629 | Recon: 7875.1493 | KL: 168.2494


Epoch 101/200: 100%|█████████████████████████████████████████████████████| 44/44 [00:03<00:00, 13.19it/s, loss=8.49e+3]


Epoch 101/200 | Train: 8069.5180 | Val: 8175.6441 | Recon: 7900.4732 | KL: 169.0448


Epoch 102/200: 100%|█████████████████████████████████████████████████████| 44/44 [00:03<00:00, 13.35it/s, loss=7.66e+3]


Epoch 102/200 | Train: 8023.6729 | Val: 8110.7523 | Recon: 7854.0508 | KL: 169.6221


Epoch 103/200: 100%|█████████████████████████████████████████████████████| 44/44 [00:03<00:00, 13.23it/s, loss=7.11e+3]


Epoch 103/200 | Train: 7977.1233 | Val: 7992.7525 | Recon: 7806.2664 | KL: 170.8569


Epoch 104/200: 100%|█████████████████████████████████████████████████████| 44/44 [00:03<00:00, 13.36it/s, loss=7.99e+3]


Epoch 104/200 | Train: 7949.3436 | Val: 8004.1922 | Recon: 7776.5438 | KL: 172.7999


Epoch 105/200: 100%|█████████████████████████████████████████████████████| 44/44 [00:03<00:00, 13.01it/s, loss=8.07e+3]


Epoch 105/200 | Train: 7846.9186 | Val: 8026.7666 | Recon: 7674.5624 | KL: 172.3562


Epoch 106/200: 100%|█████████████████████████████████████████████████████| 44/44 [00:03<00:00, 12.36it/s, loss=8.36e+3]


Epoch 106/200 | Train: 7885.3827 | Val: 7925.0880 | Recon: 7712.1205 | KL: 173.2623


Epoch 107/200: 100%|█████████████████████████████████████████████████████| 44/44 [00:03<00:00, 12.95it/s, loss=8.83e+3]


Epoch 107/200 | Train: 7883.1469 | Val: 7966.7802 | Recon: 7709.6548 | KL: 173.4922


Epoch 108/200: 100%|█████████████████████████████████████████████████████| 44/44 [00:03<00:00, 13.15it/s, loss=7.98e+3]


Epoch 108/200 | Train: 7883.3766 | Val: 7985.8630 | Recon: 7709.2178 | KL: 174.1588


Epoch 109/200: 100%|█████████████████████████████████████████████████████| 44/44 [00:03<00:00, 13.53it/s, loss=7.49e+3]


Epoch 109/200 | Train: 7864.9166 | Val: 7987.1902 | Recon: 7690.9253 | KL: 173.9913


Epoch 110/200: 100%|█████████████████████████████████████████████████████| 44/44 [00:03<00:00, 13.46it/s, loss=8.81e+3]


Epoch 110/200 | Train: 7872.5094 | Val: 7927.4912 | Recon: 7698.8935 | KL: 173.6158


Epoch 111/200: 100%|█████████████████████████████████████████████████████| 44/44 [00:03<00:00, 13.49it/s, loss=9.15e+3]


Epoch 111/200 | Train: 7830.9845 | Val: 8010.2299 | Recon: 7655.5608 | KL: 175.4237


Epoch 112/200: 100%|█████████████████████████████████████████████████████| 44/44 [00:03<00:00, 13.48it/s, loss=8.77e+3]


Epoch 112/200 | Train: 7850.4017 | Val: 7991.9929 | Recon: 7675.6444 | KL: 174.7573


Epoch 113/200: 100%|█████████████████████████████████████████████████████| 44/44 [00:03<00:00, 13.23it/s, loss=8.17e+3]


Epoch 113/200 | Train: 7854.6964 | Val: 8171.1346 | Recon: 7679.3370 | KL: 175.3593


Epoch 114/200: 100%|█████████████████████████████████████████████████████| 44/44 [00:03<00:00, 12.43it/s, loss=7.77e+3]


Epoch 114/200 | Train: 7818.8096 | Val: 7931.4020 | Recon: 7643.5538 | KL: 175.2558


Epoch 115/200: 100%|██████████████████████████████████████████████████████| 44/44 [00:03<00:00, 13.23it/s, loss=6.3e+3]


Epoch 115/200 | Train: 7849.0814 | Val: 7950.9296 | Recon: 7673.5515 | KL: 175.5299


Epoch 116/200: 100%|█████████████████████████████████████████████████████| 44/44 [00:03<00:00, 13.43it/s, loss=7.99e+3]


Epoch 116/200 | Train: 7829.2626 | Val: 7904.3657 | Recon: 7654.2494 | KL: 175.0132


Epoch 117/200: 100%|█████████████████████████████████████████████████████| 44/44 [00:03<00:00, 13.05it/s, loss=7.92e+3]


Epoch 117/200 | Train: 7804.8840 | Val: 7915.9467 | Recon: 7628.5884 | KL: 176.2955


Epoch 118/200: 100%|██████████████████████████████████████████████████████| 44/44 [00:03<00:00, 13.20it/s, loss=8.2e+3]


Epoch 118/200 | Train: 7776.8498 | Val: 7873.0255 | Recon: 7600.0886 | KL: 176.7612


Epoch 119/200: 100%|█████████████████████████████████████████████████████| 44/44 [00:03<00:00, 13.39it/s, loss=7.69e+3]


Epoch 119/200 | Train: 7783.2844 | Val: 7982.4762 | Recon: 7605.9141 | KL: 177.3703


Epoch 120/200: 100%|████████████████████████████████████████████████████████| 44/44 [00:03<00:00, 13.47it/s, loss=8e+3]


Epoch 120/200 | Train: 7814.5257 | Val: 7979.7563 | Recon: 7637.4262 | KL: 177.0995


Epoch 121/200: 100%|█████████████████████████████████████████████████████| 44/44 [00:03<00:00, 13.00it/s, loss=8.15e+3]


Epoch 121/200 | Train: 7802.4407 | Val: 7936.9296 | Recon: 7625.0023 | KL: 177.4384


Epoch 122/200: 100%|█████████████████████████████████████████████████████| 44/44 [00:03<00:00, 12.45it/s, loss=8.16e+3]


Epoch 122/200 | Train: 7805.9680 | Val: 7957.1814 | Recon: 7628.5670 | KL: 177.4010


Epoch 123/200: 100%|█████████████████████████████████████████████████████| 44/44 [00:03<00:00, 13.35it/s, loss=8.09e+3]


Epoch 123/200 | Train: 7786.4737 | Val: 7877.8349 | Recon: 7609.0145 | KL: 177.4591


Epoch 124/200: 100%|█████████████████████████████████████████████████████| 44/44 [00:03<00:00, 13.37it/s, loss=7.61e+3]


Epoch 124/200 | Train: 7746.6252 | Val: 7924.7102 | Recon: 7568.5283 | KL: 178.0969


Epoch 125/200: 100%|█████████████████████████████████████████████████████| 44/44 [00:03<00:00, 13.46it/s, loss=7.72e+3]


Epoch 125/200 | Train: 7758.6475 | Val: 7797.5907 | Recon: 7581.5500 | KL: 177.0975


Epoch 126/200: 100%|█████████████████████████████████████████████████████| 44/44 [00:03<00:00, 13.55it/s, loss=6.77e+3]


Epoch 126/200 | Train: 7775.8896 | Val: 7989.6342 | Recon: 7598.2800 | KL: 177.6097


Epoch 127/200: 100%|█████████████████████████████████████████████████████| 44/44 [00:03<00:00, 13.60it/s, loss=7.68e+3]


Epoch 127/200 | Train: 7751.7217 | Val: 7869.1654 | Recon: 7573.8786 | KL: 177.8431


Epoch 128/200: 100%|█████████████████████████████████████████████████████| 44/44 [00:03<00:00, 13.37it/s, loss=7.36e+3]


Epoch 128/200 | Train: 7710.5195 | Val: 7828.9436 | Recon: 7532.0182 | KL: 178.5013


Epoch 129/200: 100%|█████████████████████████████████████████████████████| 44/44 [00:03<00:00, 12.83it/s, loss=7.35e+3]


Epoch 129/200 | Train: 7729.3394 | Val: 7824.3855 | Recon: 7550.9996 | KL: 178.3399


Epoch 130/200: 100%|█████████████████████████████████████████████████████| 44/44 [00:03<00:00, 12.78it/s, loss=8.16e+3]


Epoch 130/200 | Train: 7751.5751 | Val: 7917.6695 | Recon: 7572.9836 | KL: 178.5915


Epoch 131/200: 100%|██████████████████████████████████████████████████████| 44/44 [00:03<00:00, 13.49it/s, loss=7.5e+3]


Epoch 131/200 | Train: 7753.2208 | Val: 7902.1613 | Recon: 7575.0567 | KL: 178.1641


Epoch 132/200: 100%|█████████████████████████████████████████████████████| 44/44 [00:03<00:00, 13.50it/s, loss=7.02e+3]


Epoch 132/200 | Train: 7748.1090 | Val: 7883.5633 | Recon: 7569.3984 | KL: 178.7106


Epoch 133/200: 100%|█████████████████████████████████████████████████████| 44/44 [00:03<00:00, 13.31it/s, loss=8.17e+3]


Epoch 133/200 | Train: 7780.9427 | Val: 7967.5254 | Recon: 7601.5921 | KL: 179.3505


Epoch 134/200: 100%|█████████████████████████████████████████████████████| 44/44 [00:03<00:00, 12.96it/s, loss=7.53e+3]


Epoch 134/200 | Train: 7743.5055 | Val: 7913.4812 | Recon: 7564.3347 | KL: 179.1708


Epoch 135/200: 100%|█████████████████████████████████████████████████████| 44/44 [00:03<00:00, 13.43it/s, loss=7.32e+3]


Epoch 135/200 | Train: 7700.0995 | Val: 7885.4835 | Recon: 7520.6012 | KL: 179.4983


Epoch 136/200: 100%|█████████████████████████████████████████████████████| 44/44 [00:03<00:00, 13.06it/s, loss=7.69e+3]


Epoch 136/200 | Train: 7696.5527 | Val: 7912.9671 | Recon: 7516.9742 | KL: 179.5786


Epoch 137/200: 100%|█████████████████████████████████████████████████████| 44/44 [00:03<00:00, 12.48it/s, loss=7.11e+3]


Epoch 137/200 | Train: 7680.1541 | Val: 7847.2088 | Recon: 7499.9279 | KL: 180.2262
Early stopping triggered.
Loaded best model from ./output\models\best_vae_model.pth


Evaluating:  80%|████████████████████████████████████████████████████████▊              | 8/10 [00:01<00:00,  4.24it/s]


Saved audio samples to ./output\audio
Computing t-SNE (this may take a while)...


  File "C:\Users\koust\anaconda3\Lib\site-packages\joblib\externals\loky\backend\context.py", line 257, in _count_physical_cores
    cpu_info = subprocess.run(
        "wmic CPU Get NumberOfCores /Format:csv".split(),
        capture_output=True,
        text=True,
    )
  File "C:\Users\koust\anaconda3\Lib\subprocess.py", line 554, in run
    with Popen(*popenargs, **kwargs) as process:
         ~~~~~^^^^^^^^^^^^^^^^^^^^^^
  File "C:\Users\koust\anaconda3\Lib\subprocess.py", line 1039, in __init__
    self._execute_child(args, executable, preexec_fn, close_fds,
    ~~~~~~~~~~~~~~~~~~~^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
                        pass_fds, cwd, env,
                        ^^^^^^^^^^^^^^^^^^^
    ...<5 lines>...
                        gid, gids, uid, umask,
                        ^^^^^^^^^^^^^^^^^^^^^^
                        start_new_session, process_group)
                        ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "C:\Users\koust\anaconda3\Lib\subprocess.


--- Reconstruction metrics (averages) ---
mse: 0.3187 (std 0.0766)
mae: 0.4427 (std 0.0561)
psnr: 5.1024 (std 1.1227)
snr: 5.1024 (std 1.1227)
ssim: 0.2672 (std 0.2427)
spectral_conv: 0.5603 (std 0.0695)

--- Anomaly detection ---
accuracy: 0.9636
precision: 1.0000
recall: 0.6000
f1_score: 0.7500
roc_auc: 1.0000
avg_precision: 1.0000

Playing original (first) and reconstructed (first) audio (notebook only):



Outputs saved to: C:\Users\koust\Koustab Projects\VAE_LVM\Speech\output
Total time (min): 9.142680438359578
