### Speaker Verification with Pretrained and Fine-Tuned Models
The first part of this assignment focuses on assessing a pretrained speaker verification model and subsequently improving its performance through fine-tuning. This process involves selecting an appropriate model, evaluating it on a standard dataset, and adapting it to enhance its discriminative capabilities.

In [37]:
import os
import torch
import torchaudio
import numpy as np
from transformers import Wav2Vec2FeatureExtractor, WavLMModel
from sklearn.metrics import roc_curve, accuracy_score
from tqdm import tqdm


In [38]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
device

device(type='cuda')

In [3]:
model_name = "microsoft/wavlm-base-plus"
feature_extractor = Wav2Vec2FeatureExtractor.from_pretrained(model_name)
model = WavLMModel.from_pretrained(model_name).to(device)
model.eval()

preprocessor_config.json:   0%|          | 0.00/215 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/2.23k [00:00<?, ?B/s]

pytorch_model.bin:   0%|          | 0.00/378M [00:00<?, ?B/s]

WavLMModel(
  (feature_extractor): WavLMFeatureEncoder(
    (conv_layers): ModuleList(
      (0): WavLMGroupNormConvLayer(
        (conv): Conv1d(1, 512, kernel_size=(10,), stride=(5,), bias=False)
        (activation): GELUActivation()
        (layer_norm): GroupNorm(512, 512, eps=1e-05, affine=True)
      )
      (1-4): 4 x WavLMNoLayerNormConvLayer(
        (conv): Conv1d(512, 512, kernel_size=(3,), stride=(2,), bias=False)
        (activation): GELUActivation()
      )
      (5-6): 2 x WavLMNoLayerNormConvLayer(
        (conv): Conv1d(512, 512, kernel_size=(2,), stride=(2,), bias=False)
        (activation): GELUActivation()
      )
    )
  )
  (feature_projection): WavLMFeatureProjection(
    (layer_norm): LayerNorm((512,), eps=1e-05, elementwise_affine=True)
    (projection): Linear(in_features=512, out_features=768, bias=True)
    (dropout): Dropout(p=0.1, inplace=False)
  )
  (encoder): WavLMEncoder(
    (pos_conv_embed): WavLMPositionalConvEmbedding(
      (conv): Parametrized

In [None]:
def load_audio(filepath, target_sr=16000):
    waveform, sr = torchaudio.load(filepath)
    if sr != target_sr:
        waveform = torchaudio.transforms.Resample(orig_freq=sr, new_freq=target_sr)(waveform)
    return waveform.squeeze(0)  

In [None]:
def extract_embedding(audio_tensor):
    inputs = feature_extractor(audio_tensor, sampling_rate=16000, return_tensors="pt", padding=True)
    input_values = inputs.input_values.to(device)
    with torch.no_grad():
        outputs = model(input_values)
    embeddings = outputs.last_hidden_state.mean(dim=1)
    return embeddings.cpu().numpy()[0]

In [6]:
def cosine_similarity(vec1, vec2):
    vec1_norm = vec1 / np.linalg.norm(vec1)
    vec2_norm = vec2 / np.linalg.norm(vec2)
    return np.dot(vec1_norm, vec2_norm)

In [None]:
def compute_eer(labels, scores):
    fpr, tpr, thresholds = roc_curve(labels, scores, pos_label=1)
    fnr = 1 - tpr
    abs_diffs = np.abs(fnr - fpr)
    idx_eer = np.nanargmin(abs_diffs)
    eer = fpr[idx_eer]  
    return eer * 100, thresholds[idx_eer]

In [None]:
def compute_tar_at_far(labels, scores, target_far=0.01):
    fpr, tpr, thresholds = roc_curve(labels, scores, pos_label=1)
    valid_idxs = np.where(fpr <= target_far)[0]
    if len(valid_idxs) == 0:
        return 0.0
    tar = tpr[valid_idxs[-1]]
    return tar * 100

In [9]:
def compute_accuracy(labels, scores, threshold):
    predictions = (np.array(scores) >= threshold).astype(int)
    return accuracy_score(labels, predictions) * 100


In [None]:
vox1_audio_dir = "/kaggle/input/voxcelebdataset-su/vox1_test_wav/wav"  
trial_file_path = "/kaggle/input/vox1-trialpair/vox1_trialpair.txt"

In [None]:
embedding_cache = {}

In [43]:
def get_embedding_for_file(filename):
    if filename in embedding_cache:
        return embedding_cache[filename]
    audio_path = os.path.join(vox1_audio_dir, filename)
    audio = load_audio(audio_path)
    emb = extract_embedding(audio)
    embedding_cache[filename] = emb
    return emb

In [None]:
with open(trial_file_path, "r") as f:
    lines = f.readlines()

lines

In [None]:
trial_labels = []
trial_scores = []


for line in tqdm(lines, desc="Processing trials"):
    parts = line.strip().split()
    
    if len(parts) != 3:
        print(f"Skipping malformed line: {line.strip()}")
        continue
    
    try:
        label = int(parts[0]) 
        file1 = parts[1]
        file2 = parts[2]
    except ValueError as e:
        print(f"Error parsing line: {line.strip()} - {e}")
        continue
    
    emb1 = get_embedding_for_file(file1)
    emb2 = get_embedding_for_file(file2)
    score = cosine_similarity(emb1, emb2)
    
    trial_labels.append(label)
    trial_scores.append(score)

Processing trials: 100%|██████████| 37611/37611 [04:29<00:00, 139.41it/s] 


In [47]:
trial_labels = np.array(trial_labels)
trial_scores = np.array(trial_scores)

In [None]:
eer, eer_threshold = compute_eer(trial_labels, trial_scores)
tar_at_1_far = compute_tar_at_far(trial_labels, trial_scores, target_far=0.01)
verification_accuracy = compute_accuracy(trial_labels, trial_scores, threshold=eer_threshold)
print("Pretrained Model Verification Results:")
print(f"EER: {eer:.2f}% at threshold {eer_threshold:.4f}")
print(f"TAR@1%FAR: {tar_at_1_far:.2f}%")
print(f"Speaker Verification Accuracy: {verification_accuracy:.2f}%")


Pretrained Model Verification Results:
EER: 36.73% at threshold 0.8699
TAR@1%FAR: 7.39%
Speaker Verification Accuracy: 63.27%


In [49]:
!apt-get install -y ffmpeg

Reading package lists... Done
Building dependency tree... Done
Reading state information... Done
ffmpeg is already the newest version (7:4.4.2-0ubuntu0.22.04.1).
0 upgraded, 0 newly installed, 0 to remove and 129 not upgraded.


In [15]:
pip install ffmpeg-python

Collecting ffmpeg-python
  Downloading ffmpeg_python-0.2.0-py3-none-any.whl.metadata (1.7 kB)
Downloading ffmpeg_python-0.2.0-py3-none-any.whl (25 kB)
Installing collected packages: ffmpeg-python
Successfully installed ffmpeg-python-0.2.0
Note: you may need to restart the kernel to use updated packages.


In [53]:
import math
import torch.nn as nn
import torch.nn.functional as F
from torch.utils.data import Dataset, DataLoader
import glob
from pydub import AudioSegment
import imageio_ffmpeg as ffmpeg
import ffmpeg
import io
import random
from peft import LoraConfig, get_peft_model, TaskType

In [None]:
import torchaudio
torchaudio.set_audio_backend("ffmpeg")

  torchaudio.set_audio_backend("ffmpeg")


### ArcFace 

In [78]:
class ArcMarginProduct(nn.Module):
    def __init__(self, in_features, out_features, s=30.0, m=0.50, easy_margin=False):
        super(ArcMarginProduct, self).__init__()
        self.in_features = in_features
        self.out_features = out_features
        self.s = s
        self.m = m
        self.weight = nn.Parameter(torch.FloatTensor(out_features, in_features))
        nn.init.xavier_uniform_(self.weight)
        self.easy_margin = easy_margin
        self.cos_m = math.cos(m)
        self.sin_m = math.sin(m)
        self.th = math.cos(math.pi - m)
        self.mm = math.sin(math.pi - m) * m

    def forward(self, input, label):
        cosine = F.linear(F.normalize(input), F.normalize(self.weight))
        sine = torch.sqrt(torch.clamp(1.0 - torch.pow(cosine, 2), min=1e-6))
        phi = cosine * self.cos_m - sine * self.sin_m
        if self.easy_margin:
            phi = torch.where(cosine > 0, phi, cosine)
        else:
            phi = torch.where(cosine > self.th, phi, cosine - self.mm)
        one_hot = torch.zeros(cosine.size(), device=input.device)
        one_hot.scatter_(1, label.view(-1, 1), 1)
        output = (one_hot * phi) + ((1.0 - one_hot) * cosine)
        output *= self.s
        return output

### LoRA module for linear adaptation

In [79]:
class LoRALinear(nn.Module):
    def __init__(self, in_features, out_features, r=4, alpha=16.0, dropout=0.1):
        super(LoRALinear, self).__init__()
        self.r = r
        if r > 0:
            self.lora_A = nn.Linear(in_features, r, bias=False)
            self.lora_B = nn.Linear(r, out_features, bias=False)
            self.scaling = alpha / r
            self.dropout = nn.Dropout(dropout)
        else:
            self.lora_A = None
            self.lora_B = None

    def forward(self, x):
        if self.r > 0:
            return self.lora_B(self.dropout(self.lora_A(x))) * self.scaling
        else:
            return 0

### Fine-tuning model wrapper

In [None]:
class FineTuneModel(nn.Module):
    def __init__(self, pretrained_model, embedding_dim, num_classes, lora_r=4, lora_alpha=16):
        super(FineTuneModel, self).__init__()
        self.pretrained = pretrained_model
        for param in self.pretrained.parameters():
            param.requires_grad = False
        self.lora = LoRALinear(embedding_dim, embedding_dim, r=lora_r, alpha=lora_alpha)
        self.arcface = ArcMarginProduct(embedding_dim, num_classes)
    
    def forward(self, input_values, labels):
        outputs = self.pretrained(input_values)
        embeddings = outputs.last_hidden_state.mean(dim=1)
        adapted_embeddings = embeddings + self.lora(embeddings)
        logits = self.arcface(adapted_embeddings, labels)
        return logits, adapted_embeddings
    
    def extract_embeddings(self, input_values):
        outputs = self.pretrained(input_values)
        embeddings = outputs.last_hidden_state.mean(dim=1)
        adapted_embeddings = embeddings + self.lora(embeddings)
        return adapted_embeddings

### Define VoxCeleb2 Dataset for Fine-Tuning

In [None]:
import random

def crop_waveform(waveform, target_length):
    num_samples = waveform.shape[-1]
    if num_samples <= target_length:
        return waveform
    start = random.randint(0, num_samples - target_length)
    return waveform[..., start:start+target_length]

class VoxCeleb2Dataset(Dataset):
    def __init__(self, root_dir, identities, feature_extractor, max_duration=3, max_samples_per_speaker=300):
        self.samples = []
        self.root_dir = root_dir
        self.feature_extractor = feature_extractor
        self.max_duration = max_duration 
        for speaker in identities:
            speaker_files = []
            speaker_dir = os.path.join(root_dir, speaker)
            for subdir, dirs, files in os.walk(speaker_dir):
                for file in files:
                    if file.lower().endswith((".m4a", ".wav", ".mp3")):
                        audio_file = os.path.join(subdir, file)
                        speaker_files.append((audio_file, speaker))
            if len(speaker_files) > max_samples_per_speaker:
                speaker_files = random.sample(speaker_files, max_samples_per_speaker)
            self.samples.extend(speaker_files)
        self.speaker2label = {speaker: idx for idx, speaker in enumerate(sorted(identities))}

    def __len__(self):
        return len(self.samples)

    def __getitem__(self, idx):
        audio_path, speaker = self.samples[idx]
        if not isinstance(audio_path, str):
            print("DEBUG: audio_path is not a string:", audio_path, type(audio_path))
        waveform, sr = torchaudio.load(audio_path)
        if sr != 16000:
            waveform = torchaudio.transforms.Resample(sr, 16000)(waveform)
        waveform = waveform.squeeze(0) 
        target_length = 16000 * self.max_duration  
        waveform = crop_waveform(waveform, target_length)
        label = self.speaker2label[speaker]
        return waveform, label


    
    


In [None]:
def vox2_collate_fn(batch):
        waveforms, labels = zip(*batch)
        waveforms_np = [w.numpy() for w in waveforms]
        inputs = feature_extractor(waveforms_np, sampling_rate=16000, return_tensors="pt", padding=True)
        return inputs.input_values, torch.tensor(labels)

In [83]:
vox2_dir = "/kaggle/input/voxcelebdataset-su/vox2_test_aac/aac"

In [None]:
first_folder = os.path.join(vox2_dir, os.listdir(vox2_dir)[0])
print(f"Directory tree for {first_folder}:\n")
for root, dirs, files in os.walk(first_folder):
    level = root.replace(first_folder, "").count(os.sep)
    indent = " " * 4 * level
    print(f"{indent}{os.path.basename(root)}/")
    sub_indent = " " * 4 * (level + 1)
    for file in files:
        print(f"{sub_indent}{file}")

In [None]:
all_identities = sorted(os.listdir(vox2_dir))
print(f"Found {len(all_identities)} identities in VoxCeleb2 dataset.")
print(f"Identities: {all_identities}")
train_identities = all_identities[:100]
print(f"Using {len(train_identities)} identities for training.")
print(f"Training Identities: {train_identities}")
num_classes = len(train_identities)
batch_size = 32
num_epochs = 10  
learning_rate = 1e-4

Found 118 identities in VoxCeleb2 dataset.
Identities: ['id00017', 'id00061', 'id00081', 'id00154', 'id00419', 'id00562', 'id00812', 'id00817', 'id00866', 'id00926', 'id01000', 'id01041', 'id01066', 'id01106', 'id01224', 'id01228', 'id01298', 'id01333', 'id01437', 'id01460', 'id01509', 'id01541', 'id01567', 'id01593', 'id01618', 'id01822', 'id01892', 'id01989', 'id02019', 'id02057', 'id02086', 'id02181', 'id02286', 'id02317', 'id02445', 'id02465', 'id02542', 'id02548', 'id02576', 'id02577', 'id02685', 'id02725', 'id02745', 'id03030', 'id03041', 'id03127', 'id03178', 'id03347', 'id03382', 'id03524', 'id03677', 'id03789', 'id03839', 'id03862', 'id03969', 'id03978', 'id03980', 'id03981', 'id04006', 'id04030', 'id04094', 'id04119', 'id04232', 'id04253', 'id04276', 'id04295', 'id04366', 'id04478', 'id04536', 'id04570', 'id04627', 'id04656', 'id04657', 'id04862', 'id04950', 'id05015', 'id05055', 'id05124', 'id05176', 'id05202', 'id05459', 'id05594', 'id05654', 'id05714', 'id05816', 'id05850'

In [None]:
train_dataset = VoxCeleb2Dataset(vox2_dir, train_identities, feature_extractor)
print(f"Number of samples in train_dataset: {len(train_dataset)}")

Number of samples in train_dataset: 23186


In [98]:
train_dataloader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True, collate_fn=vox2_collate_fn)

In [None]:
embedding_dim = model.config.hidden_size  
num_classes = len(train_identities)  
finetune_model = FineTuneModel(model, embedding_dim, num_classes, lora_r=4, lora_alpha=16)
finetune_model.to(device)
finetune_model.train()

FineTuneModel(
  (pretrained): WavLMModel(
    (feature_extractor): WavLMFeatureEncoder(
      (conv_layers): ModuleList(
        (0): WavLMGroupNormConvLayer(
          (conv): Conv1d(1, 512, kernel_size=(10,), stride=(5,), bias=False)
          (activation): GELUActivation()
          (layer_norm): GroupNorm(512, 512, eps=1e-05, affine=True)
        )
        (1-4): 4 x WavLMNoLayerNormConvLayer(
          (conv): Conv1d(512, 512, kernel_size=(3,), stride=(2,), bias=False)
          (activation): GELUActivation()
        )
        (5-6): 2 x WavLMNoLayerNormConvLayer(
          (conv): Conv1d(512, 512, kernel_size=(2,), stride=(2,), bias=False)
          (activation): GELUActivation()
        )
      )
    )
    (feature_projection): WavLMFeatureProjection(
      (layer_norm): LayerNorm((512,), eps=1e-05, elementwise_affine=True)
      (projection): Linear(in_features=512, out_features=768, bias=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): WavLMEncoder(
 

In [None]:
optimizer = torch.optim.Adam(filter(lambda p: p.requires_grad, finetune_model.parameters()), lr=learning_rate)
criterion = nn.CrossEntropyLoss()

In [None]:
from tqdm import tqdm
best_loss = float('inf') 
for epoch in range(num_epochs):
    epoch_loss = 0.0
    with tqdm(total=len(train_dataloader), desc=f"Epoch {epoch+1}/{num_epochs}") as pbar:
        for batch_idx, (input_values, labels) in enumerate(train_dataloader):
            input_values = input_values.to(device)
            labels = labels.to(device)
            optimizer.zero_grad()
            logits, _ = finetune_model(input_values=input_values, labels=labels)
            loss = criterion(logits, labels)
            loss.backward()
            optimizer.step()
            
            epoch_loss += loss.item()
            pbar.set_postfix(loss=loss.item())
            pbar.update(1)
        avg_loss = epoch_loss / len(train_dataloader)
        print(f"Epoch {epoch+1}, Loss: {avg_loss:.4f}")

        if avg_loss < best_loss:
            best_loss = avg_loss
            model_path = f"best_finetune_model_epoch{epoch+1}.pt"
            torch.save(finetune_model.state_dict(), model_path)
            print(f" Model saved at {model_path} (New best loss: {best_loss:.4f})")
    
    print(f"Epoch {epoch+1} average loss: {epoch_loss/len(train_dataloader):.4f}")


Epoch 1/10: 100%|██████████| 725/725 [08:45<00:00,  1.52it/s, loss=18.3]

Epoch 1, Loss: 18.5720


Epoch 1/10: 100%|██████████| 725/725 [08:46<00:00,  1.38it/s, loss=18.3]


 Model saved at best_finetune_model_epoch1.pt (New best loss: 18.5720)
Epoch 1 average loss: 18.5720


Epoch 2/10: 100%|██████████| 725/725 [08:41<00:00,  1.53it/s, loss=17.4]

Epoch 2, Loss: 17.6918


Epoch 2/10: 100%|██████████| 725/725 [08:41<00:00,  1.39it/s, loss=17.4]


 Model saved at best_finetune_model_epoch2.pt (New best loss: 17.6918)
Epoch 2 average loss: 17.6918


Epoch 4/10: 100%|██████████| 725/725 [08:42<00:00,  1.53it/s, loss=15.5]

Epoch 4, Loss: 16.4117


Epoch 4/10: 100%|██████████| 725/725 [08:43<00:00,  1.39it/s, loss=15.5]


 Model saved at best_finetune_model_epoch4.pt (New best loss: 16.4117)
Epoch 4 average loss: 16.4117


Epoch 5/10: 100%|██████████| 725/725 [08:39<00:00,  1.52it/s, loss=13.4]

Epoch 5, Loss: 14.5869


Epoch 5/10: 100%|██████████| 725/725 [08:39<00:00,  1.39it/s, loss=13.4]


 Model saved at best_finetune_model_epoch5.pt (New best loss: 14.5869)
Epoch 5 average loss: 14.5869


Epoch 6/10: 100%|██████████| 725/725 [08:42<00:00,  1.52it/s, loss=11]  

Epoch 6, Loss: 11.0216


Epoch 6/10: 100%|██████████| 725/725 [08:42<00:00,  1.39it/s, loss=11]


 Model saved at best_finetune_model_epoch6.pt (New best loss: 11.0216)
Epoch 6 average loss: 11.0216


Epoch 7/10: 100%|██████████| 725/725 [08:43<00:00,  1.52it/s, loss=10.9]

Epoch 7, Loss: 10.9942


Epoch 7/10: 100%|██████████| 725/725 [08:43<00:00,  1.38it/s, loss=10.9]


 Model saved at best_finetune_model_epoch7.pt (New best loss: 10.9942)
Epoch 7 average loss: 10.9942


Epoch 8/10: 100%|██████████| 725/725 [08:40<00:00,  1.55it/s, loss=10.5]

Epoch 8, Loss: 10.8280


Epoch 8/10: 100%|██████████| 725/725 [08:40<00:00,  1.39it/s, loss=10.5]


 Model saved at best_finetune_model_epoch8.pt (New best loss: 10.8280)
Epoch 8 average loss: 10.8280


Epoch 9/10: 100%|██████████| 725/725 [08:39<00:00,  1.51it/s, loss=10.3]

Epoch 9, Loss: 10.7202


Epoch 9/10: 100%|██████████| 725/725 [08:39<00:00,  1.39it/s, loss=10.3]


 Model saved at best_finetune_model_epoch9.pt (New best loss: 10.7202)
Epoch 9 average loss: 10.7202


Epoch 10/10: 100%|██████████| 725/725 [08:42<00:00,  1.48it/s, loss=10.8]

Epoch 10, Loss: 10.6547


Epoch 10/10: 100%|██████████| 725/725 [08:43<00:00,  1.39it/s, loss=10.8]

 Model saved at best_finetune_model_epoch10.pt (New best loss: 10.6547)
Epoch 10 average loss: 10.6547





In [None]:
finetune_model.eval()

FineTuneModel(
  (pretrained): WavLMModel(
    (feature_extractor): WavLMFeatureEncoder(
      (conv_layers): ModuleList(
        (0): WavLMGroupNormConvLayer(
          (conv): Conv1d(1, 512, kernel_size=(10,), stride=(5,), bias=False)
          (activation): GELUActivation()
          (layer_norm): GroupNorm(512, 512, eps=1e-05, affine=True)
        )
        (1-4): 4 x WavLMNoLayerNormConvLayer(
          (conv): Conv1d(512, 512, kernel_size=(3,), stride=(2,), bias=False)
          (activation): GELUActivation()
        )
        (5-6): 2 x WavLMNoLayerNormConvLayer(
          (conv): Conv1d(512, 512, kernel_size=(2,), stride=(2,), bias=False)
          (activation): GELUActivation()
        )
      )
    )
    (feature_projection): WavLMFeatureProjection(
      (layer_norm): LayerNorm((512,), eps=1e-05, elementwise_affine=True)
      (projection): Linear(in_features=512, out_features=768, bias=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): WavLMEncoder(
 

In [None]:
def extract_finetune_embedding(audio_tensor):
    inputs = feature_extractor(audio_tensor, sampling_rate=16000, return_tensors="pt", padding=True)
    input_values = inputs.input_values.to(device)
    with torch.no_grad():
        embeddings = finetune_model.extract_embeddings(input_values)
    return embeddings.cpu().numpy()[0]

In [None]:
embedding_cache_finetune = {}
def get_finetune_embedding_for_file(filename):
    if filename in embedding_cache_finetune:
        return embedding_cache_finetune[filename]
    audio_path = os.path.join(vox1_audio_dir, filename) 
    audio = load_audio(audio_path)
    emb = extract_finetune_embedding(audio)
    embedding_cache_finetune[filename] = emb
    return emb

In [None]:
trial_labels_ft = []
trial_scores_ft = []

for line in tqdm(lines, desc="Processing trials"):
    parts = line.strip().split()
    if len(parts) != 3:
        print(f"Skipping malformed line: {line.strip()}")
        continue
    try:
        label = int(parts[0])  
        file1 = parts[1]
        file2 = parts[2]
    except ValueError as e:
        print(f"Error parsing line: {line.strip()} - {e}")
        continue
    emb1 = get_finetune_embedding_for_file(file1)
    emb2 = get_finetune_embedding_for_file(file2)
    score = cosine_similarity(emb1, emb2)
    trial_labels_ft.append(label)
    trial_scores_ft.append(score)

Processing trials: 100%|██████████| 37611/37611 [03:36<00:00, 173.33it/s] 


In [106]:
eer_ft, eer_threshold_ft = compute_eer(trial_labels_ft, trial_scores_ft)
tar_at_1_far_ft = compute_tar_at_far(trial_labels_ft, trial_scores_ft, target_far=0.01)
verification_accuracy_ft = compute_accuracy(trial_labels_ft, trial_scores_ft, threshold=eer_threshold_ft)

print("\nFine-Tuned Model Verification Results:")
print(f"EER: {eer_ft:.2f}% at threshold {eer_threshold_ft:.4f}")
print(f"TAR@1%FAR: {tar_at_1_far_ft:.2f}%")
print(f"Speaker Verification Accuracy: {verification_accuracy_ft:.2f}%")


Fine-Tuned Model Verification Results:
EER: 15.01% at threshold 0.9895
TAR@1%FAR: 21.62%
Speaker Verification Accuracy: 84.99%
