In [None]:
%%capture
!pip install wandb
!pip install speechbrain

In [None]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [None]:
! wandb login c07f9b9363b1d2736cf24c01a4747e245909f2cc

[34m[1mwandb[0m: Appending key for api.wandb.ai to your netrc file: /root/.netrc


In [None]:
import os
import numpy as np
import tarfile
from zipfile import ZipFile
import sklearn.metrics as metrics

import torch
import torchaudio
import torch.nn as nn
import torch.nn.functional as F
from torch.utils.data import DataLoader
import torchaudio as ta
import wandb

from transformers import Wav2Vec2FeatureExtractor, WavLMForXVector, UniSpeechSatForXVector, HubertForSequenceClassification
from speechbrain.inference.speaker import EncoderClassifier

In [None]:
with tarfile.open("/content/drive/MyDrive/FinalSpeech/VoxCeleb1_subset.tar-tron’s MacBook Pro.gz") as tar:
  tar.extractall("/content/")

!mv -v /content/VoxCeleb1_subset/test/* /content/VoxCeleb1_subset/dev/

mv: cannot move '/content/VoxCeleb1_subset/test/id10270' to '/content/VoxCeleb1_subset/dev/id10270': Directory not empty
mv: cannot move '/content/VoxCeleb1_subset/test/id10271' to '/content/VoxCeleb1_subset/dev/id10271': Directory not empty
mv: cannot move '/content/VoxCeleb1_subset/test/id10272' to '/content/VoxCeleb1_subset/dev/id10272': Directory not empty
mv: cannot move '/content/VoxCeleb1_subset/test/id10273' to '/content/VoxCeleb1_subset/dev/id10273': Directory not empty
mv: cannot move '/content/VoxCeleb1_subset/test/id10274' to '/content/VoxCeleb1_subset/dev/id10274': Directory not empty
mv: cannot move '/content/VoxCeleb1_subset/test/id10275' to '/content/VoxCeleb1_subset/dev/id10275': Directory not empty
mv: cannot move '/content/VoxCeleb1_subset/test/id10276' to '/content/VoxCeleb1_subset/dev/id10276': Directory not empty
mv: cannot move '/content/VoxCeleb1_subset/test/id10278' to '/content/VoxCeleb1_subset/dev/id10278': Directory not empty
mv: cannot move '/content/VoxCel

In [None]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
cosine_similarity = nn.CosineSimilarity(dim=-1)

In [None]:
class CustomDataset(torch.utils.data.Dataset):
  def __init__(self, root_dir, txt_file, max_frames=32000):
    self.root_dir = root_dir
    self.txt_file = txt_file
    self.max_frames = max_frames
    self.data = self.read_file()

  def read_file(self):
    data = []
    with open(self.txt_file, 'r') as fil:
      for line in fil:
        label, first, second = line.strip().split()

        second_path = os.path.join(self.root_dir, second)
        first_path = os.path.join(self.root_dir, first)

        if os.path.exists(first_path) :
          if os.path.exists(second_path):
              data.append((label, first, second))

    return data

  def __len__(self):
    return len(self.data)

  def process_sample(self, path):
    filename, _ = os.path.splitext(path)
    path = filename + ".wav"
    file_path = os.path.join(self.root_dir, path)

    wav, sample_rate = ta.load(file_path)
    num_frames = wav.shape[1]

    if num_frames >= self.max_frames:
      wav = wav[:, :self.max_frames]
    else:
      pad_size = self.max_frames - num_frames
      wav = F.pad(wav, (0, pad_size), value=0)

    return wav, sample_rate

  def __getitem__(self, idx):
    label, first_path, second_path = self.data[idx]
    first_tensor, first_sample_rate = self.process_sample(first_path)
    second_tensor, second_sample_rate = self.process_sample(second_path)

    first_tensor = first_tensor.squeeze(0)
    second_tensor = second_tensor.squeeze(0)
    label = torch.tensor(np.array(int(label)))

    return first_tensor, second_tensor, label

In [None]:
def compute_eer(labels, preds):
    fpr, tpr, thresholds = metrics.roc_curve(labels, preds, pos_label=1)
    eer = np.min(np.abs(fpr - (1 - tpr)))
    return eer

In [None]:
def evaluate(model, test_loader, extractor, cos_sim):
    model.eval()
    total_eer = 0

    for batch_idx, (wav1, wav2, label) in enumerate(test_loader):
      wav1 = wav1.to(device)
      wav2 = wav2.to(device)

      with torch.inference_mode():
          audio1 = extractor(wav1.squeeze(0), return_tensors="pt", sampling_rate=16000).input_values.squeeze(0).to(device)
          audio2 = extractor(wav2.squeeze(0), return_tensors="pt", sampling_rate=16000).input_values.squeeze(0).to(device)

          embeddings1 = F.normalize(model(input_values=audio1).embeddings,dim=-1).cpu()
          embeddings2 = F.normalize(model(input_values=audio2).embeddings,dim=-1).cpu()

          similarity = torch.sigmoid(cos_sim(embeddings1, embeddings2))
          eer = compute_eer(label, similarity)
          total_eer += eer

          if batch_idx % 50 == 0:
            print(f"{batch_idx+1}/{len(test_loader)} EER: {eer}")

    total_eer = total_eer / len(test_loader)
    return total_eer

In [None]:
def evaluate_ecapa(classifier, test_dataloader, cos_sim):
    total_eer = 0
    for batch_idx, (wav1, wav2, label) in enumerate(test_dataloader):
      wav1 = wav1.to(device)
      wav2 = wav2.to(device)

      with torch.inference_mode():
          embeddings1 = (F.normalize(classifier.encode_batch(wav1),dim=-1)).cpu()
          embeddings2 = (F.normalize(classifier.encode_batch(wav2),dim=-1)).cpu()

          similarity = torch.sigmoid(cos_sim(embeddings1, embeddings2))
          eer = compute_eer(label, similarity)
          total_eer += eer

          if batch_idx % 50 == 0:
            print(f"{batch_idx+1}/{len(test_dataloader)} EER: {eer}")

    total_eer = total_eer / len(test_dataloader)

    return total_eer

# VoxCeleb1-H

In [None]:
wandb.init(project="Speech Assignment Task 1", name="Voxceleb1-H")

VBox(children=(Label(value='0.001 MB of 0.001 MB uploaded\r'), FloatProgress(value=1.0, max=1.0)))

In [None]:
txt_file = "/content/VoxCeleb1_subset/list_test_hard.txt"
wav_dir = "/content/VoxCeleb1_subset/dev"

vox_test_dataset = CustomDataset(wav_dir, txt_file, 32000)
vox_test_loader = DataLoader(vox_test_dataset, batch_size=64, shuffle=True)

# 1. Ecapa TDNN

In [None]:
classifier = EncoderClassifier.from_hparams(source="speechbrain/spkrec-ecapa-voxceleb", run_opts={"device":"cuda"} )
vox_ecapa_eer = round(evaluate_ecapa(classifier,vox_test_loader,cosine_similarity),4)
wandb.log({'ECAPA VOX EER': vox_ecapa_eer})
print(f"Average EER(%): {vox_ecapa_eer * 100}%")

1/152 EER: 0.05726405090137862
51/152 EER: 0.011764705882352927
101/152 EER: 0.023809523809523767
151/152 EER: 0.013785790031813405
Average EER(%): 1.59%


# 2. Unispeech-sat-base

In [None]:
model = UniSpeechSatForXVector.from_pretrained('microsoft/unispeech-sat-base-sv').to(device)
extractor = Wav2Vec2FeatureExtractor.from_pretrained('microsoft/unispeech-sat-base-sv')
vox_uni_eer = round(evaluate(model,vox_test_loader,extractor,cosine_similarity),4)
wandb.log({f'Unispeech VOX EER': vox_uni_eer})
print(f"Average EER(%): {vox_uni_eer * 100}%")

Some weights of the model checkpoint at microsoft/unispeech-sat-base-sv were not used when initializing UniSpeechSatForXVector: ['unispeech_sat.encoder.pos_conv_embed.conv.weight_g', 'unispeech_sat.encoder.pos_conv_embed.conv.weight_v']
- This IS expected if you are initializing UniSpeechSatForXVector from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing UniSpeechSatForXVector from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of UniSpeechSatForXVector were not initialized from the model checkpoint at microsoft/unispeech-sat-base-sv and are newly initialized: ['unispeech_sat.encoder.pos_conv_embed.conv.parametrizations.weight.original0', 'unispeech_sat.encoder.pos_conv_embed.conv.parametrizati

1/152 EER: 0.04901960784313722
51/152 EER: 0.027571580063626727
101/152 EER: 0.0019607843137255387
151/152 EER: 0.015763546798029493
Average EER(%): 2.04%


# 3. Wavlm-base-plus

In [None]:
model = WavLMForXVector.from_pretrained('microsoft/wavlm-base-plus-sv').to(device)
extractor = Wav2Vec2FeatureExtractor.from_pretrained('microsoft/wavlm-base-plus-sv')
vox_wavlm_eer = round(evaluate(model,vox_test_loader,extractor,cosine_similarity),4)
wandb.log({f'Wavlm VOX EER': vox_wavlm_eer})
print(f"Average EER(%): {vox_wavlm_eer * 100}")

Some weights of the model checkpoint at microsoft/wavlm-base-plus-sv were not used when initializing WavLMForXVector: ['wavlm.encoder.pos_conv_embed.conv.weight_g', 'wavlm.encoder.pos_conv_embed.conv.weight_v']
- This IS expected if you are initializing WavLMForXVector from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing WavLMForXVector from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of WavLMForXVector were not initialized from the model checkpoint at microsoft/wavlm-base-plus-sv and are newly initialized: ['wavlm.encoder.pos_conv_embed.conv.parametrizations.weight.original0', 'wavlm.encoder.pos_conv_embed.conv.parametrizations.weight.original1']
You should probably TRAIN this model on a d

1/152 EER: 0.016194331983805627
51/152 EER: 0.0
101/152 EER: 0.0
151/152 EER: 0.0009852216748768572
Average EER(%): 1.7500000000000002


In [None]:
# wandb.finish()

VBox(children=(Label(value='0.001 MB of 0.001 MB uploaded\r'), FloatProgress(value=1.0, max=1.0)))

0,1
ECAPA VOX EER,▁
Unispeech VOX EER,▁
Wavlm VOX EER,▁

0,1
ECAPA VOX EER,0.0159
Unispeech VOX EER,0.0204
Wavlm VOX EER,0.0175


# Kathbath Dataset

In [None]:
# wandb.init(project="Speech Assignment Task 1", name="Kathbath Dataset")

In [None]:
# os.makedirs("/content/kb_test_hi/", exist_ok=True)
# with ZipFile("/content/drive/MyDrive/FinalSpeech/kb_test_hi.zip", "r") as kb:
#   kb.extractall("/content/kb_test_hi/")

In [None]:
# # drive_folder = "/content/drive/MyDrive/"
# # kb_val_dir = "/content/kb_val_hi/wav"
# kb_test_dir = "/content/kb_test_hi"
# kb_test_pairs = "/content/drive/MyDrive/FinalSpeech/kb_test_pairs.txt"
# kb_val_pairs = "/content/drive/MyDrive/FinalSpeech/kb_valid_pairs.txt"

In [None]:
# kb_test_dataset = CustomDataset(kb_test_dir, kb_test_pairs, 32000)
# kb_test_loader = DataLoader(kb_test_dataset, batch_size=64, shuffle=True)

# 1. Ecapa TDNN

In [None]:
# classifier = EncoderClassifier.from_hparams(source="speechbrain/spkrec-ecapa-voxceleb", run_opts={"device":"cuda"} )
# kb_ecapa_eer = round(evaluate_ecapa(classifier,kb_test_loader,cosine_similarity),4)
# wandb.log({'ECAPA KB EER': kb_ecapa_eer})
# print(f"Average EER(%): {kb_ecapa_eer * 100}%")

1/782 EER: 0.03503503503503508
51/782 EER: 0.03251231527093601
101/782 EER: 0.03125
151/782 EER: 0.0
201/782 EER: 0.0029325513196481467
251/782 EER: 0.01379310344827589
301/782 EER: 0.03174603174603169
351/782 EER: 0.0
401/782 EER: 0.005882352941176505
451/782 EER: 0.03323558162267837
501/782 EER: 0.007843137254901933
551/782 EER: 0.042510121457489836
601/782 EER: 0.0021645021645022022
651/782 EER: 0.04926108374384236
701/782 EER: 0.03503503503503502
751/782 EER: 0.0029325513196481467
Average EER(%): 2.31%


# 2. Unispeech-sat-base

In [None]:
# model = UniSpeechSatForXVector.from_pretrained('microsoft/unispeech-sat-base-sv').to(device)
# extractor = Wav2Vec2FeatureExtractor.from_pretrained('microsoft/unispeech-sat-base-sv')
# kb_uni_eer = round(evaluate(model,kb_test_loader,extractor,cosine_similarity),4)
# wandb.log({f'Unispeech KB EER': kb_uni_eer})
# print(f"Average EER(%): {kb_uni_eer * 100}%")

Some weights of the model checkpoint at microsoft/unispeech-sat-base-sv were not used when initializing UniSpeechSatForXVector: ['unispeech_sat.encoder.pos_conv_embed.conv.weight_g', 'unispeech_sat.encoder.pos_conv_embed.conv.weight_v']
- This IS expected if you are initializing UniSpeechSatForXVector from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing UniSpeechSatForXVector from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of UniSpeechSatForXVector were not initialized from the model checkpoint at microsoft/unispeech-sat-base-sv and are newly initialized: ['unispeech_sat.encoder.pos_conv_embed.conv.parametrizations.weight.original0', 'unispeech_sat.encoder.pos_conv_embed.conv.parametrizati

1/782 EER: 0.007881773399014746
51/782 EER: 0.0
101/782 EER: 0.007936507936507908
151/782 EER: 0.011904761904761918
201/782 EER: 0.02346041055718473
251/782 EER: 0.014778325123152636
301/782 EER: 0.04984093319194066
351/782 EER: 0.03743842364532024
401/782 EER: 0.03910068426197455
451/782 EER: 0.03125
501/782 EER: 0.015873015873015817
551/782 EER: 0.03128054740957964
601/782 EER: 0.0
651/782 EER: 0.014778325123152747
701/782 EER: 0.06206206206206211
751/782 EER: 0.014170040485829982
Average EER(%): 2.33%


# 3. Wavlm-base-plus

In [None]:
# model = WavLMForXVector.from_pretrained('microsoft/wavlm-base-plus-sv').to(device)
# extractor = Wav2Vec2FeatureExtractor.from_pretrained('microsoft/wavlm-base-plus-sv')
# kb_wavlm_eer = round(evaluate(model,kb_test_loader,extractor,cosine_similarity),4)
# wandb.log({f'Wavlm KB EER': kb_wavlm_eer})
# print(f"Average EER(%): {kb_wavlm_eer * 100}")

Some weights of the model checkpoint at microsoft/wavlm-base-plus-sv were not used when initializing WavLMForXVector: ['wavlm.encoder.pos_conv_embed.conv.weight_g', 'wavlm.encoder.pos_conv_embed.conv.weight_v']
- This IS expected if you are initializing WavLMForXVector from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing WavLMForXVector from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of WavLMForXVector were not initialized from the model checkpoint at microsoft/wavlm-base-plus-sv and are newly initialized: ['wavlm.encoder.pos_conv_embed.conv.parametrizations.weight.original0', 'wavlm.encoder.pos_conv_embed.conv.parametrizations.weight.original1']
You should probably TRAIN this model on a d

1/782 EER: 0.032032032032031976
51/782 EER: 0.025490196078431393
101/782 EER: 0.05506883604505636
151/782 EER: 0.007881773399014746
201/782 EER: 0.051808406647116334
251/782 EER: 0.015873015873015872
301/782 EER: 0.050980392156862786
351/782 EER: 0.033333333333333326
401/782 EER: 0.022267206477732837
451/782 EER: 0.015873015873015817
501/782 EER: 0.009803921568627472
551/782 EER: 0.00396825396825401
601/782 EER: 0.02502502502502507
651/782 EER: 0.007881773399014746
701/782 EER: 0.049049049049049054
751/782 EER: 0.03125
Average EER(%): 2.46


In [None]:
# wandb.finish()

VBox(children=(Label(value='0.001 MB of 0.001 MB uploaded\r'), FloatProgress(value=1.0, max=1.0)))

0,1
ECAPA KB EER,▁
Unispeech KB EER,▁
Wavlm KB EER,▁

0,1
ECAPA KB EER,0.0231
Unispeech KB EER,0.0233
Wavlm KB EER,0.0246


# Fine Tuning

In [None]:
# !git clone https://github.com/speechbrain/speechbrain.git

Cloning into 'speechbrain'...
remote: Enumerating objects: 80014, done.[K
remote: Counting objects: 100% (2785/2785), done.[K
remote: Compressing objects: 100% (1818/1818), done.[K
remote: Total 80014 (delta 1543), reused 1846 (delta 880), pack-reused 77229[K
Receiving objects: 100% (80014/80014), 87.36 MiB | 18.01 MiB/s, done.
Resolving deltas: 100% (53270/53270), done.


In [None]:
# from zipfile import ZipFile

# # os.makedirs("/content/kb_test_hi/", exist_ok=True)

# with ZipFile("/content/drive/MyDrive/FinalSpeech/valid.zip", "r") as kb:
#   kb.extractall("/content/")

In [None]:
# !pip install -r ../content/speechbrain/requirements.txt
# !pip install -e .

Ignoring SoundFile: markers 'sys_platform == "win32"' don't match your environment
Collecting black==24.3.0 (from -r ../content/speechbrain/lint-requirements.txt (line 1))
  Downloading black-24.3.0-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (1.7 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1.7/1.7 MB[0m [31m14.4 MB/s[0m eta [36m0:00:00[0m
Collecting flake8==7.0.0 (from -r ../content/speechbrain/lint-requirements.txt (line 3))
  Downloading flake8-7.0.0-py2.py3-none-any.whl (57 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m57.6/57.6 kB[0m [31m9.0 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting isort==5.13.2 (from -r ../content/speechbrain/lint-requirements.txt (line 4))
  Downloading isort-5.13.2-py3-none-any.whl (92 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m92.3/92.3 kB[0m [31m15.6 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting pycodestyle==2.11.0 (from -r ../content/speechbrain/lint-require

In [14]:
# !python /content/speechbrain/recipes/VoxCeleb/SpeakerRec/train_speaker_embeddings.py /content/speechbrain/recipes/VoxCeleb/SpeakerRec/hparams/train_ecapa_tdnn.yaml --data_folder=/content/valid

[34m[1mwandb[0m: Currently logged in as: [33mm23csa011[0m ([33mkushal1506[0m). Use [1m`wandb login --relogin`[0m to force relogin
[34m[1mwandb[0m: Tracking run with wandb version 0.16.6
[34m[1mwandb[0m: Run data is saved locally in [35m[1m/content/wandb/run-20240412_180514-bw2h4n8i[0m
[34m[1mwandb[0m: Run [1m`wandb offline`[0m to turn off syncing.
[34m[1mwandb[0m: Syncing run [33mrare-glade-2[0m
[34m[1mwandb[0m: ⭐️ View project at [34m[4mhttps://wandb.ai/kushal1506/FineTune_EPCCA[0m
[34m[1mwandb[0m: 🚀 View run at [34m[4mhttps://wandb.ai/kushal1506/FineTune_EPCCA/runs/bw2h4n8i[0m
/content/valid/noise/data.zip exists. Skipping download
/content/valid/rir/data.zip exists. Skipping download
speechbrain.core - Beginning experiment!
speechbrain.core - Experiment folder: results/ecapa_augment/1986
speechbrain.core - Info: ckpt_interval_minutes arg from hparam file is used
speechbrain.core - Gradscaler enabled: False. Using precision: fp32.
speechbrain.c