# Library

In [2]:
cd /workdir/Speaker_Verification_version_1.0/Speaker-Verification

/workdir/Speaker_Verification_version_1.0/Speaker-Verification


In [3]:
from speaker_verification import transforms as T
from speaker_verification.dataset import SpeakingFacesDataset
from speaker_verification.dataset import ValidDataset
from speaker_verification.sampler import ProtoSampler
from speaker_verification.sampler import ValidSampler
from speaker_verification.models_handmade.resnet import ResNet34
from speaker_verification.models import SelfAttentivePool2d
from speaker_verification.loss import PrototypicalLoss
from speaker_verification.train import train_model

import torch
from torch.utils.data import DataLoader

from speaker_verification.utils import plot_sample
import timm
from tqdm.notebook import tqdm
import matplotlib.pyplot as plt
import numpy as np

In [4]:
from tqdm import tqdm
from tqdm import trange
import gc
import torch
import torch.nn.functional as F

from speaker_verification.loss import PrototypicalLoss
from speaker_verification.metrics import EER_
from speaker_verification.metrics import accuracy_
from timeit import default_timer as timer

In [5]:
import speechbrain

# Solution 0.0

### General pipeline

In [6]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

In [4]:
# dataset
ANNOTATIONS_FILE = "/workdir/annotations_file_short.csv"
DATASET_DIR = '/workdir/sf_pv/data_v2'
PATH2DATASET = "/workdir/sf_pv"

train_dataset = SpeakingFacesDataset(ANNOTATIONS_FILE,DATASET_DIR,'train',
                                    image_transform=T.image_transform, 
                                    audio_transform=T.audio_transform)

valid_dataset = ValidDataset(PATH2DATASET,'valid',
                            image_transform=T.image_transform, 
                            audio_transform=T.audio_transform)
# sampler
train_sampler = ProtoSampler(labels = train_dataset.labels,
                                    n_batch = 10,
                                    n_ways = 3, # n_way
                                    n_support = 1, # n_shots
                                    n_query = 1)

# dataloader
train_dataloader = DataLoader(dataset=train_dataset, 
                          batch_sampler=train_sampler,
                          num_workers=4, pin_memory=True
                          )

valid_dataloader = DataLoader(dataset=valid_dataset,
                            batch_size=64,
                            shuffle=True,
                            num_workers=4, 
                            pin_memory=True)

In [6]:
# model
# model = ResNet34()
model = timm.create_model('resnet34', pretrained=True, num_classes=128, in_chans=1)
model = model.to(device)

In [7]:
# optimizer + scheduler
optimizer = torch.optim.Adam(model.parameters(), lr = 0.001, weight_decay = 0)
scheduler = torch.optim.lr_scheduler.StepLR(optimizer, 1, gamma=0.5, last_epoch=-1)

# loss
criterion = PrototypicalLoss(dist_type='squared_euclidean')
criterion = criterion.to(device)

In [8]:
# train
model = train_model(model,
                    train_dataloader, 
                    valid_dataloader,
                    train_sampler,
                    criterion,
                    optimizer,
                    scheduler,
                    device,
                    num_epochs=1,
                    save_dir="/workdir/results",
                    exp_name="resnet",
                    modality='wav')

Epoch:   0%|          | 0/1 [00:00<?, ?it/s]


Average train loss: 12.77603850364685
Average train accuracy: 43.333336639404294


Eval (epoch = 0): 100%|██████████| 594/594 [01:16<00:00,  7.76it/s]



Average val eer: 40.396064017445006

Average val accuracy: 59.49687850729518
Best eer model saved at epoch 0


Epoch: 100%|██████████| 1/1 [01:19<00:00, 79.64s/it]

Best acc model saved at epoch 0
Time elapsed: 1.327319642699634  minutes





### timm REsNet + SAP

In [3]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

In [4]:
# dataset
ANNOTATIONS_FILE = "/workdir/annotations_file_short.csv"
DATASET_DIR = '/workdir/sf_pv/data_v2'
PATH2DATASET = "/workdir/sf_pv"

train_dataset = SpeakingFacesDataset(ANNOTATIONS_FILE,DATASET_DIR,'train',
                                    image_transform=T.image_transform, 
                                    audio_transform=T.audio_transform)

valid_dataset = ValidDataset(PATH2DATASET,'valid',
                            image_transform=T.image_transform, 
                            audio_transform=T.audio_transform)
# sampler
train_sampler = ProtoSampler(labels = train_dataset.labels,
                                    n_batch = 10,
                                    n_ways = 3, # n_way
                                    n_support = 1, # n_shots
                                    n_query = 1)

# dataloader
train_dataloader = DataLoader(dataset=train_dataset, 
                          batch_sampler=train_sampler,
                          num_workers=4, pin_memory=True
                          )

valid_dataloader = DataLoader(dataset=valid_dataset,
                            batch_size=64,
                            shuffle=True,
                            num_workers=4, 
                            pin_memory=True)

In [5]:
# model
# model = ResNet34()
model = timm.create_model('resnet34', pretrained=True, num_classes=128, in_chans=1)
model.global_pool = SelfAttentivePool2d()

In [6]:
model = model.to(device)

In [7]:
# optimizer + scheduler
optimizer = torch.optim.Adam(model.parameters(), lr = 0.001, weight_decay = 0)
scheduler = torch.optim.lr_scheduler.StepLR(optimizer, 1, gamma=0.5, last_epoch=-1)

# loss
criterion = PrototypicalLoss(dist_type='squared_euclidean')
criterion = criterion.to(device)

In [8]:
# train
model = train_model(model,
                    train_dataloader, 

                    
                    valid_dataloader,
                    train_sampler,
                    criterion,
                    optimizer,
                    scheduler,
                    device,
                    num_epochs=1,
                    save_dir="/workdir/results",
                    exp_name="resnet_sap",
                    modality='wav')

Epoch:   0%|          | 0/1 [00:00<?, ?it/s]


Average train loss: 16.273793411254882
Average train accuracy: 40.000003051757815


Eval (epoch = 0): 100%|██████████| 594/594 [01:16<00:00,  7.80it/s]



Average val eer: 39.018451303037835

Average val accuracy: 60.91820987654321
Best eer model saved at epoch 0


Epoch: 100%|██████████| 1/1 [01:19<00:00, 79.41s/it]

Best acc model saved at epoch 0
Time elapsed: 1.3235322367555151  minutes





### Try AST (Audio Spectrogram Transformer)

In [63]:
import torchaudio
import soundfile

In [64]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

In [65]:
from transformers import ASTFeatureExtractor
from transformers import AutoModelForAudioClassification

feature_extractor = ASTFeatureExtractor()
model = AutoModelForAudioClassification.from_pretrained("MIT/ast-finetuned-audioset-10-10-0.4593")

In [66]:
path2wav1 = "/workdir/sf_pv/data_v2/sub_1/11/wav/574.wav"
path2wav2 = "/workdir/sf_pv/data_v2/sub_1/11/wav/603.wav"
path2wav3 = "/workdir/sf_pv/data_v2/sub_1/12/wav/673.wav"
data_wav_id1_1, sample_rate = soundfile.read(path2wav1) 
data_wav_id1_2, sample_rate = soundfile.read(path2wav2) 
data_wav_id2_1, sample_rate = soundfile.read(path2wav3) 

In [67]:
# compute attention masks and normalize the waveform if needed
inputs1 = feature_extractor(data_wav_id1_1, sampling_rate=16000, padding=True, return_tensors="pt")
inputs2 = feature_extractor(data_wav_id1_2, sampling_rate=16000, padding=True, return_tensors="pt")
inputs3 = feature_extractor(data_wav_id2_1, sampling_rate=16000, padding=True, return_tensors="pt")

In [70]:
model(inputs3.input_values)

SequenceClassifierOutput(loss=None, logits=tensor([[  1.9893,  -0.9755,  -3.4737,  -8.1023,  -3.6725,  -1.6364, -10.6159,
          -5.0053,  -9.0250,  -7.0192,  -6.4388,  -7.6345, -11.6691,  -9.4656,
          -8.6656,  -9.7345,  -7.3644, -10.0390,  -7.8844,  -8.0277,  -9.8606,
          -6.1820,  -9.0928,  -8.2744,  -6.3399,  -7.7557,  -6.7260,  -8.4375,
          -9.7831, -10.2545, -10.2305,  -8.4399,  -8.6328,  -9.8559, -10.6823,
         -11.9752, -10.3021, -10.4324,  -7.3540,  -7.7377,  -7.7613,  -6.9020,
          -9.7197,  -9.2440,  -4.5852,  -9.9752,  -4.9876,  -9.7073,  -5.3093,
          -6.9735, -10.4641,  -7.8191,  -7.2847,  -8.4210,  -8.9820,  -6.6953,
          -8.4857,  -8.5635,  -8.1789,  -8.9217,  -8.0305,  -9.7148,  -7.9259,
          -7.2844,  -9.4872,  -9.9177,  -9.0284,  -8.0984, -10.8437,  -8.1701,
          -8.6471, -12.7541,  -4.9552,  -6.0202,  -7.7936, -10.2052,  -8.7268,
          -9.1717,  -8.5424,  -8.8025,  -8.3987,  -6.3925,  -9.0607,  -6.5005,
         

In [74]:
output1 = model(**inputs1).logits
output2 = model(**inputs2).logits
output3 = model(**inputs3).logits

In [79]:

embeddings1 = torch.nn.functional.normalize(output1, dim=-1).cpu()
embeddings2 = torch.nn.functional.normalize(output2, dim=-1).cpu()
emb3 = torch.nn.functional.normalize(output3, dim=-1).cpu()

# the resulting embeddings can be used for cosine similarity-based retrieval
cosine_sim = torch.nn.CosineSimilarity(dim=-1)
similarity = cosine_sim(embeddings1, emb3)
print(similarity)
threshold = 0.86  # the optimal threshold is dataset-dependent
if similarity < threshold:
    print("Speakers are not the same!")

tensor([0.9987], grad_fn=<SumBackward1>)


In [42]:
feature_extractor

ASTFeatureExtractor {
  "do_normalize": true,
  "feature_extractor_type": "ASTFeatureExtractor",
  "feature_size": 1,
  "max_length": 1024,
  "mean": -4.2677393,
  "num_mel_bins": 128,
  "padding_side": "right",
  "padding_value": 0.0,
  "return_attention_mask": false,
  "sampling_rate": 16000,
  "std": 4.5689974
}

In [7]:
model = model.to(device)

In [8]:
model

ASTForAudioClassification(
  (audio_spectrogram_transformer): ASTModel(
    (embeddings): ASTEmbeddings(
      (patch_embeddings): ASTPatchEmbeddings(
        (projection): Conv2d(1, 768, kernel_size=(16, 16), stride=(10, 10))
      )
      (dropout): Dropout(p=0.0, inplace=False)
    )
    (encoder): ASTEncoder(
      (layer): ModuleList(
        (0): ASTLayer(
          (attention): ASTAttention(
            (attention): ASTSelfAttention(
              (query): Linear(in_features=768, out_features=768, bias=True)
              (key): Linear(in_features=768, out_features=768, bias=True)
              (value): Linear(in_features=768, out_features=768, bias=True)
              (dropout): Dropout(p=0.0, inplace=False)
            )
            (output): ASTSelfOutput(
              (dense): Linear(in_features=768, out_features=768, bias=True)
              (dropout): Dropout(p=0.0, inplace=False)
            )
          )
          (intermediate): ASTIntermediate(
            (dense): Li

In [9]:
def ast_transform(data_wav, sample_rate):
    data_wav = data_wav.squeeze().numpy()
    inputs = feature_extractor(data_wav, sampling_rate=sample_rate, padding="max_length", return_tensors="pt")
    input_values = inputs.input_values
    return input_values

In [10]:
# dataset
ANNOTATIONS_FILE = "/workdir/annotations_file_short.csv"
DATASET_DIR = '/workdir/sf_pv/data_v2'
PATH2DATASET = "/workdir/sf_pv"

train_dataset = SpeakingFacesDataset(ANNOTATIONS_FILE,DATASET_DIR,'train',
                                    image_transform=T.image_transform, 
                                    audio_transform=ast_transform)

valid_dataset = ValidDataset(PATH2DATASET,'valid',
                            image_transform=T.image_transform, 
                            audio_transform=ast_transform)
# sampler
train_sampler = ProtoSampler(labels = train_dataset.labels,
                                    n_batch = 10,
                                    n_ways = 3, # n_way
                                    n_support = 1, # n_shots
                                    n_query = 1)

# dataloader
train_dataloader = DataLoader(dataset=train_dataset, 
                          batch_sampler=train_sampler,
                          num_workers=4, pin_memory=True
                          )

valid_dataloader = DataLoader(dataset=valid_dataset,
                            batch_size=64,
                            shuffle=True,
                            num_workers=4, 
                            pin_memory=True)

In [11]:
# optimizer + scheduler
optimizer = torch.optim.Adam(model.parameters(), lr = 0.001, weight_decay = 0)
scheduler = torch.optim.lr_scheduler.StepLR(optimizer, 1, gamma=0.5, last_epoch=-1)

# loss
criterion = PrototypicalLoss(dist_type='squared_euclidean')
criterion = criterion.to(device)

In [12]:
# train
model = train_model(model,
                    train_dataloader, 
                    valid_dataloader,
                    train_sampler,
                    criterion,
                    optimizer,
                    scheduler,
                    device,
                    num_epochs=1,
                    save_dir="/workdir/results",
                    exp_name="AST",
                    modality='wav')

Epoch:   0%|          | 0/1 [00:00<?, ?it/s]


RuntimeError: Expected 3D (unbatched) or 4D (batched) input to conv2d, but got input of size: [6, 1, 1024, 1, 128]

In [None]:
with torch.no_grad():

    outputs = model(**inputs)

last_hidden_states = outputs.last_hidden_state

list(last_hidden_states.shape)

In [13]:
model

ASTForAudioClassification(
  (audio_spectrogram_transformer): ASTModel(
    (embeddings): ASTEmbeddings(
      (patch_embeddings): ASTPatchEmbeddings(
        (projection): Conv2d(1, 768, kernel_size=(16, 16), stride=(10, 10))
      )
      (dropout): Dropout(p=0.0, inplace=False)
    )
    (encoder): ASTEncoder(
      (layer): ModuleList(
        (0): ASTLayer(
          (attention): ASTAttention(
            (attention): ASTSelfAttention(
              (query): Linear(in_features=768, out_features=768, bias=True)
              (key): Linear(in_features=768, out_features=768, bias=True)
              (value): Linear(in_features=768, out_features=768, bias=True)
              (dropout): Dropout(p=0.0, inplace=False)
            )
            (output): ASTSelfOutput(
              (dense): Linear(in_features=768, out_features=768, bias=True)
              (dropout): Dropout(p=0.0, inplace=False)
            )
          )
          (intermediate): ASTIntermediate(
            (dense): Li

### Try Evaluation as from Korean guys

In [9]:
import soundfile
import torchaudio

In [6]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

In [7]:
# dataset
ANNOTATIONS_FILE = "/workdir/annotations_file_short.csv"
DATASET_DIR = '/workdir/sf_pv/data_v2'
PATH2DATASET = "/workdir/sf_pv"

valid_dataset = ValidDataset(PATH2DATASET,'valid',
                            image_transform=None, 
                            audio_transform=None)

# valid_dataloader = DataLoader(dataset=valid_dataset,
#                             batch_size=1,
#                             shuffle=True,
#                             num_workers=4, 
#                             pin_memory=True)

In [8]:
sample = valid_dataset[0]

/workdir/sf_pv/data_v2/sub_101/23/wav/139.wav
/workdir/sf_pv/data_v2/sub_101/25/wav/49.wav


1. Load audio from torchaudio and from soundfile

In [10]:
path2wav1 = "/workdir/sf_pv/data_v2/sub_101/23/wav/139.wav"
path2wav2 = "/workdir/sf_pv/data_v2/sub_101/25/wav/49.wav"

In [11]:
data_wav, sample_rate = torchaudio.load(path2wav1) 
audio1, sample_rate1 = soundfile.read(path2wav1)

In [13]:
print(sample_rate)
print(sample_rate1)

16000
16000


In [14]:
print(data_wav)

tensor([[0.0045, 0.0071, 0.0056,  ..., 0.0119, 0.0111, 0.0084]])


In [15]:
print(audio1)

[0.00445557 0.00714111 0.00561523 ... 0.01193237 0.01113892 0.00842285]


In [11]:
for batch in valid_dataloader:
    id1, id2, labels = batch

    wav_id1, rgb_id1, thr_id1, person_id1 = id1
    wav_id2, rgb_id2, thr_id2, person_id2 = id2

    break

In [13]:
print(f"Same person: {bool(labels)}")
print(labels)

print(f"First person id: {person_id1}")
print(f"Second person id: {person_id2}")

Same person: True
tensor([1])
First person id: tensor([113])
Second person id: tensor([113])


In [16]:
for sample in valid_dataset:
    id1, id2, label = sample

    print(f"Same person: {bool(label)}")

    wav_id1, rgb_id1, thr_id1, person_id1 = id1
    wav_id2, rgb_id2, thr_id2, person_id2 = id2

    print(f"First person id: {person_id1}")
    print(f"Second person id: {person_id2}")

    break

Same person: True
First person id: 101
Second person id: 101


### SpeechBrain embeddings

In [14]:
from speechbrain.pretrained import EncoderClassifier

In [15]:
classifier = EncoderClassifier.from_hparams(source="speechbrain/spkrec-xvect-voxceleb", savedir="pretrained_models/spkrec-xvect-voxceleb")

In [18]:
classifier.encode_batch(wav_id1)

NotImplementedError: Only 2D, 3D, 4D, 5D padding with non-constant padding are supported for now

In [17]:
embeddings_id1 = classifier.encode_batch(wav_id1[0])
embeddings_id2 = classifier.encode_batch(wav_id2[0])

In [26]:
cos_sim = F.cosine_similarity(embeddings_id1, embeddings_id2, dim=1)

In [28]:
EER_(cos_sim, label)

AttributeError: 'int' object has no attribute 'cpu'

In [None]:
eer, scores = EER_(cos_sim, label)
accuracy = accuracy_(labels, scores)

In [None]:
from speechbrain.pretrained import SpeakerRecognition
verification = SpeakerRecognition.from_hparams(source="speechbrain/spkrec-ecapa-voxceleb", savedir="pretrained_models/spkrec-ecapa-voxceleb")

score, prediction = verification.verify_files(file1, file2)

print(score)
print(prediction) # True = same speaker, False=Different speakers

### Добавила варьирование параметров для transform

In [30]:
from speaker_verification.transforms import Audio_Transforms
from speaker_verification.transforms import Image_Transforms
from speaker_verification.models import Model

In [31]:
device = torch.device("cuda:1" if torch.cuda.is_available() else "cpu")

In [43]:
library="timm"
modality = "wav"
model_name = "resnet34"
pool="default"

In [44]:
model = Model(library=library, 
                pretrained_weights=True, 
                fine_tune=False, 
                embedding_size=128, 
                modality = modality,
                model_name = model_name,
                pool=pool)
model = model.to(device)

audio_T = Audio_Transforms(sample_rate=16000,
                            sample_duration=3, # seconds
                            n_fft=512, # from Korean code
                            win_length=400,
                            hop_length=160,
                            window_fn=torch.hamming_window,
                            n_mels=40)

image_T = Image_Transforms(model,
                            library=library,
                            model_name = model_name,
                            resize=128)

In [45]:
# dataset
ANNOTATIONS_FILE = "/workdir/Speaker_Verification_version_1.0/Speaker-Verification/annotations_file_short_SF.csv"
PATH2DATASET = "/workdir/sf_pv"
DATASET_DIR = f'{PATH2DATASET}/data_v2'

train_dataset = SpeakingFacesDataset(ANNOTATIONS_FILE,DATASET_DIR,'train',
                                    image_transform=image_T.transform, 
                                    audio_transform=audio_T.transform)

valid_dataset = ValidDataset(PATH2DATASET,'valid',
                            image_transform=image_T.transform, 
                            audio_transform=audio_T.transform)

In [46]:
# sampler
train_sampler = ProtoSampler(labels = train_dataset.labels,
                                    n_batch = 10,
                                    n_ways = 3, # n_way
                                    n_support = 1, # n_shots
                                    n_query = 1)

# dataloader
train_dataloader = DataLoader(dataset=train_dataset, 
                          batch_sampler=train_sampler,
                          num_workers=4, pin_memory=True
                          )

valid_dataloader = DataLoader(dataset=valid_dataset,
                            batch_size=64,
                            shuffle=True,
                            num_workers=4, 
                            pin_memory=True)

In [47]:
# optimizer + scheduler
optimizer = torch.optim.Adam(model.parameters(), lr = 0.001, weight_decay = 0)
scheduler = torch.optim.lr_scheduler.StepLR(optimizer, 1, gamma=0.5, last_epoch=-1)

# loss
criterion = PrototypicalLoss(dist_type='squared_euclidean')
criterion = criterion.to(device)

In [48]:
# train
model = train_model(model,
                    train_dataloader, 
                    valid_dataloader,
                    train_sampler,
                    criterion,
                    optimizer,
                    scheduler,
                    device,
                    num_epochs=1,
                    save_dir="/workdir/Speaker_Verification_version_1.0/results",
                    exp_name="chern",
                    modality=modality)

Epoch:   0%|          | 0/1 [00:00<?, ?it/s]

Train (epoch = 0):   0%|          | 0/10 [00:00<?, ?it/s]


Average train loss: 8.036772561073303
Average train accuracy: 26.666668701171876


Eval (epoch = 0):   0%|          | 0/594 [00:00<?, ?it/s]


Average val eer: 38.135066470414856

Average val accuracy: 61.7555765993266
Best eer model saved at epoch 0
Best acc model saved at epoch 0
Time elapsed: 1.227491514896974  minutes


### SAP from Korean code

In [5]:
from speaker_verification.transforms import Audio_Transforms
from speaker_verification.transforms import Image_Transforms
from speaker_verification.models import Model
import torch.nn as nn

In [6]:
device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")

In [7]:
library="timm"
modality = "wav"
model_name = "resnet34"
pool="SAP"

# audio
sample_rate=16000
sample_duration=2 # seconds
n_fft=512 # from Korean code
win_length=400
hop_length=160
window_fn=torch.hamming_window
n_mels=40

In [8]:
model = Model(library=library, 
                pretrained_weights=True, 
                fine_tune=False, 
                embedding_size=128, 
                modality = modality,
                model_name = model_name,
                pool=pool)
model = model.to(device)

audio_T = Audio_Transforms(sample_rate=sample_rate,
                            sample_duration=sample_duration, # seconds
                            n_fft=n_fft, # from Korean code
                            win_length=win_length,
                            hop_length=hop_length,
                            window_fn=window_fn,
                            n_mels=n_mels)

image_T = Image_Transforms(model,
                            library=library,
                            model_name = model_name,
                            resize=128)

In [103]:
# dataset
ANNOTATIONS_FILE = "/workdir/Speaker_Verification_version_1.0/Speaker-Verification/annotations_file_short_SF.csv"
PATH2DATASET = "/workdir/sf_pv"
DATASET_DIR = f'{PATH2DATASET}/data_v2'

train_dataset = SpeakingFacesDataset(ANNOTATIONS_FILE,DATASET_DIR,'train',
                                    image_transform=image_T.transform, 
                                    audio_transform=audio_T.transform)

valid_dataset = ValidDataset(PATH2DATASET,'valid',
                            image_transform=image_T.transform, 
                            audio_transform=audio_T.transform)

In [77]:
wav, _, _, _ = train_dataset[0]

In [78]:
wav.shape # [n_channels, time]

torch.Size([1, 43084])

In [104]:
# sampler
train_sampler = ProtoSampler(labels = train_dataset.labels,
                                    n_batch = 10,
                                    n_ways = 3, # n_way
                                    n_support = 1, # n_shots
                                    n_query = 1)

# dataloader
train_dataloader = DataLoader(dataset=train_dataset, 
                          batch_sampler=train_sampler,
                          num_workers=4, pin_memory=True
                          )

valid_dataloader = DataLoader(dataset=valid_dataset,
                            batch_size=64,
                            shuffle=True,
                            num_workers=4, 
                            pin_memory=True)

In [105]:
# optimizer + scheduler
optimizer = torch.optim.Adam(model.parameters(), lr = 0.001, weight_decay = 0)
scheduler = torch.optim.lr_scheduler.StepLR(optimizer, 1, gamma=0.5, last_epoch=-1)

# loss
criterion = PrototypicalLoss(dist_type='squared_euclidean')
criterion = criterion.to(device)

In [106]:
pbar = tqdm(train_dataloader, desc=f'Train (epoch = {1})', leave=False)  

total_loss = 0
total_acc = 0
for batch in pbar:

    if modality == "rgb":
        # data_wav, data_rgb, data_thr, label
        _,rgb, _, _ = batch # we do not use labels from dataset
        data = rgb.to(device)
    elif modality == "thr":
        # data_wav, data_rgb, data_thr, label
        _,_, thr, _ = batch # we do not use labels from dataset
        data = thr.to(device)
    elif modality == "wav":
        wav, _, _, _ = batch # we do not use labels from dataset
        data = wav.to(device)

    break

                                                         

In [80]:
data.shape # [batch_size, n_channels, n_mels, number of frames]

torch.Size([6, 1, 40, 201])

#### implement step by step

In [24]:
# data = model(data)
# data.shape

torch.Size([6, 128])

In [69]:
data_try = torch.FloatTensor(6,512,40,201)

In [15]:
instancenorm = nn.InstanceNorm1d(n_mels)

$h_t = \tanh(Wx_t + b)$

In [44]:
x = instancenorm(data.squeeze()).unsqueeze(1).detach().cpu()
print(x.shape) # [batch_size, n_channels, n_mels, number of frames]
x = torch.mean(x, dim=2, keepdim=True)
print(x.shape)
x = x.permute(0,3,1,2) # [batch_size, number of frames,n_channels, n_mels]
print(x.shape)
x = x.squeeze(-1) # delete last dimension
print(x.shape) # [batch_size, number of frames, n_channels]

torch.Size([6, 1, 40, 201])
torch.Size([6, 1, 1, 201])
torch.Size([6, 201, 1, 1])
torch.Size([6, 201, 1])


In [45]:
W = nn.Linear(1,512)

In [46]:
W(x).shape

torch.Size([6, 201, 512])

In [47]:
h = torch.tanh(W(x))

In [48]:
h.shape # [batch_size, number of frames, n_channels]

torch.Size([6, 201, 512])

In [49]:
def new_parameter(*size):
    out = nn.Parameter(torch.FloatTensor(*size))
    nn.init.xavier_normal_(out)
    return out

In [63]:
u = new_parameter(512,1)

In [64]:
u.shape

torch.Size([512, 1])

In [65]:
w = torch.matmul(h, u)
print(w.shape)
w = w.squeeze(dim=2)
print(w.shape)
w = F.softmax(w, dim=1)
print(w.shape)
w = w.view(x.size(0), x.size(1), 1)
print(w.shape) # [batch_size, number of frames, n_channels]

torch.Size([6, 201, 1])
torch.Size([6, 201])
torch.Size([6, 201])
torch.Size([6, 201, 1])


In [66]:
e = torch.sum(x * w, dim=1)
print(e.shape)

torch.Size([6, 1])


In [67]:
e.shape

torch.Size([6, 1])

In [57]:
e.size()

torch.Size([6, 1])

In [56]:
e.view(e.size()[0], -1).shape

torch.Size([6, 1])

In [68]:
x_new = e.view(e.size()[0], -1)

In [70]:
from torchvision import models

In [71]:
model = models.resnet34(weights=models.ResNet34_Weights.DEFAULT)

In [72]:
model.conv1 = nn.Conv2d(1, 64, kernel_size=(7, 7), stride=(2, 2), padding=(3, 3), bias=False)
model.fc = nn.Linear(model.fc.in_features, 128)

In [62]:
print(model)

ResNet(
  (conv1): Conv2d(1, 64, kernel_size=(7, 7), stride=(2, 2), padding=(3, 3), bias=False)
  (bn1): BatchNorm2d(64, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
  (relu): ReLU(inplace=True)
  (maxpool): MaxPool2d(kernel_size=3, stride=2, padding=1, dilation=1, ceil_mode=False)
  (layer1): Sequential(
    (0): BasicBlock(
      (conv1): Conv2d(64, 64, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1), bias=False)
      (bn1): BatchNorm2d(64, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
      (relu): ReLU(inplace=True)
      (conv2): Conv2d(64, 64, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1), bias=False)
      (bn2): BatchNorm2d(64, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
    )
    (1): BasicBlock(
      (conv1): Conv2d(64, 64, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1), bias=False)
      (bn1): BatchNorm2d(64, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
      (relu): ReLU(inplace=True)
  

In [84]:
model_new = nn.Sequential(*list(model.children())[:-2])
model_new = model_new.to(device)

In [85]:
data_1 = instancenorm(data.squeeze()).unsqueeze(1).detach()
data_1 = model_new(data_1)
data_1.shape # [batch_size, n_channels, n_mels, number of frames]

torch.Size([6, 512, 2, 7])

In [86]:
x = torch.mean(data_1, dim=2, keepdim=True) # [batch_size, n_channels, n_mels, number of frames]
print(x.shape) 
x = x.permute(0,3,1,2) # [batch_size, number of frames,n_channels, n_mels]
print(x.shape)
x = x.squeeze(-1) # delete last dimension
print(x.shape) # [batch_size, number of frames, n_channels]

torch.Size([6, 512, 1, 7])
torch.Size([6, 7, 512, 1])
torch.Size([6, 7, 512])


In [88]:
W = nn.Linear(512,512)
W = W.to(device)
h = torch.tanh(W(x)) # [batch_size, number of frames, n_channels]
print(h.shape)

torch.Size([6, 7, 512])


In [90]:
u = new_parameter(512,1).to(device)
print(u.shape)
w = torch.matmul(h, u)
print(w.shape)
w = w.squeeze(dim=2)
print(w.shape)
w = F.softmax(w, dim=1)
print(w.shape)
w = w.view(x.size(0), x.size(1), 1)
print(w.shape) # [batch_size, number of frames, n_channels]

torch.Size([512, 1])
torch.Size([6, 7, 1])
torch.Size([6, 7])
torch.Size([6, 7])
torch.Size([6, 7, 1])


In [91]:
e = torch.sum(x * w, dim=1)
print(e.shape)

torch.Size([6, 512])


In [92]:
e.view(e.size()[0], -1).shape

# x = self.fc(x)

torch.Size([6, 512])

In [94]:
fc = nn.Linear(512,128).to(device)

In [95]:
x = fc(e)

In [96]:
x.shape

torch.Size([6, 128])

In [None]:
# [batch_size, n_channels, H, W]
# [batch_size, 512, 1, 1]
# [batch_size, 128]

#### wite SAP as nn.Module

In [111]:
class SelfAttentivePool2d(nn.Module):
    '''
    Based on this article: https://www.isca-speech.org/archive/pdfs/odyssey_2018/cai18_odyssey.pdf
    '''
    def __init__(self, input_dim=512):
        super(SelfAttentivePool2d, self).__init__()
        self.W = nn.Linear(input_dim, input_dim)

        self.u = nn.Parameter(torch.FloatTensor(input_dim, 1))
        nn.init.xavier_normal_(self.u)

    def preprocess(self,x):
        """
            x: [batch_size, n_channels, n_mels, number of frames] --> [batch_size, number of frames, n_channels]

            step 1: [batch_size, n_channels, n_mels, number of frames] --> [batch_size, n_channels, 1, number of frames]
            step 2: [batch_size, n_channels, 1, number of frames] --> [batch_size, number of frames, n_channels, 1]
            step 3: [batch_size, number of frames, n_channels, 1] --> [batch_size, number of frames, n_channels]
        """
        x = torch.mean(x, dim=2, keepdim=True) 
        x = x.permute(0,3,1,2)
        x = x.squeeze(-1)

        return x
        
    def forward(self, x):
        """
        h = tanh(Wx + b)
        w = Softmax(h @ u) * H
        e = sum(w*x)
        input:
            x : [batch_size, n_channels, n_mels, number of frames]
        
        return:
            e: size (batch_size, n_channels)
        """
        x = self.preprocess(x)

        h = torch.tanh(self.W(x))
        w = torch.matmul(h, self.u).squeeze(dim=2) # [batch_size, number of frames, n_channels=1] --> squeeze: [batch_size, number of frames]
        w = F.softmax(w, dim=1)
        w = w.view(x.size(0), x.size(1), 1) # [batch_size, number of frames, n_channels=1]
        e = torch.sum(x * w, dim=1) # utterance level representation e
        e = e.view(e.size()[0], -1) # flatten
        return e

In [112]:
from torchvision import models

In [113]:
model = models.resnet34(weights=models.ResNet34_Weights.DEFAULT)
model.conv1 = nn.Conv2d(1, 64, kernel_size=(7, 7), stride=(2, 2), padding=(3, 3), bias=False)
model.avgpool = SelfAttentivePool2d(model.fc.in_features)
model.fc = nn.Linear(model.fc.in_features, 128)

In [114]:
model = model.to(device)

In [115]:
x = instancenorm(data.squeeze()).unsqueeze(1).detach()
x = model(x)
x.shape # [batch_size, n_channels, n_mels, number of frames]

torch.Size([6, 128])

In [116]:
model = timm.create_model(model_name, pretrained=True, num_classes=128, in_chans=1)

In [117]:
model.global_pool = SelfAttentivePool2d()

In [118]:
model = model.to(device)

In [119]:
x = instancenorm(data.squeeze()).unsqueeze(1).detach()
x = model(x)
x.shape # [batch_size, n_channels, n_mels, number of frames]

torch.Size([6, 128])

### Add evaluation with divisions

In [191]:
from speaker_verification.transforms import Audio_Transforms
from speaker_verification.transforms import Image_Transforms
from speaker_verification.models import Model
import torch.nn as nn

device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")

In [192]:
library="timm"
modality = "wav"
model_name = "resnet34"
pool="SAP"

# audio
sample_rate=16000
sample_duration=2 # seconds
n_fft=512 # from Korean code
win_length=400
hop_length=160
window_fn=torch.hamming_window
n_mels=40

In [193]:
model = Model(library=library, 
                pretrained_weights=True, 
                fine_tune=False, 
                embedding_size=128, 
                modality = modality,
                model_name = model_name,
                pool=pool)
model = model.to(device)

audio_T = Audio_Transforms(sample_rate=sample_rate,
                            sample_duration=sample_duration, # seconds
                            n_fft=n_fft, # from Korean code
                            win_length=win_length,
                            hop_length=hop_length,
                            window_fn=window_fn,
                            n_mels=n_mels)

image_T = Image_Transforms(model,
                            library=library,
                            model_name = model_name,
                            resize=128)

# optimizer + scheduler
optimizer = torch.optim.Adam(model.parameters(), lr = 0.001, weight_decay = 0)
scheduler = torch.optim.lr_scheduler.StepLR(optimizer, 1, gamma=0.5, last_epoch=-1)

# loss
criterion = PrototypicalLoss(dist_type='squared_euclidean')
criterion = criterion.to(device)

In [194]:
# dataset
ANNOTATIONS_FILE = "/workdir/Speaker_Verification_version_1.0/Speaker-Verification/annotations_file_short_SF.csv"
PATH2DATASET = "/workdir/sf_pv"
DATASET_DIR = f'{PATH2DATASET}/data_v2'

train_dataset = SpeakingFacesDataset(ANNOTATIONS_FILE,DATASET_DIR,'train',
                                    image_transform=image_T.transform, 
                                    audio_transform=audio_T.transform)

valid_dataset = ValidDataset(PATH2DATASET,'valid',
                            image_transform=image_T.transform, 
                            audio_transform=None) #audio_T.transform)

In [None]:
# sampler
train_sampler = ProtoSampler(labels = train_dataset.labels,
                                    n_batch = 10,
                                    n_ways = 3, # n_way
                                    n_support = 1, # n_shots
                                    n_query = 1)

# dataloader
train_dataloader = DataLoader(dataset=train_dataset, 
                          batch_sampler=train_sampler,
                          num_workers=4, pin_memory=True
                          )

valid_dataloader = DataLoader(dataset=valid_dataset,
                            batch_size=64,
                            shuffle=True,
                            num_workers=4, 
                            pin_memory=True)

In [186]:
def valid_transform(signal, sample_rate):
    sample_duration = 16000
    n_eval_cuts = 4
    # Maximum audio length
    max_audio = sample_duration * sample_rate
    # stereo --> mono
    if signal.shape[0] > 1:
        signal = torch.mean(signal, dim=0, keepdim=True)
    
    audiosize = signal.shape[1] # time
    if audiosize <= max_audio:
        shortage = max_audio - audiosize
        dim_padding = (0, shortage) # (left_pad, right_pad)
        # ex: dim_padding = (1,2) --> [1,1,1] -> [0,1,1,1,0,0]
        feat = torch.nn.functional.pad(signal, dim_padding, 'replicate') # shape: [n_channels, time]
        feat = feat.unsqueeze(dim=0) # shape: [n_eval_cuts=1, n_channels, time]
    else:
        feats = []
        startframe = torch.linspace(0,audiosize-max_audio,steps=n_eval_cuts)
        for asf in startframe:
            feats.append(signal[:, int(asf):int(asf)+max_audio])
        feat = torch.stack(feats,0) # shape: [n_eval_cuts, n_channels, time]

    return feat # shape: [n_eval_cuts, n_channels, time]

In [183]:
id1, id2, label = valid_dataset[0]
wav_id1, _, _, _ = id1

In [184]:
feat = valid_transform(sample_duration,  wav_id1, sample_rate, n_eval_cuts=4)

In [185]:
feat.shape

torch.Size([4, 1, 32000])

In [198]:
valid_dataset = ValidDataset(PATH2DATASET,'valid',
                            image_transform=image_T.transform, 
                            audio_transform=valid_transform) #audio_T.transform)

In [199]:
valid_dataloader = DataLoader(dataset=valid_dataset,
                            batch_size=1,
                            shuffle=True,
                            num_workers=4, 
                            pin_memory=True)

In [200]:
for batch in valid_dataloader:

    id1, id2, labels = batch

    wav_id1, rgb_id1, thr_id1, _ = id1
    wav_id2, rgb_id2, thr_id2, _ = id2

    if modality == "rgb":
        data_id1 = rgb_id1.to(device)
        data_id2 = rgb_id2.to(device)

    elif modality == "thr":
        data_id1 = thr_id1.to(device)
        data_id2 = thr_id2.to(device)

    elif modality == "wav":
        data_id1 = wav_id1.to(device)
        data_id2 = wav_id2.to(device)

    break

ERROR: Unexpected bus error encountered in worker. This might be caused by insufficient shared memory (shm).
 ERROR: Unexpected bus error encountered in worker. This might be caused by insufficient shared memory (shm).
 ERROR: Unexpected bus error encountered in worker. This might be caused by insufficient shared memory (shm).
 ERROR: Unexpected bus error encountered in worker. This might be caused by insufficient shared memory (shm).
 

RuntimeError: DataLoader worker (pid(s) 3616158) exited unexpectedly

In [197]:
data_id1.shape

torch.Size([1, 1, 83149])

In [None]:
    with torch.no_grad():
        id1_out = model(data_id1)
        id2_out = model(data_id2)

        cos_sim = F.cosine_similarity(id1_out, id2_out, dim=1)
        eer, scores = EER_(cos_sim, labels)
        accuracy = accuracy_(labels, scores)

        total_eer += eer
        total_accuracy += accuracy

# Solution 1.0

## Hugging Face Models

### library

In [9]:
from speaker_verification.transforms import Audio_Transforms
from speaker_verification.transforms import Image_Transforms
from speaker_verification.models import Model
import torch.nn as nn

from transformers import Wav2Vec2FeatureExtractor, WavLMForXVector
from transformers import ASTFeatureExtractor
from transformers import AutoModelForAudioClassification

import torch

device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")

### AST (Audio Spectrogram Transformer)

In [11]:
feature_extractor = ASTFeatureExtractor()
model = AutoModelForAudioClassification.from_pretrained("MIT/ast-finetuned-audioset-10-10-0.4593")

In [16]:
model.classifier.dense.weight.requires_grad

True

### WAvLM

In [17]:
feature_extractor = Wav2Vec2FeatureExtractor.from_pretrained('microsoft/wavlm-base-sv')
model = WavLMForXVector.from_pretrained('microsoft/wavlm-base-sv')

In [19]:
model.classifier

Linear(in_features=512, out_features=512, bias=True)

### Model class

In [71]:
class Model(nn.Module):
    def __init__(self, 
                library="pytorch", 
                pretrained_weights=True, 
                fine_tune=False, 
                embedding_size=128, 
                modality = "rgb",
                model_name = "resnet34",
                pool="default"):

        super(Model, self).__init__()

        if modality == "wav":
            in_channels = 1
        else:
            in_channels = 3
    
        if library == "pytorch":
            if model_name == "resnet34":
                if pretrained_weights:
                    weights = models.ResNet34_Weights.DEFAULT
                else:
                    weights = None

                self.model = models.resnet34(weights=weights)
                if modality == "wav":
                    self.model.conv1 = nn.Conv2d(in_channels, 64, kernel_size=(7, 7), stride=(2, 2), padding=(3, 3), bias=False)
                self.model.fc = nn.Linear(self.model.fc.in_features, embedding_size)
                
                if pool == "SAP":
                    self.model.avgpool = SelfAttentivePool2d(self.model.fc.in_features)

            if fine_tune:
                for param in self.model.parameters():
                    param.requires_grad = True
            else:
                for param in self.model.parameters():
                    param.requires_grad = False
                
                self.model.fc.weight.requires_grad = True
                self.model.fc.bias.requires_grad = True

        elif library == "timm":
            self.model = timm.create_model(model_name, pretrained=pretrained_weights, num_classes=embedding_size, in_chans=in_channels)

            if pool == "SAP":
                self.model.global_pool = SelfAttentivePool2d()
            if fine_tune:
                for param in self.model.parameters():
                    param.requires_grad = True
            else:
                for param in self.model.parameters():
                    param.requires_grad = False

                self.model.get_classifier().weight.requires_grad = True
                self.model.get_classifier().bias.requires_grad = True
        
        elif library == "huggingface":
            if model_name == "WavLM":
                self.model = WavLMForXVector.from_pretrained('microsoft/wavlm-base-sv')
                self.model.classifier = nn.Linear(self.model.classifier.in_features, embedding_size)

                if fine_tune:
                    for param in self.model.parameters():
                        param.requires_grad = True
                else:
                    for param in self.model.parameters():
                        param.requires_grad = False
                    
                    self.model.classifier.weight.requires_grad = True
                    self.model.classifier.bias.requires_grad = True
                
            elif model_name == "AST":
                self.model = AutoModelForAudioClassification.from_pretrained("MIT/ast-finetuned-audioset-10-10-0.4593")
                self.model.classifier.dense = nn.Linear(self.model.classifier.dense.in_features, embedding_size)

                if fine_tune:
                    for param in self.model.parameters():
                        param.requires_grad = True
                else:
                    for param in self.model.parameters():
                        param.requires_grad = False
                    
                    self.model.classifier.dense.weight.requires_grad = True
                    self.model.classifier.dense.bias.requires_grad = True

        self.library = library
        self.model_name = model_name

    def forward(self, x):
        if self.library == "huggingface":
            if self.model_name == "WavLM":
                x = self.model(x).embeddings
            elif self.model_name == "AST":
                x = self.model(x).logits
        else:
            x = self.model(x)
        return x


### Transform

In [72]:
class Audio_Transforms:
    def __init__(self, 
                sample_rate,
                sample_duration, # seconds
                n_fft, # from Korean code
                win_length,
                hop_length,
                window_fn,
                n_mels,
                model_name=None):

        self.sample_rate = sample_rate
        self.sample_duration = sample_duration
        self.n_fft = n_fft
        self.win_length = win_length
        self.hop_length = hop_length
        self.window_fn = window_fn
        self.n_mels = n_mels
        
        if model_name == "WavLM":
            self.feature_extractor = Wav2Vec2FeatureExtractor.from_pretrained('microsoft/wavlm-base-sv')
        elif model_name == "AST":
            self.feature_extractor = ASTFeatureExtractor()

    def basic_transform(self, signal, sample_rate):

        # stereo --> mono
        if signal.shape[0] > 1:
            signal = torch.mean(signal, dim=0, keepdim=True)
        
        # sample_rate --> 16000
        if sample_rate != self.sample_rate:
            resampler = torchaudio.transforms.Resample(sample_rate, self.sample_rate)
            signal = resampler(signal)

        # normalize duration --> 3 seconds (mean duration in dataset)
        sample_length_signal = self.sample_duration * self.sample_rate # sample length of the audio signal
        length_signal = signal.shape[1]
        if length_signal < sample_length_signal:
            num_missing_points = int(sample_length_signal - length_signal)
            dim_padding = (0, num_missing_points) # (left_pad, right_pad)
            # ex: dim_padding = (1,2) --> [1,1,1] -> [0,1,1,1,0,0]
            signal = torch.nn.functional.pad(signal, dim_padding)
        elif length_signal > sample_length_signal:
            middle_of_the_signal = length_signal // 2
            left_edge = int(middle_of_the_signal - sample_length_signal // 2)
            right_edge = int(middle_of_the_signal + sample_length_signal // 2)
            signal = signal[:,left_edge:right_edge]
            
        return signal

    def HF_transform(self, signal, sample_rate):
        signal = self.basic_transform(signal, sample_rate)
        signal = signal.squeeze()
        inputs = self.feature_extractor(signal, sampling_rate=sample_rate, padding=True, return_tensors="pt")
        return inputs.input_values

### Train

In [73]:
library="huggingface"
modality = "wav"
model_name = "AST"
pool="default"

# audio
sample_rate=16000
sample_duration=2 # seconds
n_fft=512 # from Korean code
win_length=400
hop_length=160
window_fn=torch.hamming_window
n_mels=40

In [80]:
audio_T = Audio_Transforms(sample_rate=16000,
                            sample_duration=3, # seconds
                            n_fft=512, # from Korean code
                            win_length=400,
                            hop_length=160,
                            window_fn=torch.hamming_window,
                            n_mels=40,
                            model_name=model_name)

In [81]:
# dataset
ANNOTATIONS_FILE = "/workdir/Speaker_Verification_version_1.0/Speaker-Verification/annotations_file_short_SF.csv"
PATH2DATASET = "/workdir/sf_pv"
DATASET_DIR = f'{PATH2DATASET}/data_v2'

train_dataset = SpeakingFacesDataset(ANNOTATIONS_FILE,DATASET_DIR,'train',
                                    image_transform=None, 
                                    audio_transform=audio_T.HF_transform)

valid_dataset = ValidDataset(PATH2DATASET,'valid',
                            image_transform=None, 
                            audio_transform=audio_T.HF_transform)

In [83]:
for i in range(0,5):
    wav, _, _, _ = train_dataset[i]
    print(wav.shape)

torch.Size([1, 1024, 128])
torch.Size([1, 1024, 128])
torch.Size([1, 1024, 128])
torch.Size([1, 1024, 128])
torch.Size([1, 1024, 128])


In [78]:
# sampler
train_sampler = ProtoSampler(labels = train_dataset.labels,
                                    n_batch = 10,
                                    n_ways = 3, # n_way
                                    n_support = 1, # n_shots
                                    n_query = 1)

# dataloader
train_dataloader = DataLoader(dataset=train_dataset, 
                          batch_sampler=train_sampler,
                          num_workers=4, pin_memory=True
                          )

valid_dataloader = DataLoader(dataset=valid_dataset,
                            batch_size=64,
                            shuffle=True,
                            num_workers=4, 
                            pin_memory=True)

In [84]:
pbar = tqdm(train_dataloader, desc=f'Train (epoch = {1})', leave=False)  

total_loss = 0
total_acc = 0
for batch in pbar:

    if modality == "rgb":
        # data_wav, data_rgb, data_thr, label
        _,rgb, _, _ = batch # we do not use labels from dataset
        data = rgb.to(device)
    elif modality == "thr":
        # data_wav, data_rgb, data_thr, label
        _,_, thr, _ = batch # we do not use labels from dataset
        data = thr.to(device)
    elif modality == "wav":
        wav, _, _, _ = batch # we do not use labels from dataset
        data = wav.to(device)

                                                         

RuntimeError: Caught RuntimeError in DataLoader worker process 0.
Original Traceback (most recent call last):
  File "/usr/local/lib/python3.8/dist-packages/torch/utils/data/_utils/worker.py", line 302, in _worker_loop
    data = fetcher.fetch(index)
  File "/usr/local/lib/python3.8/dist-packages/torch/utils/data/_utils/fetch.py", line 61, in fetch
    return self.collate_fn(data)
  File "/usr/local/lib/python3.8/dist-packages/torch/utils/data/_utils/collate.py", line 265, in default_collate
    return collate(batch, collate_fn_map=default_collate_fn_map)
  File "/usr/local/lib/python3.8/dist-packages/torch/utils/data/_utils/collate.py", line 143, in collate
    return [collate(samples, collate_fn_map=collate_fn_map) for samples in transposed]  # Backwards compatibility.
  File "/usr/local/lib/python3.8/dist-packages/torch/utils/data/_utils/collate.py", line 143, in <listcomp>
    return [collate(samples, collate_fn_map=collate_fn_map) for samples in transposed]  # Backwards compatibility.
  File "/usr/local/lib/python3.8/dist-packages/torch/utils/data/_utils/collate.py", line 120, in collate
    return collate_fn_map[elem_type](batch, collate_fn_map=collate_fn_map)
  File "/usr/local/lib/python3.8/dist-packages/torch/utils/data/_utils/collate.py", line 172, in collate_numpy_array_fn
    return collate([torch.as_tensor(b) for b in batch], collate_fn_map=collate_fn_map)
  File "/usr/local/lib/python3.8/dist-packages/torch/utils/data/_utils/collate.py", line 120, in collate
    return collate_fn_map[elem_type](batch, collate_fn_map=collate_fn_map)
  File "/usr/local/lib/python3.8/dist-packages/torch/utils/data/_utils/collate.py", line 162, in collate_tensor_fn
    out = elem.new(storage).resize_(len(batch), *list(elem.size()))
RuntimeError: Trying to resize storage that is not resizable


In [35]:
# optimizer + scheduler
optimizer = torch.optim.Adam(model.parameters(), lr = 0.001, weight_decay = 0)
scheduler = torch.optim.lr_scheduler.StepLR(optimizer, 1, gamma=0.5, last_epoch=-1)

# loss
criterion = PrototypicalLoss(dist_type='squared_euclidean')
criterion = criterion.to(device)

In [36]:
# train
model = train_model(model,
                    train_dataloader, 
                    valid_dataloader,
                    train_sampler,
                    criterion,
                    optimizer,
                    scheduler,
                    device,
                    num_epochs=1,
                    save_dir="/workdir/results",
                    exp_name="chern",
                    modality='wav')

Epoch:   0%|          | 0/1 [00:00<?, ?it/s]


RuntimeError: Caught RuntimeError in DataLoader worker process 0.
Original Traceback (most recent call last):
  File "/usr/local/lib/python3.8/dist-packages/torch/utils/data/_utils/worker.py", line 302, in _worker_loop
    data = fetcher.fetch(index)
  File "/usr/local/lib/python3.8/dist-packages/torch/utils/data/_utils/fetch.py", line 61, in fetch
    return self.collate_fn(data)
  File "/usr/local/lib/python3.8/dist-packages/torch/utils/data/_utils/collate.py", line 265, in default_collate
    return collate(batch, collate_fn_map=default_collate_fn_map)
  File "/usr/local/lib/python3.8/dist-packages/torch/utils/data/_utils/collate.py", line 143, in collate
    return [collate(samples, collate_fn_map=collate_fn_map) for samples in transposed]  # Backwards compatibility.
  File "/usr/local/lib/python3.8/dist-packages/torch/utils/data/_utils/collate.py", line 143, in <listcomp>
    return [collate(samples, collate_fn_map=collate_fn_map) for samples in transposed]  # Backwards compatibility.
  File "/usr/local/lib/python3.8/dist-packages/torch/utils/data/_utils/collate.py", line 120, in collate
    return collate_fn_map[elem_type](batch, collate_fn_map=collate_fn_map)
  File "/usr/local/lib/python3.8/dist-packages/torch/utils/data/_utils/collate.py", line 172, in collate_numpy_array_fn
    return collate([torch.as_tensor(b) for b in batch], collate_fn_map=collate_fn_map)
  File "/usr/local/lib/python3.8/dist-packages/torch/utils/data/_utils/collate.py", line 120, in collate
    return collate_fn_map[elem_type](batch, collate_fn_map=collate_fn_map)
  File "/usr/local/lib/python3.8/dist-packages/torch/utils/data/_utils/collate.py", line 162, in collate_tensor_fn
    out = elem.new(storage).resize_(len(batch), *list(elem.size()))
RuntimeError: Trying to resize storage that is not resizable


In [7]:
path2wav1 = "/workdir/sf_pv/data_v2/sub_1/11/wav/574.wav"
path2wav2 = "/workdir/sf_pv/data_v2/sub_1/11/wav/603.wav"
path2wav3 = "/workdir/sf_pv/data_v2/sub_1/12/wav/673.wav"
path2wav4 = "/workdir/sf_pv/data_v2/sub_2/11/wav/38.wav"
data_wav_id1_1, sample_rate = soundfile.read(path2wav1) 
data_wav_id1_2, sample_rate = soundfile.read(path2wav2) 
data_wav_id1_3, sample_rate = soundfile.read(path2wav3) 
data_wav_id2, sample_rate = soundfile.read(path2wav4)

NameError: name 'soundfile' is not defined

In [8]:
data_wav, sr = torchaudio.load(path2wav4)

NameError: name 'torchaudio' is not defined

In [95]:
data_wav.shape

torch.Size([1, 60959])

In [106]:
data_wav = feature_extractor(data_wav.squeeze(), sampling_rate=16000, padding=True,return_tensors="pt")

In [107]:
data_wav.input_values.shape

torch.Size([1, 60959])

In [90]:
# compute attention masks and normalize the waveform if needed
inputs1 = feature_extractor(data_wav_id1_1, sampling_rate=16000, padding=True, return_tensors="pt")
inputs2 = feature_extractor(data_wav_id1_2, sampling_rate=16000, padding=True, return_tensors="pt")
inputs3 = feature_extractor(data_wav_id1_3, sampling_rate=16000, padding=True, return_tensors="pt")
inputs4 = feature_extractor(data_wav_id2, sampling_rate=16000, padding=True, return_tensors="pt")

In [36]:
def ast_transform(data_wav, sample_rate):
    data_wav = data_wav.squeeze()
    inputs = feature_extractor(data_wav, sampling_rate=sample_rate, padding=True, return_tensors="pt")
    input_values = inputs.input_values
    return input_values

In [109]:
out = model(**data_wav).embeddings

In [110]:
out.shape

torch.Size([1, 512])

In [91]:
output1 = model(**inputs1).embeddings
output2 = model(**inputs2).embeddings
output3 = model(**inputs3).embeddings
output4 = model(**inputs4).embeddings

In [92]:
emb1 = torch.nn.functional.normalize(output1, dim=-1).cpu()
emb2 = torch.nn.functional.normalize(output2, dim=-1).cpu()
emb3 = torch.nn.functional.normalize(output3, dim=-1).cpu()
emb4 = torch.nn.functional.normalize(output4, dim=-1).cpu()

In [93]:
# the resulting embeddings can be used for cosine similarity-based retrieval
cosine_sim = torch.nn.CosineSimilarity(dim=-1)
similarity = cosine_sim(embeddings1, embeddings2)
threshold = 0.86  # the optimal threshold is dataset-dependent
print(cosine_sim(emb1, emb2))
print(cosine_sim(emb1, emb3))
print(cosine_sim(emb1, emb4))
if similarity < threshold:
    print("Speakers are not the same!")

tensor([0.8827], grad_fn=<SumBackward1>)
tensor([0.8955], grad_fn=<SumBackward1>)
tensor([0.8481], grad_fn=<SumBackward1>)
