### Library

In [1]:
cd /workdir/Speaker_Verification_version_1.0/Speaker-Verification

/workdir/Speaker_Verification_version_1.0/Speaker-Verification


In [2]:
from speaker_verification import transforms as T
from speaker_verification.dataset import SpeakingFacesDataset
from speaker_verification.dataset import ValidDataset
from speaker_verification.sampler import ProtoSampler
from speaker_verification.sampler import ValidSampler
from speaker_verification.models_handmade.resnet import ResNet34
from speaker_verification.models import SelfAttentivePool2d
from speaker_verification.loss import PrototypicalLoss
from speaker_verification.train import train_model

import torch
from torch.utils.data import DataLoader

from speaker_verification.utils import plot_sample
import timm
from tqdm.notebook import tqdm
import matplotlib.pyplot as plt
import numpy as np

In [3]:
from tqdm import tqdm
from tqdm import trange
import gc
import torch
import torch.nn.functional as F

from speaker_verification.loss import PrototypicalLoss
from speaker_verification.metrics import EER_
from speaker_verification.metrics import accuracy_
from timeit import default_timer as timer

In [4]:
import speechbrain

### Functions

### General pipeline

In [6]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

In [4]:
# dataset
ANNOTATIONS_FILE = "/workdir/annotations_file_short.csv"
DATASET_DIR = '/workdir/sf_pv/data_v2'
PATH2DATASET = "/workdir/sf_pv"

train_dataset = SpeakingFacesDataset(ANNOTATIONS_FILE,DATASET_DIR,'train',
                                    image_transform=T.image_transform, 
                                    audio_transform=T.audio_transform)

valid_dataset = ValidDataset(PATH2DATASET,'valid',
                            image_transform=T.image_transform, 
                            audio_transform=T.audio_transform)
# sampler
train_sampler = ProtoSampler(labels = train_dataset.labels,
                                    n_batch = 10,
                                    n_ways = 3, # n_way
                                    n_support = 1, # n_shots
                                    n_query = 1)

# dataloader
train_dataloader = DataLoader(dataset=train_dataset, 
                          batch_sampler=train_sampler,
                          num_workers=4, pin_memory=True
                          )

valid_dataloader = DataLoader(dataset=valid_dataset,
                            batch_size=64,
                            shuffle=True,
                            num_workers=4, 
                            pin_memory=True)

In [6]:
# model
# model = ResNet34()
model = timm.create_model('resnet34', pretrained=True, num_classes=128, in_chans=1)
model = model.to(device)

In [7]:
# optimizer + scheduler
optimizer = torch.optim.Adam(model.parameters(), lr = 0.001, weight_decay = 0)
scheduler = torch.optim.lr_scheduler.StepLR(optimizer, 1, gamma=0.5, last_epoch=-1)

# loss
criterion = PrototypicalLoss(dist_type='squared_euclidean')
criterion = criterion.to(device)

In [8]:
# train
model = train_model(model,
                    train_dataloader, 
                    valid_dataloader,
                    train_sampler,
                    criterion,
                    optimizer,
                    scheduler,
                    device,
                    num_epochs=1,
                    save_dir="/workdir/results",
                    exp_name="resnet",
                    modality='wav')

Epoch:   0%|          | 0/1 [00:00<?, ?it/s]


Average train loss: 12.77603850364685
Average train accuracy: 43.333336639404294


Eval (epoch = 0): 100%|██████████| 594/594 [01:16<00:00,  7.76it/s]



Average val eer: 40.396064017445006

Average val accuracy: 59.49687850729518
Best eer model saved at epoch 0


Epoch: 100%|██████████| 1/1 [01:19<00:00, 79.64s/it]

Best acc model saved at epoch 0
Time elapsed: 1.327319642699634  minutes





### timm REsNet + SAP

In [3]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

In [4]:
# dataset
ANNOTATIONS_FILE = "/workdir/annotations_file_short.csv"
DATASET_DIR = '/workdir/sf_pv/data_v2'
PATH2DATASET = "/workdir/sf_pv"

train_dataset = SpeakingFacesDataset(ANNOTATIONS_FILE,DATASET_DIR,'train',
                                    image_transform=T.image_transform, 
                                    audio_transform=T.audio_transform)

valid_dataset = ValidDataset(PATH2DATASET,'valid',
                            image_transform=T.image_transform, 
                            audio_transform=T.audio_transform)
# sampler
train_sampler = ProtoSampler(labels = train_dataset.labels,
                                    n_batch = 10,
                                    n_ways = 3, # n_way
                                    n_support = 1, # n_shots
                                    n_query = 1)

# dataloader
train_dataloader = DataLoader(dataset=train_dataset, 
                          batch_sampler=train_sampler,
                          num_workers=4, pin_memory=True
                          )

valid_dataloader = DataLoader(dataset=valid_dataset,
                            batch_size=64,
                            shuffle=True,
                            num_workers=4, 
                            pin_memory=True)

In [5]:
# model
# model = ResNet34()
model = timm.create_model('resnet34', pretrained=True, num_classes=128, in_chans=1)
model.global_pool = SelfAttentivePool2d()

In [6]:
model = model.to(device)

In [7]:
# optimizer + scheduler
optimizer = torch.optim.Adam(model.parameters(), lr = 0.001, weight_decay = 0)
scheduler = torch.optim.lr_scheduler.StepLR(optimizer, 1, gamma=0.5, last_epoch=-1)

# loss
criterion = PrototypicalLoss(dist_type='squared_euclidean')
criterion = criterion.to(device)

In [8]:
# train
model = train_model(model,
                    train_dataloader, 

                    
                    valid_dataloader,
                    train_sampler,
                    criterion,
                    optimizer,
                    scheduler,
                    device,
                    num_epochs=1,
                    save_dir="/workdir/results",
                    exp_name="resnet_sap",
                    modality='wav')

Epoch:   0%|          | 0/1 [00:00<?, ?it/s]


Average train loss: 16.273793411254882
Average train accuracy: 40.000003051757815


Eval (epoch = 0): 100%|██████████| 594/594 [01:16<00:00,  7.80it/s]



Average val eer: 39.018451303037835

Average val accuracy: 60.91820987654321
Best eer model saved at epoch 0


Epoch: 100%|██████████| 1/1 [01:19<00:00, 79.41s/it]

Best acc model saved at epoch 0
Time elapsed: 1.3235322367555151  minutes





### Try AST (Audio Spectrogram Transformer)

In [6]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

In [3]:

from transformers import ASTFeatureExtractor

feature_extractor = ASTFeatureExtractor()

In [4]:

from transformers import AutoModelForAudioClassification

model = AutoModelForAudioClassification.from_pretrained("MIT/ast-finetuned-audioset-10-10-0.4593")

In [7]:
model = model.to(device)

In [8]:
model

ASTForAudioClassification(
  (audio_spectrogram_transformer): ASTModel(
    (embeddings): ASTEmbeddings(
      (patch_embeddings): ASTPatchEmbeddings(
        (projection): Conv2d(1, 768, kernel_size=(16, 16), stride=(10, 10))
      )
      (dropout): Dropout(p=0.0, inplace=False)
    )
    (encoder): ASTEncoder(
      (layer): ModuleList(
        (0): ASTLayer(
          (attention): ASTAttention(
            (attention): ASTSelfAttention(
              (query): Linear(in_features=768, out_features=768, bias=True)
              (key): Linear(in_features=768, out_features=768, bias=True)
              (value): Linear(in_features=768, out_features=768, bias=True)
              (dropout): Dropout(p=0.0, inplace=False)
            )
            (output): ASTSelfOutput(
              (dense): Linear(in_features=768, out_features=768, bias=True)
              (dropout): Dropout(p=0.0, inplace=False)
            )
          )
          (intermediate): ASTIntermediate(
            (dense): Li

In [9]:
def ast_transform(data_wav, sample_rate):
    data_wav = data_wav.squeeze().numpy()
    inputs = feature_extractor(data_wav, sampling_rate=sample_rate, padding="max_length", return_tensors="pt")
    input_values = inputs.input_values
    return input_values

In [10]:
# dataset
ANNOTATIONS_FILE = "/workdir/annotations_file_short.csv"
DATASET_DIR = '/workdir/sf_pv/data_v2'
PATH2DATASET = "/workdir/sf_pv"

train_dataset = SpeakingFacesDataset(ANNOTATIONS_FILE,DATASET_DIR,'train',
                                    image_transform=T.image_transform, 
                                    audio_transform=ast_transform)

valid_dataset = ValidDataset(PATH2DATASET,'valid',
                            image_transform=T.image_transform, 
                            audio_transform=ast_transform)
# sampler
train_sampler = ProtoSampler(labels = train_dataset.labels,
                                    n_batch = 10,
                                    n_ways = 3, # n_way
                                    n_support = 1, # n_shots
                                    n_query = 1)

# dataloader
train_dataloader = DataLoader(dataset=train_dataset, 
                          batch_sampler=train_sampler,
                          num_workers=4, pin_memory=True
                          )

valid_dataloader = DataLoader(dataset=valid_dataset,
                            batch_size=64,
                            shuffle=True,
                            num_workers=4, 
                            pin_memory=True)

In [11]:
# optimizer + scheduler
optimizer = torch.optim.Adam(model.parameters(), lr = 0.001, weight_decay = 0)
scheduler = torch.optim.lr_scheduler.StepLR(optimizer, 1, gamma=0.5, last_epoch=-1)

# loss
criterion = PrototypicalLoss(dist_type='squared_euclidean')
criterion = criterion.to(device)

In [12]:
# train
model = train_model(model,
                    train_dataloader, 
                    valid_dataloader,
                    train_sampler,
                    criterion,
                    optimizer,
                    scheduler,
                    device,
                    num_epochs=1,
                    save_dir="/workdir/results",
                    exp_name="AST",
                    modality='wav')

Epoch:   0%|          | 0/1 [00:00<?, ?it/s]


RuntimeError: Expected 3D (unbatched) or 4D (batched) input to conv2d, but got input of size: [6, 1, 1024, 1, 128]

In [None]:
with torch.no_grad():

    outputs = model(**inputs)

last_hidden_states = outputs.last_hidden_state

list(last_hidden_states.shape)

In [13]:
model

ASTForAudioClassification(
  (audio_spectrogram_transformer): ASTModel(
    (embeddings): ASTEmbeddings(
      (patch_embeddings): ASTPatchEmbeddings(
        (projection): Conv2d(1, 768, kernel_size=(16, 16), stride=(10, 10))
      )
      (dropout): Dropout(p=0.0, inplace=False)
    )
    (encoder): ASTEncoder(
      (layer): ModuleList(
        (0): ASTLayer(
          (attention): ASTAttention(
            (attention): ASTSelfAttention(
              (query): Linear(in_features=768, out_features=768, bias=True)
              (key): Linear(in_features=768, out_features=768, bias=True)
              (value): Linear(in_features=768, out_features=768, bias=True)
              (dropout): Dropout(p=0.0, inplace=False)
            )
            (output): ASTSelfOutput(
              (dense): Linear(in_features=768, out_features=768, bias=True)
              (dropout): Dropout(p=0.0, inplace=False)
            )
          )
          (intermediate): ASTIntermediate(
            (dense): Li

### Try Evaluation as from Korean guys

In [9]:
import soundfile
import torchaudio

In [6]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

In [7]:
# dataset
ANNOTATIONS_FILE = "/workdir/annotations_file_short.csv"
DATASET_DIR = '/workdir/sf_pv/data_v2'
PATH2DATASET = "/workdir/sf_pv"

valid_dataset = ValidDataset(PATH2DATASET,'valid',
                            image_transform=None, 
                            audio_transform=None)

# valid_dataloader = DataLoader(dataset=valid_dataset,
#                             batch_size=1,
#                             shuffle=True,
#                             num_workers=4, 
#                             pin_memory=True)

In [8]:
sample = valid_dataset[0]

/workdir/sf_pv/data_v2/sub_101/23/wav/139.wav
/workdir/sf_pv/data_v2/sub_101/25/wav/49.wav


1. Load audio from torchaudio and from soundfile

In [10]:
path2wav1 = "/workdir/sf_pv/data_v2/sub_101/23/wav/139.wav"
path2wav2 = "/workdir/sf_pv/data_v2/sub_101/25/wav/49.wav"

In [11]:
data_wav, sample_rate = torchaudio.load(path2wav1) 
audio1, sample_rate1 = soundfile.read(path2wav1)

In [13]:
print(sample_rate)
print(sample_rate1)

16000
16000


In [14]:
print(data_wav)

tensor([[0.0045, 0.0071, 0.0056,  ..., 0.0119, 0.0111, 0.0084]])


In [15]:
print(audio1)

[0.00445557 0.00714111 0.00561523 ... 0.01193237 0.01113892 0.00842285]


In [11]:
for batch in valid_dataloader:
    id1, id2, labels = batch

    wav_id1, rgb_id1, thr_id1, person_id1 = id1
    wav_id2, rgb_id2, thr_id2, person_id2 = id2

    break

In [13]:
print(f"Same person: {bool(labels)}")
print(labels)

print(f"First person id: {person_id1}")
print(f"Second person id: {person_id2}")

Same person: True
tensor([1])
First person id: tensor([113])
Second person id: tensor([113])


In [16]:
for sample in valid_dataset:
    id1, id2, label = sample

    print(f"Same person: {bool(label)}")

    wav_id1, rgb_id1, thr_id1, person_id1 = id1
    wav_id2, rgb_id2, thr_id2, person_id2 = id2

    print(f"First person id: {person_id1}")
    print(f"Second person id: {person_id2}")

    break

Same person: True
First person id: 101
Second person id: 101


### SpeechBrain embeddings

In [14]:
from speechbrain.pretrained import EncoderClassifier

In [15]:
classifier = EncoderClassifier.from_hparams(source="speechbrain/spkrec-xvect-voxceleb", savedir="pretrained_models/spkrec-xvect-voxceleb")

In [18]:
classifier.encode_batch(wav_id1)

NotImplementedError: Only 2D, 3D, 4D, 5D padding with non-constant padding are supported for now

In [17]:
embeddings_id1 = classifier.encode_batch(wav_id1[0])
embeddings_id2 = classifier.encode_batch(wav_id2[0])

In [26]:
cos_sim = F.cosine_similarity(embeddings_id1, embeddings_id2, dim=1)

In [28]:
EER_(cos_sim, label)

AttributeError: 'int' object has no attribute 'cpu'

In [None]:
eer, scores = EER_(cos_sim, label)
accuracy = accuracy_(labels, scores)

In [None]:
from speechbrain.pretrained import SpeakerRecognition
verification = SpeakerRecognition.from_hparams(source="speechbrain/spkrec-ecapa-voxceleb", savedir="pretrained_models/spkrec-ecapa-voxceleb")

score, prediction = verification.verify_files(file1, file2)

print(score)
print(prediction) # True = same speaker, False=Different speakers

### Добавила варьирование параметров для transform

In [30]:
from speaker_verification.transforms import Audio_Transforms
from speaker_verification.transforms import Image_Transforms
from speaker_verification.models import Model

In [31]:
device = torch.device("cuda:1" if torch.cuda.is_available() else "cpu")

In [43]:
library="timm"
modality = "wav"
model_name = "resnet34"
pool="default"

In [44]:
model = Model(library=library, 
                pretrained_weights=True, 
                fine_tune=False, 
                embedding_size=128, 
                modality = modality,
                model_name = model_name,
                pool=pool)
model = model.to(device)

audio_T = Audio_Transforms(sample_rate=16000,
                            sample_duration=3, # seconds
                            n_fft=512, # from Korean code
                            win_length=400,
                            hop_length=160,
                            window_fn=torch.hamming_window,
                            n_mels=40)

image_T = Image_Transforms(model,
                            library=library,
                            model_name = model_name,
                            resize=128)

In [45]:
# dataset
ANNOTATIONS_FILE = "/workdir/Speaker_Verification_version_1.0/Speaker-Verification/annotations_file_short_SF.csv"
PATH2DATASET = "/workdir/sf_pv"
DATASET_DIR = f'{PATH2DATASET}/data_v2'

train_dataset = SpeakingFacesDataset(ANNOTATIONS_FILE,DATASET_DIR,'train',
                                    image_transform=image_T.transform, 
                                    audio_transform=audio_T.transform)

valid_dataset = ValidDataset(PATH2DATASET,'valid',
                            image_transform=image_T.transform, 
                            audio_transform=audio_T.transform)

In [46]:
# sampler
train_sampler = ProtoSampler(labels = train_dataset.labels,
                                    n_batch = 10,
                                    n_ways = 3, # n_way
                                    n_support = 1, # n_shots
                                    n_query = 1)

# dataloader
train_dataloader = DataLoader(dataset=train_dataset, 
                          batch_sampler=train_sampler,
                          num_workers=4, pin_memory=True
                          )

valid_dataloader = DataLoader(dataset=valid_dataset,
                            batch_size=64,
                            shuffle=True,
                            num_workers=4, 
                            pin_memory=True)

In [47]:
# optimizer + scheduler
optimizer = torch.optim.Adam(model.parameters(), lr = 0.001, weight_decay = 0)
scheduler = torch.optim.lr_scheduler.StepLR(optimizer, 1, gamma=0.5, last_epoch=-1)

# loss
criterion = PrototypicalLoss(dist_type='squared_euclidean')
criterion = criterion.to(device)

In [48]:
# train
model = train_model(model,
                    train_dataloader, 
                    valid_dataloader,
                    train_sampler,
                    criterion,
                    optimizer,
                    scheduler,
                    device,
                    num_epochs=1,
                    save_dir="/workdir/Speaker_Verification_version_1.0/results",
                    exp_name="chern",
                    modality=modality)

Epoch:   0%|          | 0/1 [00:00<?, ?it/s]

Train (epoch = 0):   0%|          | 0/10 [00:00<?, ?it/s]


Average train loss: 8.036772561073303
Average train accuracy: 26.666668701171876


Eval (epoch = 0):   0%|          | 0/594 [00:00<?, ?it/s]


Average val eer: 38.135066470414856

Average val accuracy: 61.7555765993266
Best eer model saved at epoch 0
Best acc model saved at epoch 0
Time elapsed: 1.227491514896974  minutes


## Evaluation from Korea

In [None]:
def loadWAV(filename, max_frames, evalmode=True, num_eval=10):

    # Maximum audio length
    max_audio = max_frames * 160 + 240

    # Read wav file and convert to torch tensor
    audio, sample_rate = soundfile.read(filename)

    audiosize = audio.shape[0]

    if audiosize <= max_audio:
        shortage    = max_audio - audiosize + 1 
        audio       = numpy.pad(audio, (0, shortage), 'wrap')
        audiosize   = audio.shape[0]

    if evalmode:
        startframe = numpy.linspace(0,audiosize-max_audio,num=num_eval)
    else:
        startframe = numpy.array([numpy.int64(random.random()*(audiosize-max_audio))])
    
    feats = []
    if evalmode and max_frames == 0:
        feats.append(audio)
    else:
        for asf in startframe:
            feats.append(audio[int(asf):int(asf)+max_audio])

    feat = numpy.stack(feats,axis=0).astype(numpy.float)

    return feat