### Library

In [1]:
from speaker_verification import transforms as T
from speaker_verification.dataset import SpeakingFacesDataset
from speaker_verification.dataset import ValidDataset
from speaker_verification.sampler import ProtoSampler
from speaker_verification.sampler import ValidSampler
from speaker_verification.models_handmade.resnet import ResNet34
from speaker_verification.models import SelfAttentivePool2d
from speaker_verification.loss import PrototypicalLoss
from speaker_verification.train import train_model

import torch
from torch.utils.data import DataLoader

from speaker_verification.utils import plot_sample
import timm
from tqdm.notebook import tqdm
import matplotlib.pyplot as plt
import numpy as np

In [2]:
from tqdm import tqdm
from tqdm import trange
import gc
import torch
import torch.nn.functional as F

from speaker_verification.loss import PrototypicalLoss
from speaker_verification.metrics import EER_
from speaker_verification.metrics import accuracy_
from timeit import default_timer as timer

### Functions

### General pipeline

In [5]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

In [4]:
# dataset
ANNOTATIONS_FILE = "/workdir/annotations_file_short.csv"
DATASET_DIR = '/workdir/sf_pv/data_v2'
PATH2DATASET = "/workdir/sf_pv"

train_dataset = SpeakingFacesDataset(ANNOTATIONS_FILE,DATASET_DIR,'train',
                                    image_transform=T.image_transform, 
                                    audio_transform=T.audio_transform)

valid_dataset = ValidDataset(PATH2DATASET,'valid',
                            image_transform=T.image_transform, 
                            audio_transform=T.audio_transform)
# sampler
train_sampler = ProtoSampler(labels = train_dataset.labels,
                                    n_batch = 10,
                                    n_ways = 3, # n_way
                                    n_support = 1, # n_shots
                                    n_query = 1)

# dataloader
train_dataloader = DataLoader(dataset=train_dataset, 
                          batch_sampler=train_sampler,
                          num_workers=4, pin_memory=True
                          )

valid_dataloader = DataLoader(dataset=valid_dataset,
                            batch_size=64,
                            shuffle=True,
                            num_workers=4, 
                            pin_memory=True)

In [6]:
# model
# model = ResNet34()
model = timm.create_model('resnet34', pretrained=True, num_classes=128, in_chans=1)
model = model.to(device)

In [7]:
# optimizer + scheduler
optimizer = torch.optim.Adam(model.parameters(), lr = 0.001, weight_decay = 0)
scheduler = torch.optim.lr_scheduler.StepLR(optimizer, 1, gamma=0.5, last_epoch=-1)

# loss
criterion = PrototypicalLoss(dist_type='squared_euclidean')
criterion = criterion.to(device)

In [8]:
# train
model = train_model(model,
                    train_dataloader, 
                    valid_dataloader,
                    train_sampler,
                    criterion,
                    optimizer,
                    scheduler,
                    device,
                    num_epochs=1,
                    save_dir="/workdir/results",
                    exp_name="resnet",
                    modality='wav')

Epoch:   0%|          | 0/1 [00:00<?, ?it/s]


Average train loss: 12.77603850364685
Average train accuracy: 43.333336639404294


Eval (epoch = 0): 100%|██████████| 594/594 [01:16<00:00,  7.76it/s]



Average val eer: 40.396064017445006

Average val accuracy: 59.49687850729518
Best eer model saved at epoch 0


Epoch: 100%|██████████| 1/1 [01:19<00:00, 79.64s/it]

Best acc model saved at epoch 0
Time elapsed: 1.327319642699634  minutes





### timm REsNet + SAP

In [3]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

In [4]:
# dataset
ANNOTATIONS_FILE = "/workdir/annotations_file_short.csv"
DATASET_DIR = '/workdir/sf_pv/data_v2'
PATH2DATASET = "/workdir/sf_pv"

train_dataset = SpeakingFacesDataset(ANNOTATIONS_FILE,DATASET_DIR,'train',
                                    image_transform=T.image_transform, 
                                    audio_transform=T.audio_transform)

valid_dataset = ValidDataset(PATH2DATASET,'valid',
                            image_transform=T.image_transform, 
                            audio_transform=T.audio_transform)
# sampler
train_sampler = ProtoSampler(labels = train_dataset.labels,
                                    n_batch = 10,
                                    n_ways = 3, # n_way
                                    n_support = 1, # n_shots
                                    n_query = 1)

# dataloader
train_dataloader = DataLoader(dataset=train_dataset, 
                          batch_sampler=train_sampler,
                          num_workers=4, pin_memory=True
                          )

valid_dataloader = DataLoader(dataset=valid_dataset,
                            batch_size=64,
                            shuffle=True,
                            num_workers=4, 
                            pin_memory=True)

In [5]:
# model
# model = ResNet34()
model = timm.create_model('resnet34', pretrained=True, num_classes=128, in_chans=1)
model.global_pool = SelfAttentivePool2d()

In [6]:
model = model.to(device)

In [7]:
# optimizer + scheduler
optimizer = torch.optim.Adam(model.parameters(), lr = 0.001, weight_decay = 0)
scheduler = torch.optim.lr_scheduler.StepLR(optimizer, 1, gamma=0.5, last_epoch=-1)

# loss
criterion = PrototypicalLoss(dist_type='squared_euclidean')
criterion = criterion.to(device)

In [8]:
# train
model = train_model(model,
                    train_dataloader, 

                    
                    valid_dataloader,
                    train_sampler,
                    criterion,
                    optimizer,
                    scheduler,
                    device,
                    num_epochs=1,
                    save_dir="/workdir/results",
                    exp_name="resnet_sap",
                    modality='wav')

Epoch:   0%|          | 0/1 [00:00<?, ?it/s]


Average train loss: 16.273793411254882
Average train accuracy: 40.000003051757815


Eval (epoch = 0): 100%|██████████| 594/594 [01:16<00:00,  7.80it/s]



Average val eer: 39.018451303037835

Average val accuracy: 60.91820987654321
Best eer model saved at epoch 0


Epoch: 100%|██████████| 1/1 [01:19<00:00, 79.41s/it]

Best acc model saved at epoch 0
Time elapsed: 1.3235322367555151  minutes





### Try AST (Audio Spectrogram Transformer)

In [6]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

In [3]:

from transformers import ASTFeatureExtractor

feature_extractor = ASTFeatureExtractor()

In [4]:

from transformers import AutoModelForAudioClassification

model = AutoModelForAudioClassification.from_pretrained("MIT/ast-finetuned-audioset-10-10-0.4593")

In [7]:
model = model.to(device)

In [8]:
model

ASTForAudioClassification(
  (audio_spectrogram_transformer): ASTModel(
    (embeddings): ASTEmbeddings(
      (patch_embeddings): ASTPatchEmbeddings(
        (projection): Conv2d(1, 768, kernel_size=(16, 16), stride=(10, 10))
      )
      (dropout): Dropout(p=0.0, inplace=False)
    )
    (encoder): ASTEncoder(
      (layer): ModuleList(
        (0): ASTLayer(
          (attention): ASTAttention(
            (attention): ASTSelfAttention(
              (query): Linear(in_features=768, out_features=768, bias=True)
              (key): Linear(in_features=768, out_features=768, bias=True)
              (value): Linear(in_features=768, out_features=768, bias=True)
              (dropout): Dropout(p=0.0, inplace=False)
            )
            (output): ASTSelfOutput(
              (dense): Linear(in_features=768, out_features=768, bias=True)
              (dropout): Dropout(p=0.0, inplace=False)
            )
          )
          (intermediate): ASTIntermediate(
            (dense): Li

In [9]:
def ast_transform(data_wav, sample_rate):
    data_wav = data_wav.squeeze().numpy()
    inputs = feature_extractor(data_wav, sampling_rate=sample_rate, padding="max_length", return_tensors="pt")
    input_values = inputs.input_values
    return input_values

In [10]:
# dataset
ANNOTATIONS_FILE = "/workdir/annotations_file_short.csv"
DATASET_DIR = '/workdir/sf_pv/data_v2'
PATH2DATASET = "/workdir/sf_pv"

train_dataset = SpeakingFacesDataset(ANNOTATIONS_FILE,DATASET_DIR,'train',
                                    image_transform=T.image_transform, 
                                    audio_transform=ast_transform)

valid_dataset = ValidDataset(PATH2DATASET,'valid',
                            image_transform=T.image_transform, 
                            audio_transform=ast_transform)
# sampler
train_sampler = ProtoSampler(labels = train_dataset.labels,
                                    n_batch = 10,
                                    n_ways = 3, # n_way
                                    n_support = 1, # n_shots
                                    n_query = 1)

# dataloader
train_dataloader = DataLoader(dataset=train_dataset, 
                          batch_sampler=train_sampler,
                          num_workers=4, pin_memory=True
                          )

valid_dataloader = DataLoader(dataset=valid_dataset,
                            batch_size=64,
                            shuffle=True,
                            num_workers=4, 
                            pin_memory=True)

In [11]:
# optimizer + scheduler
optimizer = torch.optim.Adam(model.parameters(), lr = 0.001, weight_decay = 0)
scheduler = torch.optim.lr_scheduler.StepLR(optimizer, 1, gamma=0.5, last_epoch=-1)

# loss
criterion = PrototypicalLoss(dist_type='squared_euclidean')
criterion = criterion.to(device)

In [12]:
# train
model = train_model(model,
                    train_dataloader, 
                    valid_dataloader,
                    train_sampler,
                    criterion,
                    optimizer,
                    scheduler,
                    device,
                    num_epochs=1,
                    save_dir="/workdir/results",
                    exp_name="AST",
                    modality='wav')

Epoch:   0%|          | 0/1 [00:00<?, ?it/s]


RuntimeError: Expected 3D (unbatched) or 4D (batched) input to conv2d, but got input of size: [6, 1, 1024, 1, 128]

In [None]:
with torch.no_grad():

    outputs = model(**inputs)

last_hidden_states = outputs.last_hidden_state

list(last_hidden_states.shape)

In [13]:
model

ASTForAudioClassification(
  (audio_spectrogram_transformer): ASTModel(
    (embeddings): ASTEmbeddings(
      (patch_embeddings): ASTPatchEmbeddings(
        (projection): Conv2d(1, 768, kernel_size=(16, 16), stride=(10, 10))
      )
      (dropout): Dropout(p=0.0, inplace=False)
    )
    (encoder): ASTEncoder(
      (layer): ModuleList(
        (0): ASTLayer(
          (attention): ASTAttention(
            (attention): ASTSelfAttention(
              (query): Linear(in_features=768, out_features=768, bias=True)
              (key): Linear(in_features=768, out_features=768, bias=True)
              (value): Linear(in_features=768, out_features=768, bias=True)
              (dropout): Dropout(p=0.0, inplace=False)
            )
            (output): ASTSelfOutput(
              (dense): Linear(in_features=768, out_features=768, bias=True)
              (dropout): Dropout(p=0.0, inplace=False)
            )
          )
          (intermediate): ASTIntermediate(
            (dense): Li