# Libraries

In [1]:
cd /workdir/Speaker_Verification_version_1.0/Speaker-Verification/

/workdir/Speaker_Verification_version_1.0/Speaker-Verification


In [2]:
from speaker_verification import transforms as T
from speaker_verification.dataset import SpeakingFacesDataset
from speaker_verification.dataset import ValidDataset
from speaker_verification.sampler import ProtoSampler
from speaker_verification.sampler import ValidSampler
from speaker_verification.models_handmade.resnet import ResNet34
from speaker_verification.models import ResNet
from speaker_verification.loss import PrototypicalLoss
from speaker_verification.train import train_model

import torch
from torch.utils.data import DataLoader

from speaker_verification.utils import plot_sample


In [3]:
import timm

In [4]:
from tqdm.notebook import tqdm
import matplotlib.pyplot as plt
import numpy as np

# Different models

## resnet1 (Resnet from Pytorch)

In [16]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

In [27]:
# model
model = ResNet(pretrained_weights=True, 
                        fine_tune=False, 
                        embedding_size=128, 
                        modality = "rgb", 
                        filter_size="default", 
                        from_torch=True
                    )
model = model.to(device)

unfrozen = 0
frozen = 0
total = 0
for param in model.parameters():
    if param.requires_grad:
        unfrozen += param.numel()
    else:
        frozen += param.numel()
    total += param.numel()

print(f"Number of unfrozen parameters with fine_tune=True: {unfrozen}/{total} - {unfrozen/total * 100} %")
print(f"Number of frozen parameters with fine_tune=True: {frozen}/{total} - {frozen / total * 100} %")
print(f"Total number of parameters with fine_tune=True: {total}")

Number of unfrozen parameters with fine_tune=True: 21350336/21350336 - 100.0 %
Number of frozen parameters with fine_tune=True: 0/21350336 - 0.0 %
Total number of parameters with fine_tune=True: 21350336


Check if number of parameters changes with changing to fine tune or not

In [29]:
# model
model = ResNet(pretrained_weights=True, 
                        fine_tune=False, 
                        embedding_size=128, 
                        modality = "rgb", 
                        filter_size="default", 
                        from_torch=True
                    )
model = model.to(device)

unfrozen = 0
frozen = 0
total = 0
for param in model.parameters():
    if param.requires_grad:
        unfrozen += param.numel()
    else:
        frozen += param.numel()
    total += param.numel()

print(f"Number of unfrozen parameters with fine_tune=False: {unfrozen}/{total} - {unfrozen/total * 100} %")
print(f"Number of frozen parameters with fine_tune=False: {frozen}/{total} - {frozen / total * 100} %")
print(f"Total number of parameters with fine_tune=False: {total}")

Number of unfrozen parameters with fine_tune=False: 65664/21350336 - 0.307554878761627 %
Number of frozen parameters with fine_tune=False: 21284672/21350336 - 99.69244512123836 %
Total number of parameters with fine_tune=False: 21350336


In [17]:
# dataset
ANNOTATIONS_FILE = "annotations_file_short_SF.csv"
DATASET_DIR = '/workdir/sf_pv/data_v2'
PATH2DATASET = "/workdir/sf_pv"

# Dataset
train_dataset = SpeakingFacesDataset(ANNOTATIONS_FILE,DATASET_DIR,'train',
                                    image_transform=T.image_transform, 
                                    audio_transform=T.audio_transform)
valid_dataset = ValidDataset(PATH2DATASET,'valid',
                        image_transform=T.image_transform, 
                        audio_transform=T.audio_transform)

# sampler
train_sampler = ProtoSampler(train_dataset.labels,
                            n_batch=200,
                            n_ways=60, # n_way
                            n_support=1, # n_shots
                            n_query=1)

# dataloader
train_dataloader = DataLoader(dataset=train_dataset, 
                        batch_sampler=train_sampler,
                        num_workers=4, pin_memory=True
                        )

valid_dataloader = DataLoader(dataset=valid_dataset,
                        batch_size=64,
                        shuffle=True,
                        num_workers=4, 
                        pin_memory=True)

# optimizer + scheduler
optimizer = torch.optim.Adam(model.parameters(), lr = 0.001, weight_decay = 0)
scheduler = torch.optim.lr_scheduler.StepLR(optimizer, 1, gamma=0.5, last_epoch=-1)

# loss
criterion = PrototypicalLoss(dist_type='cosine_similarity')
criterion = criterion.to(device)

# train
model = train_model(model,
                    train_dataloader, 
                    valid_dataloader,
                    train_sampler,
                    criterion,
                    optimizer,
                    scheduler,
                    device,
                    num_epochs=1,
                    save_dir="/workdir/Speaker_Verification_version_1.0/results",
                    exp_name="chern",
                    modality="rgb")

Epoch:   0%|          | 0/1 [00:00<?, ?it/s]


Average train loss: 4.09436205625534
Average train accuracy: 1.5500000739097595


Eval (epoch = 0): 100%|██████████| 594/594 [01:17<00:00,  7.69it/s]



Average val eer: 48.8092462548469

Average val accuracy: 50.56642817059483
Best eer model saved at epoch 0


Epoch: 100%|██████████| 1/1 [01:48<00:00, 108.02s/it]

Best acc model saved at epoch 0
Time elapsed: 1.800370344084998  minutes





## resnet2 (from timm)

In [5]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

In [6]:
model = timm.create_model('resnet34', pretrained=True, num_classes=128, in_chans=3)
model=model.to(device)

In [7]:
unfrozen = 0
frozen = 0
total = 0
for param in model.parameters():
    if param.requires_grad:
        unfrozen += param.numel()
    else:
        frozen += param.numel()
    total += param.numel()

print(f"Number of unfrozen parameters with fine_tune=True: {unfrozen}/{total} - {unfrozen/total * 100} %")
print(f"Number of frozen parameters with fine_tune=True: {frozen}/{total} - {frozen / total * 100} %")
print(f"Total number of parameters with fine_tune=True: {total}")

Number of unfrozen parameters with fine_tune=True: 21350336/21350336 - 100.0 %
Number of frozen parameters with fine_tune=True: 0/21350336 - 0.0 %
Total number of parameters with fine_tune=True: 21350336


In [16]:
for param in model.parameters():
    param.requires_grad = False

model.get_classifier().weight.requires_grad = True
model.get_classifier().bias.requires_grad = True

In [17]:
unfrozen = 0
frozen = 0
total = 0
for param in model.parameters():
    if param.requires_grad:
        unfrozen += param.numel()
    else:
        frozen += param.numel()
    total += param.numel()

print(f"Number of unfrozen parameters with fine_tune=False: {unfrozen}/{total} - {unfrozen/total * 100} %")
print(f"Number of frozen parameters with fine_tune=False: {frozen}/{total} - {frozen / total * 100} %")
print(f"Total number of parameters with fine_tune=False: {total}")

Number of unfrozen parameters with fine_tune=True: 65664/21350336 - 0.307554878761627 %
Number of frozen parameters with fine_tune=True: 21284672/21350336 - 99.69244512123836 %
Total number of parameters with fine_tune=True: 21350336


In [26]:
# dataset
ANNOTATIONS_FILE = "annotations_file_short_SF.csv"
DATASET_DIR = '/workdir/sf_pv/data_v2'
PATH2DATASET = "/workdir/sf_pv"

# Dataset
train_dataset = SpeakingFacesDataset(ANNOTATIONS_FILE,DATASET_DIR,'train',
                                    image_transform=T.image_transform, 
                                    audio_transform=T.audio_transform)
valid_dataset = ValidDataset(PATH2DATASET,'valid',
                        image_transform=T.image_transform, 
                        audio_transform=T.audio_transform)

# sampler
train_sampler = ProtoSampler(train_dataset.labels,
                            n_batch=200,
                            n_ways=60, # n_way
                            n_support=1, # n_shots
                            n_query=1)

# dataloader
train_dataloader = DataLoader(dataset=train_dataset, 
                        batch_sampler=train_sampler,
                        num_workers=4, pin_memory=True
                        )

valid_dataloader = DataLoader(dataset=valid_dataset,
                        batch_size=64,
                        shuffle=True,
                        num_workers=4, 
                        pin_memory=True)

# optimizer + scheduler
optimizer = torch.optim.Adam(model.parameters(), lr = 0.001, weight_decay = 0)
scheduler = torch.optim.lr_scheduler.StepLR(optimizer, 1, gamma=0.5, last_epoch=-1)

# loss
criterion = PrototypicalLoss(dist_type='cosine_similarity')
criterion = criterion.to(device)

# train
model = train_model(model,
                    train_dataloader, 
                    valid_dataloader,
                    train_sampler,
                    criterion,
                    optimizer,
                    scheduler,
                    device,
                    num_epochs=1,
                    save_dir="/workdir/Speaker_Verification_version_1.0/results",
                    exp_name="chern",
                    modality="rgb")

Epoch:   0%|          | 0/1 [00:00<?, ?it/s]


Average train loss: 4.09741884469986
Average train accuracy: 1.8833334314823151


Eval (epoch = 0): 100%|██████████| 594/594 [01:17<00:00,  7.64it/s]



Average val eer: 49.467722624598146

Average val accuracy: 50.57519640852974
Best eer model saved at epoch 0


Epoch: 100%|██████████| 1/1 [01:48<00:00, 108.89s/it]

Best acc model saved at epoch 0
Time elapsed: 1.8148329654708504  minutes





## try different models

In [5]:
from speaker_verification.models import Model

In [6]:
def train_pipeline(model):
    device = torch.device("cuda:1" if torch.cuda.is_available() else "cpu")
    model = model.to(device)

    # dataset
    ANNOTATIONS_FILE = "annotations_file_short_SF.csv"
    DATASET_DIR = '/workdir/sf_pv/data_v2'
    PATH2DATASET = "/workdir/sf_pv"

    # Dataset
    train_dataset = SpeakingFacesDataset(ANNOTATIONS_FILE,DATASET_DIR,'train',
                                        image_transform=T.image_transform, 
                                        audio_transform=T.audio_transform)
    valid_dataset = ValidDataset(PATH2DATASET,'valid',
                            image_transform=T.image_transform, 
                            audio_transform=T.audio_transform)

    # sampler
    train_sampler = ProtoSampler(train_dataset.labels,
                                n_batch=50,
                                n_ways=5, # n_way
                                n_support=1, # n_shots
                                n_query=1)

    # dataloader
    train_dataloader = DataLoader(dataset=train_dataset, 
                            batch_sampler=train_sampler,
                            num_workers=4, pin_memory=True
                            )

    valid_dataloader = DataLoader(dataset=valid_dataset,
                            batch_size=64,
                            shuffle=True,
                            num_workers=4, 
                            pin_memory=True)

    # optimizer + scheduler
    optimizer = torch.optim.Adam(model.parameters(), lr = 0.001, weight_decay = 0)
    scheduler = torch.optim.lr_scheduler.StepLR(optimizer, 1, gamma=0.5, last_epoch=-1)

    # loss
    criterion = PrototypicalLoss(dist_type='squared_euclidean')
    criterion = criterion.to(device)

    # train
    model = train_model(model,
                        train_dataloader, 
                        valid_dataloader,
                        train_sampler,
                        criterion,
                        optimizer,
                        scheduler,
                        device,
                        num_epochs=1,
                        save_dir="/workdir/Speaker_Verification_version_1.0/results",
                        exp_name="chern",
                        modality="rgb")
        

In [17]:
model = Model(library="pytorch", 
                pretrained_weights=True, 
                fine_tune=False, 
                embedding_size=128, 
                modality = "rgb",
                model_name = "resnet34",
                pool="default")

train_pipeline(model)

Epoch:   0%|          | 0/1 [00:00<?, ?it/s]

Train (epoch = 0):   0%|          | 0/50 [00:00<?, ?it/s]


Average train loss: 10.1061541390419
Average train accuracy: 43.60000122070313


Eval (epoch = 0):   0%|          | 0/594 [00:00<?, ?it/s]


Average val eer: 35.600263328071065

Average val accuracy: 64.3755260942761
Best eer model saved at epoch 0
Best acc model saved at epoch 0
Time elapsed: 1.3073300561867653  minutes


In [19]:
model = Model(library="timm", 
                pretrained_weights=True, 
                fine_tune=False, 
                embedding_size=128, 
                modality = "rgb",
                model_name = "resnet34",
                pool="default")

train_pipeline(model)

Epoch:   0%|          | 0/1 [00:00<?, ?it/s]

Train (epoch = 0):   0%|          | 0/50 [00:00<?, ?it/s]


Average train loss: 10.644466586387717
Average train accuracy: 42.0000008392334


Eval (epoch = 0):   0%|          | 0/594 [00:00<?, ?it/s]


Average val eer: 33.319663053788055

Average val accuracy: 66.67631172839506
Best eer model saved at epoch 0
Best acc model saved at epoch 0
Time elapsed: 1.3062162003324678  minutes
