# Libraries

In [1]:
cd /workdir/Speaker_Verification_version_1.0/Speaker-Verification

/workdir/Speaker_Verification_version_1.0/Speaker-Verification


In [6]:
from speaker_verification import transforms as T
from speaker_verification.dataset import SpeakingFacesDataset
from speaker_verification.dataset import ValidDataset
from speaker_verification.sampler import ProtoSampler
from speaker_verification.sampler import ValidSampler
from speaker_verification.models_handmade.resnet import ResNet34
from speaker_verification.models import ResNet
from speaker_verification.loss import PrototypicalLoss
from speaker_verification.train import train_model

import torch
from torch.utils.data import DataLoader

from speaker_verification.utils import plot_sample


In [3]:
import timm

In [4]:
from tqdm.notebook import tqdm
import matplotlib.pyplot as plt
import numpy as np

# Different models

## resnet1 (Resnet from Hugging Face)

In [5]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

In [9]:
# model
model = ResNet(pretrained_weights=True, 
                        fine_tune=True, 
                        embedding_size=128, 
                        modality = "rgb", 
                        filter_size="default", 
                        from_torch=True
                    )
model = model.to(device)

In [10]:
image_transform = T.image_transform

In [17]:
# dataset
ANNOTATIONS_FILE = "annotations_file_short_SF.csv"
DATASET_DIR = '/workdir/sf_pv/data_v2'
PATH2DATASET = "/workdir/sf_pv"

# Dataset
train_dataset = SpeakingFacesDataset(ANNOTATIONS_FILE,DATASET_DIR,'train',
                                    image_transform=image_transform, 
                                    audio_transform=T.audio_transform)
valid_dataset = ValidDataset(PATH2DATASET,'valid',
                        image_transform=image_transform, 
                        audio_transform=T.audio_transform)

# sampler
train_sampler = ProtoSampler(train_dataset.labels,
                            n_batch=200,
                            n_ways=60, # n_way
                            n_support=1, # n_shots
                            n_query=1)

# dataloader
train_dataloader = DataLoader(dataset=train_dataset, 
                        batch_sampler=train_sampler,
                        num_workers=4, pin_memory=True
                        )

valid_dataloader = DataLoader(dataset=valid_dataset,
                        batch_size=64,
                        shuffle=True,
                        num_workers=4, 
                        pin_memory=True)

# optimizer + scheduler
optimizer = torch.optim.Adam(model.parameters(), lr = 0.001, weight_decay = 0)
scheduler = torch.optim.lr_scheduler.StepLR(optimizer, 1, gamma=0.5, last_epoch=-1)

# loss
criterion = PrototypicalLoss(dist_type='cosine_similarity')
criterion = criterion.to(device)

# train
model = train_model(model,
                    train_dataloader, 
                    valid_dataloader,
                    train_sampler,
                    criterion,
                    optimizer,
                    scheduler,
                    device,
                    num_epochs=1,
                    save_dir="/workdir/Speaker_Verification_version_1.0/results",
                    exp_name="chern",
                    modality="rgb")

Epoch:   0%|          | 0/1 [00:00<?, ?it/s]


Average train loss: 4.09436205625534
Average train accuracy: 1.5500000739097595


Eval (epoch = 0): 100%|██████████| 594/594 [01:17<00:00,  7.69it/s]



Average val eer: 48.8092462548469

Average val accuracy: 50.56642817059483
Best eer model saved at epoch 0


Epoch: 100%|██████████| 1/1 [01:48<00:00, 108.02s/it]

Best acc model saved at epoch 0
Time elapsed: 1.800370344084998  minutes





## resnet2

In [18]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

In [19]:
model = timm.create_model('resnet34', pretrained=True, num_classes=128, in_chans=3)
image_transform = T.image_transform

Downloading: "https://github.com/rwightman/pytorch-image-models/releases/download/v0.1-weights/resnet34-43635321.pth" to /workdir/data/.torch/hub/checkpoints/resnet34-43635321.pth


In [20]:
param_name = [name for name,_ in model.named_parameters()] # All parameters name
layer_name = [name for name,_ in model.named_modules()] # All layers name

In [23]:
for param in model.parameters():
    param.requires_grad = True

In [25]:
model=model.to(device)

In [26]:
# dataset
ANNOTATIONS_FILE = "annotations_file_short_SF.csv"
DATASET_DIR = '/workdir/sf_pv/data_v2'
PATH2DATASET = "/workdir/sf_pv"

# Dataset
train_dataset = SpeakingFacesDataset(ANNOTATIONS_FILE,DATASET_DIR,'train',
                                    image_transform=image_transform, 
                                    audio_transform=T.audio_transform)
valid_dataset = ValidDataset(PATH2DATASET,'valid',
                        image_transform=image_transform, 
                        audio_transform=T.audio_transform)

# sampler
train_sampler = ProtoSampler(train_dataset.labels,
                            n_batch=200,
                            n_ways=60, # n_way
                            n_support=1, # n_shots
                            n_query=1)

# dataloader
train_dataloader = DataLoader(dataset=train_dataset, 
                        batch_sampler=train_sampler,
                        num_workers=4, pin_memory=True
                        )

valid_dataloader = DataLoader(dataset=valid_dataset,
                        batch_size=64,
                        shuffle=True,
                        num_workers=4, 
                        pin_memory=True)

# optimizer + scheduler
optimizer = torch.optim.Adam(model.parameters(), lr = 0.001, weight_decay = 0)
scheduler = torch.optim.lr_scheduler.StepLR(optimizer, 1, gamma=0.5, last_epoch=-1)

# loss
criterion = PrototypicalLoss(dist_type='cosine_similarity')
criterion = criterion.to(device)

# train
model = train_model(model,
                    train_dataloader, 
                    valid_dataloader,
                    train_sampler,
                    criterion,
                    optimizer,
                    scheduler,
                    device,
                    num_epochs=1,
                    save_dir="/workdir/Speaker_Verification_version_1.0/results",
                    exp_name="chern",
                    modality="rgb")

Epoch:   0%|          | 0/1 [00:00<?, ?it/s]


Average train loss: 4.09741884469986
Average train accuracy: 1.8833334314823151


Eval (epoch = 0): 100%|██████████| 594/594 [01:17<00:00,  7.64it/s]



Average val eer: 49.467722624598146

Average val accuracy: 50.57519640852974
Best eer model saved at epoch 0


Epoch: 100%|██████████| 1/1 [01:48<00:00, 108.89s/it]

Best acc model saved at epoch 0
Time elapsed: 1.8148329654708504  minutes



