# Simple Example to EXTRACT EMBEDDING Vector from a wav file using a pre-trained VGGVox for model

* ## See VGGVOx_1.ipynb and also:

https://github.com/Derpimort/VGGVox-PyTorch

https://rzimmermann.com/coding/vggvox-pytorch


* ## Clonar el github: VGGVox PyTorch

In [1]:
! git clone https://github.com/Derpimort/VGGVox-PyTorch.git

Cloning into 'VGGVox-PyTorch'...
remote: Enumerating objects: 8, done.[K
remote: Counting objects: 100% (8/8), done.[K
remote: Compressing objects: 100% (7/7), done.[K
remote: Total 101 (delta 1), reused 5 (delta 1), pack-reused 93[K
Receiving objects: 100% (101/101), 64.81 MiB | 38.90 MiB/s, done.
Resolving deltas: 100% (50/50), done.


In [2]:
ls ./VGGVox-PyTorch/

[0m[01;34mdata[0m/    [01;34mmodels[0m/        README.md         results.txt      test.wav  vggm.py
LICENSE  model_test.py  requirements.txt  signal_utils.py  train.py


## * Change the current directory to /content/VGGVox-PyTorch

In [1]:
cd /content/VGGVox-PyTorch


/content/VGGVox-PyTorch


# Install packages
# <font color=red>WARNING!!! once installed, you have to reset the virtual machine (see the warning message)

In [4]:
! pip install -r requirements.txt

Collecting argparse
  Downloading https://files.pythonhosted.org/packages/f2/94/3af39d34be01a24a6e65433d19e107099374224905f1e0cc6bbe1fd22a2f/argparse-1.4.0-py2.py3-none-any.whl
Installing collected packages: argparse
Successfully installed argparse-1.4.0


# LET's see what data do we need

In [2]:
cd /content/VGGVox-PyTorch

/content/VGGVox-PyTorch


In [3]:
ls -al ./data/

total 5228
drwxr-xr-x 2 root root    4096 Dec 16 10:37 [0m[01;34m.[0m/
drwxr-xr-x 5 root root    4096 Dec 16 10:37 [01;34m..[0m/
-rw-r--r-- 1 root root 4912512 Dec 16 10:37 iden_split.txt
-rw-r--r-- 1 root root  387503 Dec 16 10:37 val.pkl
-rw-r--r-- 1 root root   40782 Dec 16 10:37 vox1_meta.csv


# Create a wav directory inside data

In [4]:
cd ./data

/content/VGGVox-PyTorch/data


In [5]:
! mkdir ./wav

# Mount our Google Drive and copy some wav files

In [6]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [7]:
ls /content/drive/'My Drive'/VGGVOX_PyTorch

id10270.zip                    VGGVox_Distance_Embedding.ipynb
id10275.zip                    VGGVox_Embeddings_EmoPCFBorja_wavDir_SVM.ipynb
[0m[01;34mloc1[0m/                          VGGVox_Embeddings_Simple.ipynb
[01;34mloc2[0m/                          VGGVox_Embeddings_wavDir.ipynb
MLLB_Keras_FontReco_CNN.ipynb  VGGVox_Embed_Female_Male_wavDir.ipynb
MLLB_OSA_Simple_PCA.ipynb      VGGVox_Embed_Female_Male_wavDir_SVM.ipynb
PFCBorjaGAPS.zip               wavEmo_females_males.zip
VGGVox_1.ipynb                 wav.zip


* # **Copiamos los audios de un locutor "loc1" que está en Google Drive en el directorio local**
/VGGVOX_PyTorch/loc1 /content/VGGVox-PyTorch/data/wav/

In [30]:
cp -rf /content/drive/'My Drive'/VGGVOX_PyTorch/loc1 /content/VGGVox-PyTorch/data/wav/.

In [31]:
ls /content/VGGVox-PyTorch/data/wav

[0m[01;34mloc1[0m/


In [32]:
ls /content/VGGVox-PyTorch/data/wav/loc1

00001.wav  00002.wav  00003.wav  00004.wav  00005.wav


* # Create a data frame with the wav file and other columns

In [8]:
import pandas as pd  
    
## NOTE: label must be an integer (to accomplish AudioDataset)
df_F = pd.DataFrame([[3, 'loc1/00001.wav', 1001]], 
                  columns = ['Set', 'Path', 'Label'])


In [9]:
df_F

Unnamed: 0,Set,Path,Label
0,3,loc1/00001.wav,1001


---
---


# Now we are going to extract the embeddings from this wav files

* ## **IT IS IMPORTANT TO MOVE to the VGGVox-PyTorch directory**

In [10]:
cd /content/VGGVox-PyTorch

/content/VGGVox-PyTorch


In [11]:
pwd

'/content/VGGVox-PyTorch'

In [12]:
import os
import numpy as np
import pandas as pd
import torch
from torch.utils.data import Subset, Dataset, DataLoader
from tqdm.auto import tqdm
from vggm import VGGM
import argparse
from train import AudioDataset, accuracy, ppdf, LOCAL_DATA_DIR, MODEL_DIR


* # NOTE the DATA_DIR is data/wav/ that will be concatenated with ./loc1

In [13]:
DATA_DIR = '/content/VGGVox-PyTorch/data/wav/'

In [14]:
ls /content/VGGVox-PyTorch/data/wav

* # Construimos el diccionario de Python Datasets, AudioDataset, y a partir de él creamos los Dataloaders : en este caso solo hay un Dataset (en el ejemplo de partida había tres: uno para train, otro validación, otro de test)

In [18]:
#Datasets={
#        "test":AudioDataset(df_F[df_F['Set']==3], DATA_DIR, is_train=False)}

Datasets={
        "test":AudioDataset(df_F, DATA_DIR, is_train=False)}

In [19]:
Dataloaders={i:DataLoader(Datasets[i], batch_size=1, shuffle=False, num_workers=2) for i in Datasets}

In [20]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(device)

cuda


In [21]:
MODEL_DIR

'models/'

In [22]:
cd /content/VGGVox-PyTorch/

/content/VGGVox-PyTorch


In [23]:
ls ./models

VGGM300_BEST_140_81.99.pth


# Cargamos el modelo VGGVox pre entrenado : VGGM300_BEST_140_81.99.pth

In [24]:
model=VGGM(1251)
#model.load_state_dict(torch.load(DATA_DIR+"/VGGMVAL_BEST_149_80.84.pth", map_location=device))
model.load_state_dict(torch.load(MODEL_DIR+"VGGM300_BEST_140_81.99.pth", map_location=device))
model.to(device)
model.eval()

VGGM(
  (features): Sequential(
    (conv1): Conv2d(1, 96, kernel_size=(7, 7), stride=(2, 2), padding=(1, 1))
    (bn1): BatchNorm2d(96, eps=1e-05, momentum=0.5, affine=True, track_running_stats=True)
    (relu1): ReLU()
    (mpool1): MaxPool2d(kernel_size=(3, 3), stride=(2, 2), padding=0, dilation=1, ceil_mode=False)
    (conv2): Conv2d(96, 256, kernel_size=(5, 5), stride=(2, 2), padding=(1, 1))
    (bn2): BatchNorm2d(256, eps=1e-05, momentum=0.5, affine=True, track_running_stats=True)
    (relu2): ReLU()
    (mpool2): MaxPool2d(kernel_size=(3, 3), stride=(2, 2), padding=0, dilation=1, ceil_mode=False)
    (conv3): Conv2d(256, 384, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
    (bn3): BatchNorm2d(384, eps=1e-05, momentum=0.5, affine=True, track_running_stats=True)
    (relu3): ReLU()
    (conv4): Conv2d(384, 256, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
    (bn4): BatchNorm2d(256, eps=1e-05, momentum=0.5, affine=True, track_running_stats=True)
    (relu4): ReLU()
   

# EXTRACTION OF EMBEDDINGS !!!

* # NOTE that the network has two parts features and classification

In [25]:
list(VGGM().features)

[Conv2d(1, 96, kernel_size=(7, 7), stride=(2, 2), padding=(1, 1)),
 BatchNorm2d(96, eps=1e-05, momentum=0.5, affine=True, track_running_stats=True),
 ReLU(),
 MaxPool2d(kernel_size=(3, 3), stride=(2, 2), padding=0, dilation=1, ceil_mode=False),
 Conv2d(96, 256, kernel_size=(5, 5), stride=(2, 2), padding=(1, 1)),
 BatchNorm2d(256, eps=1e-05, momentum=0.5, affine=True, track_running_stats=True),
 ReLU(),
 MaxPool2d(kernel_size=(3, 3), stride=(2, 2), padding=0, dilation=1, ceil_mode=False),
 Conv2d(256, 384, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1)),
 BatchNorm2d(384, eps=1e-05, momentum=0.5, affine=True, track_running_stats=True),
 ReLU(),
 Conv2d(384, 256, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1)),
 BatchNorm2d(256, eps=1e-05, momentum=0.5, affine=True, track_running_stats=True),
 ReLU(),
 Conv2d(256, 256, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1)),
 BatchNorm2d(256, eps=1e-05, momentum=0.5, affine=True, track_running_stats=True),
 ReLU(),
 MaxPool2d(kernel_siz

In [26]:
print(model.classifier)

Sequential(
  (fc7): Linear(in_features=4096, out_features=1024, bias=True)
  (relu7): ReLU()
  (fc8): Linear(in_features=1024, out_features=1251, bias=True)
)


## This is to extract an activation from one layer ...

In [27]:
activation = {}
def get_activation(name):
    def hook(model, input, output):
        activation[name] = output.detach()
    return hook

# Now we test it!

In [56]:
def test_embeddings(model, Dataloaders):

    for audio, labels in Dataloaders:
        audio = audio.to(device)
        labels = labels.to(device)
        model.classifier.fc7.register_forward_hook(get_activation('fc7'))
        outputs = model(audio)
        
        print('embedding vector :', activation['fc7'])

        print('\n Las salidas de la red, son las puntuaciones de los 1251 locutores de VoxCeleb que a nosotros no nos interesan  : \n', outputs)

        print('\n outputs length:', outputs.shape)

In [58]:
test_embeddings(model, Dataloaders['test'])

embedding vector : tensor([[-3.0984e-11, -5.8438e-11, -4.9172e-10,  ...,  6.2839e+00,
         -3.6131e-11, -3.0457e-11]], device='cuda:0')

 Las salidas de la red, son las puntuaciones de los 1251 locutores de VoxCeleb que a nosotros no nos interesan  : 
 tensor([[ 1.3943,  5.2511,  3.6531,  ...,  5.0330, -2.8473, -4.3761]],
       device='cuda:0', grad_fn=<AddmmBackward>)

 outputs length: torch.Size([1, 1251])


* # NEXT a simple Notebook to extract embeddings from TWO wav files and compare them

VGGVox_Distance_Embeddings.ipynb