In [21]:
import os
import numpy as np
import librosa
import pandas as pd
import aggregators
import embeddors
import data_utils
import tensorflow as tf
from tensorflow.keras.layers import Dense, InputLayer
from alibi_detect.cd import MMDDrift
from alibi_detect.cd.preprocess import UAE 

In [None]:
# Change this component to the root of the VOiCES dataset
DATASET_ROOT = '/Users/jberkowitz/Datasets/VOiCES_devkit'
# Convenience function to add root to data path
add_root = lambda x: os.path.join(DATASET_ROOT,x)

First we will load the dataframe for the `training` slice of VOiCES, and perform a few cleaning operations

In [6]:
full_index_df = pd.read_csv(add_root('references/train_index.csv'))
# Drop recordings that don't match in length to source audio
trimmed_index = full_index_df[full_index_df['noisy_length']==full_index_df['source_length']]
trimmed_index.head(2)

Unnamed: 0,index,chapter,degrees,distractor,filename,gender,mic,query_name,room,segment,source,speaker,transcript,noisy_length,noisy_sr,noisy_time,source_length,source_sr,source_time
0,0,9960,60,musi,distant-16k/speech/train/rm1/musi/sp0083/Lab41...,F,1,Lab41-SRI-VOiCES-rm1-musi-sp0083-ch009960-sg00...,rm1,42,source-16k/train/sp0083/Lab41-SRI-VOiCES-src-s...,83,the horrible glowing tool disappeared into the...,258880,16000,16.18,258880,16000,16.18
1,1,9960,60,musi,distant-16k/speech/train/rm1/musi/sp0083/Lab41...,F,5,Lab41-SRI-VOiCES-rm1-musi-sp0083-ch009960-sg00...,rm1,42,source-16k/train/sp0083/Lab41-SRI-VOiCES-src-s...,83,the horrible glowing tool disappeared into the...,258880,16000,16.18,258880,16000,16.18


In [19]:
len(trimmed_index[trimmed_index['distractor']=='musi'])

3200

Next we initialize the embeddors

In [14]:
w2v = embeddors.Wav2VecEmbeddor(weight_path='models/wav2vec_large.pt')
trill=embeddors.TrillEmbeddor()

2020-12-06 13:23:05 | INFO | fairseq.models.wav2vec.wav2vec | Wav2VecModel(
  (feature_extractor): ConvFeatureExtractionModel(
    (conv_layers): ModuleList(
      (0): Sequential(
        (0): Conv1d(1, 512, kernel_size=(10,), stride=(5,), bias=False)
        (1): Dropout(p=0.0, inplace=False)
        (2): Fp32GroupNorm(1, 512, eps=1e-05, affine=True)
        (3): ReLU()
      )
      (1): Sequential(
        (0): Conv1d(512, 512, kernel_size=(8,), stride=(4,), bias=False)
        (1): Dropout(p=0.0, inplace=False)
        (2): Fp32GroupNorm(1, 512, eps=1e-05, affine=True)
        (3): ReLU()
      )
      (2): Sequential(
        (0): Conv1d(512, 512, kernel_size=(4,), stride=(2,), bias=False)
        (1): Dropout(p=0.0, inplace=False)
        (2): Fp32GroupNorm(1, 512, eps=1e-05, affine=True)
        (3): ReLU()
      )
      (3): Sequential(
        (0): Conv1d(512, 512, kernel_size=(4,), stride=(2,), bias=False)
        (1): Dropout(p=0.0, inplace=False)
        (2): Fp32GroupNorm

Initialize the aggregators

In [15]:
mean_ag = aggregators.MeanAggregator()
pca_ag = aggregators.PCAAggregator()

We will also use untrained autoencoders (UAE) to do some preliminary dimension reduction, as in [Failing Loudly: An Empirical Study of Methods for Detecting Dataset Shift](https://arxiv.org/pdf/1810.11953.pdf).  We will initialize a UAE for both Trill and Wave2Vec embeddings, as they have different dimensionality. 

In [23]:
tf.random.set_seed(0) # set seed for reproducibility
encoding_dim=10 # fixed dimensionality

w2v_encoder_net = tf.keras.Sequential(
    [
        InputLayer(input_shape=(512,)),
        Dense(100,),
        Dense(encoding_dim,)
    ]
)
w2v_uae = UAE(w2v_encoder_net)

trill_encoder_net = tf.keras.Sequential(
    [
        InputLayer(input_shape=(2048,)),
        Dense(100,),
        Dense(encoding_dim,)
    ]
)
trill_uae = UAE(trill_encoder_net)