In [2]:
import os
import numpy as np
import librosa
import pandas as pd
import aggregators
import embeddors
import data_utils
import tensorflow as tf
from tensorflow.keras.layers import Dense, InputLayer
from alibi_detect.cd import MMDDrift
from alibi_detect.cd.preprocess import UAE 

2020-12-12 14:10:23 | ERROR | fbprophet.plot | Importing plotly failed. Interactive plots will not work.


In [1]:
# Change this component to the root of the VOiCES dataset
DATASET_ROOT = '/Users/jberkowitz/Datasets/VOiCES_devkit'
# Convenience function to add root to data path
add_root = lambda x: os.path.join(DATASET_ROOT,x)

First we will load the dataframe for the `training` slice of VOiCES, and perform a few cleaning operations

In [3]:
full_index_df = pd.read_csv(add_root('references/train_index.csv'))
# Drop recordings that don't match in length to source audio
trimmed_index = full_index_df[full_index_df['noisy_length']==full_index_df['source_length']]
trimmed_index.head(2)

Unnamed: 0,index,chapter,degrees,distractor,filename,gender,mic,query_name,room,segment,source,speaker,transcript,noisy_length,noisy_sr,noisy_time,source_length,source_sr,source_time
0,0,9960,60,musi,distant-16k/speech/train/rm1/musi/sp0083/Lab41...,F,1,Lab41-SRI-VOiCES-rm1-musi-sp0083-ch009960-sg00...,rm1,42,source-16k/train/sp0083/Lab41-SRI-VOiCES-src-s...,83,the horrible glowing tool disappeared into the...,258880,16000,16.18,258880,16000,16.18
1,1,9960,60,musi,distant-16k/speech/train/rm1/musi/sp0083/Lab41...,F,5,Lab41-SRI-VOiCES-rm1-musi-sp0083-ch009960-sg00...,rm1,42,source-16k/train/sp0083/Lab41-SRI-VOiCES-src-s...,83,the horrible glowing tool disappeared into the...,258880,16000,16.18,258880,16000,16.18


In [26]:
len(trimmed_index[trimmed_index['distractor']=='babb'])

3200

Next we initialize the embeddors

In [5]:
w2v = embeddors.Wav2VecEmbeddor(weight_path='models/wav2vec_large.pt')
trill=embeddors.TrillEmbeddor()

2020-12-12 14:12:29 | INFO | fairseq.models.wav2vec.wav2vec | Wav2VecModel(
  (feature_extractor): ConvFeatureExtractionModel(
    (conv_layers): ModuleList(
      (0): Sequential(
        (0): Conv1d(1, 512, kernel_size=(10,), stride=(5,), bias=False)
        (1): Dropout(p=0.0, inplace=False)
        (2): Fp32GroupNorm(1, 512, eps=1e-05, affine=True)
        (3): ReLU()
      )
      (1): Sequential(
        (0): Conv1d(512, 512, kernel_size=(8,), stride=(4,), bias=False)
        (1): Dropout(p=0.0, inplace=False)
        (2): Fp32GroupNorm(1, 512, eps=1e-05, affine=True)
        (3): ReLU()
      )
      (2): Sequential(
        (0): Conv1d(512, 512, kernel_size=(4,), stride=(2,), bias=False)
        (1): Dropout(p=0.0, inplace=False)
        (2): Fp32GroupNorm(1, 512, eps=1e-05, affine=True)
        (3): ReLU()
      )
      (3): Sequential(
        (0): Conv1d(512, 512, kernel_size=(4,), stride=(2,), bias=False)
        (1): Dropout(p=0.0, inplace=False)
        (2): Fp32GroupNorm

Initialize the aggregators

In [6]:
mean_ag = aggregators.MeanAggregator()
pca_ag = aggregators.PCAAggregator()

We will also use untrained autoencoders (UAE) to do some preliminary dimension reduction, as in [Failing Loudly: An Empirical Study of Methods for Detecting Dataset Shift](https://arxiv.org/pdf/1810.11953.pdf).  We will initialize a UAE for both Trill and Wave2Vec embeddings, as they have different dimensionality. 

In [7]:
tf.random.set_seed(0) # set seed for reproducibility
encoding_dim=10 # fixed dimensionality

w2v_encoder_net = tf.keras.Sequential(
    [
        InputLayer(input_shape=(512,)),
        Dense(100,),
        Dense(encoding_dim,)
    ]
)
w2v_uae = UAE(w2v_encoder_net)

trill_encoder_net = tf.keras.Sequential(
    [
        InputLayer(input_shape=(2048,)),
        Dense(100,),
        Dense(encoding_dim,)
    ]
)
trill_uae = UAE(trill_encoder_net)

Next we will vectorize the data and store it in arrays.  For the clean data and each noise type in VOiCES (none,musi,babb,tele).  We will define a convenience function here to automate the production and storage of all these arrays in dictionaries.

In [8]:
def make_vector_dict(file_list):
    """
    Inputs
        file_list - a list of paths to wav files corresponding to all the waveforms to vectorize.
    Outputs:
        vector_dict - a dictionary with keys w2v_mean,w2v_pca,trill_mean,trill_pca
    """
    waveforms = [librosa.load(file_name,sr=16000)[0] for file_name in file_list]
    vector_dict = {}
    vector_dict['w2v_mean']=data_utils.pipeline(wav_list=waveforms,embeddor=w2v,aggregator=mean_ag)
    vector_dict['w2v_pca']=data_utils.pipeline(wav_list=waveforms,embeddor=w2v,aggregator=pca_ag)
    vector_dict['trill_mean']=data_utils.pipeline(wav_list=waveforms,embeddor=trill,aggregator=mean_ag)
    vector_dict['trill_pca']=data_utils.pipeline(wav_list=waveforms,embeddor=trill,aggregator=pca_ag)
    return vector_dict

In [10]:
source_vector_dict

{'w2v_mean': array([[0.0225979 , 0.00788946, 0.04420952, ..., 0.01228718, 0.18554828,
         0.12250672],
        [0.0225979 , 0.00788946, 0.04420952, ..., 0.01228718, 0.18554828,
         0.12250672],
        [0.03045051, 0.01346586, 0.05660842, ..., 0.01263731, 0.16521999,
         0.2128439 ],
        [0.03045051, 0.01346586, 0.05660842, ..., 0.01263731, 0.16521999,
         0.2128439 ],
        [0.03135986, 0.01382883, 0.11795536, ..., 0.01408581, 0.23116378,
         0.16410196]], dtype=float32),
 'w2v_pca': array([[-2.7104260e-03, -9.5924057e-05, -1.1925279e-02, ...,
          3.8271321e-03, -6.2693387e-02,  1.2496358e-02],
        [-2.7104104e-03, -9.5921598e-05, -1.1925279e-02, ...,
          3.8271321e-03, -6.2693425e-02,  1.2496394e-02],
        [-1.8501817e-03, -2.1425122e-03, -1.9867809e-02, ...,
          3.0232358e-03, -4.2727299e-02, -3.7932701e-03],
        [-1.8502675e-03, -2.1425067e-03, -1.9867813e-02, ...,
          3.0232349e-03, -4.2727273e-02, -3.7933113e-03],
