In [1]:
## Standard python libraries
import numpy as np
import time
import sys
import matplotlib.pylab as plt
%matplotlib inline
# %pdb

## Magnolia data iteration
sys.path.append('../../')
from src.features.mixer import FeatureMixer
from src.features.wav_iterator import batcher
from src.features.supervised_iterator import SupervisedIterator, SupervisedMixer
from src.features.hdf5_iterator import Hdf5Iterator, SplitsIterator

## LibriSpeech Dev File

In [2]:
batchsize = 1024
datashape = (64, 257)
libridev='/local_data/teams/magnolia/librispeech/processed_dev-clean.h5'
# libridev='/local_data/teams/magnolia/processed_train-clean-100.h5'

### Feature Mixer

Unsupervised (Non-labeled) feature mixer declaration with several iterators from mixed sources

In [3]:
mixer = FeatureMixer([libridev,libridev,libridev], shape=datashape, mix_method='add', diffseed=True, return_key=True)
ti = time.clock()
data_batch = mixer.get_batch(batchsize)
tf = time.clock()
print('Regular feature mixer with 3 libridev sources timed at ', (tf-ti), 'sec')

Regular feature mixer with 3 libridev sources timed at  3.282445 sec


### Supervised Feature Mixer
Feature mixer declaration with same number of iterators from mixed sources

In [4]:
libriter = SupervisedIterator(libridev, shape=datashape)
mixerter = SupervisedMixer([libridev,libridev,libridev], shape=datashape, 
                     mix_method='add', diffseed=True, return_key=True)
# Check the time
ti = time.clock()
X, Y, I = mixerter.get_batch(batchsize)
tf = time.clock()
print('Supervised feature mixer with 3 libridev sources timed at ', (tf-ti), 'sec')
print('Shapes [X,Y] for out_TF=-1 is [',X.shape,',',Y.shape,']')

# Check the time for subset of array
ti = time.clock()
X, Y, I = mixerter.get_batch(batchsize,out_TF=[0,1,2,3,4,5])
tf = time.clock()
print('Supervised feature mixer with 3 libridev sources timed at ', (tf-ti), 'sec')
print('Shapes [X,Y] for out_TF=FullSpec is [',X.shape,',',Y.shape,']')

# Check the time for full spectra
ti = time.clock()
X, Y, I = mixerter.get_batch(batchsize,out_TF=None)
tf = time.clock()
print('Supervised feature mixer with 3 libridev sources timed at ', (tf-ti), 'sec')
print('Shapes [X,Y] for out_TF=FullSpec is [',X.shape,',',Y.shape,']')


Supervised feature mixer with 3 libridev sources timed at  3.2734059999999996 sec
Shapes [X,Y] for out_TF=-1 is [ (1024, 64, 257) , (1024, 3, 257) ]
Supervised feature mixer with 3 libridev sources timed at  3.436992 sec
Shapes [X,Y] for out_TF=FullSpec is [ (1024, 64, 257) , (1024, 3, 1542) ]
Supervised feature mixer with 3 libridev sources timed at  4.911866 sec
Shapes [X,Y] for out_TF=FullSpec is [ (1024, 64, 257) , (1024, 3, 16448) ]


### Splits Iterator
Specifying training splits and then let's say we have specified speakers. 

Speaker lists are stored in `data/librispeech/authors`:
```
dev-clean-F.txt  test-clean-F.txt  train-clean-100-F.txt
dev-clean-M.txt  test-clean-M.txt  train-clean-100-M.txt
```

For this example, let's use `speaker_keys = dev-clean-M.txt`. You can actually pass in `speaker_keys` to both `Hdf5Iterator` and `SplitsIterator`. 

#### Operating the splits
In the iterator class `SplitsIterator`, there is a variable called `split_list` that is a list of lists. Each of the lists in `split_list` has the names of the wav files in that split. So, `split_list[0]` is the $0^{th}$ split, which has the names of all the files in that list.

To set the split number, you must call `set_split`, a method in class `SplitsIterator`. For example, if I want split $0$, then I would call:

```
iterator = SplitsIterator( [0.8, 0.1, 0.1], file_name, **kwargs )
iterator.set_split[0]
next(iterator)
```

The above code will: 

1. create a splits iterator with presumably a training, dev, and test set where each speaker has 80% of his files in the training set, 10% in the development set, and the remainder in the testing set.
2. and set the split to the training split

In [5]:
split_ratio = [0.8, 0.1, 0.1]
speaker_keys = open('../../data/librispeech/authors/dev-clean-F.txt','r').read().splitlines()

# For reference, let's take an iterator with a ratio
iterator_all = Hdf5Iterator(libridev, shape=(10,257))

# Let's create a splits iterator with the split ratio
iterator_split_keys = SplitsIterator(split_ratio, libridev, speaker_keys=speaker_keys, shape=(10,257))

print( 'There are ', len( iterator_all.h5_groups ), ' people in libri-dev.' )
print( 'of which ', len( iterator_split_keys.h5_groups ), ' are female.')
print( 'In total, the number of items to be used is: ' )

# Now, specify which splits
iterator_split_keys.set_split(0)
print( len(iterator_split_keys.h5_items), ' for split 0' )
iterator_split_keys.set_split(1)
print( len(iterator_split_keys.h5_items), ' for split 1' )
iterator_split_keys.set_split(2)
print( len(iterator_split_keys.h5_items), ' for split 2' )

There are  40  people in libri-dev.
of which  20  are female.
In total, the number of items to be used is: 
1106  for split 0
146  for split 1
122  for split 2


### Supervised splits iterator and mixer