## Preprocess the data
### 1. Insert packages

In [13]:
from __future__ import print_function
from __future__ import absolute_import
from __future__ import division

import tensorflow as tf
from os import path, makedirs
import numpy as np

from scipy.misc import comb
from scipy.spatial.distance import pdist

from collections import Counter
import re

### 2. Set global variables
    variables are stored under a Config() class

In [17]:
class Config(object):
    """Set up model for debugging."""

    trainfile = "./awe_data/swbd.train.npz"
    devfile = "./awe_data/swbd.dev.npz"
    batch_size = 32
    current_epoch = 0
    num_epochs = 100
    feature_dim = 39
    num_layers = 3
    hidden_size = 256
    bidirectional = True
    keep_prob = 0.7
    margin = 0.5
    max_same = 1
    max_diff = 5
    lr = 0.001
    mom = 0.9
    logdir = "./neural-acoustic-word-embeddings/logs/test"
    ckptdir = "./neural-acoustic-word-embeddings/ckpts/test"
    log_interval = 10
    ckpt = None
    debugmode = True

    makedirs(logdir, exist_ok=True)
    makedirs(ckptdir, exist_ok=True)


In [18]:
# set config
config = Config()
print(config.trainfile)

./awe_data/swbd.train.npz


### 3. Dissect DataSet Class

In [12]:
# getattr(object, name[, default]): Return the value of the named attribute of object
partition = "train"
data_scp = getattr(config, "%sfile" % partition)
print(data_scp)

../awe_data/swbd.train.npz


In [None]:
is_train = (partition == "train")


In [20]:
try_train = np.load(config.trainfile)

In [38]:
# set words and uwords
labels = try_train.keys()
words = [re.split("_", x)[0].strip("{").strip("}") for x in labels]
uwords = np.unique(words)
print(len(words), len(uwords))
print(uwords)

9971 1687
['abandoned' 'ability' 'absolute' ... "you're" 'younger' 'yourself']


In [41]:
word2id = {v: k for k, v in enumerate(uwords)} # dictionary unique word -> id; dim = 1687
ids = [word2id[w] for w in words] # list of ids; dim = 9971
#print(ids, len(ids))

[671, 787, 372, 594, 1233, 421, 1345, 1638, 1075, 1522, 1488, 879, 162, 1125, 143, 1279, 814, 495, 286, 814, 453, 845, 1586, 332, 1279, 1257, 1114, 1179, 484, 1273, 388, 49, 1306, 769, 635, 1279, 1423, 159, 1431, 536, 1538, 510, 1516, 274, 155, 969, 64, 583, 1642, 1030, 356, 1316, 1684, 381, 159, 1357, 1633, 596, 1335, 364, 569, 261, 703, 814, 158, 1033, 1607, 82, 962, 414, 220, 1153, 1188, 538, 1557, 1255, 596, 324, 767, 857, 544, 502, 481, 995, 264, 988, 733, 1315, 742, 1426, 3, 1084, 638, 1275, 1341, 180, 182, 484, 584, 1057, 510, 467, 150, 408, 1148, 575, 1188, 1263, 1246, 191, 1456, 148, 1301, 1497, 605, 605, 876, 588, 663, 771, 395, 579, 814, 1247, 81, 276, 1274, 1039, 931, 852, 389, 265, 297, 808, 1164, 883, 159, 753, 537, 1558, 100, 164, 1628, 948, 1337, 1429, 636, 305, 287, 1457, 1558, 814, 286, 753, 1259, 1434, 1444, 927, 657, 286, 1127, 1178, 1027, 1316, 562, 1374, 970, 627, 1170, 766, 1553, 1275, 1209, 592, 769, 356, 1245, 332, 1275, 70, 1411, 1092, 669, 1193, 792, 1429, 92

In [46]:
data = np.array([try_train[label] for label in labels])
print("data shape = {}".format(data.shape))
print(data)

data shape = (9971, 39, 200)
[[[0. 0. 0. ... 0. 0. 0.]
  [0. 0. 0. ... 0. 0. 0.]
  [0. 0. 0. ... 0. 0. 0.]
  ...
  [0. 0. 0. ... 0. 0. 0.]
  [0. 0. 0. ... 0. 0. 0.]
  [0. 0. 0. ... 0. 0. 0.]]

 [[0. 0. 0. ... 0. 0. 0.]
  [0. 0. 0. ... 0. 0. 0.]
  [0. 0. 0. ... 0. 0. 0.]
  ...
  [0. 0. 0. ... 0. 0. 0.]
  [0. 0. 0. ... 0. 0. 0.]
  [0. 0. 0. ... 0. 0. 0.]]

 [[0. 0. 0. ... 0. 0. 0.]
  [0. 0. 0. ... 0. 0. 0.]
  [0. 0. 0. ... 0. 0. 0.]
  ...
  [0. 0. 0. ... 0. 0. 0.]
  [0. 0. 0. ... 0. 0. 0.]
  [0. 0. 0. ... 0. 0. 0.]]

 ...

 [[0. 0. 0. ... 0. 0. 0.]
  [0. 0. 0. ... 0. 0. 0.]
  [0. 0. 0. ... 0. 0. 0.]
  ...
  [0. 0. 0. ... 0. 0. 0.]
  [0. 0. 0. ... 0. 0. 0.]
  [0. 0. 0. ... 0. 0. 0.]]

 [[0. 0. 0. ... 0. 0. 0.]
  [0. 0. 0. ... 0. 0. 0.]
  [0. 0. 0. ... 0. 0. 0.]
  ...
  [0. 0. 0. ... 0. 0. 0.]
  [0. 0. 0. ... 0. 0. 0.]
  [0. 0. 0. ... 0. 0. 0.]]

 [[0. 0. 0. ... 0. 0. 0.]
  [0. 0. 0. ... 0. 0. 0.]
  [0. 0. 0. ... 0. 0. 0.]
  ...
  [0. 0. 0. ... 0. 0. 0.]
  [0. 0. 0. ... 0. 0. 0.]
  [0. 0. 

In [78]:
feature_mean, n = 0.0, 0
for x in data:
    feature_mean += np.sum(x)
    nonzero = np.count_nonzero(x)
    n += nonzero
    
print("feature_mean = {}, n = {}".format(feature_mean, n))
feature_mean = feature_mean / n
print(feature_mean)

feature_mean = -0.36783453636780905, n = 24603228
-1.4950661611062135e-08


In [79]:
md_data = np.array([x - feature_mean for x in data])
ids = np.array(ids, dtype=np.int32) #vectorize ids
id_counts = Counter(ids) #create counter to count number of occurence for each id key -> # occurence
num_classes = len(id_counts)
num_examples = len(ids)

In [80]:
print(id_counts)

Counter({1279: 149, 159: 136, 143: 98, 814: 93, 1233: 80, 1275: 78, 286: 66, 583: 65, 1100: 60, 1257: 59, 1188: 59, 995: 57, 1132: 57, 1431: 56, 605: 56, 579: 56, 1558: 55, 1190: 54, 1411: 50, 769: 45, 1191: 43, 285: 42, 1551: 41, 818: 40, 1624: 39, 1337: 39, 1607: 39, 1426: 37, 435: 36, 29: 36, 1276: 35, 982: 35, 569: 34, 81: 34, 604: 33, 1148: 33, 3: 33, 1114: 33, 1520: 33, 1209: 32, 1374: 32, 269: 31, 1553: 31, 51: 31, 576: 30, 64: 30, 539: 30, 1133: 29, 171: 29, 148: 28, 498: 27, 1433: 27, 1193: 26, 458: 26, 138: 26, 180: 26, 70: 26, 636: 25, 197: 25, 785: 25, 1044: 25, 538: 24, 1429: 24, 1139: 24, 1263: 24, 1366: 24, 1168: 24, 1661: 23, 1577: 23, 1564: 23, 994: 23, 408: 23, 1554: 22, 1273: 22, 1186: 22, 1096: 22, 1550: 21, 244: 21, 1117: 21, 1349: 21, 1084: 21, 562: 21, 158: 21, 83: 20, 935: 20, 393: 20, 356: 20, 1423: 20, 284: 20, 311: 20, 463: 19, 254: 19, 689: 19, 521: 18, 275: 18, 1128: 18, 1101: 17, 627: 17, 1245: 17, 324: 17, 78: 17, 1359: 17, 1686: 17, 362: 16, 1391: 16, 37

In [81]:
print(num_classes, num_examples)

1687 9971


In [77]:
#def shuffle(num_examples, md_data, ids):
shuffled_indices = np.random.permutation(num_examples)
md_data = md_data[shuffled_indices]
ids = ids[shuffled_indices]
print("shuffled_indices", shuffled_indices, len(shuffled_indices))
print("md_data", md_data)
print("ids", ids)

shuffled_indices [5640 8399 6100 ... 4445 3084 6928] 9971
md_data [[[4.72954306e-09 4.72954306e-09 4.72954306e-09 ... 4.72954306e-09
   4.72954306e-09 4.72954306e-09]
  [4.72954306e-09 4.72954306e-09 4.72954306e-09 ... 4.72954306e-09
   4.72954306e-09 4.72954306e-09]
  [4.72954306e-09 4.72954306e-09 4.72954306e-09 ... 4.72954306e-09
   4.72954306e-09 4.72954306e-09]
  ...
  [4.72954306e-09 4.72954306e-09 4.72954306e-09 ... 4.72954306e-09
   4.72954306e-09 4.72954306e-09]
  [4.72954306e-09 4.72954306e-09 4.72954306e-09 ... 4.72954306e-09
   4.72954306e-09 4.72954306e-09]
  [4.72954306e-09 4.72954306e-09 4.72954306e-09 ... 4.72954306e-09
   4.72954306e-09 4.72954306e-09]]

 [[4.72954306e-09 4.72954306e-09 4.72954306e-09 ... 4.72954306e-09
   4.72954306e-09 4.72954306e-09]
  [4.72954306e-09 4.72954306e-09 4.72954306e-09 ... 4.72954306e-09
   4.72954306e-09 4.72954306e-09]
  [4.72954306e-09 4.72954306e-09 4.72954306e-09 ... 4.72954306e-09
   4.72954306e-09 4.72954306e-09]
  ...
  [4.729543

In [73]:
shuffle(9971, md_data, ids)

shuffled_indices [4047 5582 8191 ... 9485 9320 5114] 9971
md_data [[[4.72954306e-09 4.72954306e-09 4.72954306e-09 ... 4.72954306e-09
   4.72954306e-09 4.72954306e-09]
  [4.72954306e-09 4.72954306e-09 4.72954306e-09 ... 4.72954306e-09
   4.72954306e-09 4.72954306e-09]
  [4.72954306e-09 4.72954306e-09 4.72954306e-09 ... 4.72954306e-09
   4.72954306e-09 4.72954306e-09]
  ...
  [4.72954306e-09 4.72954306e-09 4.72954306e-09 ... 4.72954306e-09
   4.72954306e-09 4.72954306e-09]
  [4.72954306e-09 4.72954306e-09 4.72954306e-09 ... 4.72954306e-09
   4.72954306e-09 4.72954306e-09]
  [4.72954306e-09 4.72954306e-09 4.72954306e-09 ... 4.72954306e-09
   4.72954306e-09 4.72954306e-09]]

 [[4.72954306e-09 4.72954306e-09 4.72954306e-09 ... 4.72954306e-09
   4.72954306e-09 4.72954306e-09]
  [4.72954306e-09 4.72954306e-09 4.72954306e-09 ... 4.72954306e-09
   4.72954306e-09 4.72954306e-09]
  [4.72954306e-09 4.72954306e-09 4.72954306e-09 ... 4.72954306e-09
   4.72954306e-09 4.72954306e-09]
  ...
  [4.729543

#### - labels, data = zip(*read_mat_scp(data_scp))
- read_mat_scp(data_scp) is a kaldi method and should be replaced
- the code does the following things:
    1. open the source file
    2. seem to zip it into list of tuples, using read_mat

In [83]:
indices = np.arange(5)
lens = np.array([np.count_nonzero(d[0]) for d in data[indices]], dtype=np.int32)
print(indices)
print(lens)

[0 1 2 3 4]
[58 80 93 63 50]


In [88]:
b = len(indices)
padded = np.zeros((b, max(lens), feature_dim))
print(padded.shape)

(5, 93, 39)


In [96]:
for i, (x, l) in enumerate(zip(data[indices], lens)):
    x = x[x!=0]
    lx = len(x)
    x = x.reshape(lx // 39, 39)
    padded[i, :l] = x
print(padded)

[[[-0.43178879 -0.22322403 -0.20718085 ...  0.69124998  0.61103309
    0.62707628]
  [ 0.53081573  0.43455566  0.41851247 ... -0.18130909 -1.61426436
   -2.06617986]
  [-2.88784467 -3.33976089 -3.17542802 ...  0.74055184  1.55659855
    0.65246219]
  ...
  [ 0.          0.          0.         ...  0.          0.
    0.        ]
  [ 0.          0.          0.         ...  0.          0.
    0.        ]
  [ 0.          0.          0.         ...  0.          0.
    0.        ]]

 [[-1.54688131  0.49162734  1.09864486 ...  0.9974481   0.9974481
    0.95949901]
  [ 0.90890039  0.53217662 -0.49506508 ... -0.38693415 -0.30583607
   -0.22473799]
  [ 0.04558862  0.00503934  0.00597985 ...  0.53272648  0.6062261
    0.67972554]
  ...
  [ 0.          0.          0.         ...  0.          0.
    0.        ]
  [ 0.          0.          0.         ...  0.          0.
    0.        ]
  [ 0.          0.          0.         ...  0.          0.
    0.        ]]

 [[-1.54795825 -1.50857817  0.05571855

In [97]:
batch_ids = ids[indices]
print(batch_ids)

[ 599  818 1558  786 1588]


In [None]:
# input: partition \in ["train", "dev", "test"] config

class Dataset(object):
    def __init__(self, partition, config):
        # set whether this data is used for training
        self.is_train = (partition == "train")
        # set feature dimension
        self.feature_dim = config.feature_dim  # 39 in acoustic case
        # check if the path is set in config
        data_scp = getattr(config, "%sfile" % partition)
        
        #labels, data = zip(*read_mat_scp(data_scp))
        #words = [re.split("_", x)[0] for x in labels]
        #uwords = np.unique(words)
        
        
            