In [1]:
#for using in google colab
from google.colab import drive
drive.mount('/content/drive')
!unzip "drive/MyDrive/train.zip" -d "/content/data/"
!unzip "drive/MyDrive/test.zip" -d "/content/data/"

[1;30;43mStreaming output truncated to the last 5000 lines.[0m
 extracting: /content/data/test/images/clip_4492064f9.png  
 extracting: /content/data/test/images/clip_354fd98e1.png  
 extracting: /content/data/test/images/clip_474bb33e4.png  
 extracting: /content/data/test/images/clip_d3f5bd0b1.png  
  inflating: /content/data/test/images/clip_9dfb60c79.png  
 extracting: /content/data/test/images/clip_5cdec871a.png  
 extracting: /content/data/test/images/clip_0a9c798a3.png  
 extracting: /content/data/test/images/clip_0a1297c45.png  
 extracting: /content/data/test/images/clip_4e6a948b2.png  
 extracting: /content/data/test/images/clip_d4814bab6.png  
 extracting: /content/data/test/images/clip_f737ea6e5.png  
 extracting: /content/data/test/images/clip_667e681a0.png  
 extracting: /content/data/test/images/clip_0ddda4868.png  
  inflating: /content/data/test/images/clip_0984f52ff.png  
 extracting: /content/data/test/images/clip_7454e2dbb.png  
 extracting: /content/data/test/ima

In [2]:
import os 
import imageio
from IPython.display import display, Image
from sklearn.preprocessing import OneHotEncoder
import numpy as np 
import random
from tensorflow import keras
from tensorflow.keras import layers
from tensorflow.keras import regularizers
from tensorflow.keras.callbacks import EarlyStopping, ModelCheckpoint

In [3]:
# images directory for spectrograms, mfcc directory for mfccs
train_path = 'data/train/images/'
test_path = 'data/test/images/'

#ignore .ipynb_checkpoints
classes = [f for f in os.listdir(train_path) if not f.startswith('.')]
targets = ['yes','no','up','down','left','right','on','off','stop','go','silence','unknown']

#128x87 with 1 channel
#dimensions are flipped so time is the first axis
image_shape = (87,128)
num_classes = len(targets)

In [4]:
def get_image_names():
  images = []
  for c in classes:
    #print(c)
    if c in targets:
      label = targets.index(c)
    elif c == '_background_noise_':
      label = targets.index('silence')
    else: 
      label = targets.index('unknown')
    class_path = train_path + c + '/'
    class_images = os.listdir(class_path)
    labeled = []
    for i in class_images:
      labeled.append([class_path + i,label])
    images.append(labeled)
  return images 

In [5]:
def split_names(image_list, num_batches=2):
  batches = [[] for _ in range(num_batches)]
  for i in image_list:
    num = len(i)
    random.shuffle(i)
    step = num // num_batches 
    #print(step)
    for k in range(0, num_batches):
      batches[k].append(i[ k * step : min( (k+1)*step, num) ])
  return batches 

In [6]:
def make_training_batch(image_names):
  train_x, train_y = [], []
  # training images are grouped by class
  for c in image_names:
    for i in c:
      # load the image as a np array
      x = np.array(imageio.imread(i[0]))
      # flip the image so the first axis is timesteps
      x = np.transpose(x)
      # split silence clips into smaller chunks 
      if targets[i[1]] == 'silence':
        # number of samples to take from the file 
        num_samples = 100 #x.shape[0] // image_shape[0]
        for _ in range(num_samples):
          start = random.randint(0, x.shape[0] - image_shape[0])
          sample = x[start : start + image_shape[0], :]
          train_x.append(sample)
          train_y.append(i[1])
      else:
        # pad to a uniform size for the model 
        x = np.resize(x,image_shape)
        train_x.append(x)
        train_y.append(i[1])
  # convert outer lists to np arrays
  train_x = np.array(train_x)
  train_y = np.array(train_y).reshape(-1,1)
  # transform the labels into one-hot vectors 
  onehot = OneHotEncoder()
  train_y = onehot.fit_transform(train_y).toarray()
  return (train_x, train_y)


In [7]:
def training_loop(num_times=1, num_batches=1, num_epochs=50, model=None):
  # build the model if needed
  if model is None:
    model = build_model()
  
  # callbacks used in training
  early_stop = EarlyStopping(monitor='val_accuracy',min_delta=.001, patience=10, restore_best_weights=True)
  checkpoint = ModelCheckpoint('best_model.h5',monitor='val_accuracy', save_best_only=True,mode='max')

  # get filenames  of training images, labeled with their classes
  name_list = get_image_names()
  for e in range(num_times):
    print("Loop {}".format(e))
    # divide training data into smaller chunks if needed
    # some large models encountered RAM issues when loading the full data
    name_batches = split_names(name_list, num_batches)

  # perform training on each batch, using a validation split of 20% and early stopping 
    for k in range(num_batches):
      print("Batch {}".format(k))
      train_x, train_y = make_training_batch(name_batches[k])
      shuffler = np.random.permutation(len(train_x))
      train_x = train_x[shuffler]
      train_y = train_y[shuffler]
      model.fit(train_x,train_y,batch_size=50,epochs=num_epochs, callbacks=[early_stop,checkpoint], validation_split = 0.2)

In [8]:
# a baseline LSTM architecture for classification
def build_baseline():
  model = keras.Sequential()
  model.add(layers.Input(shape=image_shape))
  # first LSTM layer observes the sequence to extract higher-level features 
  model.add(layers.LSTM(96, activation='tanh', return_sequences=True))
  # second LSTM layer observes the sequence from the first to make predictions 
  model.add(layers.LSTM(64, activation='tanh', return_sequences=False))
  # final dense layer predicts class label
  model.add(layers.Dense(units=num_classes, activation='softmax',))

  return model

In [9]:
# our best-performing enhancement of the LSTM-based approach
def build_best():
  # CNN + LSTM model
  # two layers of 1D convolutions scan along the time axis
  # the resulting sequence is processed by three layers of bidirectional LSTM
  # weight regularization is applied to all layers to help with overfitting
  # batch normalization is used througout to mitigate internal covariate shift
  model = keras.Sequential()
  model.add(layers.Input(shape=image_shape))
  # features are extracted with two layers of convolution filters
  # 1D convolution is used to preserve the time series nature of the data
  model.add(layers.Conv1D(filters=128, kernel_size=4, strides=4, 
            kernel_regularizer=keras.regularizers.l2(1e-5)))
  model.add(layers.BatchNormalization())
  model.add(layers.Conv1D(filters=128, kernel_size=2, strides=2, 
            kernel_regularizer=keras.regularizers.l2(1e-5)))
  model.add(layers.BatchNormalization())
  # feature vector sequences are processed by three layers of LSTM
  model.add(layers.Bidirectional(layers.LSTM(256, activation='tanh', return_sequences=True,
            dropout=0.5, kernel_regularizer=keras.regularizers.l2(1e-5), bias_regularizer=keras.regularizers.l2(1e-4))))
  model.add(layers.BatchNormalization())
  model.add(layers.Bidirectional(layers.LSTM(128, activation='tanh', return_sequences=True, 
            dropout=0.5, kernel_regularizer=keras.regularizers.l2(1e-5), bias_regularizer=keras.regularizers.l2(1e-4))))
  model.add(layers.BatchNormalization())
  model.add(layers.Bidirectional(layers.LSTM(96, activation='tanh', return_sequences=False, 
            dropout=0.5, kernel_regularizer=keras.regularizers.l2(1e-5), bias_regularizer=keras.regularizers.l2(1e-4))))
  model.add(layers.BatchNormalization())
  # final dense layer predicts class label
  model.add(layers.Dense(units=num_classes, activation='softmax',))

  return model

In [10]:
model = build_best()
opt = keras.optimizers.Adam(learning_rate=0.0001)
model.compile(optimizer=opt,loss="categorical_crossentropy",metrics=['accuracy'])
model.summary()

Model: "sequential"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
conv1d (Conv1D)              (None, 21, 128)           65664     
_________________________________________________________________
batch_normalization (BatchNo (None, 21, 128)           512       
_________________________________________________________________
conv1d_1 (Conv1D)            (None, 10, 128)           32896     
_________________________________________________________________
batch_normalization_1 (Batch (None, 10, 128)           512       
_________________________________________________________________
bidirectional (Bidirectional (None, 10, 512)           788480    
_________________________________________________________________
batch_normalization_2 (Batch (None, 10, 512)           2048      
_________________________________________________________________
bidirectional_1 (Bidirection (None, 10, 256)           6

In [None]:
training_loop(num_times=1,num_batches=1,num_epochs=100,model=model)

Loop 0
Batch 0
Epoch 1/100

In [None]:
def read_test_images(num=2000,image_names=[]):
  images = []
  names = []
  for k in range(min(num,len(image_names))):
    if (image_names[k][0] == "."):
      continue
    if (not os.path.exists(test_path + image_names[k])):
      continue 
    names.append(image_names[k])
    x = np.array(imageio.imread(test_path + image_names[k]))
    x = np.transpose(x)
    x = np.resize(x,image_shape)
    images.append(x)
  return (names,np.array(images))

def predict_labels(model, test_x):
  predictions = model.predict(test_x)
  labels = []
  for p in predictions:
    i = np.argmax(p)
    labels.append(targets[i])
  return labels

def evaluate_model(model):
  test_images = os.listdir(test_path)
  test_num = len(test_images)
  batch_size = 2000
  pred_file = open("predictions.csv","w")
  pred_file.write("fname,label\n")
  for n in range(0,test_num,batch_size):
    image_names, image_batch = read_test_images(batch_size,test_images)
    print(n)
    label_batch = predict_labels(model,image_batch)
    for k in range(min(batch_size,len(image_batch))):
      label = label_batch[k]
      im = image_names[k].replace("png","wav")
      pred_file.write("{},{}\n".format(im,label))
    
    test_images = test_images[len(image_batch):]

  pred_file.close()

In [None]:
evaluate_model(model)