<a href="https://colab.research.google.com/github/MoltenMuffins/IndoorAudioClassifier/blob/master/AudioAsImage_TransferLearningVGG.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>



Shallow layers trained on self created database of sounds from freesound

Classes: Speech, Music, Water, Door Sounds, Car horn, Glass Breaking

## 0. Boilerplate Code

In [0]:
#test for gpu
import tensorflow as tf
device_name = tf.test.gpu_device_name()
if device_name != '/device:GPU:0':
  raise SystemError('GPU device not found')
print('Found GPU at: {}'.format(device_name))

Found GPU at: /device:GPU:0


In [0]:
#imports here
import os
import librosa
from tensorflow.keras.models import Model,load_model,Sequential
from tensorflow.keras.applications import VGG16
from tensorflow.keras.applications.inception_v3 import InceptionV3
from tensorflow.keras.layers import *
from tensorflow.keras import backend as K

import tensorflow as tf

tf.VERSION

'1.12.0'

## 1. Download Dataset

We use a small labeled dataset generated via the freesound api and passed through an audio tokeniser to shorten exceedingly long sound files.

It has the following file structure: `Dataset/Train/{CLASS_LABEL}/{FILENAME}.jpg`

In [0]:
#Download Dataset from dropbox
!wget -qq https://www.dropbox.com/s/rv7xzjyvae0nabt/Data.zip

In [0]:
#Unzip and delete zip file
!unzip -qq Data.zip
!rm Data.zip
!ls

Data  __MACOSX	sample_data


## 2. Prepare Dataset for Model

We use glob to get lists of the files in the directories and then convert them into dataframes and add in class numbers.

We also then split them up so we have 10% for a testing set and the rest for training.

Finally we randomly shuffle them up

In [0]:
path = "./Data/"
train_data_dir = "./Data/Train/"

In [0]:
import pandas as pd
import glob

filenames_n0 = glob.glob('./Data/Train/car horn/*.png')
filenames_n1 = glob.glob('./Data/Train/door/*.png')
filenames_n2 = glob.glob('./Data/Train/glass break/*.png')
filenames_n3 = glob.glob('./Data/Train/music/*.png')
filenames_n4 = glob.glob('./Data/Train/speech/*.png')
filenames_n5 = glob.glob('./Data/Train/water/*.png')


names = ['car horn', 'door', 'glass break', 'music', 'speech', 'water']

len(filenames_n0)

53

In [0]:
# Make a dataframe based on the filenames
df = pd.DataFrame(filenames_n0, columns = ["filename"])
df2 = pd.DataFrame(filenames_n1, columns = ["filename"])
df3 = pd.DataFrame(filenames_n2, columns = ["filename"])
df4 = pd.DataFrame(filenames_n3, columns = ["filename"])
df5 = pd.DataFrame(filenames_n4, columns = ["filename"])
df6 = pd.DataFrame(filenames_n5, columns = ["filename"])


# Add Class columns 
df['class'] = pd.Series([0 for x in range(len(df.index))], index=df.index)
df2['class'] = pd.Series([1 for x in range(len(df2.index))], index=df2.index)
df3['class'] = pd.Series([2 for x in range(len(df3.index))], index=df3.index)
df4['class'] = pd.Series([3 for x in range(len(df4.index))], index=df4.index)
df5['class'] = pd.Series([4 for x in range(len(df5.index))], index=df5.index)
df6['class'] = pd.Series([5 for x in range(len(df6.index))], index=df6.index)


# Split into train and validation sets
train_set_percentage = .9

#1
train_df = df[:int(len(df)*train_set_percentage)]
val_df = df[int(len(df)*train_set_percentage):]

#2
train_df2 = df2[:int(len(df2)*train_set_percentage)]
val_df2 = df2[int(len(df2)*train_set_percentage):]

#3
train_df3 = df3[:int(len(df3)*train_set_percentage)]
val_df3 = df3[int(len(df3)*train_set_percentage):]

#4
train_df4 = df4[:int(len(df4)*train_set_percentage)]
val_df4 = df4[int(len(df4)*train_set_percentage):]

#5
train_df5 = df5[:int(len(df5)*train_set_percentage)]
val_df5 = df5[int(len(df5)*train_set_percentage):]

#6
train_df6 = df6[:int(len(df6)*train_set_percentage)]
val_df6 = df6[int(len(df6)*train_set_percentage):]

df_new_train = pd.concat([train_df, train_df2, train_df3, train_df4, train_df5, train_df6])
df_new_val = pd.concat([val_df, val_df2, val_df3, val_df4, val_df5, val_df6])

In [0]:
# shuffle dataframes
df = df_new_train.sample(frac=1).reset_index(drop=True)
df_val = df_new_val.sample(frac=1).reset_index(drop=True)

print('number of train files:', len(df))
print('number of val files:', len(df_val))
df.head(10)

number of train files: 911
number of val files: 104


Unnamed: 0,filename,class
0,./Data/Train/music/Touch Down Music EDM_clip0.png,3
1,./Data/Train/glass break/Casse03.png,2
2,./Data/Train/music/Ambient Piano Music #3_clip...,3
3,./Data/Train/water/Sink Drumming and Water_cli...,5
4,"./Data/Train/speech/""Tengo hambre"".png",4
5,./Data/Train/glass break/Glass Break 1_clip0.png,2
6,./Data/Train/speech/cello-phrase_stftMorph1_cl...,4
7,./Data/Train/water/washing-machine_clip3.png,5
8,./Data/Train/speech/pod_bay_doors_test1.png,4
9,./Data/Train/speech/South Jersey Control-Radio...,4


In [0]:
# convert the dataframe into 2 lists to use for filename and labels
train_filenames_list = df["filename"].tolist()
train_labels_list = df["class"].astype('int32').tolist()

# convert the dataframe into 2 lists to use for filename and labels
val_filenames_list = df_val["filename"].tolist()
val_labels_list = df_val["class"].astype('int32').tolist()

#number of classes
num_classes = 6

df.shape

(911, 2)

In [0]:
train_filenames_list[:5]

['./Data/Train/music/Touch Down Music EDM_clip0.png',
 './Data/Train/glass break/Casse03.png',
 './Data/Train/music/Ambient Piano Music #3_clip0.png',
 './Data/Train/water/Sink Drumming and Water_clip0.png',
 './Data/Train/speech/"Tengo hambre".png']

## 3. VGG16

### 3.1 Create Data Pipeline for VGG16

In [0]:
# Reads an image from a file, decodes it into a tensor, and resizes it
# to a fixed shape.
img_rows, img_cols = 224,224

def _parse_function(filename, label):
  image_string = tf.read_file(filename)
  #Channels specified to be 3 to ensure images output as [224,224,3] at the end of this
  image_decoded = tf.image.decode_jpeg(image_string, channels=3)
  image_resized = tf.image.resize_images(image_decoded, [img_rows, img_cols])
  label = tf.one_hot(label, num_classes)
  return image_resized, label

In [0]:
# Create vector of filenames from list
filenames = tf.constant(train_filenames_list)

# Create vector of labels
labels = tf.constant(train_labels_list)

# Same as above but for validation set
val_filenames = tf.constant(val_filenames_list)
val_labels = tf.constant(val_labels_list)

### 3.2 Asemble Data Pipeline using tf.data for VGG16

In [0]:
train_dataset = tf.data.Dataset.from_tensor_slices((filenames, labels))
train_dataset = train_dataset.map(_parse_function)
train_dataset = train_dataset.repeat()
train_dataset = train_dataset.batch(32)

valid_dataset = tf.data.Dataset.from_tensor_slices((val_filenames, val_labels))
valid_dataset = valid_dataset.map(_parse_function)
valid_dataset = valid_dataset.repeat()
valid_dataset = valid_dataset.batch(32)

In [0]:
# create the base pre-trained model
base_model = VGG16(weights='imagenet', include_top=False, input_shape=(224, 224, 3))
base_model.summary()

Downloading data from https://github.com/fchollet/deep-learning-models/releases/download/v0.1/vgg16_weights_tf_dim_ordering_tf_kernels_notop.h5
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
input_1 (InputLayer)         (None, 224, 224, 3)       0         
_________________________________________________________________
block1_conv1 (Conv2D)        (None, 224, 224, 64)      1792      
_________________________________________________________________
block1_conv2 (Conv2D)        (None, 224, 224, 64)      36928     
_________________________________________________________________
block1_pool (MaxPooling2D)   (None, 112, 112, 64)      0         
_________________________________________________________________
block2_conv1 (Conv2D)        (None, 112, 112, 128)     73856     
_________________________________________________________________
block2_conv2 (Conv2D)        (None, 112, 112, 128)     147584   

In [0]:
# add a global spatial average pooling layer
x = base_model.output

x = Flatten()(x)

# let's add a fully-connected layer
x = Dense(512, activation='relu')(x)

# and a logits layer -- let's say we have 6 classes
predictions = Dense(6, activation='softmax')(x)

In [0]:
# this is the model we will train
model = Model(inputs=base_model.input, outputs=predictions)

In [0]:
# first: train only the top layers (which were randomly initialized)
# i.e. freeze all convolutional VGG16 layers
for layer in base_model.layers:
    layer.trainable = False
    
#for layer in model.layers:
#    print(layer.name)
#    print(layer.trainable)

In [0]:
#optional
model.summary()

_________________________________________________________________
Layer (type)                 Output Shape              Param #   
input_1 (InputLayer)         (None, 224, 224, 3)       0         
_________________________________________________________________
block1_conv1 (Conv2D)        (None, 224, 224, 64)      1792      
_________________________________________________________________
block1_conv2 (Conv2D)        (None, 224, 224, 64)      36928     
_________________________________________________________________
block1_pool (MaxPooling2D)   (None, 112, 112, 64)      0         
_________________________________________________________________
block2_conv1 (Conv2D)        (None, 112, 112, 128)     73856     
_________________________________________________________________
block2_conv2 (Conv2D)        (None, 112, 112, 128)     147584    
_________________________________________________________________
block2_pool (MaxPooling2D)   (None, 56, 56, 128)       0         
__________

In [0]:
opt = tf.train.AdamOptimizer(learning_rate = 0.00001)

# compile the model (should be done *after* setting layers to non-trainable)
model.compile(optimizer=opt, loss='categorical_crossentropy',metrics=['accuracy'])

### 3.3 Transfer Learning on VGG16

In [0]:
#60 20 20 rule
train_steps = 270
val_steps = 90
epochs = 10

In [0]:
print(train_dataset)
print(valid_dataset)

<BatchDataset shapes: ((?, 224, 224, 3), (?, 6)), types: (tf.float32, tf.float32)>
<BatchDataset shapes: ((?, 224, 224, 3), (?, 6)), types: (tf.float32, tf.float32)>


In [0]:
# # # Train the model with validation 
history = model.fit( train_dataset, steps_per_epoch = train_steps,
                   epochs = epochs,
                   validation_data = valid_dataset,
                   validation_steps = val_steps)

Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


In [0]:
#@title
#epochs = 3

# # # Train the model with validation 
#history = model.fit( train_dataset, steps_per_epoch = train_steps,
#                   epochs = epochs,
#                   validation_data = valid_dataset,
#                   validation_steps = val_steps)

In [0]:
#@title
#metrics = model.evaluate(valid_dataset,
#                   steps = val_steps)
#print("model accuracy:",metrics[1])

In [0]:
#@title
#image_check = 'content/Data/Train/glass break/01537 breaking glass 1.png'
#
#print(img_check)
#img = image.load_img(img_path, target_size=(224, 224))
#x = image.img_to_array(img)
#x = np.expand_dims(x, axis=0)
#x = preprocess_input(x)
#print('Input image shape:', x.shape)
#preds = model.predict(x)
#print('Predicted:', preds)