***Import all the necessary librarys***

In [1]:
import pandas as pd
import numpy as np
import tensorflow as tf
from tensorflow import keras
import matplotlib.pyplot as plt
from random import randrange, shuffle
from os import path
import librosa
from librosa.display import specshow
import IPython.display as ipd
from math import floor
import kapre
import time

***Check to see if GPU is available***

In [2]:
if (len(tf.config.experimental.list_physical_devices("GPU")) > 0):
    print("GPU is available!")
    gpu_options = tf.compat.v1.GPUOptions(per_process_gpu_memory_fraction=0.75)
    sess = tf.compat.v1.Session(config=tf.compat.v1.ConfigProto(gpu_options=gpu_options))

GPU is available!


Open and read the csv with all the audio information
***Set all the paths and basic formats***

In [3]:
csvPath    = r"F:\labeled_audio_data.csv"
specPath   = r"C:\Users\cboen\Documents\Programmierungen\AudioDataScience\Labeled_Specs"
audioPath  = r"F:\Labeled_Audio"
modelPath  = r"F:\Models"
logPath    = r"F:\Logs"

version      = r"5.4" #When you make changes, update the version!!
creationTime = time.strftime("%d.%m.%H.%M.%S", time.localtime())

dataFrame = pd.read_csv(csvPath)
dataDictionary = dataFrame.to_dict()
print(dataDictionary.keys())

sampleRate      = 16000
audioFileLength = 3
spec_hopSize    = 256
spec_blockSize  = 1024
audioFileSize   = (sampleRate * audioFileLength, 1)

dict_keys(['Unnamed: 0', 'filename', 'mainSound', 'length', 'sampleRate', 'quality', 'isCut', 'isMixed', 'isChecked', 'threat', 'salience', 'importance'])


***Create the first batch of training data***
The training and test data will be a list of data containing following informations:
filename, mainSound, audioData, targetValue

In [4]:
numAudiosPerBatch = 1500 #Number of audiofiles to train with per Batch

allFileNames = list(dataDictionary['filename'].values())

training_data = []

dataIndex = 0
numBatches   = floor(len(allFileNames) / numAudiosPerBatch)
testDataSize = len(allFileNames) % numAudiosPerBatch
print("TrainBatches:", numBatches, ", Testdata:", testDataSize)

ids = list(range(0, len(allFileNames)))
shuffle(ids)

def createTrainingData():
    global dataIndex
    print("Get new training data: ", int(dataIndex/numAudiosPerBatch), "/", (numBatches-1), sep = "")
    for i in range(numAudiosPerBatch):
        perc = int( 20 * i / numAudiosPerBatch)
        bar = "[" + "=" * (perc) + ">" + "-" * (19-perc) + "]"
        print(bar, end="\r")
        audioFile = ids[i+dataIndex]
        filename = allFileNames[audioFile]
        #Load the spectrogram
        try:
            audioFilePath = path.join(audioPath, (filename+".wav"))
            audioData, sr = librosa.load(audioFilePath, sr=16000, duration = audioFileLength)
            #Load the target values
            mainSound  = dataDictionary['mainSound'][audioFile]
            targetValue = [[0 for i in range(10)], [0 for i in range(10)], [0 for i in range(10)], ]
            targetValue[0][dataDictionary['threat'][audioFile]] = 1
            targetValue[1][dataDictionary['salience'][audioFile]] = 1
            targetValue[2][dataDictionary['importance'][audioFile]] = 1
            training_data.append([filename, mainSound, audioData, targetValue])
        except Exception as e:
            pass
    dataIndex += numAudiosPerBatch
    print()
    print("Done!")

test_data = []

def createTestData():
    for i in range(len(allFileNames)-testDataSize, len(allFileNames)):
        audioFile = ids[i]
        filename = allFileNames[audioFile]
        #Load the spectrogram
        try:
            audioFilePath = path.join(audioPath, (filename+".wav"))
            audioData, sr = librosa.load(audioFilePath, sr=16000, duration = audioFileLength)
            #Load the target values
            mainSound  = dataDictionary['mainSound'][audioFile]
            targetValue = [[0 for i in range(10)], [0 for i in range(10)], [0 for i in range(10)], ]
            targetValue[0][dataDictionary['threat'][audioFile]] = 1
            targetValue[1][dataDictionary['salience'][audioFile]] = 1
            targetValue[2][dataDictionary['importance'][audioFile]] = 1
        
            test_data.append([filename, mainSound, audioData, targetValue])
        except:
            pass

createTrainingData()
createTestData()

TrainBatches: 2 , Testdata: 130
Get new training data: 0/1
Done!


***Roughly describe the dataset:***

In [None]:
def describeTrainingData():
    print("Number of audiofiles:", len(training_data))
    print("Lets get the percentage of each Class:")
    categorys = set(dataDictionary['mainSound'].values())
    trainingDataCategorys = [data[1] for data in training_data] 
    for key in categorys:
        print("Key:", key.ljust(17, " "), ", perc.:", round(trainingDataCategorys.count(key)/len(training_data), 3))
    del categorys, trainingDataCategorys

describeTrainingData()

Number of audiofiles: 1500
Lets get the percentage of each Class:
Key: interior          , perc.: 0.14
Key: animal            , perc.: 0.117
Key: emergency_vehicle , perc.: 0.002
Key: human             , perc.: 0.095
Key: natural           , perc.: 0.098
Key: exterior          , perc.: 0.155
Key: siren             , perc.: 0.393


***Sort the list of training data information into input and output data**

In [6]:
train_x=[]
train_y=[]
def sortTrainingData():
    global train_x, train_y
    train_x = [np.pad(data[2], (0, (audioFileSize[0]-len(data[2])))) for data in training_data]
    #train_y = [[data[3], data[4], data[5]] for data in training_data]
    train_y = [data[-1] for data in training_data]
    train_x = np.asarray(train_x).reshape(-1, audioFileSize[0])
    train_y = np.array(train_y)

sortTrainingData()

Now the really fun part!
***Create a Convolutional network**
...that can get one audiofile as input data and outputs three levels for threat, salience and importance!

In [7]:
model = keras.models.Sequential()
print(audioFileSize)
mel_layer = kapre.composed.get_melspectrogram_layer(input_shape=audioFileSize, n_fft=spec_blockSize, hop_length=spec_hopSize, sample_rate=sampleRate, n_mels=64, mel_f_max=8000.0, return_decibel=True, output_data_format='default', name='melspectrogram')
vgg_layer = tf.keras.applications.VGG16(input_shape=(184,64,1),weights=None, classes=30)# 
vgg_layer.layers[-1].activation=None

model.add(mel_layer)
model.add(vgg_layer)
model.add(keras.layers.Reshape((3,10)))
model.add(keras.layers.Activation(keras.activations.softmax))

modelName = "stft_vgg16_activation-{}-".format(creationTime) + version
print("Name:", modelName)
print(model.summary())

optimizer = keras.optimizers.SGD(learning_rate = 0.1)
tensorboard = tf.keras.callbacks.TensorBoard(path.join(logPath, modelName), histogram_freq=1)
model.compile(loss="binary_crossentropy", optimizer=optimizer, metrics=["accuracy"])

(48000, 1)
Name: stft_vgg16_activation-15.03.09.29.41-5.4
Model: "sequential"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
melspectrogram (Sequential)  (None, 184, 64, 1)        0         
_________________________________________________________________
vgg16 (Functional)           (None, 30)                52593374  
_________________________________________________________________
reshape (Reshape)            (None, 3, 10)             0         
_________________________________________________________________
activation (Activation)      (None, 3, 10)             0         
Total params: 52,593,374
Trainable params: 52,593,374
Non-trainable params: 0
_________________________________________________________________
None


Now lets get the ***training session*** done!

In [8]:
for batch in range((numBatches-1)):
    try:
        model.fit(train_x, train_y, validation_split=0.05, epochs=10, callbacks=[tensorboard])
    except Exception as e:
        print("Something went wrong!")
        print(e)
    training_data = []
    createTrainingData()
    sortTrainingData()

try:
    model.fit(train_x, train_y, validation_split=0.05, epochs=10, callbacks=[tensorboard])
except Exception as e:
    print("Something went wrong!")
    print(e)

Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10
Get new training data: 1/1

  return f(*args, **kwargs)


Done!
Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


***Lets let the model predict some data!***

In [17]:
i = randrange(dataIndex, len(allFileNames), 1)
audioFile = ids[i]
filename = allFileNames[audioFile]
audioFilePath = path.join(audioPath, (filename+".wav"))
audioData, sr = librosa.load(audioFilePath, sr=16000, duration = audioFileLength)
audioData = np.pad(audioData, (0, (audioFileSize[0]-len(audioData))))
test_x = np.array(audioData).reshape(-1, audioFileSize[0])
mainSound = dataDictionary['mainSound'][ids[i]]
targetValue = [[0 for i in range(10)], [0 for i in range(10)], [0 for i in range(10)], ]
targetValue[0][dataDictionary['threat'][audioFile]] = 1
targetValue[1][dataDictionary['salience'][audioFile]] = 1
targetValue[2][dataDictionary['importance'][audioFile]] = 1
test_y = np.array(targetValue)
print("Infos: ", end="")
print( filename, dataDictionary['mainSound'][ids[i]], sep=" | ")
#print(path.join(audioPath, (filename+".wav")))
print("Goal:  ", [np.argmax(cat) for cat in test_y])
prediction = model(test_x)
print("Result: ", end="")
#print([round(val, 2) for val in prediction.numpy()[0]])
#print("max Val:", np.argmax(prediction.numpy()[0]))
print([np.argmax(cat) for cat in prediction.numpy()[0]])
ipd.Audio(path.join(audioPath, (filename+".wav")))

Infos: 157866-8-0-26 | siren
Goal:   [8, 9, 9]
Result: [0, 5, 0]


***Test the model on unseen testing data***

In [32]:
training_data = test_data[:]
sortTrainingData()
model.evaluate(train_x, train_y)

Last but not least:
***Save the model.***

In [34]:
model.save(path.join(modelPath, modelName))

INFO:tensorflow:Assets written to: F:\Models\stft_vgg16_activation-15.03.04.13.57-5.4\assets
