### Models ###

This is the main Notebook in which all the processing for the defined project is combined. This notebook starts with importing necessary libraries, and follows with the execution of the pre-processing functions which are defined in notebook 'preprocessing_features.ipynb'. When the feature vectors are all created and stored, the models are defined and a gridsearch is performed on each model in order to find the best hyperparameter settings. Note that this gridsearch is very limited as, due to computationally limitations, only a few hyperparameters and values are explored. The model report the accuracy and the AUC value for the test set. 

Please note that this notebook is specifically designed for the audio dataset considered in the research. This data could not be shared publicly. 

In [None]:
#Import all libraries

import preprocessing_feature_extraction

import pandas as pd
import numpy as np

import os
from os import listdir
from os.path import isfile, join

import IPython.display as ipd

import moviepy.editor as mp
from moviepy.editor import *

import mutagen
from mutagen.mp3 import MP3

import opensmile
import audiofile
import time

from pydub import AudioSegment
from pydub.utils import make_chunks

from keras.models import Sequential, Model
from keras.layers import LSTM, Dense, Dropout
from keras.wrappers.scikit_learn import KerasClassifier
import tensorflow as tf 
from tensorflow.keras.optimizers import Adam

from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.metrics import accuracy_score, roc_auc_score

In [9]:
#create directions where data is or will be stored.
dir_video = r'D:\Data\Videofiles'
dir_audio_train = r'D:\Data\Audiofiles\Train'
dir_audio_val = r'D:\Data\Audiofiles\Val'
dir_audio_test = r'D:\Data\Audiofiles\Test'
dir_chunk = r'D:\Data\Chunked'

In [10]:
x, y = preprocessing_feature_extraction.get_xy(dir_video)

In [11]:
#create training and test set from all data
X_train, X_test, Y_train, Y_test = train_test_split(x, y, test_size = 0.2, random_state=666)

In [12]:
preprocessing_feature_extraction.convert_pitch_audio(X_train,dir_audio_train)

done with converting:  Startups1920_1_8FindIT_allcams_informedconsent_minoptionC.mp4
done with converting:  Startups1920_2_1Ar-T-ficial_allcams_informedconsent_minoptionD.mp4
done with converting:  Startups1920_2_2Recipe-Me_allcams_informedconsent_minoptionC.mp4
done with converting:  Startups2021_3_4APlaceForNow_AsafKedem_informedconsent_optionE.mp4
done with converting:  Startups1920_1_5wAIste_allcams_informedconsent_minoptionC.mp4
done with converting:  Startups1819_1LittleSister_allcams_shortenedversion_informedconsent_minoptionC.mp4
done with converting:  Startups1920_2_4Peech_allcams_informedconsent_minoptionD.mp4
done with converting:  Startups1920_1_3SmArt_allcams_informedconsent_minoptionC.mp4
done with converting:  Startups2021_1_3SOLOPE_informedconsent.mp4
done with converting:  Startups2021_2_5VintageSurprise_MirjamHament_informedconsent_optionC.mp4
done with converting:  Startups2021_4_4Calculytics_GemKua_informedconsent_optionC.mp4
done with converting:  Startups1920_1_7C

In [13]:
preprocessing_feature_extraction.convert_pitch_audio(X_test,dir_audio_test)

done with converting:  DEiAIII1920_SM1_3YoungBoosters_allcams_informedconsent_minoptionD.mp4
done with converting:  Startups1920_1_6SchwiftyShopping_allcams_informedconsent_minoptionB.mp4
done with converting:  Startups2021_1_4BookFlixDelivery_informedconsent.mp4
done with converting:  Startups2021_3_1OutBusy_BartHermsen_informedconsent_optionC.mp4
done with converting:  Startups1819_5HOTIDY_allcams_shortenedversion_informedconsent_minoptionC.mp4
done with converting:  Startups1920_2_3Salix_allcams_informedconsent_minoptionC.mp4
done with converting:  Startups1819_6FitPoint_allcams_shortenedversion_informedconsent_minoptionC.mp4
done with converting:  DEiAIII1920_SM1_4Whitebox_allcams_informedconsent_minoptionD.mp4
done with converting:  DEiAIII1920_SM1_2PREA_allcams_informedconsent_minoptionD.mp4


In [17]:
#create final feature vector VGGish and HaF for training files
all_files = [f for f in listdir(dir_audio_train) if isfile(join(dir_audio_train, f))]
vector_VGGish_HaF = []
count=0
for file in all_files: 
    preprocessing_feature_extraction.creating_chunks(dir_audio_train, file)   
    
    spect = []
    HaF = []
    
    all_chunks = [f for f in listdir(dir_chunk) if isfile(join(dir_chunk,f))]
    for chunk in all_chunks:
        HaF_array = preprocessing_feature_extraction.HaF_opensmile(chunk)
        mel_sp = preprocessing_feature_extraction.vggish_features(chunk)
        mel_sp = mel_sp.flatten()
            
        if chunk == 'chunk_00.wav':
            HaF = HaF_array
            spect = mel_sp
        else: 
            HaF = np.vstack((HaF, HaF_array))
            spect = np.vstack((spect,mel_sp))
        #print('done with ', chunk)
    features = np.concatenate((spect, HaF), axis=1) 
    
    if count==0:
        vector_VGGish_HaF = features
    elif count ==1:
        vector_VGGish_HaF = np.stack((vector_VGGish_HaF,features), axis=0)
    else:
        vector_VGGish_HaF = np.vstack((vector_VGGish_HaF, features[None]))
    count+=1
    print('done with', file)

exporting D:\Data\Chunked\chunk_00.wav
exporting D:\Data\Chunked\chunk_01.wav
exporting D:\Data\Chunked\chunk_02.wav
exporting D:\Data\Chunked\chunk_03.wav
exporting D:\Data\Chunked\chunk_04.wav
exporting D:\Data\Chunked\chunk_05.wav
exporting D:\Data\Chunked\chunk_06.wav
exporting D:\Data\Chunked\chunk_07.wav
exporting D:\Data\Chunked\chunk_08.wav
exporting D:\Data\Chunked\chunk_09.wav
exporting D:\Data\Chunked\chunk_10.wav
exporting D:\Data\Chunked\chunk_11.wav
exporting D:\Data\Chunked\chunk_12.wav
exporting D:\Data\Chunked\chunk_13.wav
exporting D:\Data\Chunked\chunk_14.wav
exporting D:\Data\Chunked\chunk_15.wav
exporting D:\Data\Chunked\chunk_16.wav
exporting D:\Data\Chunked\chunk_17.wav
exporting D:\Data\Chunked\chunk_18.wav
exporting D:\Data\Chunked\chunk_19.wav
exporting D:\Data\Chunked\chunk_20.wav
exporting D:\Data\Chunked\chunk_21.wav
exporting D:\Data\Chunked\chunk_22.wav
exporting D:\Data\Chunked\chunk_23.wav
exporting D:\Data\Chunked\chunk_24.wav
exporting D:\Data\Chunked

In [18]:
print(vector_VGGish_HaF.shape)

(34, 75, 369)


In [19]:
np.save('vector_VGGish_HaF.npy',vector_VGGish_HaF)
np.save('Y_train', Y_train)

In [20]:
#create final feature vector VGGish and HaF for test files
all_files = [f for f in listdir(dir_audio_test) if isfile(join(dir_audio_test, f))]
vector_VGGish_HaF_test = []
count=0
for file in all_files:
    preprocessing_feature_extraction.creating_chunks(dir_audio_test,file)   
    
    spect = []
    HaF = []
    
    all_chunks = [f for f in listdir(dir_chunk) if isfile(join(dir_chunk,f))]
    for chunk in all_chunks:
        HaF_array = preprocessing_feature_extraction.HaF_opensmile(chunk)
        mel_sp = preprocessing_feature_extraction.vggish_features(chunk)
        mel_sp = mel_sp.flatten()
            
        if chunk == 'chunk_00.wav':
            HaF = HaF_array
            spect = mel_sp
        else: 
            HaF = np.vstack((HaF, HaF_array))
            spect = np.vstack((spect,mel_sp))
        #print('done with ', chunk)
    features = np.concatenate((spect, HaF), axis=1) 
    
    if count==0:
        vector_VGGish_HaF_test = features
    elif count ==1:
        vector_VGGish_HaF_test = np.stack((vector_VGGish_HaF_test,features), axis=0)
    else:
        vector_VGGish_HaF_test = np.vstack((vector_VGGish_HaF_test, features[None]))
    count+=1
    print('done with', file)

exporting D:\Data\Chunked\chunk_00.wav
exporting D:\Data\Chunked\chunk_01.wav
exporting D:\Data\Chunked\chunk_02.wav
exporting D:\Data\Chunked\chunk_03.wav
exporting D:\Data\Chunked\chunk_04.wav
exporting D:\Data\Chunked\chunk_05.wav
exporting D:\Data\Chunked\chunk_06.wav
exporting D:\Data\Chunked\chunk_07.wav
exporting D:\Data\Chunked\chunk_08.wav
exporting D:\Data\Chunked\chunk_09.wav
exporting D:\Data\Chunked\chunk_10.wav
exporting D:\Data\Chunked\chunk_11.wav
exporting D:\Data\Chunked\chunk_12.wav
exporting D:\Data\Chunked\chunk_13.wav
exporting D:\Data\Chunked\chunk_14.wav
exporting D:\Data\Chunked\chunk_15.wav
exporting D:\Data\Chunked\chunk_16.wav
exporting D:\Data\Chunked\chunk_17.wav
exporting D:\Data\Chunked\chunk_18.wav
exporting D:\Data\Chunked\chunk_19.wav
exporting D:\Data\Chunked\chunk_20.wav
exporting D:\Data\Chunked\chunk_21.wav
exporting D:\Data\Chunked\chunk_22.wav
exporting D:\Data\Chunked\chunk_23.wav
exporting D:\Data\Chunked\chunk_24.wav
exporting D:\Data\Chunked

In [21]:
print(vector_VGGish_HaF_test.shape)

(9, 75, 369)


In [22]:
np.save('vector_VGGish_HaF_test.npy',vector_VGGish_HaF_test)
np.save('Y_test', Y_test)

In [23]:
#creating the feature vectors including deep audio features for the training set
all_files = [f for f in listdir(dir_audio_train) if isfile(join(dir_audio_train, f))]
vector_VGGish = []
count=0
for file in all_files:
    preprocessing_feature_extraction.creating_chunks(dir_audio_train, file)   
    
    spect = []
    #HaF = []
    
    all_chunks = [f for f in listdir(dir_chunk) if isfile(join(dir_chunk,f))]
    for chunk in all_chunks:
        #HaF_array = preprocessing_feature_extraction.HaF_opensmile(chunk)
        mel_sp = preprocessing_feature_extraction.vggish_features(chunk)
        mel_sp = mel_sp.flatten()
            
        if chunk == 'chunk_00.wav':
            #HaF = HaF_array
            spect = mel_sp
        else: 
            #HaF = np.vstack((HaF, HaF_array))
            spect = np.vstack((spect,mel_sp))
        #print('done with ', chunk)
    #features = np.concatenate((spect, HaF), axis=1) 
    
    if count==0:
        vector_VGGish = spect
    elif count ==1:
        vector_VGGish = np.stack((vector_VGGish,spect), axis=0)
    else:
        vector_VGGish = np.vstack((vector_VGGish, spect[None]))
    count+=1
    print('done with', file)

exporting D:\Data\Chunked\chunk_00.wav
exporting D:\Data\Chunked\chunk_01.wav
exporting D:\Data\Chunked\chunk_02.wav
exporting D:\Data\Chunked\chunk_03.wav
exporting D:\Data\Chunked\chunk_04.wav
exporting D:\Data\Chunked\chunk_05.wav
exporting D:\Data\Chunked\chunk_06.wav
exporting D:\Data\Chunked\chunk_07.wav
exporting D:\Data\Chunked\chunk_08.wav
exporting D:\Data\Chunked\chunk_09.wav
exporting D:\Data\Chunked\chunk_10.wav
exporting D:\Data\Chunked\chunk_11.wav
exporting D:\Data\Chunked\chunk_12.wav
exporting D:\Data\Chunked\chunk_13.wav
exporting D:\Data\Chunked\chunk_14.wav
exporting D:\Data\Chunked\chunk_15.wav
exporting D:\Data\Chunked\chunk_16.wav
exporting D:\Data\Chunked\chunk_17.wav
exporting D:\Data\Chunked\chunk_18.wav
exporting D:\Data\Chunked\chunk_19.wav
exporting D:\Data\Chunked\chunk_20.wav
exporting D:\Data\Chunked\chunk_21.wav
exporting D:\Data\Chunked\chunk_22.wav
exporting D:\Data\Chunked\chunk_23.wav
exporting D:\Data\Chunked\chunk_24.wav
exporting D:\Data\Chunked

In [24]:
print(vector_VGGish.shape)

(34, 75, 256)


In [25]:
np.save('vector_VGGish.npy',vector_VGGish)

In [26]:
#creating the feature vectors including deep audio features for the test set
all_files = [f for f in listdir(dir_audio_test) if isfile(join(dir_audio_test, f))]
vector_VGGish_test = []
count=0
for file in all_files:
    preprocessing_feature_extraction.creating_chunks(dir_audio_test, file)   
    
    spect = []
    #HaF = []
    
    all_chunks = [f for f in listdir(dir_chunk) if isfile(join(dir_chunk,f))]
    for chunk in all_chunks:
        #HaF_array = preprocessing_feature_extraction.HaF_opensmile(chunk)
        mel_sp = preprocessing_feature_extraction.vggish_features(chunk)
        mel_sp = mel_sp.flatten()
            
        if chunk == 'chunk_00.wav':
            #HaF = HaF_array
            spect = mel_sp
        else: 
            #HaF = np.vstack((HaF, HaF_array))
            spect = np.vstack((spect,mel_sp))
        #print('done with ', chunk)
    #features = np.concatenate((spect, HaF), axis=1) 
    
    if count==0:
        vector_VGGish_test = spect
    elif count ==1:
        vector_VGGish_test = np.stack((vector_VGGish_test,spect), axis=0)
    else:
        vector_VGGish_test = np.vstack((vector_VGGish_test, spect[None]))
    count+=1
    print('done with', file)

exporting D:\Data\Chunked\chunk_00.wav
exporting D:\Data\Chunked\chunk_01.wav
exporting D:\Data\Chunked\chunk_02.wav
exporting D:\Data\Chunked\chunk_03.wav
exporting D:\Data\Chunked\chunk_04.wav
exporting D:\Data\Chunked\chunk_05.wav
exporting D:\Data\Chunked\chunk_06.wav
exporting D:\Data\Chunked\chunk_07.wav
exporting D:\Data\Chunked\chunk_08.wav
exporting D:\Data\Chunked\chunk_09.wav
exporting D:\Data\Chunked\chunk_10.wav
exporting D:\Data\Chunked\chunk_11.wav
exporting D:\Data\Chunked\chunk_12.wav
exporting D:\Data\Chunked\chunk_13.wav
exporting D:\Data\Chunked\chunk_14.wav
exporting D:\Data\Chunked\chunk_15.wav
exporting D:\Data\Chunked\chunk_16.wav
exporting D:\Data\Chunked\chunk_17.wav
exporting D:\Data\Chunked\chunk_18.wav
exporting D:\Data\Chunked\chunk_19.wav
exporting D:\Data\Chunked\chunk_20.wav
exporting D:\Data\Chunked\chunk_21.wav
exporting D:\Data\Chunked\chunk_22.wav
exporting D:\Data\Chunked\chunk_23.wav
exporting D:\Data\Chunked\chunk_24.wav
exporting D:\Data\Chunked

In [35]:
print(vector_VGGish_test.shape)

(9, 75, 256)


In [28]:
np.save('vector_VGGish_test.npy',vector_VGGish_test)

### Models ###

In [13]:
#hyperparameters to be tuned
dropout = [0, 0.1, 0.2]
nodes = [64, 128, 256]
learning_rate = [0.01, 0.001, 0.02]
epochs= [10, 20, 50]

#### Model a. CNN+LSTM ####

In [10]:
training_data = np.load('vector_VGGish.npy')
training_labels = np.load('Y_train.npy')
test_data = np.load('vector_VGGish_test.npy')
test_labels = np.load('Y_test.npy')
print(training_data.shape, training_labels.shape, test_data.shape, test_labels.shape)

(34, 75, 256) (34,) (9, 75, 256) (9,)


In [20]:
#hyperparameters to be tuned
dropout = [0, 0.1, 0.2]
nodes = [128]
learning_rate = [0.01, 0.001, 0.02]
epochs= [50]

In [21]:
#LSTM model
def LSTM_model_base(dropout, nodes, lr):
    model = Sequential()
    #model.add(Embedding) often used for NLP, so not used in this case
    model.add(LSTM(nodes, input_shape = (75,256)))
    model.add(Dropout(dropout))
    model.add(Dense(2, activation='softmax'))
    
    #opt = keras.optimizers.Adam(learning_rate=lr)
    optimizer = Adam(learning_rate = lr)
    model.compile(optimizer = optimizer, loss = 'binary_crossentropy', metrics = ['accuracy'])
    return model

In [22]:
model_LSTM_base = KerasClassifier(build_fn=LSTM_model_base, epochs=10, verbose=0)
param_grid = dict(nodes=nodes, lr = learning_rate, dropout=dropout, epochs=epochs)
grid_LSTM_base = GridSearchCV(estimator=model_LSTM_base, param_grid=param_grid)

In [None]:
grid_result_a = grid_LSTM_base.fit(training_data, training_labels)

In [None]:
print("Best: %f using %s" % (grid_result_a.best_score_, grid_result_a.best_params_))

In [None]:
#Predict values based on new parameters
y_pred_a = grid_result_a.predict(test_data)

In [None]:
print('Accuracy Score model a: ' + str(accuracy_score(test_labels,y_pred_a)))
print('AUC Score model a: ' + str(roc_auc_score(test_labels,y_pred_a)))

#### Model b. CNN+HaF+LSTM ####

In [14]:
training_data = np.load('vector_VGGish_HaF.npy')
training_labels = np.load('Y_train.npy')
test_data = np.load('vector_VGGish_HaF_test.npy')
test_labels = np.load('Y_test.npy')
print(training_data.shape, training_labels.shape, test_data.shape, test_labels.shape)

(34, 75, 369) (34,) (9, 75, 369) (9,)


In [None]:
#LSTM model
def LSTM_model(dropout, nodes, lr):
    model = Sequential()
    #model.add(Embedding) often used for NLP, so not used in this case
    model.add(LSTM(nodes, input_shape = (75,369)))
    model.add(Dropout(dropout))
    model.add(Dense(2, activation='softmax'))
    
    #opt = keras.optimizers.Adam(learning_rate=lr)
    optimizer = Adam(learning_rate = lr)
    model.compile(optimizer = optimizer, loss = 'binary_crossentropy', metrics = ['accuracy'])
    return model

In [None]:
model_LSTM = KerasClassifier(build_fn=LSTM_model, epochs=10, verbose=0)
param_grid = dict(nodes=nodes, lr = learning_rate, dropout=dropout, epochs=epochs)
grid_LSTM = GridSearchCV(estimator=model_LSTM, param_grid=param_grid)

In [None]:
grid_result_b = grid_LSTM.fit(training_data, training_labels)

In [None]:
print("Best: %f using %s" % (grid_result_b.best_score_, grid_result_b.best_params_))

In [None]:
#Predict values based on new parameters
y_pred_b = grid_result_b.predict(test_data)

In [None]:
print('Accuracy Score model b: ' + str(accuracy_score(test_labels,y_pred_b)))
print('AUC Score model b: ' + str(roc_auc_score(test_labels,y_pred_b)))

#### Model c. CNN+GRU ####

In [None]:
training_data = np.load('vector_VGGish.npy')
training_labels = np.load('Y_train.npy')
test_data = np.load('vector_VGGish_test.npy')
test_labels = np.load('Y_test.npy')
print(training_data.shape, training_labels.shape, test_data.shape, test_labels.shape)

In [None]:
def GRU_model_base(dropout, nodes, lr):
    model = Sequential()
    #model.add(Embedding) often used for NLP, so not used in this case
    model.add(GRU(nodes, input_shape = (75,256)))
    model.add(Dropout(dropout))
    model.add(Dense(2, activation='softmax'))
    
    #opt = keras.optimizers.Adam(learning_rate=lr)
    optimizer = Adam(learning_rate = lr)
    model.compile(optimizer = optimizer, loss = 'binary_crossentropy', metrics = ['accuracy'])
    return model

In [None]:
model_GRU_base = KerasClassifier(build_fn=GRU_model_base, epochs=10, verbose=0)
param_grid = dict(nodes=nodes, lr = learning_rate, dropout=dropout)
grid_GRU = GridSearchCV(estimator=model_GRU_base, param_grid=param_grid)

In [None]:
grid_result_c = grid_GRU.fit(training_data, training_labels)

In [None]:
print("Best: %f using %s" % (grid_result_c.best_score_, grid_result_c.best_params_))

In [None]:
#Predict values based on new parameters
y_pred_c = grid_result_c.predict(test_data)

In [None]:
print('Accuracy Score model c: ' + str(accuracy_score(test_labels,y_pred_a)))
print('AUC Score model c: ' + str(roc_auc_score(test_labels,y_pred_a)))

#### Model d. CNN+HaF+GRU ####

In [None]:
training_data = np.load('vector_VGGish_HaF.npy')
training_labels = np.load('Y_train.npy')
test_data = np.load('vector_VGGish_HaF_test.npy')
test_labels = np.load('Y_test.npy')
print(training_data.shape, training_labels.shape, test_data.shape, test_labels.shape)

In [None]:
def GRU_model(dropout, nodes, lr):
    model = Sequential()
    #model.add(Embedding) often used for NLP, so not used in this case
    model.add(GRU(nodes, input_shape = (75,369)))
    model.add(Dropout(dropout))
    model.add(Dense(2, activation='softmax'))
    
    #opt = keras.optimizers.Adam(learning_rate=lr)
    optimizer = Adam(learning_rate = lr)
    model.compile(optimizer = optimizer, loss = 'binary_crossentropy', metrics = ['accuracy'])
    return model

In [None]:
model_GRU = KerasClassifier(build_fn=GRU_model, epochs=10, verbose=0)
param_grid = dict(nodes=nodes, lr = learning_rate, dropout=dropout)
grid_GRU = GridSearchCV(estimator=model_GRU, param_grid=param_grid)

In [None]:
grid_result_d = grid_GRU.fit(training_data, training_labels)

In [None]:
print("Best: %f using %s" % (grid_result_d.best_score_, grid_result_d.best_params_))

In [None]:
#Predict values based on new parameters
y_pred_d = grid_result_d.predict(test_data)

In [None]:
print('Accuracy Score model d: ' + str(accuracy_score(test_labels,y_pred_a)))
print('AUC Score model d: ' + str(roc_auc_score(test_labels,y_pred_a)))