In [14]:
import numpy as np
import pandas as pd
import wave
import librosa
import matplotlib.pyplot as plt
from sklearn.metrics import confusion_matrix
from sklearn.metrics import accuracy_score
from keras.utils import to_categorical
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Conv2D, MaxPooling2D, Dense, Flatten, Dropout
from tensorflow.keras.optimizers import Adam, Adadelta
from sklearn.metrics import classification_report, confusion_matrix, precision_score, recall_score, f1_score, mean_absolute_error, mean_squared_error

In [None]:
prefix = 'data/'

train_split_df = pd.read_csv(prefix+'train_split_Depression_AVEC2017.csv')
test_split_df = pd.read_csv(prefix+'dev_split_Depression_AVEC2017.csv')
train_split_num = train_split_df[['Participant_ID']]['Participant_ID'].tolist()
test_split_num = test_split_df[['Participant_ID']]['Participant_ID'].tolist()
train_split_clabel = train_split_df[['PHQ8_Score']]['PHQ8_Score'].tolist()
test_split_clabel = test_split_df[['PHQ8_Score']]['PHQ8_Score'].tolist()

In [None]:
def extract_features(number, audio_features, target, audio_targets, mode):
    transcript = pd.read_csv(prefix+'{0}_P/{0}_TRANSCRIPT.csv'.format(number), sep='\t').fillna('')

    wavefile = wave.open(prefix+'{0}_P/{0}_AUDIO.wav'.format(number, 'r'))
    sr = wavefile.getframerate()
    nframes = wavefile.getnframes()
    wave_data = np.frombuffer(wavefile.readframes(nframes), dtype=np.short)

    time_range = []
    response = ''
    response_flag = False
    time_collect_flag = False
    start_time = 0
    stop_time = 0

    signal = []

    global counter_train

    for t in transcript.itertuples():
        if getattr(t,'speaker') == 'Ellie':
            continue
        elif getattr(t,'speaker') == 'Participant':
            if 'scrubbed_entry' in getattr(t,'value'):
                continue
            start_time = int(getattr(t,'start_time')*sr)
            stop_time = int(getattr(t,'stop_time')*sr)
            signal = np.hstack((signal, wave_data[start_time:stop_time].astype(np.float)))

    clip = sr*1*15
    if target >= 10 and mode == 'train':
        times = 3 if counter_train < 48 else 2
        for i in range(times):
            if clip*(i+1) > len(signal):
                continue
            melspec = librosa.feature.melspectrogram(y=signal[clip*i:clip*(i+1)], n_mels=80,sr=sr)
            logspec = melspec
            audio_features.append(logspec)
            audio_targets.append(target)
            counter_train+=1
    else:
        melspec = librosa.feature.melspectrogram(y=signal[:clip], n_mels=80, sr=sr)
        logspec = melspec
        audio_features.append(logspec)
        audio_targets.append(target)
    print('{}_P feature done'.format(number))

In [None]:
audio_features_train = []
audio_ctargets_train = []

audio_features_test = []
audio_ctargets_test = []

for index in range(len(train_split_num)):
    extract_features(train_split_num[index], audio_features_train, train_split_clabel[index], audio_ctargets_train, 'train')

for index in range(len(test_split_num)):
    extract_features(test_split_num[index], audio_features_test, test_split_clabel[index], audio_ctargets_test, 'test')

print(np.shape(audio_ctargets_train), np.shape(audio_ctargets_test))

In [None]:
np.savez('train_samples_cla.npz', audio_features_train)
np.savez('test_samples_cla.npz', audio_features_test)
np.savez('train_labels_cla.npz', audio_ctargets_train)
np.savez('test_labels_cla.npz', audio_ctargets_test)

In [7]:
features_train = np.load('train_samples_cla.npz', allow_pickle=True)['arr_0']
features_test = np.load('test_samples_cla.npz', allow_pickle=True)['arr_0']
targets_train = np.load('train_labels_cla.npz', allow_pickle=True)['arr_0']
ctargets_test = np.load('test_labels_cla.npz', allow_pickle=True)['arr_0']

In [8]:
X_train = np.array(features_train)
Y_train = np.array(targets_train)
X_test = np.array(features_test)
Y_test = np.array(ctargets_test)

In [9]:
X_train = np.array([(X - X.min()) / (X.max() - X.min()) for X in X_train])
X_test = np.array([(X - X.min()) / (X.max() - X.min()) for X in X_test])

In [10]:
X_train = X_train.astype('float32')
X_test = X_test.astype('float32')

In [11]:
Y_train = Y_train.astype('float32')
Y_test = Y_test.astype('float32')

In [12]:
print(X_train.shape)
print(Y_train.shape)
print(X_test.shape)
print(Y_test.shape)

(153, 80, 469)
(153,)
(35, 80, 469)
(35,)


In [13]:
train_y = to_categorical(Y_train)
test_y = to_categorical(Y_test)

In [26]:
input_shape = (80, 469, 1)

model = Sequential()
model.add(Conv2D(32, (1, 7), activation='relu', input_shape=input_shape))
model.add(MaxPooling2D(pool_size=(4, 3), strides=(1, 3)))
model.add(Conv2D(32, (1, 7), activation='relu'))
model.add(MaxPooling2D(pool_size=(1, 3), strides=(1, 3)))
model.add(Flatten())
model.add(Dense(128, activation='relu'))
model.add(Dense(128, activation='relu'))
model.add(Dropout(0.5))
model.add(Dense(2, activation='softmax'))

model.compile(optimizer=Adadelta(learning_rate=1), loss='categorical_crossentropy', metrics=['accuracy'])

In [3]:
model.summary()

Model: "sequential"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 conv2d (Conv2D)             (None, 80, 463, 32)       256       
                                                                 
 max_pooling2d (MaxPooling2  (None, 77, 154, 32)       0         
 D)                                                              
                                                                 
 conv2d_1 (Conv2D)           (None, 77, 148, 32)       7200      
                                                                 
 max_pooling2d_1 (MaxPoolin  (None, 77, 49, 32)        0         
 g2D)                                                            
                                                                 
 flatten (Flatten)           (None, 120736)            0         
                                                                 
 dense (Dense)               (None, 128)               1

In [None]:
model.fit(X_train, train_y, epochs=30, batch_size=4)

loss, accuracy = model.evaluate(X_test, test_y)
print(f"Test Loss: {loss}, Test Accuracy: {accuracy}")

In [19]:
y_pred = model.predict(X_test)
predicted_1 = [1 if x[1] > x[0] else 0 for x in y_pred]
print(classification_report(Y_test, predicted_1))

              precision    recall  f1-score   support

         0.0       0.95      0.78      0.86        23
         1.0       0.69      0.92      0.79        12

    accuracy                           0.83        35
   macro avg       0.82      0.85      0.82        35
weighted avg       0.86      0.83      0.83        35



In [None]:
model.save("audio_classification_model.keras")

# Regression

In [28]:
features_train = np.load('train_samples_reg.npz', allow_pickle=True)['arr_0']
features_test = np.load('test_samples_reg.npz', allow_pickle=True)['arr_0']
targets_train = np.load('train_labels_reg.npz', allow_pickle=True)['arr_0']
ctargets_test = np.load('test_labels_reg.npz', allow_pickle=True)['arr_0']

In [29]:
X_train = np.array(features_train)
Y_train = np.array(targets_train)
X_test = np.array(features_test)
Y_test = np.array(ctargets_test)

In [30]:
X_train = np.array([(X - X.min()) / (X.max() - X.min()) for X in X_train])
X_test = np.array([(X - X.min()) / (X.max() - X.min()) for X in X_test])

In [32]:
input_shape = (80, 469, 1)

model = Sequential()
model.add(Conv2D(32, (1, 7), activation='relu', input_shape=input_shape))
model.add(MaxPooling2D(pool_size=(4, 3), strides=(1, 3)))
model.add(Conv2D(32, (1, 7), activation='relu'))
model.add(MaxPooling2D(pool_size=(1, 3), strides=(1, 3)))
model.add(Flatten())
model.add(Dense(128, activation='relu'))
model.add(Dense(128, activation='relu'))
model.add(Dropout(0.5))
model.add(Dense(1, activation='linear'))

model.compile(optimizer=Adadelta(learning_rate=1), loss='mean_squared_error', metrics=['mean_absolute_error'])

history = model.fit(X_train, Y_train, epochs=30, batch_size=4)

loss, mean_absolute_error = model.evaluate(X_test, Y_test)
print(f"Test Loss: {loss}, Test Mean Absolute Error: {mean_absolute_error}")

Epoch 1/30
Epoch 2/30
Epoch 3/30
Epoch 4/30
Epoch 5/30
Epoch 6/30
Epoch 7/30
Epoch 8/30
Epoch 9/30
Epoch 10/30
Epoch 11/30
Epoch 12/30
Epoch 13/30
Epoch 14/30
Epoch 15/30
Epoch 16/30
Epoch 17/30
Epoch 18/30
Epoch 19/30
Epoch 20/30
Epoch 21/30
Epoch 22/30
Epoch 23/30
Epoch 24/30
Epoch 25/30
Epoch 26/30
Epoch 27/30
Epoch 28/30
Epoch 29/30
Epoch 30/30




Test Loss: 38.2661018371582, Test Mean Absolute Error: 5.092613697052002


In [33]:
y_pred = model.predict(X_test)
print("RMSE = ", np.sqrt(mean_squared_error(Y_test,y_pred)))

RMSE =  6.1859601576648675
