In [1]:
import os, glob
import warnings
import numpy as np
import pandas as pd
import tensorflow as tf
import matplotlib.pyplot as plt
from pyannote.audio import Model, Inference
warnings.filterwarnings("ignore")

c:\Users\Legion\.conda\envs\tf210\lib\site-packages\numpy\.libs\libopenblas.FB5AE2TYXYH2IJRDKGDGQ3XBKLKTF43H.gfortran-win_amd64.dll
c:\Users\Legion\.conda\envs\tf210\lib\site-packages\numpy\.libs\libopenblas.GK7GX5KEQ4F6UYO3P26ULGBQYHGQO7J4.gfortran-win_amd64.dll


In [7]:
embedding_model = Model.from_pretrained(
                                        "pyannote/embedding", 
                                        use_auth_token="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"
                                        )
embedding_inference = Inference(
                                embedding_model, 
                                window="whole"
                                )

class_dict = {
            'Stage 1': 0,
            'Stage 2': 1
            }

In [12]:
def load_dataset(audio_dir='data/speech_therapy'):
    voice_files = glob.glob(f'{audio_dir}/*/*.wav')
    voice_files = [voice_file.replace('\\', '/') for voice_file in voice_files]

    folder_names = [voice_file.split('/')[-2] for voice_file in voice_files]
    labels = [class_dict[folder_name] for folder_name in folder_names]

    embeddings = np.zeros((len(voice_files), 512))
    labels = np.array(labels)
    errorneous_idxs = []
    for i, voice_file in enumerate(voice_files):
        try:
            embeddings[i] = embedding_inference(voice_file)
        except:
            errorneous_idxs.append(i)
            print('Errorneous file: ', voice_file)

    embeddings = np.delete(embeddings, errorneous_idxs, axis=0)
    labels = np.delete(labels, errorneous_idxs, axis=0)
    return embeddings, labels

In [13]:
embeddings, labels = load_dataset()

print("Embedding shape: ", embeddings.shape)
print("labels    shape: ", labels.shape)

Embedding shape:  (14, 512)
labels    shape:  (14,)


In [14]:
def build_model():
    inputs = tf.keras.Input(shape=(512,))
    x = tf.keras.layers.Dense(256, activation='relu')(inputs)
    x = tf.keras.layers.Dense(128)(x)
    x = tf.keras.layers.BatchNormalization()(x)
    x = tf.keras.layers.Activation('relu')(x)
    x = tf.keras.layers.Dropout(0.2)(x)

    x = tf.keras.layers.Dense(64)(x)
    x = tf.keras.layers.BatchNormalization()(x)
    x = tf.keras.layers.Activation('relu')(x)
    x = tf.keras.layers.Dropout(0.2)(x)

    x = tf.keras.layers.Dense(32)(x)
    x = tf.keras.layers.BatchNormalization()(x)
    x = tf.keras.layers.Activation('relu')(x)
    x = tf.keras.layers.Dropout(0.2)(x)

    outputs = tf.keras.layers.Dense(1, activation='sigmoid', name='detection')(x)

    model = tf.keras.Model(
                            inputs=inputs, 
                            outputs=outputs
                            )
    model.compile(
                optimizer='adam',
                loss='binary_crossentropy',
                metrics=[
                        tf.keras.metrics.BinaryAccuracy(name='accuracy'),
                        tf.keras.metrics.Precision(name='precision'),
                        tf.keras.metrics.Recall(name='recall'),
                        tf.keras.metrics.AUC(name='auc')
                        ]
                )
    return model

In [15]:
model = build_model()
model.summary()

Model: "model"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 input_1 (InputLayer)        [(None, 512)]             0         
                                                                 
 dense (Dense)               (None, 256)               131328    
                                                                 
 dense_1 (Dense)             (None, 128)               32896     
                                                                 
 batch_normalization (BatchN  (None, 128)              512       
 ormalization)                                                   
                                                                 
 activation (Activation)     (None, 128)               0         
                                                                 
 dropout (Dropout)           (None, 128)               0         
                                                             

In [17]:
model.fit(
        embeddings,
        labels,
        epochs=100,
        batch_size=8,
        callbacks=[
            tf.keras.callbacks.EarlyStopping(
                                            monitor='loss',
                                            patience=10,
                                            restore_best_weights=True
                                            )
            ]   
        )
model.save('feature_store/speech therapy.h5')

Epoch 1/100
Epoch 2/100
Epoch 3/100
Epoch 4/100
Epoch 5/100
Epoch 6/100
Epoch 7/100
Epoch 8/100
Epoch 9/100
Epoch 10/100
Epoch 11/100
Epoch 12/100
Epoch 13/100
Epoch 14/100
Epoch 15/100
Epoch 16/100
Epoch 17/100
Epoch 18/100
Epoch 19/100
Epoch 20/100
Epoch 21/100
Epoch 22/100
Epoch 23/100
Epoch 24/100
Epoch 25/100
Epoch 26/100
Epoch 27/100
Epoch 28/100
Epoch 29/100
Epoch 30/100
Epoch 31/100
Epoch 32/100
Epoch 33/100
Epoch 34/100
Epoch 35/100
Epoch 36/100
Epoch 37/100
Epoch 38/100
Epoch 39/100
Epoch 40/100
Epoch 41/100
Epoch 42/100
Epoch 43/100
Epoch 44/100
Epoch 45/100
Epoch 46/100
Epoch 47/100
Epoch 48/100
Epoch 49/100
Epoch 50/100
Epoch 51/100
Epoch 52/100
Epoch 53/100
Epoch 54/100
Epoch 55/100
Epoch 56/100
Epoch 57/100
Epoch 58/100
Epoch 59/100
Epoch 60/100
Epoch 61/100
Epoch 62/100
Epoch 63/100


# Inference

In [18]:
class_dict_speech = {
                    'Stage 1': 0,
                    'Stage 2': 1
                    }
class_dict_speech_rev = {
                        0: 'Stage 1',
                        1: 'Stage 2'
                        }

model_speech_therapy = tf.keras.models.load_model('feature_store/speech therapy.h5')

In [19]:
def inference_stage_sentiment(audio_file):
    embedding = embedding_inference(audio_file)
    embedding = np.expand_dims(embedding, axis=0)
    sentiment = model.predict(embedding)
    sentiment = sentiment.squeeze()
    sentiment = np.round(sentiment)
    sentiment = int(sentiment)
    return class_dict_speech_rev[sentiment]

In [20]:
response = inference_stage_sentiment('data/speech_therapy/Stage 1/s 2 48000 1-[AudioTrimmer.com].wav')
response



'Stage 1'