### WaveNet 모델을 이용해 영아 울음소리 분류 모델을 만들어보자.

Code: https://github.com/mjpyeon/wavenet-classifier

In [1]:
import numpy as np
import pandas as pd
import tensorflow as tf
import librosa
import sys
from keras.layers import Input, Dense, Lambda, Flatten, Reshape, Activation, Dropout, Add, TimeDistributed, Multiply, Conv1D, Conv2D, MaxPooling1D, AveragePooling1D
from keras.models import Model, Sequential, load_model
from keras import backend as K
from keras import metrics
from keras import optimizers
from keras.callbacks import History, ModelCheckpoint

from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder

In [2]:
sys.path.append('/Users/jaewone/developer/tensorflow/baby-cry-classification')

from utils.sound import *
from utils.os import *
from constant.os import *

In [3]:
sample_data_path = os.path.join(main_path, 'sample_data')
info_csv_path = os.path.join(main_path, 'sample_data.csv')

In [4]:
# 각 state 별로 n개만 추출한다.(데이터가 너무 많아 학습이 느리기 때문에 데이터 수를 줄인다.)
df = pd.read_csv(info_csv_path, index_col=0)
df = df.groupby('state').apply(lambda x: x.sample(n=6, random_state=42)).reset_index(drop=True)

# 음성 파일 경로와 클래스 레이블 지정
audio_files = [os.path.join(sample_data_path, file) for file in df['file']]
class_labels = df['state'].tolist()

In [5]:
X = [librosa.load(file, sr=16000)[0] for file in audio_files]
y = class_labels



In [6]:
# 클래스 레이블 인코딩 및 원-핫 인코딩
label_encoder = LabelEncoder()
y_encoded = label_encoder.fit_transform(y)
num_classes = len(label_encoder.classes_)
y_one_hot = tf.keras.utils.to_categorical(y_encoded, num_classes=num_classes)

X_train, X_test, y_train, y_test = train_test_split(X, y_one_hot, test_size=0.2, random_state=1)
X_train, X_val, y_train, y_val = train_test_split(X_train, y_train, test_size=0.2, random_state=1)

X_train = np.array(X_train)
X_test = np.array(X_test)
y_train = np.array(y_train)
y_test = np.array(y_test)
X_val = np.array(X_val)
y_val = np.array(y_val)


print(X_train.shape)
print(y_test.shape)
print(X_val.shape)

(26, 80000)
(9, 7)
(7, 80000)


In [7]:
# WaveNet model
class WaveNetClassifier():
    def __init__(self, input_shape, output_shape, kernel_size=2, dilation_depth=9, n_filters=40, load=False, load_dir='./'):
        """
        Parameters:
          input_shape: (tuple) tuple of input shape. (e.g. If input is 6s raw waveform with sampling rate = 16kHz, (96000,) is the input_shape)
          output_shape: (tuple)tuple of output shape. (e.g. If we want classify the signal into 100 classes, (100,) is the output_shape)
          kernel_size: (integer) kernel size of convolution operations in residual blocks
          dilation_depth: (integer) type total depth of residual blocks
          n_filters: (integer) # of filters of convolution operations in residual blocks
          load: (bool) load previous WaveNetClassifier or not
          load_dir: (string) the directory where the previous model exists
        """
        self.activation = 'softmax'
        self.scale_ratio = 1

        # save input info
        if len(input_shape) == 1:
            self.expand_dims = True
        elif len(input_shape) == 2:
            self.expand_dims = False
        else:
            print('ERROR: wrong input shape')
            sys.exit()
        self.input_shape = input_shape

        # save output info
        if len(output_shape) == 1:
            self.time_distributed = False
        elif len(output_shape) == 2:
            self.time_distributed = True
        else:
            print('ERROR: wrong output shape')
            sys.exit()
        self.output_shape = output_shape

        # save hyperparameters of WaveNet
        self.kernel_size = kernel_size
        self.dilation_depth = dilation_depth
        self.n_filters = n_filters
        self.manual_loss = None

        if load is True:
            self.model = load_model(
                load_dir+"saved_wavenet_clasifier.h5", custom_objects={'tf': tf})
            self.prev_history = pd.read_csv(
                load_dir+'wavenet_classifier_training_history.csv')
            self.start_idx = len(self.prev_history)
            self.history = None
        else:
            self.model = self.construct_model()
            self.start_idx = 0
            self.history = None
            self.prev_history = None

    def residual_block(self, x, i):
        tanh_out = Conv1D(self.n_filters,
                          self.kernel_size,
                          dilation_rate=self.kernel_size**i,
                          padding='causal',
                          name='dilated_conv_%d_tanh' % (
                              self.kernel_size ** i),
                          activation='tanh'
                          )(x)
        sigm_out = Conv1D(self.n_filters,
                          self.kernel_size,
                          dilation_rate=self.kernel_size**i,
                          padding='causal',
                          name='dilated_conv_%d_sigm' % (
                              self.kernel_size ** i),
                          activation='sigmoid'
                          )(x)
        z = Multiply(name='gated_activation_%d' % (i))([tanh_out, sigm_out])
        skip = Conv1D(self.n_filters, 1, name='skip_%d' % (i))(z)
        res = Add(name='residual_block_%d' % (i))([skip, x])
        return res, skip

    def construct_model(self):
        x = Input(shape=self.input_shape, name='original_input')
        if self.expand_dims == True:
            x_reshaped = Reshape(self.input_shape + (1,),
                                 name='reshaped_input')(x)
        else:
            x_reshaped = x
        skip_connections = []
        out = Conv1D(self.n_filters, 2, dilation_rate=1,
                     padding='causal', name='dilated_conv_1')(x_reshaped)
        for i in range(1, self.dilation_depth + 1):
            out, skip = self.residual_block(out, i)
            skip_connections.append(skip)
        out = Add(name='skip_connections')(skip_connections)
        out = Activation('relu')(out)
        out = Conv1D(self.n_filters, 80, strides=1, padding='same',
                     name='conv_5ms', activation='relu')(out)
        out = AveragePooling1D(
            80, padding='same', name='downsample_to_200Hz')(out)
        if self.time_distributed:
            # prev_len / x = target_len => x = prev_len / target_len
            target_kernel_size = (int)(
                self.input_shape[0] / 80 / self.output_shape[0])
            out = Conv1D(self.n_filters, target_kernel_size, padding='same',
                         name='conv_fit_to_target', activation='relu')(out)
            out = Conv1D(
                self.output_shape[1], target_kernel_size, padding='same', name='conv_final')(out)
            out = AveragePooling1D(target_kernel_size, padding='same')(out)
            out = TimeDistributed(Activation(self.activation))(out)
        else:
            out = Conv1D(self.n_filters, 100, padding='same',
                         activation='relu', name='conv_500ms')(out)
            out = Conv1D(self.output_shape[0], 100, padding='same',
                         activation='relu', name='conv_500ms_target_shape')(out)
            out = AveragePooling1D(100, padding='same',
                                   name='downsample_to_2Hz')(out)
            out = Conv1D(self.output_shape[0], (int)(
                self.input_shape[0] / 8000), padding='same', name='final_conv')(out)
            out = AveragePooling1D(
                (int)(self.input_shape[0] / 8000), name='final_pooling')(out)
            out = Reshape(self.output_shape)(out)
            out = Activation(self.activation)(out)
        if self.scale_ratio != 1:
            out = Lambda(lambda x: x * self.scale_ratio,
                         name='output_reshaped')(out)
        model = Model(x, out)
        model.summary()
        return model

    def get_model(self):
        return self.model

    def add_loss(self, loss):
        self.manual_loss = loss

    def fit(self, X, Y, validation_data=None, epochs=100, batch_size=32, optimizer='adam', save=False, save_dir='./'):
        # set default losses if not defined
        if self.manual_loss is not None:
            loss = self.manual_loss
            metrics = None
        else:
            loss = 'categorical_crossentropy'
            metrics = ['accuracy']

        # set callback functions
        if save:
            saved = save_dir + "saved_wavenet_clasifier.h5"
            hist = save_dir + 'wavenet_classifier_training_history.csv'
            if validation_data is None:
                checkpointer = ModelCheckpoint(
                    filepath=saved, monitor='loss', verbose=1, save_best_only=True)
            else:
                checkpointer = ModelCheckpoint(
                    filepath=saved, monitor='val_loss', verbose=1, save_best_only=True)
            history = History()
            callbacks = [history, checkpointer]
        else:
            callbacks = None

        # compile the model
        self.model.compile(optimizer, loss, metrics)
        try:
            self.history = self.model.fit(X, Y, shuffle=True, batch_size=batch_size, epochs=epochs,
                                          validation_data=validation_data, callbacks=callbacks, initial_epoch=self.start_idx)
        except:
            if save:
                df = pd.DataFrame.from_dict(history.history)
                df.to_csv(hist, encoding='utf-8', index=False)
            raise
            sys.exit()
        return self.history

    def predict(self, x):
        return self.model.predict(x)

In [8]:
# Load model
model = WaveNetClassifier(
    input_shape=(16000*5,),  # sample_rate * second
    output_shape=(7,),       # label counts
    kernel_size=2,
    dilation_depth=9,
    n_filters=40,
    load=False,
)

Metal device set to: Apple M1

systemMemory: 16.00 GB
maxCacheSize: 5.33 GB



2023-08-03 13:13:35.225785: I tensorflow/core/common_runtime/pluggable_device/pluggable_device_factory.cc:305] Could not identify NUMA node of platform GPU ID 0, defaulting to 0. Your kernel may not have been built with NUMA support.
2023-08-03 13:13:35.225900: I tensorflow/core/common_runtime/pluggable_device/pluggable_device_factory.cc:271] Created TensorFlow device (/job:localhost/replica:0/task:0/device:GPU:0 with 0 MB memory) -> physical PluggableDevice (device: 0, name: METAL, pci bus id: <undefined>)


Model: "model"
__________________________________________________________________________________________________
 Layer (type)                   Output Shape         Param #     Connected to                     
 original_input (InputLayer)    [(None, 80000)]      0           []                               
                                                                                                  
 reshaped_input (Reshape)       (None, 80000, 1)     0           ['original_input[0][0]']         
                                                                                                  
 dilated_conv_1 (Conv1D)        (None, 80000, 40)    120         ['reshaped_input[0][0]']         
                                                                                                  
 dilated_conv_2_tanh (Conv1D)   (None, 80000, 40)    3240        ['dilated_conv_1[0][0]']         
                                                                                              

In [9]:
model.fit(X_train, y_train, validation_data=(X_val, y_val), epochs=100,
          batch_size=32, optimizer='adam', save=True, save_dir=os.path.join(main_path, 'model', 'history'))

Epoch 1/100


2023-08-03 13:13:35.767746: W tensorflow/core/platform/profile_utils/cpu_utils.cc:128] Failed to get CPU frequency: 0 Hz
2023-08-03 13:13:37.132567: I tensorflow/core/grappler/optimizers/custom_graph_optimizer_registry.cc:113] Plugin optimizer for device_type GPU is enabled.




2023-08-03 13:14:13.903460: I tensorflow/core/grappler/optimizers/custom_graph_optimizer_registry.cc:113] Plugin optimizer for device_type GPU is enabled.



Epoch 1: val_loss improved from inf to 1.85323, saving model to /Users/jaewone/developer/tensorflow/baby-cry-classification/model/historysaved_wavenet_clasifier.h5
Epoch 2/100
Epoch 2: val_loss did not improve from 1.85323
Epoch 3/100
Epoch 3: val_loss did not improve from 1.85323
Epoch 4/100
Epoch 4: val_loss did not improve from 1.85323
Epoch 5/100
Epoch 5: val_loss did not improve from 1.85323
Epoch 6/100
Epoch 6: val_loss did not improve from 1.85323
Epoch 7/100
Epoch 7: val_loss did not improve from 1.85323
Epoch 8/100
Epoch 8: val_loss did not improve from 1.85323
Epoch 9/100
Epoch 9: val_loss did not improve from 1.85323
Epoch 10/100
Epoch 10: val_loss did not improve from 1.85323
Epoch 11/100
Epoch 11: val_loss did not improve from 1.85323
Epoch 12/100
Epoch 12: val_loss did not improve from 1.85323
Epoch 13/100
Epoch 13: val_loss did not improve from 1.85323
Epoch 14/100
Epoch 14: val_loss did not improve from 1.85323
Epoch 15/100
Epoch 15: val_loss did not improve from 1.853

KeyboardInterrupt: 

In [None]:
y_pred = model.predict(X_test)
for i in range(len(y_pred)):
    real = label_encoder.classes_[np.argmax(y_test[i])]
    pred_label = label_encoder.classes_[np.argmax(y_pred[i])]
    print(f'Real: {real:>10} | Predict: {pred_label:>10} with {y_pred[i][np.argmax(y_pred[i])]*100:.2f}%')