In [42]:
import tensorflow as tf
import numpy as np
import pandas as pd
import os
import librosa
import sys

sys.path.append('/Users/jaewone/developer/tensorflow/baby-cry-classification')

In [43]:
from constant.os import *
from utils.os import *
from utils.sound import *


In [44]:
from scipy.stats import skew
from scipy.stats import kurtosis, median_abs_deviation

In [45]:
EPOCHS = 5000
SAMPLING_FREQ = 16000
MFCC_COEF_RETAIN = 25
MFCC_COEF = 40
MFCC_WINDOW_DURATION = 0.0232  # in miliseconds
OPTIMIZER = tf.keras.optimizers.Adam()

In [46]:
def get_mfcc(file_path):

    # load wav file and normalize
    wave, sr = librosa.load(file_path, mono=True, sr=SAMPLING_FREQ)
    wave = librosa.util.normalize(wave)

    # feature extraction
    """
    ① sr

    sampling rate를 말합니다. default값은 22050Hz입니다. 저희는 앞서 음성 데이터를 load 할 때 sr을 16000Hz으로 했기 때문에 꼭 sr=16000을 파라미터로 삽입해야 합니다. (사람의 목소리는 대부분 16000Hz 안에 포함된다고 합니다)

    ② n_mfcc

    return 될 mfcc의 개수를 정해주는 파라미터입니다. default값은 20입니다. 더 다양한 데이터 특징을 추출하기 위해서 이를 100까지 증가 시켰습니다. 

    ③ n_fft

    frame의 length를 결정하는 파라미터 입니다. n_fft를 설정하면 window size가 자동으로 같은 값으로 설정되는데 window size의 크기로 잘린 음성이 n_fft보다 작은 경우 0으로 padding을 붙여주는 작업을 하기 때문에 n_fft는 window size보다 크거나 같아야 합니다. 

    일반적으로 자연어 처리에서는 음성을 25m의 크기를 기본으로 하고 있으며 16000Hz인 음성에서는 400에 해당하는 값입니다. (16000 * 0.025 = 400) 즉, n_fft는 sr에 frame_length인 0.025를 곱한 값입니다.

    ④ hop_length

    hop_length의 길이만큼 옆으로 가면서 데이터를 읽습니다. 10ms를 기본으로 하고 있어 16000Hz인 음성에서는 160에 해당합니다. (16000 * 0.01 = 160) 즉, hop_length는 sr에 frame_stride인 0.01를 곱해서 구할 수 있습니다.

    window_length가 0.025이고 frame_stride가 0.01이라고 하면 0.015초씩은 데이터를 겹치면서 읽는다고 생각하면 됩니다.
    """
    mfccs = librosa.feature.mfcc(y=wave, sr=sr, n_mfcc=MFCC_COEF, hop_length=int(
        MFCC_WINDOW_DURATION*sr/2.0), n_fft=int(MFCC_WINDOW_DURATION*sr))

    # 정규화: 평균이 0, 표준편차 1
    mfccs = (mfccs - np.mean(mfccs))/np.std(mfccs)

    # keep the first MFCC_COEF_RETAIN coefficients
    mfccs = mfccs[:MFCC_COEF_RETAIN, :]

    # calculate MFCC statistics
    mfccs_min = mfccs.min(axis=1)
    mfccs_max = mfccs.max(axis=1)
    mfccs_median = np.median(mfccs, axis=1)
    mfccs_mean = np.mean(mfccs, axis=1)
    mfccs_var = np.var(mfccs, axis=1)
    mfccs_skewness = skew(mfccs, axis=1)
    mfccs_kurtosis = kurtosis(mfccs, axis=1)
    mfccs_mad = median_abs_deviation(mfccs, axis=1)

    mfccs_first_derivative = np.diff(mfccs, n=1, axis=1)
    mfccs_first_derivative_mean = np.mean(mfccs_first_derivative, axis=1)
    mfccs_first_derivative_var = np.var(mfccs_first_derivative, axis=1)

    mfccs_second_derivative = np.diff(mfccs, n=2, axis=1)
    mfccs_second_derivative_mean = np.mean(mfccs_second_derivative, axis=1)
    mfccs_second_derivative_var = np.var(mfccs_second_derivative, axis=1)

    mfccs_stats = np.vstack((mfccs_min, mfccs_max, mfccs_median, mfccs_mean, mfccs_var, mfccs_skewness, mfccs_kurtosis, mfccs_mad,
                            mfccs_first_derivative_mean, mfccs_first_derivative_var, mfccs_second_derivative_mean, mfccs_second_derivative_var))

    # 첫번째 값은 mfcc의 25번째까지의 값(?) 인 것 같고 두번째 값은 특성값이다.
    return pd.Series([mfccs, mfccs_stats.transpose()])

In [47]:
# Extrach mfcc data example
file_path = '/Users/jaewone/developer/tensorflow/baby-cry-classification/sample_data/diaper_121.wav'
get_mfcc(file_path)

0    [[-3.3499403, -3.4310021, -3.427462, -2.942319...
1    [[-7.959745, -2.8374531, -6.3970804, -5.843464...
dtype: object

In [48]:
from tqdm import tqdm

# Load csv
df = pd.read_csv(os.path.join(main_path, 'sample_data.csv'), index_col=0)[['state', 'file']]

# Get MFCC data
tqdm.pandas()
df[['mfccs', 'mfccs_stats']] = df.progress_apply(lambda x: get_mfcc(os.path.join(main_path, 'sample_data', x['file'])), axis=1)
df.tail(3)

100%|██████████| 697/697 [00:04<00:00, 150.53it/s]


Unnamed: 0,state,file,mfccs,mfccs_stats
694,uncomfortable,uncomfortable_134.wav,"[[-4.60427, -6.3953624, -6.6822557, -6.5885105...","[[-10.103623, -1.5321517, -5.2921014, -5.49550..."
695,uncomfortable,uncomfortable_33.wav,"[[-4.816056, -4.6024766, -4.8375535, -4.782134...","[[-8.788574, -2.870194, -5.5996184, -5.7096386..."
696,uncomfortable,uncomfortable_121.wav,"[[-7.337755, -6.7571588, -6.6625886, -6.617184...","[[-7.337755, -2.3285367, -6.6171846, -5.985762..."


In [49]:
# state 열을 카테고리 타입으로 변환한 다음 int 형태로 캐스팅한다.
df.state = df.state.astype('category')
df = df.assign(state_code=df.state.cat.codes)
df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 697 entries, 0 to 696
Data columns (total 5 columns):
 #   Column       Non-Null Count  Dtype   
---  ------       --------------  -----   
 0   state        697 non-null    category
 1   file         697 non-null    object  
 2   mfccs        697 non-null    object  
 3   mfccs_stats  697 non-null    object  
 4   state_code   697 non-null    int8    
dtypes: category(1), int8(1), object(3)
memory usage: 23.5+ KB


In [50]:
def split_xy(train_df: pd.DataFrame, test_df:pd.DataFrame):
    x_train = np.array(train_df['mfccs_stats'].to_list())
    x_train = x_train.reshape(x_train.shape[0], x_train.shape[1], x_train.shape[2])

    y_train = np.array(train_df['state_code'].to_list())
    y_train = y_train.reshape(y_train.shape[0], 1)

    x_test = np.array(test_df['mfccs_stats'].to_list())
    x_test = x_test.reshape(x_test.shape[0], x_test.shape[1], x_test.shape[2])

    y_test = np.array(test_df['state_code'].to_list())
    y_test = y_test.reshape(y_test.shape[0], 1)

    return x_train, x_test, y_train, y_test

In [51]:
def get_lstm_model(num_classes):

    model = tf.keras.Sequential([tf.keras.layers.LSTM(256, return_sequences=False),
                              tf.keras.layers.BatchNormalization(),
                              tf.keras.layers.Dropout(0.4),
                              tf.keras.layers.Dense(num_classes, activation='softmax')])

    model.compile(optimizer=OPTIMIZER, loss='sparse_categorical_crossentropy', metrics=[
                  'sparse_categorical_accuracy'])

    return model

In [52]:
df.tail(1)

Unnamed: 0,state,file,mfccs,mfccs_stats,state_code
696,uncomfortable,uncomfortable_121.wav,"[[-7.337755, -6.7571588, -6.6625886, -6.617184...","[[-7.337755, -2.3285367, -6.6171846, -5.985762...",6


In [53]:
from sklearn.model_selection import train_test_split

train, test = train_test_split(df, test_size=0.2)

X_train, X_test, Y_train, Y_test = split_xy(train, test)

lstm_model = get_lstm_model(num_classes=10)

history = lstm_model.fit(X_train, Y_train, batch_size=128, epochs=EPOCHS, validation_data=(
    X_test, Y_test), callbacks=[], verbose=0)

accuracy_score = lstm_model.evaluate(X_test, Y_test)

2023-08-04 21:13:08.284230: I tensorflow/core/grappler/optimizers/custom_graph_optimizer_registry.cc:113] Plugin optimizer for device_type GPU is enabled.
2023-08-04 21:13:08.493693: I tensorflow/core/grappler/optimizers/custom_graph_optimizer_registry.cc:113] Plugin optimizer for device_type GPU is enabled.
2023-08-04 21:13:08.662561: I tensorflow/core/grappler/optimizers/custom_graph_optimizer_registry.cc:113] Plugin optimizer for device_type GPU is enabled.
2023-08-04 21:13:09.207329: I tensorflow/core/grappler/optimizers/custom_graph_optimizer_registry.cc:113] Plugin optimizer for device_type GPU is enabled.
2023-08-04 21:13:09.292712: I tensorflow/core/grappler/optimizers/custom_graph_optimizer_registry.cc:113] Plugin optimizer for device_type GPU is enabled.




In [54]:
from sklearn.metrics import classification_report
lstm_test_preds = lstm_model.predict(X_test)
lstm_test_pred_classes = np.argmax(lstm_test_preds, axis=1)

print(classification_report(Y_test, lstm_test_pred_classes))

2023-08-04 21:24:44.740804: I tensorflow/core/grappler/optimizers/custom_graph_optimizer_registry.cc:113] Plugin optimizer for device_type GPU is enabled.
2023-08-04 21:24:44.807854: I tensorflow/core/grappler/optimizers/custom_graph_optimizer_registry.cc:113] Plugin optimizer for device_type GPU is enabled.


              precision    recall  f1-score   support

           0       0.38      0.47      0.42        17
           1       0.65      0.65      0.65        26
           2       0.44      0.48      0.46        25
           3       0.73      0.41      0.52        27
           4       0.73      1.00      0.84         8
           5       0.50      0.63      0.56        19
           6       0.88      0.78      0.82        18

    accuracy                           0.59       140
   macro avg       0.62      0.63      0.61       140
weighted avg       0.61      0.59      0.59       140

