In [6]:
import librosa
import librosa.display
import tensorflow as tf
import numpy as np
import os
import matplotlib.pyplot as plt
import IPython.display as ipd
import random

def init_gpu():
    gpus = tf.config.experimental.list_physical_devices('GPU')
    if gpus:
        try:
        # Currently, memory growth needs to be the same across GPUs
            for gpu in gpus:
                tf.config.experimental.set_memory_growth(gpu, True)
                logical_gpus = tf.config.experimental.list_logical_devices('GPU')
                print(len(gpus), "Physical GPUs,", len(logical_gpus), "Logical GPUs")
        except RuntimeError as e:
        # Memory growth must be set before GPUs have been 
             print(e)

init_gpu()

data_path = os.getenv("HOME")+'/aiffel/speech_recognition/data/speech_wav_8000.npz'
speech_data = np.load(data_path)

print("Wave data shape : ", speech_data["wav_vals"].shape)
print("Label data shape : ", speech_data["label_vals"].shape)
print("✅")


target_list = ['yes', 'no', 'up', 'down', 'left', 'right', 'on', 'off', 'stop', 'go']

label_value = target_list
label_value.append('unknown')
label_value.append('silence')

print('LABEL : ', label_value)

new_label_value = dict()
for i, l in enumerate(label_value):
    new_label_value[l] = i
label_value = new_label_value

print('Indexed LABEL : ', new_label_value)


temp = []
for v in speech_data["label_vals"]:
    temp.append(label_value[v[0]])
label_data = np.array(temp)


def one_hot_label(wav, label):
    label = tf.one_hot(label, depth=12)
    return wav, label
print("✅")


def wav2spec(wav, fft_size=258): # spectrogram shape을 맞추기위해서 size 변형
    D = np.abs(librosa.stft(wav, n_fft=fft_size))
    return D

speech_data

1 Physical GPUs, 1 Logical GPUs
Wave data shape :  (50620, 8000)
Label data shape :  (50620, 1)
✅
LABEL :  ['yes', 'no', 'up', 'down', 'left', 'right', 'on', 'off', 'stop', 'go', 'unknown', 'silence']
Indexed LABEL :  {'yes': 0, 'no': 1, 'up': 2, 'down': 3, 'left': 4, 'right': 5, 'on': 6, 'off': 7, 'stop': 8, 'go': 9, 'unknown': 10, 'silence': 11}
✅


<numpy.lib.npyio.NpzFile at 0x7fa0a05801d0>

In [7]:
spec = wav2spec(speech_data["wav_vals"][0])
print(spec.shape)

spec2 = wav2spec(speech_data["wav_vals"][1])
print(spec2.shape)

del spec, spec2

(130, 126)
(130, 126)


# 데이터 준비하기
이번 데이터는 크기가 매우 커서, 2만개만 썼다. 이 부분에서 자꾸 커널에 종료되어서 시간을 많이 끌었다. 그리고 list와 numpy array 간을 어떻게 넘나들지 매우 고민했는데 생각보다 쉬워서 황당했다. 

In [8]:
# 먼저 빈 리스트를 준비하고
spec_data=[]
s_data = speech_data['wav_vals'][:20000]
print(s_data.shape)
for wav in s_data: 
    stft = wav2spec(wav)
    # 일단 데이터를 담는다. 이 때 객체는 numpy array가 담기게 된다. 
    spec_data.append(stft)

# 둘러싼 리스트를 np.array() function을 사용해서 numpy array로 바꿔준다. 
    spec_data = np.array(spec_data)
spec_data.shape

(20000, 8000)


(20000, 130, 126)

In [9]:
s_label = label_data[:20000]

In [10]:
from sklearn.model_selection import train_test_split

sr = 8000
train_wav, test_wav, train_label, test_label = train_test_split(spec_data, 
                                                                s_label, 
                                                                test_size=0.1,
                                                                shuffle=True)
print(train_wav.shape)
train_wav = train_wav.reshape([-1, 130, 126, 1]) # add channel for CNN
test_wav = test_wav.reshape([-1,  130, 126, 1])

(18000, 130, 126)


`train_wav = train_wav.reshape([-1, 130, 126, 1])` -> 이 부분이 헷갈렸는데, reshape를 할 때 행의 크기를 정확히 주지 않고 생성하겠다는 의미가 되어서, 자동으로 크기를 인식하고 넣어준다. cnn을 적용하려면 채널 수대로 뒤에 컬럼이 하나 더 붙어야 해서 1을 넣는다. 

In [11]:
print("train data : ", train_wav.shape)
print("train labels : ", train_label.shape)
print("test data : ", test_wav.shape)
print("test labels : ", test_label.shape)
print("✅")

train data :  (18000, 130, 126, 1)
train labels :  (18000,)
test data :  (2000, 130, 126, 1)
test labels :  (2000,)
✅


In [12]:
batch_size = 32
max_epochs = 10

# the save point
checkpoint_dir = os.getenv('HOME')+'/aiffel/speech_recognition/models/wav'

checkpoint_dir

# for train
train_dataset = tf.data.Dataset.from_tensor_slices((train_wav, train_label))
train_dataset = train_dataset.map(one_hot_label)
train_dataset = train_dataset.repeat().batch(batch_size=batch_size)
print(train_dataset)

# for test
test_dataset = tf.data.Dataset.from_tensor_slices((test_wav, test_label))
test_dataset = test_dataset.map(one_hot_label)
test_dataset = test_dataset.batch(batch_size=batch_size)
print(test_dataset)
print("✅")

Please report this to the TensorFlow team. When filing the bug, set the verbosity to 10 (on Linux, `export AUTOGRAPH_VERBOSITY=10`) and attach the full output.
Cause: No module named 'tensorflow_core.estimator'
Please report this to the TensorFlow team. When filing the bug, set the verbosity to 10 (on Linux, `export AUTOGRAPH_VERBOSITY=10`) and attach the full output.
Cause: No module named 'tensorflow_core.estimator'
<BatchDataset shapes: ((None, 130, 126, 1), (None, 12)), types: (tf.float32, tf.float32)>
<BatchDataset shapes: ((None, 130, 126, 1), (None, 12)), types: (tf.float32, tf.float32)>
✅


In [13]:
del speech_data
del spec_data

In [14]:
from tensorflow.keras import layers
input_shape = (130,126,1)
input_tensor = layers.Input(shape=input_shape)

x = layers.Conv2D(32, kernel_size=(3,3), padding='same', activation='relu')(input_tensor)
x = layers.Conv2D(32, kernel_size=(3,3), padding='same', activation='relu')(x)
skip_1 = layers.MaxPool2D()(x)

x = layers.Conv2D(64, kernel_size=(3,3), padding='same', activation='relu')(skip_1)
x = layers.Conv2D(64, kernel_size=(3,3), padding='same', activation='relu')(x)
# concat을 할 때 axis 값을 negative하게 주면 역순으로 데이터를 참조하게 된다.
x = tf.concat([x, skip_1], -1)
skip_2 = layers.MaxPool2D()(x)

x = layers.Conv2D(128, kernel_size=(3,3), padding='same', activation='relu')(skip_2)
x = layers.Conv2D(128, kernel_size=(3,3), padding='same', activation='relu')(x)
x = layers.Conv2D(128, kernel_size=(3,3), padding='same', activation='relu')(x)
x = tf.concat([x, skip_2], -1)
skip_3 = layers.MaxPool2D()(x)

x = layers.Conv2D(256, kernel_size=(3,3), padding='same', activation='relu')(skip_3)
x = layers.Conv2D(256, kernel_size=(3,3), padding='same', activation='relu')(x)
x = layers.Conv2D(256, kernel_size=(3,3), padding='same', activation='relu')(x)
x = tf.concat([x, skip_3], -1)
x = layers.MaxPool2D()(x)
x = layers.Dropout(0.3)(x)

x = layers.Flatten()(x)
x = layers.Dense(256)(x)
x = layers.BatchNormalization()(x)
x = layers.Activation('relu')(x)

output_tensor = layers.Dense(12)(x)

model_wav_skip = tf.keras.Model(input_tensor, output_tensor)

model_wav_skip.summary()

Model: "model"
__________________________________________________________________________________________________
Layer (type)                    Output Shape         Param #     Connected to                     
input_1 (InputLayer)            [(None, 130, 126, 1) 0                                            
__________________________________________________________________________________________________
conv2d (Conv2D)                 (None, 130, 126, 32) 320         input_1[0][0]                    
__________________________________________________________________________________________________
conv2d_1 (Conv2D)               (None, 130, 126, 32) 9248        conv2d[0][0]                     
__________________________________________________________________________________________________
max_pooling2d (MaxPooling2D)    (None, 65, 63, 32)   0           conv2d_1[0][0]                   
______________________________________________________________________________________________

# 모델 생성하기
Conv1D -> Conv2D로 바꾸기 위해서는 필터의 크기만 조절해주면 됬었다. 한번에 1차원, 길이 n의 필터를 보다가 필터를 2차원으로 만들되 크기가 n이면 된다. 

In [15]:
optimizer=tf.keras.optimizers.Adam(1e-4)
model_wav_skip.compile(loss=tf.keras.losses.CategoricalCrossentropy(from_logits=True),
             optimizer=optimizer,
             metrics=['accuracy'])
print("✅")

✅


In [16]:
checkpoint_dir = os.getenv('HOME')+'/aiffel/speech_recognition/models/wav_skip'

cp_callback = tf.keras.callbacks.ModelCheckpoint(checkpoint_dir,
                                                 save_weights_only=True,
                                                 monitor='val_loss',
                                                 mode='auto',
                                                 save_best_only=True,
                                                 verbose=1)
print("✅")

✅


In [17]:
print(train_dataset)

<BatchDataset shapes: ((None, 130, 126, 1), (None, 12)), types: (tf.float32, tf.float32)>


In [20]:
#30분 내외 소요 (메모리 사용량에 주의해 주세요.)
history_wav = model_wav_skip.fit(train_dataset, epochs=10,
                    steps_per_epoch=len(train_wav) // batch_size,
                    validation_data=test_dataset,
                    validation_steps=len(test_wav) // batch_size,
                    callbacks=[cp_callback]
                    )
print("✅")

ERROR! Session/line number was not unique in database. History logging moved to new session 87
Train for 562 steps, validate for 62 steps
Epoch 1/10
Epoch 00001: val_loss improved from 0.16189 to 0.15659, saving model to /home/aiffel-dj15/aiffel/speech_recognition/models/wav_skip
Epoch 2/10
Epoch 00002: val_loss did not improve from 0.15659
Epoch 3/10
Epoch 00003: val_loss did not improve from 0.15659
Epoch 4/10
Epoch 00004: val_loss improved from 0.15659 to 0.14964, saving model to /home/aiffel-dj15/aiffel/speech_recognition/models/wav_skip
Epoch 5/10
Epoch 00005: val_loss did not improve from 0.14964
Epoch 6/10
Epoch 00006: val_loss did not improve from 0.14964
Epoch 7/10
Epoch 00007: val_loss did not improve from 0.14964
Epoch 8/10
Epoch 00008: val_loss did not improve from 0.14964
Epoch 9/10
Epoch 00009: val_loss did not improve from 0.14964
Epoch 10/10
Epoch 00010: val_loss did not improve from 0.14964
✅


# 모델 학습시키기
여기서 애를 많이 먹었다. 계속 입력데이터의 차원과 모델이 원하는 데이터의 차원이 달라서 애를 먹었는데, GPU초기화를 해주니까 말끔히 사라졌다. 이게 바로 첫번째 cell에 소리소문없이 있던 gpu 초기화 코드가 하는 일이다. 10회만 학습시켰는데 정확도가 의외로 높게 나왔고, 사실 9회부터는 오버피팅이 진행되었다. 

# 회고
파이썬에 익숙하지 않아서 모델을 짜는 것도 일이었지만, 차원을 맞추고 데이터를 가공하는데에 더 애를 먹었다. 진짜로! 해당 노드에서 얻은 것은 skip connection이 무엇인지에 대해서 알게 되고, Conv1D->conv2D로 바꾸는 과정을 직접 해보면서 데이터의 학습 과정을 따라가는 경험을 해본 것이 의미있었다. 