In [1]:
#path 관련 라이브러리
import os
from os.path import isdir, join
from pathlib import Path

# Scientific Math 라이브러리  
import numpy as np
from scipy.fftpack import fft
from scipy import signal
from scipy.io import wavfile

# Visualization 라이브러리
import matplotlib.pyplot as plt
import tensorflow as tf

import IPython.display as ipd
import plotly.offline as py
py.init_notebook_mode(connected=True)
import plotly.graph_objs as go

%matplotlib inline

In [2]:
train_audio_path = 'DataSet/train/audio'

In [3]:
filename = '/no/012c8314_nohash_0.wav'
sample_rate, samples = wavfile.read(train_audio_path + filename)
ipd.Audio(samples, rate= sample_rate)

In [4]:
def log_specgram(audio, sample_rate, window_size=20, step_size = 10,
                eps = 1e-10):
    nperseg = int(round(window_size * sample_rate / 1e3))
    noverlap = int(round(step_size * sample_rate / 1e3))
    freqs, times, spec = signal.spectrogram(audio,
                                           fs = sample_rate,
                                           window='hann',
                                           nperseg = nperseg,
                                           noverlap=noverlap,
                                           detrend=False)
    return freqs, times, np.log(spec.T.astype(np.float32) + eps)

In [None]:
freqs, times, spectrogram = log_specgram(samples, sample_rate)
freqs_size = len(freqs)
times_size = len(times)

In [None]:
dirs = [f for f in os.listdir(train_audio_path) if isdir(join(train_audio_path, f))]
dirs.sort()
print('Number of labels: ' + str(len(dirs[1:])))
#라벨이 몇개 있는지 확인해보자.

print(dirs[1:])
al =[]
spec_all = []
target_all = []
target_value = {}
i=0;
for direct in dirs[1:]:
    waves = [f for f in os.listdir(join(train_audio_path, direct)) if f.endswith('.wav')]
    target_value[direct] = i
    i = i + 1
    print(str(i)+":" +str(direct) + " ", end="")
    for wav in waves:
        #파일을 읽어다가.
        sample_rate, samples = wavfile.read(train_audio_path+'/'+direct+'/'+wav)
#         print(sample_rate)
        #The sample_rate that we want to deal with is 16000(Voice)
        #BG is 72000
        if samples.shape[0] != 16000 :
            continue
        target_all.append(direct)
        #스펙트로그램도 그려보자.
        freqs, times, spec = log_specgram(samples,sample_rate)
        spec = (spec - spec.min())/(spec.max() - spec.min())
        #정규화 안해주면 로그스케일로 내린 값이 음수여서 relu통과할때 0 이된다 
        #스펙트로 그램도 spec_all 이라는 리스트에 더해주고
#         spec_all.append(np.reshape(spec,(freqs_size,times_size)))
        al.append([np.reshape(spec,(freqs_size,times_size)), direct])

Number of labels: 30
['bed', 'bird', 'cat', 'dog', 'down', 'eight', 'five', 'four', 'go', 'happy', 'house', 'left', 'marvin', 'nine', 'no', 'off', 'on', 'one', 'right', 'seven', 'sheila', 'six', 'stop', 'three', 'tree', 'two', 'up', 'wow', 'yes', 'zero']
1:bed 2:bird 


invalid value encountered in true_divide



3:cat 

In [None]:
# 데이터를 섞습니다.
np.random.shuffle(al)
# 데이터를 스펙트로그램 데이터와 정답을 가지고 있는 라벨로 나눕니다.
spec_all = np.reshape(np.delete(al,1,1),(len(al)))
target_all = [i for i in np.delete(al,0,1).tolist()]

In [None]:
# 훈련 데이터에 대한 80% 비율의 인덱스를 가져옵니다.
train_indices = np.random.choice(len(target_all),
                                 round(len(target_all) * 0.8), replace=False)
# 훈련 데이터에 대한 80% 비율의 인덱스를 제외한 20%의 테스트 데이터의 인덱스를 가져옵니다.
test_indices = np.array(list(set(range(len(target_all)))
                                 - set(train_indices)))
#데이터들을 다루기 쉽게 정리합니다.
spec_vals = np.array([x for x in spec_all])
target_vals = np.array([x for x in target_all])

#데이터들을 훈련, 테스트에 맞게 변수에 할당합니다.
train_spec = spec_vals[train_indices][:]
train_target = target_vals[train_indices][:]
test_spec = spec_vals[test_indices][:]
test_target = target_vals[test_indices][:]

In [None]:
def cnn_model(input_images, batch_size,drop_out_rate=0.1, is_training=False, train_logical=True):
    def truncated_normal_var(name, shape, dtype):
        return(tf.get_variable(name=name, shape=shape, dtype=dtype, initializer = tf.truncated_normal_initializer(stddev=0.05)))

    def zero_var(name, shape, dtype):
        return(tf.get_variable(name=name, shape=shape, dtype=dtype, initializer=tf.constant_initializer(0.0)))
    
    with tf.variable_scope('conv1') as scope:
        conv1_kernel = truncated_normal_var(name='conv_kernel1',shape=[5,5,1,4], dtype= tf.float32)
        conv1 = tf.nn.conv2d(input_images, conv1_kernel, [1, 1, 1, 1], padding='SAME')
        conv1_bias = truncated_normal_var(name='conv_bias1', shape=[4], dtype=tf.float32)
        conv1_add_bias = tf.nn.bias_add(conv1,conv1_bias)
        norm1 = tf.layers.batch_normalization(conv1_add_bias, training=is_training, trainable = True, name='norm1')
        conv1_out = tf.nn.relu(norm1)
    
    with tf.variable_scope('conv2') as scope:
        conv2_kernel = truncated_normal_var(name='conv_kernel2', shape=[5, 5, 4, 8], dtype=tf.float32)
        conv2 = tf.nn.conv2d(conv1_out, conv2_kernel, [1, 1, 1, 1], padding='SAME')
        conv2_bias = truncated_normal_var(name='conv_bias2', shape=[8], dtype=tf.float32)
        conv2_add_bias = tf.nn.bias_add(conv2, conv2_bias)
        norm2 = tf.layers.batch_normalization(conv2_add_bias, training=is_training, trainable = True, name='norm2')
        conv2_out = tf.nn.relu(norm2)
    
    with tf.variable_scope('conv3') as scope:
        conv3_kernel = truncated_normal_var(name='conv_kernel3', shape=[5, 5, 8, 16], dtype=tf.float32)
        conv3 = tf.nn.conv2d(conv2_out, conv3_kernel, [1, 1, 1, 1], padding='SAME')
        conv3_bias = truncated_normal_var(name='conv_bias3', shape=[16], dtype=tf.float32)
        conv3_add_bias = tf.nn.bias_add(conv3, conv3_bias)
        norm3 = tf.layers.batch_normalization(conv3_add_bias, training=is_training, trainable = True, name='norm3')
        conv3_out = tf.nn.relu(norm3)

    with tf.variable_scope('conv4') as scope:
        conv4_kernel = truncated_normal_var(name='conv_kernel4', shape=[5, 5, 16, 32], dtype=tf.float32)
        conv4 = tf.nn.conv2d(conv3_out, conv4_kernel, [1, 1, 1, 1], padding='SAME')
        conv4_bias = truncated_normal_var(name='conv_bias4', shape=[32], dtype=tf.float32)
        conv4_add_bias = tf.nn.bias_add(conv4, conv4_bias)
        norm4 = tf.layers.batch_normalization(conv4_add_bias, training=is_training, trainable = True, name='norm4')
        conv4_out = tf.nn.relu(norm4)

    with tf.variable_scope('conv5') as scope:
        conv5_kernel = truncated_normal_var(name='conv_kernel5', shape=[3, 3, 32, 64], dtype=tf.float32)
        conv5 = tf.nn.conv2d(conv4_out, conv5_kernel, [1, 2, 2, 1], padding='SAME')
        conv5_bias = truncated_normal_var(name='conv_bias5', shape=[64], dtype=tf.float32)
        conv5_add_bias = tf.nn.bias_add(conv5, conv5_bias)
        norm5 = tf.layers.batch_normalization(conv5_add_bias, training=is_training, trainable = True, name='norm5')
        conv5_out = tf.nn.relu(norm5)

    with tf.variable_scope('conv6') as scope:
        conv6_kernel = truncated_normal_var(name='conv_kernel6', shape=[3, 3, 64, 64], dtype=tf.float32)
        conv6 = tf.nn.conv2d(conv5_out, conv6_kernel, [1, 2, 2, 1], padding='SAME')
        conv6_bias = truncated_normal_var(name='conv_bias6', shape=[64], dtype=tf.float32)
        conv6_add_bias = tf.nn.bias_add(conv6, conv6_bias)
        norm6 = tf.layers.batch_normalization(conv6_add_bias, training=is_training, trainable = True, name='norm6')
        conv6_out = tf.nn.relu(norm6)
        
    pool = tf.nn.max_pool(conv6_out, ksize=[1, 3, 3, 1], strides=[1, 1, 1, 1], padding='SAME', name='pool_layer')
    reshaped_output = tf.reshape(pool, [batch_size, -1])
    reshaped_dim = reshaped_output.get_shape()[1].value

    with tf.variable_scope('dense') as scope:
        nn = tf.layers.dense(reshaped_output, 1024, activation=tf.nn.relu)
        nn = tf.layers.dense(nn, 256, activation=tf.nn.relu)
        nn = tf.layers.dense(nn, 128, activation=tf.nn.relu)
        nn = tf.layers.dense(nn, 64, activation=tf.nn.relu)
        nn = tf.layers.dense(nn, len(target_value), activation=tf.nn.sigmoid)

    return nn

In [None]:
# Parameters
lr = 0.0003
generations = 20000
num_gens_to_wait = 250
drop_out_rate = 0.00
batch_size = 32

In [None]:
# 레이블을 네트워크가 이해할 수 있는 숫자형태로 변경
temp = []
for v in train_target:
    temp.append(target_value[v[0]])
train_target = np.array(temp)

temp = []
for v in test_target:
    temp.append(target_value[v[0]])
test_target = np.array(temp)

In [None]:
#네트워크에 사용할 placeholder 정의
x_input_shape = (batch_size, freqs_size,times_size,1)
y_input_shape = (batch_size, )
x_input = tf.placeholder(tf.float32, shape=x_input_shape)
y_target = tf.placeholder(tf.int32, shape=y_input_shape)
eval_input_shape = (batch_size, freqs_size, times_size,1)
eval_input = tf.placeholder(tf.float32, shape=eval_input_shape)
eval_target = tf.placeholder(tf.int32, shape=y_input_shape)

In [None]:
config = tf.ConfigProto()
sess = tf.Session(config=config)
with tf.variable_scope('scope',reuse=tf.AUTO_REUSE) as scope:
    model_output = cnn_model(x_input,batch_size,drop_out_rate=drop_out_rate, is_training=True)
    test_model_output = cnn_model(eval_input,batch_size)


In [None]:
targets = tf.squeeze(tf.cast(y_target,tf.int32))

loss = tf.reduce_mean(tf.nn.sparse_softmax_cross_entropy_with_logits(logits=model_output, labels=y_target))
prediction = tf.nn.softmax(model_output)
test_prediction = tf.nn.softmax(test_model_output)

my_optimizer = tf.train.AdamOptimizer(learning_rate=lr)

train_step = my_optimizer.minimize(loss)

init = tf.global_variables_initializer()
sess.run(init)

In [None]:
# Define def to calculate accuracy
def get_accuracy(logits, targets):
    batch_predictions = np.argmax(logits, axis=1)
    num_correct = np.sum(np.equal(batch_predictions, targets))
    return (100. * num_correct)/ batch_predictions.shape[0]


In [None]:
print('Train_Spectrogram Demension : ' + str(np.shape(train_spec)))

In [None]:
print('Train_Label Demension : ' + str(np.shape(train_target)))

In [None]:
print('Test_Spectrogram Demension : ' + str(np.shape(test_spec)))

In [None]:
print('Test_Label Demension : ' + str(np.shape(test_target)))

In [None]:
print('Number Of Labels : ' + str(len(target_value)))

In [None]:
train_loss = []
train_acc = []
test_acc = []
for i in range(generations):
    rand_index = np.random.choice(len(train_spec),size = batch_size)
    rand_x = train_spec[rand_index]
    rand_x = np.expand_dims(rand_x, -1)
    rand_y = train_target[rand_index]

    sess.run(train_step,feed_dict = {x_input: rand_x, y_target: rand_y})
    temp_train_loss, temp_train_preds = sess.run([loss, prediction], feed_dict={x_input: rand_x, y_target: rand_y})
    temp_train_acc = get_accuracy(temp_train_preds, rand_y)
    
    # logging temp result
    if (i + 1) % 50 == 0:
        eval_index = np.random.choice(len(test_spec), size=batch_size)
        eval_x = test_spec[eval_index]
        eval_x = np.expand_dims(eval_x, -1)
        eval_y = test_target[eval_index]

        test_preds = sess.run(test_prediction, feed_dict={eval_input: eval_x})
        temp_test_acc = get_accuracy(test_preds, eval_y)

        # Logging and Printing Results
        train_loss.append(temp_train_loss)
        train_acc.append(temp_train_acc)
        test_acc.append(temp_test_acc)
        acc_and_loss = [(i + 1), temp_train_loss, temp_train_acc,
                            temp_test_acc]
        acc_and_loss = [np.round(x, 10) for x in acc_and_loss]
        print('Generation # {}. Train Loss: {:.10f}. Train Acc (Test Acc): {:.2f} ({:.2f})'.format(*acc_and_loss))