In [None]:
import os
import numpy as np
import librosa
from scipy import signal
from pydub import AudioSegment
from pydub.silence import split_on_silence
import matplotlib.pyplot as plt
import librosa.display
import re

In [None]:
labels = 'yes no up down left right on off stop go silence unknown'.split()

In [None]:
# 웨이브 파일 로드, sr = 44100, 진폭 정규화까지
def call_audio_librosa(path, sr = 44100):
    y, sr = librosa.load(path,sr = sr)
    y = audio_regul(y)
    #y = audio_extract(y)
    return (y, sr)

# 음성 정규화
def audio_regul(y):
    _y = librosa.util.normalize(y)
    return _y

# librosa로 로드한거 인풋으로 넣었을 때
def denoise(y):
    D_noise = librosa.stft(y)
    D_denoise = signal.wiener(D_noise)
    y_denoised = librosa.istft(D_denoise)
    return y_denoised

# wav파일 path넣었을 때 바로 디노이징으로 불러오기
def denoise_path(path):
    y = call_audio_librosa(path)
    D_noise = librosa.stft(y)
    D_denoise = signal.wiener(D_noise)
    y_denoised = librosa.istft(D_denoise)
    return y_denoised

In [None]:
main_path = 'D:\\kaggle\\audio'
os.listdir(main_path)[:3]

# class 별 데이터 수 

In [None]:
main_path = 'D:\\kaggle\\audio'
classnames=os.listdir(main_path)
train_count_dict = {}
temp = []
for d in classnames:
    people = os.listdir(os.path.join(main_path, d))
    length = len(people)
    temp.append(length)
    train_count_dict[d] = sum(temp)
train_count_dict    

In [None]:
output_path = 'D:\\kaggle\\train_image'
for m in range(len(classnames)):
    target = os.path.join(output_path,classnames[m])
    os.mkdir(target)

# train_image 폴더를 만들고 word폴더를 만든 뒤, 이미지 생성

input_path= 'D:\\kaggle\\train'
output_path = 'D:\\kaggle\\train_image'
    
for word in classnames:
    print(word)
    word_path = os.path.join(input_path,word)
    
    for audio in os.listdir(word_path):
        y, sr = librosa.load(os.path.join(word_path, audio), sr=16000)  #NOT 44100
        #y = denoise(y)

        fig = plt.figure(figsize=(6, 4))
        MFCC = librosa.feature.mfcc(y=y, sr=sr)
        librosa.display.specshow(MFCC)

        save_dir = output_path + "\\" + word + "\\" + audio.split(".")[0] + ".png"
        fig.savefig(save_dir, bbox_inches='tight', pad_inches=0)  # save the figure to file
        plt.close(fig)

In [None]:
input_path= 'D:\\kaggle\\train'
output_path = 'D:\\kaggle\\train_image'

for word in classnames:
    print(word)
    word_path = os.path.join(input_path,word)
    
    for audio in os.listdir(word_path):
        y, sr = librosa.load(os.path.join(word_path, audio), sr=16000)  #NOT 44100
        MFCC = librosa.feature.mfcc(y=y, sr=sr)
        #S = librosa.feature.melspectrogram(S=D, n_mels=512)
        
        fig = plt.figure(figsize=(6, 4))
        #librosa.display.specshow(librosa.power_to_db(S, ref=np.max))
        librosa.display.specshow(MFCC)
        save_dir = output_path + "\\" + word + "\\" + audio.split(".")[0] + ".png"
        fig.savefig(save_dir, bbox_inches='tight', pad_inches=0)  # save the figure to file
        plt.close(fig)

In [None]:
output_path = 'D:\\kaggle\\test_image'
for m in range(len(classnames)):
    target = os.path.join(output_path,classnames[m])
    os.mkdir(target)

In [None]:
input_path= 'D:\\kaggle\\test'
output_path = 'D:\\kaggle\\test_image'
    
for word in classnames:
    print(word)
    word_path = os.path.join(input_path,word)
    
    for audio in os.listdir(word_path):
        y, sr = librosa.load(os.path.join(word_path, audio), sr=16000)  #NOT 44100
        #y = denoise(y)

        fig = plt.figure(figsize=(6, 4))
        MFCC = librosa.feature.mfcc(y=y, sr=sr)
        librosa.display.specshow(MFCC)

        save_dir = output_path + "\\" + word + "\\" + audio.split(".")[0] + ".png"
        fig.savefig(save_dir, bbox_inches='tight', pad_inches=0)  # save the figure to file
        plt.close(fig)

In [None]:
from keras.models import Sequential
from keras.layers import Dense
from keras.layers import Flatten
from keras.layers import Dropout
from keras.layers.convolutional import Conv2D
from keras.layers.convolutional import MaxPooling2D
from keras.preprocessing.image import ImageDataGenerator
from keras.callbacks import EarlyStopping, ReduceLROnPlateau
from keras.models import load_model
%matplotlib inline
import matplotlib.pyplot as plt

## 이미지를 train 과 test로 나누는 코드 필요

In [None]:
raw_path = 'D:\\kaggle\\audio'
train_path = 'D:\kaggle\\train_image'
test_path = 'D:\kaggle\\test_image'

In [None]:
train_datagen = ImageDataGenerator(rescale=1./255)

train_generator = train_datagen.flow_from_directory(
        train_path,
        target_size=(341, 224),
        batch_size=64,
        class_mode='categorical')
# 65744개의 이미지를 20사이즈로 학습시키니 3287번은 학습해야 한 epoch가 완성된다.

In [None]:
test_datagen = ImageDataGenerator(rescale=1./255)

test_generator = test_datagen.flow_from_directory(
        test_path,
        target_size=(341, 224),    
        batch_size=30,
        class_mode='categorical',
shuffle=False)

In [None]:
model = Sequential()

model.add(Conv2D(8, kernel_size=(3, 3), activation='elu',input_shape=(341,224,3),kernel_initializer= 'glorot_normal' ))

model.add(Conv2D(filters = 16, kernel_size = (3, 3), strides=1,activation='elu',kernel_initializer= 'glorot_normal'))
model.add(MaxPooling2D(pool_size=(3, 3)))

model.add(Conv2D(32, (3, 3), activation='elu', strides=1,kernel_initializer= 'glorot_normal'))
model.add(MaxPooling2D(pool_size=(3, 3)))

model.add(Conv2D(64, (3, 3), activation='elu', strides=1,kernel_initializer= 'glorot_normal'))
model.add(MaxPooling2D(pool_size=(3, 3)))

model.add(Flatten())
model.add(Dense(128, activation='elu',kernel_initializer= 'glorot_normal'))
model.add(Dropout(0.5))
model.add(Dense(64, activation='elu',kernel_initializer= 'glorot_normal'))

model.add(Dense(len(train_generator.class_indices) , activation='softmax'))
model.summary()

In [None]:
model2 = Sequential()
model2.add(Conv2D(8, kernel_size=(5, 5),
                 activation='relu',
                 input_shape=(341,224,3),kernel_initializer= 'glorot_normal'))

model2.add(Conv2D(filters = 16, kernel_size = (5, 5), strides=1,activation='relu',kernel_initializer= 'glorot_normal'))
model2.add(MaxPooling2D(pool_size=(3, 3)))

model2.add(Conv2D(32, (5, 5), activation='relu',kernel_initializer= 'glorot_normal'))
model2.add(MaxPooling2D(pool_size=(3, 3)))

model2.add(Conv2D(64, (5, 5), activation='relu',kernel_initializer= 'glorot_normal'))
model2.add(MaxPooling2D(pool_size=(3, 3)))

model2.add(Flatten())
model2.add(Dense(128, activation='relu',kernel_initializer= 'glorot_normal'))
model2.add(Dropout(0.5))
model2.add(Dense(64, activation='relu',kernel_initializer= 'glorot_normal'))

model2.add(Dense(len(train_generator.class_indices), activation='softmax'))
model2.summary()

In [None]:
model3 = Sequential()
model3.add(Conv2D(8, kernel_size=(5, 5),
                 activation='relu',
                 input_shape=(341,224,3),kernel_initializer= 'glorot_normal'))
model3.add(MaxPooling2D(pool_size=(3, 3)))

model3.add(Conv2D(filters = 16, kernel_size = (5, 5), strides=1,activation='relu',kernel_initializer= 'glorot_normal'))
model3.add(MaxPooling2D(pool_size=(3, 3)))

model3.add(Conv2D(32, (5, 5), activation='relu',kernel_initializer= 'glorot_normal'))
model3.add(MaxPooling2D(pool_size=(3, 3)))

model3.add(Conv2D(64, (5, 5), activation='relu',kernel_initializer= 'glorot_normal'))


model3.add(Flatten())
model3.add(Dense(128, activation='relu',kernel_initializer= 'glorot_normal'))
model3.add(Dropout(0.3))
model3.add(Dense(64, activation='relu',kernel_initializer= 'glorot_normal'))

model3.add(Dense(len(train_generator.class_indices), activation='softmax'))
model3.summary()

In [None]:
model4 = Sequential()
model4.add(Conv2D(8, kernel_size=(5, 5),
                 activation='relu',
                 input_shape=(341,224,3),kernel_initializer= 'glorot_normal'))

model4.add(Conv2D(filters = 16, kernel_size = (5, 5), strides=1,activation='relu',kernel_initializer= 'glorot_normal'))
model4.add(MaxPooling2D(pool_size=(3, 3)))

model4.add(Conv2D(32, (5, 5), activation='relu',kernel_initializer= 'glorot_normal'))
model4.add(MaxPooling2D(pool_size=(3, 3)))

model4.add(Conv2D(64, (5, 5), activation='relu',kernel_initializer= 'glorot_normal'))
model4.add(MaxPooling2D(pool_size=(3, 3)))

model4.add(Conv2D(128, (5, 5), activation='relu',kernel_initializer= 'glorot_normal'))


model4.add(Flatten())
model4.add(Dense(128, activation='relu',kernel_initializer= 'glorot_normal'))
model4.add(Dropout(0.5))
model4.add(Dense(64, activation='relu',kernel_initializer= 'glorot_normal'))

model4.add(Dense(len(train_generator.class_indices), activation='softmax'))
model4.summary()

In [None]:
model5 = Sequential()
model5.add(Conv2D(8, kernel_size=(3, 3),
                 activation='relu',
                 input_shape=(341,224,3),kernel_initializer= 'glorot_normal'))

model5.add(Conv2D(filters = 16, kernel_size = (3, 3), strides=1,activation='relu',kernel_initializer= 'glorot_normal'))
model5.add(MaxPooling2D(pool_size=(3, 3)))

model5.add(Conv2D(32, (3, 3), activation='relu',kernel_initializer= 'glorot_normal'))
model5.add(MaxPooling2D(pool_size=(3, 3)))

model5.add(Conv2D(64, (3, 3), activation='relu',kernel_initializer= 'glorot_normal'))
model5.add(MaxPooling2D(pool_size=(3, 3)))

model5.add(Conv2D(128, (3, 3), activation='relu',kernel_initializer= 'glorot_normal'))

model5.add(Flatten())
model5.add(Dense(128, activation='relu',kernel_initializer= 'glorot_normal'))
model5.add(Dropout(0.5))
model5.add(Dense(64, activation='relu',kernel_initializer= 'glorot_normal'))

model5.add(Dense(len(train_generator.class_indices), activation='softmax'))
model5.summary()

In [None]:
model.compile(loss='categorical_crossentropy', optimizer='adam', metrics=['accuracy'])
model2.compile(loss='categorical_crossentropy', optimizer='adam', metrics=['accuracy'])
model3.compile(loss='categorical_crossentropy', optimizer='adam', metrics=['accuracy'])
model4.compile(loss='categorical_crossentropy', optimizer='adam', metrics=['accuracy'])
model5.compile(loss='categorical_crossentropy', optimizer='adam', metrics=['accuracy'])

In [None]:
hist = model.fit_generator(
        train_generator,
        steps_per_epoch=500, #30 이미지를 250번 학습시키는 것이 1 epoch로 정의.
        epochs=100,
        validation_data=test_generator,
        validation_steps=25)#,
        #callbacks=[early, reduce])

In [None]:
hist2 = model2.fit_generator(
        train_generator,
        steps_per_epoch=500, #20개 이미지를 300번 학습시키는 것이 1 epoch로 정의.
        epochs=100,
        validation_data=test_generator,
        validation_steps=25)#,
        #callbacks=[early, reduce])

In [None]:
hist3 = model3.fit_generator(
        train_generator,
        steps_per_epoch=500, #20개 이미지를 300번 학습시키는 것이 1 epoch로 정의.
        epochs=100,
        validation_data=test_generator,
        validation_steps=25)#,
        #callbacks=[early, reduce])

In [None]:
hist4 = model4.fit_generator(
        train_generator,
        steps_per_epoch=500, #20개 이미지를 300번 학습시키는 것이 1 epoch로 정의.
        epochs=100,
        validation_data=test_generator,
        validation_steps=25)#,
        #callbacks=[early, reduce])

In [None]:
hist5= model5.fit_generator(
        train_generator,
        steps_per_epoch=500, #20개 이미지를 300번 학습시키는 것이 1 epoch로 정의.
        epochs=100,
        validation_data=test_generator,
        validation_steps=25)#,
        #callbacks=[early, reduce])

In [None]:
model.save('CNN1_2.h5')
model2.save('CNN2_2.h5')
model3.save('CNN3_2.h5')
model4.save('CNN4_2.h5')
#model5.save('CNN5_2.h5')

In [None]:
model5.save('CNN5_2.h5')

In [None]:
model1 = load_model("CNN1.h5")
model2 = load_model("CNN2.h5")
model3 = load_model("CNN3.h5")
model4 = load_model("CNN4.h5")

In [None]:
def loss_and_acc_plot(hist):
    fig, loss_ax = plt.subplots()

    acc_ax = loss_ax.twinx()

    loss_ax.plot(hist.history['loss'], 'y', label='train loss')
    loss_ax.plot(hist.history['val_loss'], 'r', label='val loss')

    acc_ax.plot(hist.history['acc'], 'b', label='train acc')
    acc_ax.plot(hist.history['val_acc'], 'g', label='val acc')

    loss_ax.set_xlabel('epoch')
    loss_ax.set_ylabel('loss')
    acc_ax.set_ylabel('accuray')

    loss_ax.legend(loc='upper left')
    acc_ax.legend(loc='lower left')

    return plt.show()

In [None]:
loss_and_acc_plot(hist)

In [None]:
loss_and_acc_plot(hist2)

In [None]:
loss_and_acc_plot(hist3)

In [None]:
loss_and_acc_plot(hist4)

In [None]:
loss_and_acc_plot(hist5)

In [None]:
# ensemble(prob1,prob2,acc_list = [1,2] )
def model_ensemble(*prob, acc_list):
    prob_list = [p for p in prob]

    idx_acc_list = {idx: acc for idx, acc in enumerate(acc_list)}
    sorted_acc_list = [idx for idx, _ in sorted(idx_acc_list.items(),
                                                key=lambda value: (value[1], value[0]), reverse=True)]
    output = []
    for i in sorted_acc_list:
        temp = [round(x * (i + 1), 5) for x in prob_list[i]]
        output.append(temp)
    final_prob = np.sum(output, axis=0)  # class갯수만큼 확률값 지금은 1*2

    final_score = np.mean(np.equal(np.argmax(final_prob, axis=1), test_generator.classes))
    print('Final val accuracy : %4f' % final_score)

    return final_prob

In [None]:
input_path= 'D:\\kaggle\\test_set'
output_path = 'D:\\kaggle\\test_image_kaggle\\test_image'

In [None]:
output_path = 'D:\\kaggle\\test_image_kaggle\\temp'

In [None]:
for audio in os.listdir(input_path):
    y, sr = librosa.load(os.path.join(input_path, audio), sr=16000)  #NOT 44100
        #y = denoise(y)

    fig = plt.figure(figsize=(6, 4))
    MFCC = librosa.feature.mfcc(y=y, sr=sr)
    librosa.display.specshow(MFCC)

    save_dir = output_path + "\\" +  audio.split(".")[0] + ".png"
    fig.savefig(save_dir, bbox_inches='tight', pad_inches=0)  # save the figure to file
    plt.close(fig)

In [None]:
test_path = 'D:\\kaggle\\test_image_kaggle\\test_image'
test_datagen = ImageDataGenerator(rescale=1./255)

test_generator = test_datagen.flow_from_directory(
        test_path,
        target_size=(341, 224),    
        batch_size=100,
        class_mode='categorical',
shuffle=False)

In [None]:
prob1 = model1.predict_generator(test_generator)
prob2 = model2.predict_generator(test_generator)
prob3 = model3.predict_generator(test_generator)
prob4 = model4.predict_generator(test_generator)

In [None]:
prob = prob1+prob2+prob3+prob4
result = np.argmax(prob, axis=1)

result = np.argmax(result, axis=1)

In [None]:
speech = {0: "unknown",
          1: "unknown",
          2: "unknown",
          3: "unknown",
          4: "down",
          5: "unknown",
          6: "unknown",
          7: "unknown",
          8: "go",
          9: "unknown",
          10: "unknown",
          11: "left",
         12:"unknown",
         13:"unknown",
         14:"no",
         15:"off",
         16:"on",
         17:"unknown",
         18:"right",
         19:"unknown",
         20:"unknown",
         21:"unknown",
         22:"stop",
         23:"unknown",
         24:"unknown",
         25:"unknown",
         26:"up",
         27:"unknown",
         28:"yes",
         29:"unknown"}

In [None]:
import csv    
f = open('output.csv', 'w', encoding='utf-8', newline='')
wr = csv.writer(f)
wr.writerow(["fname","label"])

for i in range(len(result)): 
#for i in range(10): 
    if i % 1000 == 0:
        print(i)
        
    filename = test_generator.filenames[i].split("/")[1].split(".")[0]+".wav"
    path = os.path.join('D:\\kaggle\\test_set',filename)
    y, sr = librosa.load(path,sr = 16000)
    if sum(y) == 0:
        wr.writerow([filename, "silence"])
    else:
        wr.writerow([filename, speech[result[i]]])
f.close()