In [1]:
import tensorflow as tf
tf.config.list_physical_devices()

[PhysicalDevice(name='/physical_device:CPU:0', device_type='CPU'),
 PhysicalDevice(name='/physical_device:GPU:0', device_type='GPU')]

In [2]:
import pandas as pd
import numpy as np
import os
import shutil
from tqdm import tqdm
from glob import glob
import librosa
import warnings
warnings.filterwarnings("ignore")

import json

### 데이터 불러오기

In [14]:
voice_paths = glob("./data/원천데이터/test/*/*/*/*.wav")
labels_paths = glob("./data/라벨링데이터/test/*/*/*/*.json")

### 데이터 전처리
librosa 라이브러리를 이용하여 wav파일을 전처리  
https://librosa.org/doc/latest/index.html  
librosa는 음악 및 오디오 분석용 파이썬 패키지

https://librosa.org/doc/latest/generated/librosa.feature.melspectrogram.html#librosa.feature.melspectrogram

In [15]:
def load_data(paths):
    result = []
    for path in tqdm(paths):
        # sr = 16000이 의미는 1초당 16000개의 데이터를 샘플링
        data, sr = librosa.load(path, sr = 16000)
        result.append(data)
    result = np.array(result) 
    # 메모리가 부족할 때는 데이터 타입을 변경 ex) np.array(data, dtype = np.float32)
    return result

In [16]:
def labeling(paths):
    result = []
    for path in tqdm(paths):
        # sr = 16000이 의미는 1초당 16000개의 데이터를 샘플링
        with open(path,'r',encoding="UTF-8") as f:
            json_data = json.load(f)
        if json_data['Speaker']['Region']=='00' and json_data['Speaker']['Dialect']=='01':
            result.append(0)
        elif json_data['Speaker']['Region']=='01':
            result.append(1)
        else:
            result.append(2)
    result = np.array(result) 
    # 메모리가 부족할 때는 데이터 타입을 변경 ex) np.array(data, dtype = np.float32)
    return result

훈련데이터에 대해서

In [17]:
labels_paths = labeling(labels_paths)

100%|██████████| 133928/133928 [09:22<00:00, 238.24it/s]


In [18]:
np.unique(labels_paths)

array([0, 1, 2])

데이터프레임으로 만들기

In [19]:
dataframe = pd.DataFrame(labels_paths, columns = ['type'])
dataframe['file_path'] = voice_paths

In [20]:
drop_index = dataframe[dataframe['type']==2].index
dataframe = dataframe.drop(drop_index)
train_y = np.array(dataframe['type'])

In [38]:
dataframe['type'].unique()

array([1, 0])

In [57]:
dataframe[dataframe['type']==1]['file_path'].loc[129516]

'./data/원천데이터/test\\random\\2022-01-06\\3996\\C0691-3996M2111-106000_0-07970327.wav'

In [11]:
dataframevoice_paths = dataframe['file_path']
train_x = load_data(dataframevoice_paths)
#dataframevoice_paths = load_data(dataframevoice_paths)
#np.save("./npy_data/train_npy", dataframevoice_paths)
#train_x = dataframevoice_paths

100%|██████████| 191759/191759 [42:23<00:00, 75.39it/s]


test 데이터에 대해서

In [12]:
# 이번 대회에서 음성은 각각 다른 길이를 가짐
# baseline 코드에서는 음성 중 길이가 가장 작은 길이의 데이터를 기준으로 데이터를 잘라서 사용
def get_mini(data):
    mini = 9999999
    for i in data:
        if len(i) < mini:
            mini = len(i)
    return mini

#음성들의 길이를 맞춰줌
def set_length(data, d_mini):
    result = []
    for i in data:
        result.append(i[:d_mini])
    result = np.array(result)
    return result

#feature를 생성합니다.
def get_feature(data, sr = 16000, n_fft = 256, win_length = 200, hop_length = 160, n_mels = 64):
    mel = []
    for i in tqdm(data):
        # win_length 는 음성을 작은 조각으로 자를때 작은 조각의 크기
        # hop_length 는 음성을 작은 조각으로 자를때 자르는 간격을 의미
        # n_mels 는 적용할 mel filter의 개수
        mel_ = librosa.feature.melspectrogram(i, sr = sr, n_fft = n_fft, win_length = win_length, hop_length = hop_length, n_mels = n_mels)
        mel.append(mel_)
    mel = np.array(mel)
    mel = librosa.power_to_db(mel, ref = np.max)
    
    mel_mean = mel.mean()
    mel_std = mel.std()
    mel = (mel - mel_mean) / mel_std
    
    return mel

In [13]:
mini = get_mini(train_x)
train_x = set_length(train_x, mini)
train_x = get_feature(data = train_x)
train_x = train_x.reshape(-1, train_x.shape[1], train_x.shape[2], 1)

100%|██████████| 191759/191759 [05:37<00:00, 568.94it/s]


### 분석모델

In [14]:
import tensorflow as tf
from tensorflow import keras
from tensorflow.keras.models import Model
from tensorflow.keras.layers import (Input, Convolution2D, BatchNormalization, Flatten,
                                     Dropout, Dense, AveragePooling2D, Add)
from tensorflow.keras.callbacks import EarlyStopping

from sklearn.model_selection import StratifiedKFold
from sklearn.metrics import accuracy_score

In [15]:
def block(input_, units = 32, dropout_rate = 0.5):
    x = Convolution2D(units, 3, padding ="same", activation = "relu")(input_)
    x = BatchNormalization()(x)
    x_res = x
    x = Convolution2D(units, 3, padding ="same", activation = "relu")(x)
    x = BatchNormalization()(x)
    x = Convolution2D(units, 3, padding ="same", activation = "relu")(x)
    x = BatchNormalization()(x)
    x = Add()([x, x_res])
    x = AveragePooling2D()(x)
    x = Dropout(rate=dropout_rate)(x)
    return x

def second_block(input_, units = 64, dropout_rate = 0.5):
    x = Convolution2D(units, 1, padding ="same", activation = "relu")(input_)
    x = Convolution2D(units, 3, padding ="same", activation = "relu")(x)
    x = Convolution2D(units * 4, 1, padding ="same", activation = "relu")(x)
    x = BatchNormalization()(x)
    x_res = x
    x = Convolution2D(units, 1, padding ="same", activation = "relu")(x)
    x = Convolution2D(units, 3, padding ="same", activation = "relu")(x)
    x = Convolution2D(units * 4, 1, padding ="same", activation = "relu")(x)
    x = BatchNormalization()(x)
    x = Convolution2D(units, 1, padding = "same", activation = "relu")(x)
    x = Convolution2D(units, 3, padding ="same", activation = "relu")(x)
    x = Convolution2D(units * 4, 1, padding = "same", activation = "relu")(x)
    x = BatchNormalization()(x)
    x = Add()([x, x_res])
    x = AveragePooling2D()(x)
    x = Dropout(rate=dropout_rate)(x)
    return x

In [16]:
def build_fn():
    dropout_rate = 0.3
    
    in_ = Input(shape = (train_x.shape[1:]))
    
    block_01 = block(in_, units = 32, dropout_rate = dropout_rate)
    block_02 = block(block_01, units = 64, dropout_rate = dropout_rate)
    block_03 = block(block_02, units = 128, dropout_rate = dropout_rate)

    block_04 = second_block(block_03, units = 64, dropout_rate = dropout_rate)
    block_05 = second_block(block_04, units = 128, dropout_rate = dropout_rate)

    x = Flatten()(block_05)

    x = Dense(units = 128, activation = "relu")(x)
    x = BatchNormalization()(x)
    x_res = x
    x = Dropout(rate = dropout_rate)(x)

    x = Dense(units = 128, activation = "relu")(x)
    x = BatchNormalization()(x)
    x = Add()([x_res, x])
    x = Dropout(rate = dropout_rate)(x)

    model_out = Dense(units = 1, activation = 'sigmoid')(x)
    model = Model(in_, model_out)
    return model

### 모델 학습

In [None]:
split = StratifiedKFold(n_splits = 5, shuffle = True, random_state = 10)

pred = []
pred_ = []

for train_idx, val_idx in split.split(train_x, train_y):
    x_train, y_train = train_x[train_idx], train_y[train_idx]
    x_val, y_val = train_x[val_idx], train_y[val_idx]

    model = build_fn()
    model.compile(optimizer = keras.optimizers.Adam(0.002),
                 loss = keras.losses.BinaryCrossentropy(),
                 metrics = ['acc'])
    history = model.fit(x = x_train, y = y_train, validation_data = (x_val, y_val), epochs = 8)
    model.save('voice.h5')
    #print("*******************************************************************")
    #pred.append(model.predict(test_x))
    #pred_.append(np.argmax(model.predict(test_x), axis = 1))
    #print("*******************************************************************")

Epoch 1/8


### 테스트 데이터 불러오기

In [None]:
test_voice_paths = glob("./data/원천데이터/test/*/*/*/*.wav")
test_label_paths = glob("./data/라벨링데이터/test/*/*/*/*.json")

In [None]:
def test_labeling(paths):
    result = []
    for path in tqdm(paths):
        # sr = 16000이 의미는 1초당 16000개의 데이터를 샘플링
        with open(path,'r',encoding="UTF-8") as f:
            json_data = json.load(f)
        if json_data['Speaker']['Region']=='00' and json_data['Speaker']['Dialect']=='01':
            result.append(0)
        elif json_data['Speaker']['Region']=='01'and json_data['Speaker']['Dialect']=='02':
            result.append(1)
        else:
            result.append(2)
    result = np.array(result) 
    # 메모리가 부족할 때는 데이터 타입을 변경 ex) np.array(data, dtype = np.float32)
    return result

In [None]:
test_label_paths = test_labeling(test_label_paths)
dataframe = pd.DataFrame(test_label_paths, columns = ['type'])
dataframe['file_path'] = test_voice_paths

drop_index = dataframe[dataframe['type']==2].index
dataframe = dataframe.drop(drop_index)
test_y = np.array(dataframe['type'])

test_voice_paths = dataframe['file_path']
test_voice_paths = load_data(test_voice_paths)

np.save("./npy_data/test_npy", test_voice_paths)
test_voice_paths = np.load("./npy_data/test_npy.npy", allow_pickle = True)

In [None]:
# test데이터 만들기
test_x = np.array(test_voice_paths)
mini = get_mini(test_x)
test_x = set_length(test_x, mini)
test_x = get_feature(data = test_x)
test_x = test_x.reshape(-1, test_x.shape[1], test_x.shape[2], 1)

In [None]:
new_model = tf.keras.models.load_model('voice.h5')
test_loss, test_acc = new_model.evaluate(test_x,  test_y, verbose=2)

### 예측하기

In [None]:
def cov_type(data):
    return np.int(data)

# 처음에 살펴본 것처럼 glob로 test data의 path는 sample_submission의 id와 같이 1,2,3,4,5.....으로 정렬 되어있지 않음
# 만들어둔 test_ 데이터프레임을 이용하여 sample_submission과 predict값의 id를 맞춰줌

result = pd.concat([test_, pd.DataFrame(np.mean(pred, axis = 0))], axis = 1).iloc[:, 1:]
result["id"] = result["id"].apply(lambda x : cov_type(x))

result = pd.merge(sample_submission["id"], result)
result.columns = sample_submission.columns

In [17]:
result

Unnamed: 0,id,africa,australia,canada,england,hongkong,us
0,1,0.037994,0.006710,0.023825,0.290019,0.020778,0.620673
1,2,0.188218,0.014532,0.022077,0.535970,0.004920,0.234284
2,3,0.157294,0.026830,0.016824,0.574049,0.026685,0.198318
3,4,0.194869,0.067176,0.038670,0.554759,0.055354,0.089171
4,5,0.207199,0.026973,0.008716,0.332835,0.022489,0.401787
...,...,...,...,...,...,...,...
6095,6096,0.063617,0.053831,0.016850,0.282672,0.241308,0.341721
6096,6097,0.009056,0.009781,0.004995,0.324703,0.003501,0.647963
6097,6098,0.174698,0.019762,0.012561,0.624443,0.063158,0.105378
6098,6099,0.180881,0.010444,0.015629,0.359167,0.007196,0.426683
