In [None]:
import random
import pandas as pd
import numpy as np
import os
import librosa

from tqdm.auto import tqdm

from sklearn.neural_network import MLPClassifier
from sklearn.preprocessing import OneHotEncoder

import matplotlib.pyplot as plt

import PyQt5 
%matplotlib qt

import warnings
warnings.filterwarnings(action='ignore') 

CFG = {
    'SR':16000,
    'N_MFCC':32, # MFCC 벡터를 추출할 개수
    'SEED':41
}

def seed_everything(seed):
    random.seed(seed)
    os.environ['PYTHONHASHSEED'] = str(seed)
    np.random.seed(seed)

seed_everything(CFG['SEED']) # Seed 고정

train_df = pd.read_csv('open/train_data.csv')
test_df = pd.read_csv('open/test_data.csv')

In [15]:
from PIL import Image
import matplotlib as mpl
import librosa.display

def get_mfcc_feature(df, data_type, save_path):
    
    cm_conv = mpl.cm.get_cmap('jet')
    
    # Data Folder path
    root_folder = './open'
    if os.path.exists(save_path):
        print(f'{save_path} is exist.')
        return
    features = []
    for uid in tqdm(df['id']):
        root_path = os.path.join(root_folder, data_type)
        path = os.path.join(root_path, str(uid).zfill(5)+'.wav')

        # librosa패키지를 사용하여 wav 파일 load
        y, sr = librosa.load(path, sr=CFG['SR'])
        
        # librosa패키지를 사용하여 mfcc 추출
        mfcc = librosa.feature.mfcc(y=y, sr=sr, n_mfcc=CFG['N_MFCC'])
        
        
        S = librosa.feature.melspectrogram(y=y, sr=sr)
        
                
        print(uid, y.shape, mfcc.shape, S.shape)
        
        if uid % 3 == 0:
            
            SdB = 10 * np.log10(S) - 10*np.log10(np.max(S))
#             im = cm_hot(SdB).astype(float)
            
            SdB_n = (SdB - np.min(SdB)) / (np.max(SdB) - np.min(SdB))

            im = cm_conv(SdB_n).astype(float)
            
#             print(np.min(im), np.max(im))
#             im = np.uint8( (im - np.min(im))/(np.max(im)-np.min(im)) * 255)
            
            im = np.uint8(im * 255)
            im2 = Image.fromarray(im)
            im2.save('open/png/train/mfcc_'+ str(uid).zfill(5) +'.png')
            
            
            
            plt.figure()
            #librosa.display.specshow(mfcc, sr=16000, x_axis='time', vmin=-700, vmax=300)
            
            plt.subplot(121)
            librosa.display.specshow(librosa.power_to_db(S, ref=np.max))
            plt.colorbar()
            
            plt.subplot(122)
            plt.imshow(SdB, cmap='hot')
            
            plt.title(f'id={uid}')
            
            plt.show()
#             plt.savefig( 'open/png/train/mfcc_'+ str(uid).zfill(5) +'.png')
        
        y_feature = []
        # 추출된 MFCC들의 평균을 Feature로 사용
        for e in mfcc:
            y_feature.append(np.mean(e))
        features.append(y_feature)
    
    # 기존의 자가진단 정보를 담은 데이터프레임에 추출된 오디오 Feature를 추가
    mfcc_df = pd.DataFrame(features, columns=['mfcc_'+str(x) for x in range(1,CFG['N_MFCC']+1)])
    df = pd.concat([df, mfcc_df], axis=1)
    df.to_csv(save_path, index=False)
    print('Done.')

In [16]:
get_mfcc_feature(train_df, 'train', './train_mfcc_data.csv')
get_mfcc_feature(test_df, 'test', './test_mfcc_data.csv')

  0%|          | 0/3805 [00:00<?, ?it/s]

1 (78720,) (32, 154) (128, 154)
2 (39360,) (32, 77) (128, 77)
3 (151680,) (32, 297) (128, 297)
4 (155520,) (32, 304) (128, 304)
5 (129600,) (32, 254) (128, 254)
6 (152640,) (32, 299) (128, 299)
7 (156480,) (32, 306) (128, 306)
8 (101760,) (32, 199) (128, 199)
9 (130560,) (32, 256) (128, 256)
10 (157440,) (32, 308) (128, 308)


KeyboardInterrupt: 

## Data Pre-Processing 2

In [None]:
# wav 파일의 MFCC Feature와 상태정보를 합친 학습데이터를 불러옵니다.
train_df = pd.read_csv('./train_mfcc_data.csv')

# 학습데이터를 모델의 input으로 들어갈 x와 label로 사용할 y로 분할
train_x = train_df.drop(columns=['id', 'covid19'])
train_y = train_df['covid19']

In [None]:
def onehot_encoding(ohe, x):
    # 학습데이터로 부터 fit된 one-hot encoder (ohe)를 받아 transform 시켜주는 함수
    encoded = ohe.transform(x['gender'].values.reshape(-1,1))
    encoded_df = pd.DataFrame(encoded, columns=ohe.categories_[0])
    x = pd.concat([x.drop(columns=['gender']), encoded_df], axis=1)
    return x

In [None]:
# 'gender' column의 경우 추가 전처리가 필요 -> OneHotEncoder 적용
ohe = OneHotEncoder(sparse=False)
ohe.fit(train_x['gender'].values.reshape(-1,1))
train_x = onehot_encoding(ohe, train_x)

## Train

In [None]:
model = MLPClassifier(random_state=CFG['SEED']) # Sklearn에서 제공하는 Multi-layer Perceptron classifier 사용
model.fit(train_x, train_y) # Model Train

## Inference

In [None]:
# 위의 학습데이터를 전처리한 과정과 동일하게 test data에도 적용
test_x = pd.read_csv('./test_mfcc_data.csv')
test_x = test_x.drop(columns=['id'])
# Data Leakage에 유의하여 train data로만 학습된 ohe를 사용
test_x = onehot_encoding(ohe, test_x)

# Model 추론
preds = model.predict(test_x)

## Submission

In [None]:
submission = pd.read_csv('./sample_submission.csv')
submission['covid19'] = preds
submission.to_csv('./submit.csv', index=False)