In [2]:
'''

audio covid-19  AI

modified the [Baseline] code 

'''


import random
import pandas as pd
import numpy as np
import os
import librosa

from tqdm.auto import tqdm

from sklearn.neural_network import MLPClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.preprocessing import OneHotEncoder

import matplotlib.pyplot as plt
import PyQt5 

from PIL import Image
import matplotlib as mpl
import librosa.display

%matplotlib qt

import warnings
warnings.filterwarnings(action='ignore') 

CFG = {
    'SR':16000,
    'N_MFCC':32, # MFCC 벡터를 추출할 개수
    'SEED':41
}

def seed_everything(seed):
    random.seed(seed)
    os.environ['PYTHONHASHSEED'] = str(seed)
    np.random.seed(seed)

seed_everything(CFG['SEED']) # Seed 고정

train_df = pd.read_csv('./train_data.csv')
test_df = pd.read_csv('./test_data.csv')

In [3]:


def get_mfcc_feature(df, data_type, save_path):
    
    #cm_hot = mpl.cm.get_cmap('hot')
    
    # Data Folder path
    root_folder = './'
    if os.path.exists(save_path):
        print(f'{save_path} is exist.')
        return
    features = []
    for uid in tqdm(df['id']):
        root_path = os.path.join(root_folder, data_type)
        path = os.path.join(root_path, str(uid).zfill(5)+'.wav')

        # librosa패키지를 사용하여 wav 파일 load
        y, sr = librosa.load(path, sr=CFG['SR'])
        
        # librosa패키지를 사용하여 mfcc 추출
        #mfcc = librosa.feature.mfcc(y=y, sr=sr, n_mfcc=CFG['N_MFCC'])
        #melspec = librosa.feature.melspectrogram(y=y, sr=sr)
        #s_db = librosa.power_to_db(melspec, ref=np.max)
        _, ints = librosa.effects.trim(y, top_db=40) 
        pad = 0.25*sr
        start = int(max(ints[0]-pad, 0))
        end = int(min(ints[1]+pad, len(y)))
        y3 = y[start:end]

        chunk = 4.09 # 128로 맞추기 위함.
        y3_ = y3[:np.floor(chunk*sr).astype(int)]
        y4 = np.zeros(int(sr*chunk))
        y4[:min(len(y4), len(y3_))] = y3_[:min(len(y4), len(y3_))]        
        
        #melspec4 = librosa.feature.melspectrogram(y=y4, sr=sr)
        #s_db4 = librosa.power_to_db(melspec4, ref=np.max)
        mfcc4 = librosa.feature.mfcc(y=y4, sr=sr, n_mfcc=CFG['N_MFCC'])            
        
        print(uid, y.shape, y4.shape, mfcc4.shape)
        
        y_feature = []
        # 추출된 MFCC들의 평균을 Feature로 사용
        for e in mfcc4:
            y_feature.append(np.mean(e))
        features.append(y_feature)
    
    # 기존의 자가진단 정보를 담은 데이터프레임에 추출된 오디오 Feature를 추가
    mfcc_df = pd.DataFrame(features, columns=['mfcc_'+str(x) for x in range(1,CFG['N_MFCC']+1)])
    df = pd.concat([df, mfcc_df], axis=1)
    df.to_csv(save_path, index=False)
    print('Done.')
    

In [4]:
get_mfcc_feature(train_df, 'train', './train_mfcc_data3.csv')
get_mfcc_feature(test_df, 'test', './test_mfcc_data3.csv')

./train_mfcc_data3.csv is exist.
./test_mfcc_data3.csv is exist.


In [5]:
# wav 파일의 MFCC Feature와 상태정보를 합친 학습데이터를 불러옵니다.
train_df = pd.read_csv('./train_mfcc_data3.csv')

# 학습데이터를 모델의 input으로 들어갈 x와 label로 사용할 y로 분할
train_x = train_df.drop(columns=['id', 'covid19'])
train_y = train_df['covid19']

In [6]:
def onehot_encoding(ohe, x):
    # 학습데이터로 부터 fit된 one-hot encoder (ohe)를 받아 transform 시켜주는 함수
    encoded = ohe.transform(x['gender'].values.reshape(-1,1))
    encoded_df = pd.DataFrame(encoded, columns=ohe.categories_[0])
    x = pd.concat([x.drop(columns=['gender']), encoded_df], axis=1)
    return x

In [7]:
# 'gender' column의 경우 추가 전처리가 필요 -> OneHotEncoder 적용
ohe = OneHotEncoder(sparse=False)
ohe.fit(train_x['gender'].values.reshape(-1,1))
train_x = onehot_encoding(ohe, train_x)

In [8]:
# model = MLPClassifier(random_state=CFG['SEED']) # Sklearn에서 제공하는 Multi-layer Perceptron classifier 사용
# model = MLPClassifier(hidden_layer_sizes=(85,), random_state=CFG['SEED'])
model = RandomForestClassifier(max_depth=3, random_state=CFG['SEED'])

model.fit(train_x, train_y) # Model Train

RandomForestClassifier(max_depth=3, random_state=41)

In [9]:
# 위의 학습데이터를 전처리한 과정과 동일하게 test data에도 적용
test_x = pd.read_csv('./test_mfcc_data3.csv')
test_x = test_x.drop(columns=['id'])
# Data Leakage에 유의하여 train data로만 학습된 ohe를 사용
test_x = onehot_encoding(ohe, test_x)

# Model 추론
preds = model.predict(test_x)

In [10]:
submission = pd.read_csv('./sample_submission.csv')
submission['covid19'] = preds
submission.to_csv('./submit.csv', index=False)