In [None]:
!pip install pycaret
!pip install markupsafe==2.0.1

In [None]:
import random
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import os
import librosa
import jinja2

from tqdm.auto import tqdm
from pycaret.classification import *
from sklearn.preprocessing import OneHotEncoder

import warnings
warnings.filterwarnings(action='ignore')

아래의 값들은 실험을 통해 결정했습니다.
- Sample Rate : {8000, 16000, 24000, 48000, ...} => 16000
- MFCC 벡터 추출 개수 : {24, 32, 39} => 39

In [None]:
CFG = {
    'SR':16000,
    'N_MFCC':39, 
    'SEED':1209
}

In [None]:
def seed_everything(seed):
    random.seed(seed)
    os.environ['PYTHONHASHSEED'] = str(seed)
    np.random.seed(seed)

seed_everything(CFG['SEED']) # Seed 고정

# Data Preprocessing

In [None]:
train_df = pd.read_csv('/content/drive/MyDrive/dacon_covid/train_data.csv')
test_df = pd.read_csv('/content/drive/MyDrive/dacon_covid/test_data.csv')

제공된 음향 데이터마다 녹음 시간이 다른 점을 확인하고, train data의 대부분 샘플이 8초 이하임을 확인했습니다. 

따라서 mfcc를 추출하기 전에 모든 데이터의 길이를 16000 * 8 = 128000로 고정했습니다.

In [None]:
def timeCheck(df, data_type, root_path):
    root_folder = os.path.join(root_path, data_type)
    sec = [0 for _ in range(len(df))]
    duration_list = []

    for uid in tqdm(df['id']):
        path = os.path.join(root_folder, str(uid).zfill(5)+'.wav')
        # librosa패키지를 사용하여 wav 파일 load
        y, sr = librosa.load(path, sr=CFG['SR'])
        dur=librosa.get_duration(y)
        duration_list.append(dur)
    
    return duration_list

def printTime(time_ary):
    temp=[]
    for i in range(3,10):
        over = 0
        for time in time_ary:
            if time <= i:
                over += 1
        print(f'duration {i} sec 이하의 샘플 수 => {over}')
        
path = '/content/drive/MyDrive/dacon_covid'
#time_ary = timeCheck(train_df, 'train', path)
#printTime(time_ary)

#duration 3 sec 이하의 샘플 수 => 317
#duration 4 sec 이하의 샘플 수 => 640
#duration 5 sec 이하의 샘플 수 => 983
#duration 6 sec 이하의 샘플 수 => 1233
#duration 7 sec 이하의 샘플 수 => 1645
#duration 8 sec 이하의 샘플 수 => 3795
#duration 9 sec 이하의 샘플 수 => 3796

In [None]:
def get_mfcc_feature(df, data_type, save_path):
    # Data Folder path
    root_folder = '/content/drive/MyDrive/dacon_covid'
    root_folder = os.path.join(root_folder, data_type)
    if os.path.exists(save_path):
        print(f'{save_path} is exist.')
        return
    features = []
    for uid in tqdm(df['id']):
        path = os.path.join(root_folder, str(uid).zfill(5)+'.wav')

        # librosa패키지를 사용하여 wav 파일 load
        y, sr = librosa.load(path, sr=CFG['SR'])
        y = librosa.util.fix_length(y, 128000) # 16000 * 8 = 128000

        # librosa패키지를 사용하여 mfcc 추출
        mfcc = librosa.feature.mfcc(y=y, sr=sr, n_mfcc=CFG['N_MFCC'])

        y_feature = []
        # 추출된 MFCC들의 평균을 Feature로 사용
        for e in mfcc:
            y_feature.append(np.mean(e))
        features.append(y_feature)
    
    # 기존의 자가진단 정보를 담은 데이터프레임에 추출된 오디오 Feature를 추가
    mfcc_df = pd.DataFrame(features, columns=['mfcc_'+str(x) for x in range(1,CFG['N_MFCC']+1)])
    df = pd.concat([df, mfcc_df], axis=1)
    df.to_csv(save_path, index=False)
    print('Done.')

In [None]:
#get_mfcc_feature(train_df, 'train', '/content/drive/MyDrive/dacon_covid/train_mfcc_data(16000,39).csv')
#get_mfcc_feature(test_df, 'test', '/content/drive/MyDrive/dacon_covid/test_mfcc_data(16000,39).csv')

In [None]:
# wav 파일의 MFCC Feature와 상태정보를 합친 학습데이터를 불러옵니다.
train_df = pd.read_csv('/content/drive/MyDrive/dacon_covid/train_mfcc_data(16000,39).csv')

In [None]:
# OneHotEncoder 적용
def onehot_encoding(ohe, x):
    encoded = ohe.transform(x['gender'].values.reshape(-1,1))
    encoded_df = pd.DataFrame(encoded, columns=ohe.categories_[0])
    x = pd.concat([x.drop(columns=['gender']), encoded_df], axis=1)
    return x
    
ohe = OneHotEncoder(sparse=False)
ohe.fit(train_df['gender'].values.reshape(-1,1))
train_x = onehot_encoding(ohe, train_df)

train_x.head()

Unnamed: 0,id,age,respiratory_condition,fever_or_muscle_pain,covid19,mfcc_1,mfcc_2,mfcc_3,mfcc_4,mfcc_5,...,mfcc_33,mfcc_34,mfcc_35,mfcc_36,mfcc_37,mfcc_38,mfcc_39,female,male,other
0,1,24,0,1,0,-358.46594,18.004765,-11.75107,-4.808286,-5.579647,...,-1.555619,-0.519989,-1.508037,0.871903,-1.092781,0.176559,-1.410955,1.0,0.0,0.0
1,2,51,0,0,0,-491.9249,16.11043,-0.018984,-3.385173,1.830499,...,-0.871181,0.284732,-0.976672,-0.269059,-1.455844,-0.243811,-0.965905,0.0,1.0,0.0
2,3,22,0,0,0,-412.43283,55.046696,-26.882256,-4.292815,-16.444223,...,0.108087,-0.841571,0.068139,1.668167,0.12683,-0.903964,-0.933764,0.0,1.0,0.0
3,4,29,1,0,0,-354.38486,46.824127,-7.460059,-2.061099,-21.868868,...,-2.161254,2.075511,-0.80784,1.637357,-1.791975,-0.283981,-2.395714,1.0,0.0,0.0
4,5,23,0,0,0,-534.09503,7.251156,-7.509023,2.261058,-5.360337,...,0.121541,1.323621,-1.292612,1.098826,-0.077346,0.059019,-0.364198,0.0,1.0,0.0


타 mfcc feature에 비해, mfcc_1의 값이 너무 커 모델에 악영향을 끼칠 것으로 판단하고 제거했습니다.

In [None]:
train_x = train_x.drop(columns=['id', 'mfcc_1'])
train_x.head(3)

Unnamed: 0,age,respiratory_condition,fever_or_muscle_pain,covid19,mfcc_2,mfcc_3,mfcc_4,mfcc_5,mfcc_6,mfcc_7,...,mfcc_33,mfcc_34,mfcc_35,mfcc_36,mfcc_37,mfcc_38,mfcc_39,female,male,other
0,24,0,1,0,18.004765,-11.75107,-4.808286,-5.579647,-5.24799,-20.034689,...,-1.555619,-0.519989,-1.508037,0.871903,-1.092781,0.176559,-1.410955,1.0,0.0,0.0
1,51,0,0,0,16.11043,-0.018984,-3.385173,1.830499,-0.523592,-4.827082,...,-0.871181,0.284732,-0.976672,-0.269059,-1.455844,-0.243811,-0.965905,0.0,1.0,0.0
2,22,0,0,0,55.046696,-26.882256,-4.292815,-16.444223,0.306596,-10.669117,...,0.108087,-0.841571,0.068139,1.668167,0.12683,-0.903964,-0.933764,0.0,1.0,0.0


# Pycaret

In [None]:
cat_col = ['covid19', 'respiratory_condition', 'fever_or_muscle_pain', 'female', 'male', 'other']
clf  = setup(train_x, preprocess = False, train_size = 0.999,
             target = 'covid19', numeric_features=list(train_x.drop(columns = cat_col).columns),  
             silent = True, session_id = 1209)              

Unnamed: 0,Description,Value
0,session_id,1209
1,Target,covid19
2,Target Type,Binary
3,Label Encoded,
4,Original Data,"(3805, 45)"
5,Missing Values,False
6,Numeric Features,39
7,Categorical Features,5
8,Transformed Train Set,"(3801, 44)"
9,Transformed Test Set,"(4, 44)"


In [None]:
top3_models = compare_models(fold = 5, round = 3, sort = 'F1', n_select = 3)

Unnamed: 0,Model,Accuracy,AUC,Recall,Prec.,F1,Kappa,MCC,TT (Sec)
nb,Naive Bayes,0.839,0.675,0.337,0.202,0.252,0.168,0.175,0.106
qda,Quadratic Discriminant Analysis,0.851,0.589,0.264,0.249,0.233,0.163,0.17,0.082
lda,Linear Discriminant Analysis,0.909,0.68,0.157,0.349,0.216,0.175,0.191,0.058
dt,Decision Tree Classifier,0.845,0.54,0.176,0.139,0.155,0.072,0.072,0.368
svm,SVM - Linear Kernel,0.767,0.0,0.274,0.349,0.126,0.07,0.114,0.106
ada,Ada Boost Classifier,0.913,0.609,0.052,0.25,0.087,0.063,0.084,1.848
gbc,Gradient Boosting Classifier,0.916,0.637,0.043,0.349,0.075,0.058,0.097,2.844
lr,Logistic Regression,0.917,0.673,0.023,0.26,0.041,0.03,0.055,2.166
et,Extra Trees Classifier,0.92,0.668,0.02,0.433,0.038,0.032,0.08,0.764
knn,K Neighbors Classifier,0.917,0.541,0.02,0.287,0.036,0.026,0.056,0.518


In [None]:
models = []
for m in top3_models:
    models.append(tune_model(m, 
                             optimize = 'F1', 
                             choose_better = True,
                            n_iter = 500))

Unnamed: 0_level_0,Accuracy,AUC,Recall,Prec.,F1,Kappa,MCC
Fold,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
0,0.9186,0.7603,0.1935,0.5,0.2791,0.2448,0.2761
1,0.9079,0.6615,0.1,0.2727,0.1463,0.1086,0.1241
2,0.9184,0.6471,0.2,0.4615,0.2791,0.2429,0.267
3,0.9132,0.6208,0.1,0.3333,0.1538,0.1218,0.1469
4,0.9211,0.677,0.1667,0.5,0.25,0.2192,0.2567
5,0.9053,0.7407,0.2258,0.3684,0.28,0.2324,0.2404
6,0.9079,0.6601,0.1935,0.375,0.2553,0.2115,0.2247
7,0.8974,0.6106,0.129,0.25,0.1702,0.1214,0.129
8,0.9211,0.7746,0.1613,0.5556,0.25,0.2214,0.2697
9,0.9053,0.689,0.1613,0.3333,0.2174,0.1734,0.1865


In [None]:
voting = blend_models(models, optimize = 'F1')
voting = tune_model(voting, 
                 optimize = 'F1', 
                 choose_better = True,
                 n_iter = 500)

voting = finalize_model(voting)

Unnamed: 0_level_0,Accuracy,AUC,Recall,Prec.,F1,Kappa,MCC
Fold,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
0,0.8924,0.754,0.2581,0.3077,0.2807,0.223,0.224
1,0.8842,0.63,0.2333,0.25,0.2414,0.1788,0.1789
2,0.8816,0.6813,0.3,0.2727,0.2857,0.2213,0.2216
3,0.8947,0.6194,0.1667,0.25,0.2,0.1461,0.1495
4,0.8816,0.65,0.2333,0.2414,0.2373,0.1731,0.1731
5,0.9,0.6935,0.3871,0.3871,0.3871,0.3327,0.3327
6,0.8974,0.68,0.2903,0.3462,0.3158,0.2608,0.262
7,0.8711,0.6515,0.2258,0.2188,0.2222,0.1519,0.152
8,0.9184,0.7715,0.2903,0.5,0.3673,0.327,0.3409
9,0.8737,0.6866,0.2903,0.2571,0.2727,0.2038,0.2043


In [None]:
# 위의 학습데이터를 전처리한 과정과 동일하게 test data에도 적용
test_x = pd.read_csv('/content/drive/MyDrive/dacon_covid/test_mfcc_data(16000,39).csv')

# Data Leakage에 유의하여 train data로만 학습된 ohe를 사용
test_x = onehot_encoding(ohe, test_x)
test_x = test_x.drop(columns=['id', 'mfcc_1'])

In [None]:
# prediction 및 제출 파일 생성 
submission = pd.read_csv('/content/drive/MyDrive/dacon_covid/sample_submission.csv')

pred = voting.predict(test_x)
submission['covid19'] = pred
submission.to_csv('/content/drive/MyDrive/dacon_covid/final_submit.csv', index=False)