In [None]:
from google.colab import drive
drive.mount('/content/drive')

## Library import

In [8]:
##
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

##
from scipy.fftpack import fft
from scipy import signal
from scipy.io import wavfile

##
from sklearn.neural_network import MLPClassifier
from sklearn.preprocessing import OneHotEncoder
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
import librosa, librosa.display

##
import glob
import pickle
from tqdm.auto import tqdm
import os
import random
import warnings
warnings.filterwarnings(action='ignore') 

## Data Load

In [4]:
data_path = '/content/drive/MyDrive/Dacon/음향_데이터_covid-19_검출_AI_경진대회/data'

In [5]:
train_df = pd.read_csv(data_path + '/raw/train_data.csv')
test_df = pd.read_csv(data_path + '/raw/test_data.csv')

## Hyperparameter Setting

In [10]:
CFG = {
    'SR':16000,
    'N_MFCC':32, # MFCC 벡터를 추출할 개수
    'SEED': 0xC0FFEE
}

## Fixed Random-Seed

In [11]:
def seed_everything(seed):
    random.seed(seed)
    os.environ['PYTHONHASHSEED'] = str(seed)
    np.random.seed(seed)

seed_everything(CFG['SEED']) # Seed 고정

## MFCC (Data Pre-Processing)

In [12]:
def get_mfcc_feature(df, data_type, save_path):
    # Data Folder path
    root_folder = data_path +'raw/'
    if os.path.exists(save_path):
        print(f'{save_path} is exist.')
        return
    features = []
    for uid in tqdm(df['id']):
        root_path = os.path.join(root_folder, data_type)
        path = os.path.join(root_path, str(uid).zfill(5)+'.wav')

        # librosa패키지를 사용하여 wav 파일 load
        y, sr = librosa.load(path, sr=CFG['SR'])
        
        # librosa패키지를 사용하여 mfcc 추출
        mfcc = librosa.feature.mfcc(y=y, sr=sr, n_mfcc=CFG['N_MFCC'])

        y_feature = []
        # 추출된 MFCC들의 평균을 Feature로 사용
        for e in mfcc:
            y_feature.append(np.mean(e))
        features.append(y_feature)
    
    # 기존의 자가진단 정보를 담은 데이터프레임에 추출된 오디오 Feature를 추가
    mfcc_df = pd.DataFrame(features, columns=['mfcc_'+str(x) for x in range(1,CFG['N_MFCC']+1)])
    df = pd.concat([df, mfcc_df], axis=1)
    df.to_csv(save_path, index=False)
    print('Done.')

In [13]:
get_mfcc_feature(train_df, 'train', data_path + '/extracted/train_mfcc_data.csv')
get_mfcc_feature(test_df, 'test', data_path + '/extracted/test_mfcc_data.csv')

/content/drive/MyDrive/Dacon/음향_데이터_covid-19_검출_AI_경진대회/data/extracted/train_mfcc_data.csv is exist.
/content/drive/MyDrive/Dacon/음향_데이터_covid-19_검출_AI_경진대회/data/extracted/test_mfcc_data.csv is exist.
