<a href="https://colab.research.google.com/github/Kompactss/Covid19/blob/main/%5BBaseline%5D_MFCC_%E1%84%80%E1%85%B5%E1%84%87%E1%85%A1%E1%86%AB_Feature_%E1%84%8E%E1%85%AE%E1%84%8E%E1%85%AE%E1%86%AF_%2B_MLP_Classification.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

## Import

In [1]:
import random
import pandas as pd
import numpy as np
import os
import librosa

from tqdm.auto import tqdm

from sklearn.neural_network import MLPClassifier
from sklearn.preprocessing import OneHotEncoder

import warnings
warnings.filterwarnings(action='ignore') 

##Drive Path Import

In [2]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [3]:
dp = '/content/drive/MyDrive/Covid19/'

## Hyperparameter Setting

In [4]:
CFG = {
    'SR':16000,
    'N_MFCC':32, # MFCC 벡터를 추출할 개수
    'SEED':41
}

## Fixed Random-Seed

In [5]:
def seed_everything(seed):
    random.seed(seed)
    os.environ['PYTHONHASHSEED'] = str(seed)
    np.random.seed(seed)

seed_everything(CFG['SEED']) # Seed 고정

## Data Pre-Processing 2

In [18]:
# wav 파일의 MFCC Feature와 상태정보를 합친 학습데이터를 불러옵니다.
train_df = pd.read_csv(dp + 'train_mfcc_data.csv')

# 학습데이터를 모델의 input으로 들어갈 x와 label로 사용할 y로 분할
train_x = train_df.drop(columns=['id', 'covid19'])
train_y = train_df['covid19']

In [19]:
train_x

Unnamed: 0,age,gender,respiratory_condition,fever_or_muscle_pain,mfcc_1,mfcc_2,mfcc_3,mfcc_4,mfcc_5,mfcc_6,...,mfcc_23,mfcc_24,mfcc_25,mfcc_26,mfcc_27,mfcc_28,mfcc_29,mfcc_30,mfcc_31,mfcc_32
0,24,female,0,1,-274.93472,29.345425,-19.152718,-7.836880,-9.094099,-8.553542,...,-2.320942,2.150005,-0.925417,2.116030,-0.192730,2.417784,-4.736650,1.237788,-1.600426,-1.462419
1,51,male,0,0,-311.56317,52.478150,-0.098957,-11.070889,5.932184,-1.739854,...,-6.494778,0.545812,-6.261986,-2.384402,-6.743353,0.255104,-0.966994,-2.113054,-2.433555,0.881178
2,22,male,0,0,-438.29000,46.588910,-22.689060,-3.607528,-13.873103,0.270997,...,-0.156510,-1.682014,2.618637,1.244486,-0.074025,-0.964130,-0.735731,-0.420304,0.795621,0.411339
3,29,female,1,0,-368.42612,46.939358,-7.443123,-3.694383,-20.511757,-9.271688,...,-0.155855,3.839285,-2.503368,2.750743,1.758510,2.094587,0.295868,1.737648,-0.654136,1.847976
4,23,male,0,0,-535.19446,7.165523,-7.422007,2.231186,-5.300425,-0.644981,...,-0.144311,-0.413284,-1.452623,0.235582,-0.973687,0.777570,-0.735323,1.141641,-0.497988,1.190929
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
3800,53,male,0,0,-328.48767,68.190380,-40.383747,2.297682,0.355717,-30.347511,...,-0.168663,1.255295,-4.972386,-0.053485,1.204619,3.961083,-4.195477,2.746365,-4.138545,-2.572868
3801,25,male,0,0,-386.25732,73.931350,-12.114974,-0.500643,-16.141827,-15.249634,...,-2.914732,4.907058,1.142662,6.096552,2.885290,3.611389,-0.631856,3.847092,2.540035,5.938597
3802,26,female,0,0,-347.20593,58.544130,12.969810,27.973340,20.632845,10.184926,...,-2.396356,-2.741529,-2.629161,-1.906816,-3.183893,-1.446224,-0.021817,-1.601471,-1.630301,-1.382297
3803,27,female,0,0,-179.11195,70.697860,-14.571251,-10.143574,-25.649060,2.032261,...,-7.733275,5.680011,-2.057645,1.684632,-1.205147,-0.134177,-4.729319,0.557955,-5.249906,-0.278406


In [20]:
def onehot_encoding(ohe, x):
    # 학습데이터로 부터 fit된 one-hot encoder (ohe)를 받아 transform 시켜주는 함수
    encoded = ohe.transform(x['gender'].values.reshape(-1,1))
    encoded_df = pd.DataFrame(encoded, columns=ohe.categories_[0])
    x = pd.concat([x.drop(columns=['gender']), encoded_df], axis=1)
    return x

In [21]:
# 'gender' column의 경우 추가 전처리가 필요 -> OneHotEncoder 적용
ohe = OneHotEncoder(sparse=False)
ohe.fit(train_x['gender'].values.reshape(-1,1))
train_x = onehot_encoding(ohe, train_x)

In [44]:
train_x

Unnamed: 0,age,respiratory_condition,fever_or_muscle_pain,mfcc_1,mfcc_2,mfcc_3,mfcc_4,mfcc_5,mfcc_6,mfcc_7,...,mfcc_26,mfcc_27,mfcc_28,mfcc_29,mfcc_30,mfcc_31,mfcc_32,female,male,other
0,24,0,1,-274.93472,29.345425,-19.152718,-7.836880,-9.094099,-8.553542,-32.653940,...,2.116030,-0.192730,2.417784,-4.736650,1.237788,-1.600426,-1.462419,1.0,0.0,0.0
1,51,0,0,-311.56317,52.478150,-0.098957,-11.070889,5.932184,-1.739854,-15.766101,...,-2.384402,-6.743353,0.255104,-0.966994,-2.113054,-2.433555,0.881178,0.0,1.0,0.0
2,22,0,0,-438.29000,46.588910,-22.689060,-3.607528,-13.873103,0.270997,-9.013165,...,1.244486,-0.074025,-0.964130,-0.735731,-0.420304,0.795621,0.411339,0.0,1.0,0.0
3,29,1,0,-368.42612,46.939358,-7.443123,-3.694383,-20.511757,-9.271688,-10.894087,...,2.750743,1.758510,2.094587,0.295868,1.737648,-0.654136,1.847976,1.0,0.0,0.0
4,23,0,0,-535.19446,7.165523,-7.422007,2.231186,-5.300425,-0.644981,-6.101685,...,0.235582,-0.973687,0.777570,-0.735323,1.141641,-0.497988,1.190929,0.0,1.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
3800,53,0,0,-328.48767,68.190380,-40.383747,2.297682,0.355717,-30.347511,-7.288190,...,-0.053485,1.204619,3.961083,-4.195477,2.746365,-4.138545,-2.572868,0.0,1.0,0.0
3801,25,0,0,-386.25732,73.931350,-12.114974,-0.500643,-16.141827,-15.249634,-19.629406,...,6.096552,2.885290,3.611389,-0.631856,3.847092,2.540035,5.938597,0.0,1.0,0.0
3802,26,0,0,-347.20593,58.544130,12.969810,27.973340,20.632845,10.184926,6.837224,...,-1.906816,-3.183893,-1.446224,-0.021817,-1.601471,-1.630301,-1.382297,1.0,0.0,0.0
3803,27,0,0,-179.11195,70.697860,-14.571251,-10.143574,-25.649060,2.032261,-11.038777,...,1.684632,-1.205147,-0.134177,-4.729319,0.557955,-5.249906,-0.278406,1.0,0.0,0.0


## MFCC Feature Regularization(효과없음ㅜㅜ)

## Train

In [45]:
model = MLPClassifier(random_state=CFG['SEED']) # Sklearn에서 제공하는 Multi-layer Perceptron classifier 사용
# activation function을 바꿔줌, relu -> logistic
model.activation = 'logistic'
model.get_params()
model.fit(train_x, train_y) # Model Train

MLPClassifier(activation='logistic', random_state=41)

## Inference

In [46]:
# 위의 학습데이터를 전처리한 과정과 동일하게 test data에도 적용
test_x = pd.read_csv(dp + 'test_mfcc_data.csv')
test_x = test_x.drop(columns=['id'])
# Data Leakage에 유의하여 train data로만 학습된 ohe를 사용
test_x = onehot_encoding(ohe, test_x)

# Model 추론
preds = model.predict(test_x)

## Submission

In [47]:
submission = pd.read_csv(dp + '/sample_submission.csv')
submission['covid19'] = preds
submission.to_csv('./submit.csv', index=False)