In [7]:
'''

audio covid-19  AI

modified the [Baseline] code 

'''


import random
import pandas as pd
import numpy as np
import os
import librosa

from tqdm.auto import tqdm

from sklearn.neural_network import MLPClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.preprocessing import OneHotEncoder

import matplotlib.pyplot as plt
import PyQt5 

from PIL import Image
import matplotlib as mpl
import librosa.display

%matplotlib qt

import warnings
warnings.filterwarnings(action='ignore') 

CFG = {
    'SR':16000,
    'N_MFCC':32, # MFCC 벡터를 추출할 개수
    'SEED':41
}

def seed_everything(seed):
    random.seed(seed)
    os.environ['PYTHONHASHSEED'] = str(seed)
    np.random.seed(seed)

seed_everything(CFG['SEED']) # Seed 고정

train_df = pd.read_csv('open/train_data.csv')
test_df = pd.read_csv('open/test_data.csv')

In [8]:

# wav 파일의 MFCC Feature와 상태정보를 합친 학습데이터를 불러옵니다.
train_df = pd.read_csv('./train_mfcc_data3.csv')

# 학습데이터를 모델의 input으로 들어갈 x와 label로 사용할 y로 분할
train_x = train_df.drop(columns=['id', 'covid19'])
train_y = train_df['covid19']


In [9]:
def onehot_encoding(ohe, x):
    # 학습데이터로 부터 fit된 one-hot encoder (ohe)를 받아 transform 시켜주는 함수
    encoded = ohe.transform(x['gender'].values.reshape(-1,1))
    encoded_df = pd.DataFrame(encoded, columns=ohe.categories_[0])
    x = pd.concat([x.drop(columns=['gender']), encoded_df], axis=1)
    return x

In [10]:
# 'gender' column의 경우 추가 전처리가 필요 -> OneHotEncoder 적용
ohe = OneHotEncoder(sparse=False)
ohe.fit(train_x['gender'].values.reshape(-1,1))
train_x = onehot_encoding(ohe, train_x)

In [29]:

from sklearn.model_selection import train_test_split, cross_val_score, RandomizedSearchCV
from sklearn.metrics import f1_score
import os, sys, time

X_train, X_test, y_train, y_test=  train_test_split(train_x, train_y, test_size=0.3, random_state=128)

print(X_train.shape, X_test.shape, train_x.shape)

tuned_parameters = {
    'activation': (['relu','logistic','tanh','identity']),
    'hidden_layer_sizes': ([ [5],[15],[25],[35],[45],[55],[65],[75],[85],[95],[105],[115],[125], [135],[145],[155]]),
    'alpha':     ([0.1,0.01, 0.001, 0.0001,0.0001]),
    'batch_size':         ['auto'],
    'learning_rate_init':    [0.01,0.001],
    'solver': ["adam"]
}

clf =  RandomizedSearchCV(MLPClassifier(), tuned_parameters, cv=5, n_jobs=1, scoring='f1_macro')  

st = time.time()
clf_ = clf.fit(X_train, y_train)

print('elapsed ... ', time.time()-st)

print(clf.best_params_)
a = clf.best_params_

clf2 = MLPClassifier(solver=a['solver'],learning_rate_init=a['learning_rate_init'], hidden_layer_sizes=a['hidden_layer_sizes'], 
                           batch_size=a['batch_size'], alpha=a['alpha'], activation=a['activation'])

clf_nn_opt = clf2.fit(X_train, y_train)

scores_nn = cross_val_score(clf_nn_opt, X_train, y_train, cv=5)
                
m = scores_nn.mean()    
s = scores_nn.std()
ci_l = m - 1.96 * s        
ci_u = m + 1.96 * s 
print('NN MLPClassifier, cross_val_score, accuracy: %0.4f (+/- %0.4f)' % (scores_nn.mean(), scores_nn.std()))
print('\t 95 percent confidence interval = [%5.4f, %5.4f]' % (ci_l, ci_u))

## testing 
y_pred = clf_nn_opt.predict(X_test)

res = f1_score(y_test, y_pred, average='macro')

print(f'test of train data: f1_macro scores = {res}')

(2663, 38) (1142, 38) (3805, 38)
elapsed ...  18.37578535079956
{'solver': 'adam', 'learning_rate_init': 0.01, 'hidden_layer_sizes': [35], 'batch_size': 'auto', 'alpha': 0.0001, 'activation': 'relu'}
NN MLPClassifier, cross_val_score, accuracy: 0.9042 (+/- 0.0089)
	 95 percent confidence interval = [0.8868, 0.9216]
test of train data: f1_macro scores = 0.4811449341208542


In [None]:
''' support vector machine '''
from sklearn import tree, svm
from skle
tuned_parameters = {
        'C':            ([0.1, 0.01, 0.001, 1, 10, 100]),
        'kernel':       ['linear', 'rbf','poly'],                
        'degree':       ([1,2,3,4,5,6]),
        'gamma':         [1, 0.1, 0.01, 0.001, 0.0001]
        #'tol':         [1, 0.1, 0.01, 0.001, 0.0001],
        }

clf =  RandomizedSearchCV(svm.SVC(), tuned_parameters, cv=5, n_jobs=1, scoring='f1_macro')   

st = time.time()

# hyper-parameter들을 try.

clf_svm_search = clf.fit(X_train, y_train)

print('elapsed ... ', time.time()-st)

print(clf_svm_search)

print(clf_svm_search.best_params_)
print(clf_svm_search.best_score_)
a = clf_svm_search.best_params_

clf = svm.SVC(gamma=a['gamma'], kernel=a['kernel'], degree=a['degree'], C=a['C'], random_state=3333)

print(clf)

clf_svm_opt = clf.fit(X_train, y_train)

print(clf_svm_opt)

scores_svm = cross_val_score(clf_svm_opt, X_train, y_train, cv=5)
print('SVM, cross_val_score, accuracy: %0.4f (+/- %0.4f)' % (scores_svm.mean(), scores_svm.std()))
print('\t 95 percent confidence interval = [%5.4f, %5.4f]' % (ci_l, ci_u))


## testing 
y_pred = clf_svm_opt.predict(X_test)

res = f1_score(y_test, y_pred, average='macro')

print(f'[Support Vector Machine model] test of train data: f1_macro scores = {res}')


In [18]:
''' Random forest '''
from sklearn.model_selection import train_test_split, cross_val_score, RandomizedSearchCV
from sklearn.metrics import f1_score
import os, sys, time
from sklearn.ensemble import RandomForestClassifier
tuned_parameters = {
    'n_estimators': ([100,200,300,400,500,600,700]),
    'max_features': (['auto', 'sqrt', 'log2']),                   # precomputed,'poly', 'sigmoid'
    'max_depth':    ([10,15,20,25,30,None]),
    'criterion':    (['gini', 'entropy','log_loss']),
    'min_samples_split':  [2,4,6,8,10,12,14,16],
    'min_samples_leaf':   [2,4,6,8,10,12,14,16]
    }
st = time.time()
clf = RandomizedSearchCV(RandomForestClassifier(), tuned_parameters, cv=5, n_jobs=1, scoring='f1_macro')     
print('elapsed ... ', time.time()-st)

st = time.time()
clf.fit(X_train, y_train)
print('elapsed ... ', time.time()-st)

print(clf.best_params_)
# print(clf.best_score_)
# print(clf.best_estimator_.score)
       
a = clf.best_params_

# use the above optimized parameters...    
clf = RandomForestClassifier(n_estimators=a['n_estimators'], max_depth=a['max_depth'],min_samples_split=a['min_samples_split'], 
                                min_samples_leaf=a['min_samples_leaf'], max_features=a['max_features'],criterion=a['criterion'], random_state=3333)
   
clf_rf_opt = clf.fit(X_train,y_train)    
    
scores_rf = cross_val_score(clf_rf_opt, X_train, y_train, cv=5)

print(scores_rf)
m = scores_rf.mean()
s = scores_rf.std()

# calculate 95% confidence interval
ci_l = m - 1.96 * s
ci_u = m + 1.96 * s

print('RFC, cross_val_score, accuracy: %0.4f (+/- %0.4f)' % (scores_rf.mean(), scores_rf.std()))
print('\t 95 percent confidence interval = [%5.4f, %5.4f]' % (ci_l, ci_u))      

## testing 
y_pred = clf_rf_opt.predict(X_test)
res = f1_score(y_test, y_pred, average='macro')
print(f'[Random Forest model] test of train data: f1_macro scores = {res}')

elapsed ...  0.0
elapsed ...  99.8425190448761
{'n_estimators': 200, 'min_samples_split': 12, 'min_samples_leaf': 2, 'max_features': 'sqrt', 'max_depth': 20, 'criterion': 'entropy'}
[0.91557223 0.91557223 0.91557223 0.91541353 0.91541353]
RFC, cross_val_score, accuracy: 0.9155 (+/- 0.0001)
	 95 percent confidence interval = [0.9154, 0.9157]
[Random Forest model] test of train data: f1_macro scores = 0.48161597821152974


In [36]:
print(clf.best_params_)

{'solver': 'adam', 'learning_rate_init': 0.001, 'hidden_layer_sizes': [55], 'batch_size': 'auto', 'alpha': 0.001, 'activation': 'relu'}


In [22]:

# 위의 학습데이터를 전처리한 과정과 동일하게 test data에도 적용

model = clf_rf_opt
test_x = pd.read_csv('./test_mfcc_data3.csv')
test_x = test_x.drop(columns=['id'])

# Data Leakage에 유의하여 train data로만 학습된 ohe를 사용
test_x = onehot_encoding(ohe, test_x)

# Model 추론
preds = model.predict(test_x)



In [23]:
submission = pd.read_csv('./sample_submission.csv')
submission['covid19'] = preds
submission.to_csv('./submit1.csv', index=False)