In [None]:
!unzip -q /content/drive/MyDrive/음향_데이터.zip

In [None]:
import random
import pandas as pd
import numpy as np
import os
import librosa

from tqdm.auto import tqdm

from sklearn.preprocessing import MinMaxScaler, OneHotEncoder
from sklearn.metrics import f1_score
from sklearn.model_selection import StratifiedKFold
from sklearn.neural_network import MLPClassifier

from imblearn.under_sampling import RandomUnderSampler

import warnings
warnings.filterwarnings(action='ignore')

In [None]:
CFG = {
    'SR':16000,
    'N_MFCC':32, # MFCC 벡터를 추출할 개수
    'SEED':41
}

In [None]:
def seed_everything(seed):
    random.seed(seed)
    os.environ['PYTHONHASHSEED'] = str(seed)
    np.random.seed(seed)

seed_everything(CFG['SEED']) # Seed 고정

In [None]:
data_path = '/content'

In [None]:
train_df = pd.read_csv(os.path.join(data_path, 'train_data.csv'))
test_df = pd.read_csv(os.path.join(data_path, 'test_data.csv'))
unlabeled_df = pd.read_csv(os.path.join(data_path, 'unlabeled_data.csv'))

In [None]:
def get_mfcc_feature(df, data_type, save_path):
    # Data Folder path
    root_folder = data_path
    if os.path.exists(save_path):
        print(f'{save_path} is exist.')
        return
    features = []
    for uid in tqdm(df['id']):
        root_path = os.path.join(root_folder, data_type)
        path = os.path.join(root_path, str(uid).zfill(5)+'.wav')

        # librosa패키지를 사용하여 wav 파일 load
        y, sr = librosa.load(path, sr=CFG['SR'])

        # librosa패키지를 사용하여 mfcc 추출
        mfcc = librosa.feature.mfcc(y=y, sr=sr, n_mfcc=CFG['N_MFCC'])

        y_feature = []
        # 추출된 MFCC들의 평균을 Feature로 사용
        for e in mfcc:
            y_feature.append(np.mean(e))
        features.append(y_feature)

    # 기존의 자가진단 정보를 담은 데이터프레임에 추출된 오디오 Feature를 추가
    mfcc_df = pd.DataFrame(features, columns=['mfcc_'+str(x) for x in range(1,CFG['N_MFCC']+1)])
    df = pd.concat([df, mfcc_df], axis=1)
    df.to_csv(save_path, index=False)
    print('Done.')

In [None]:
!mkdir /content/temp


In [None]:
get_mfcc_feature(train_df, 'train', os.path.join(data_path, 'temp/train_mfcc_data.csv'))
get_mfcc_feature(test_df, 'test', os.path.join(data_path, 'temp/test_mfcc_data.csv'))
get_mfcc_feature(unlabeled_df, 'unlabeled', os.path.join(data_path, 'temp/unlabeled_mfcc_data.csv'))


  0%|          | 0/3805 [00:00<?, ?it/s]

Done.


  0%|          | 0/5732 [00:00<?, ?it/s]

Done.


In [None]:
get_mfcc_feature(unlabeled_df, 'unlabeled', os.path.join(data_path, 'temp/unlabeled_mfcc_data.csv'))


  0%|          | 0/1867 [00:00<?, ?it/s]

Done.


In [None]:
train_df = pd.read_csv(os.path.join(data_path, 'temp/train_mfcc_data.csv'))
test_df = pd.read_csv(os.path.join(data_path, 'temp/test_mfcc_data.csv'))
unlabeled_df = pd.read_csv(os.path.join(data_path, 'temp/unlabeled_mfcc_data.csv'))

In [None]:
train_x = train_df.drop(columns=['id', 'covid19', 'mfcc_1']+['mfcc_'+str(x) for x in range(14,33)])
train_y_raw = train_df['covid19']

def onehot_encoding(ohe, x):
    encoded = ohe.transform(x['gender'].values.reshape(-1,1))
    encoded_df = pd.DataFrame(encoded, columns=ohe.categories_[0], index=x.index)
    x = pd.concat([x.drop(columns=['gender']), encoded_df], axis=1)
    return x

ohe = OneHotEncoder(sparse=False)
ohe.fit(train_x['gender'].values.reshape(-1,1))
train_x_raw = onehot_encoding(ohe, train_x)

# unlabeled
unlabeled_x = unlabeled_df.drop(columns=['id', 'mfcc_1']+['mfcc_'+str(x) for x in range(14,33)])
unlabeled_x = onehot_encoding(ohe, unlabeled_x)

# test
test_x = test_df.drop(columns=['id', 'mfcc_1']+['mfcc_'+str(x) for x in range(14,33)])
test_x = onehot_encoding(ohe, test_x)

In [None]:
kfold = StratifiedKFold(n_splits=5, random_state=41, shuffle=True)

test_probs_fold = []

for train_index, valid_index in kfold.split(train_x_raw, train_y_raw):
  train_x_fold, train_y_fold = train_x_raw.iloc[train_index], train_y_raw.iloc[train_index]
  valid_x_fold, valid_y_fold = train_x_raw.iloc[valid_index], train_y_raw.iloc[valid_index]

  ### step1 ###
  # scaling
  scaler_ = MinMaxScaler()
  scaler_.fit(train_x_fold)
  train_x_sc = pd.DataFrame(scaler_.transform(train_x_fold), columns=train_x_fold.columns, index=train_x_fold.index)
  unlabeled_x_sc = pd.DataFrame(scaler_.transform(unlabeled_x), columns=unlabeled_x.columns, index=unlabeled_x.index)

  probs = []
  for i in range(0, 10):
    # resampling
    sampler_ = RandomUnderSampler(random_state=i)
    train_x_rs, train_y_rs = sampler_.fit_resample(train_x_sc, train_y_fold)

    # training
    model = MLPClassifier(random_state=8, hidden_layer_sizes=(12,))
    model.fit(train_x_rs, train_y_rs)

    # inference for pseudo-labeling
    probs.append(model.predict_proba(unlabeled_x_sc))

  preds = np.where(np.mean(np.array(probs)[:,:,1], axis=0) < 0.75, 0, 1)

  ### step2 ###
  # train + pseudo-labeled
  train_x_fold_2 = pd.concat([train_x_fold, unlabeled_x])
  train_y_fold_2 = pd.concat([train_y_fold, pd.Series(preds, index=unlabeled_x.index)])

  # scaling
  scaler_ = MinMaxScaler()
  scaler_.fit(train_x_fold_2)
  train_x_sc_2 = pd.DataFrame(scaler_.transform(train_x_fold_2), columns=train_x_fold_2.columns, index=train_x_fold_2.index)
  test_x_sc = pd.DataFrame(scaler_.transform(test_x), columns=test_x.columns, index=test_x.index)

  _test_probs = []
  for i in tqdm(range(0, 10)):
    # resampling
    sampler_ = RandomUnderSampler(random_state=i)
    train_x_rs_2, train_y_rs_2 = sampler_.fit_resample(train_x_sc_2, train_y_fold_2)

    # training
    model_2 = MLPClassifier(random_state=7, hidden_layer_sizes=(12,))
    model_2.fit(train_x_rs_2, train_y_rs_2)

    # inference
    _test_probs.append(model_2.predict_proba(test_x_sc))

  test_probs_fold.append(np.mean(np.array(_test_probs)[:,:,1], axis=0))

test_probs = np.mean(np.array(test_probs_fold), axis=0)

  0%|          | 0/10 [00:00<?, ?it/s]

  0%|          | 0/10 [00:00<?, ?it/s]

  0%|          | 0/10 [00:00<?, ?it/s]

  0%|          | 0/10 [00:00<?, ?it/s]

  0%|          | 0/10 [00:00<?, ?it/s]

In [None]:
type_df = test_df
type_probs = test_probs
result = pd.DataFrame({
              'id':type_df['id'],
              'pred_50':np.where(type_probs < 0.50, 0, 1),
              'pred_55':np.where(type_probs < 0.55, 0, 1),
              'pred_60':np.where(type_probs < 0.60, 0, 1),
              'pred_65':np.where(type_probs < 0.65, 0, 1),
              'pred_70':np.where(type_probs < 0.70, 0, 1),
              'pred_75':np.where(type_probs < 0.75, 0, 1),
              'pred_80':np.where(type_probs < 0.80, 0, 1),
              'uncertainty':np.std(np.array(type_probs), axis=0)
              }, index=type_df.index)

In [None]:
submission = pd.read_csv(os.path.join(data_path, 'sample_submission.csv'))
submission['covid19'] = result['pred_80']
submission.to_csv(os.path.join(data_path, 'submit-0711-OOF-8*10-p75-7*10-p80.csv'), index=False)