In [None]:
!pip install pytorch-tabnet

In [None]:
import os
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import random
import librosa

from tqdm.auto import tqdm
from sklearn.preprocessing import OneHotEncoder

import torch
import torch.nn as nn
from pytorch_tabnet.tab_model import TabNetClassifier

import warnings
warnings.filterwarnings(action='ignore') 

In [None]:
CFG = {
    'SR':16000,
    'N_MFCC':39,
    'SEED':1209
}

In [None]:
def seed_everything(seed):
    random.seed(seed)
    os.environ['PYTHONHASHSEED'] = str(seed)
    np.random.seed(seed)

seed_everything(CFG['SEED']) # Seed 고정

In [None]:
# wav 파일의 MFCC Feature와 상태정보를 합친 학습데이터를 불러옵니다.
train_df = pd.read_csv('/content/drive/MyDrive/dacon_covid/train_mfcc_data(16000,39).csv')

In [None]:
def onehot_encoding(ohe, x):
    # 학습데이터로 부터 fit된 one-hot encoder (ohe)를 받아 transform 시켜주는 함수
    encoded = ohe.transform(x['gender'].values.reshape(-1,1))
    encoded_df = pd.DataFrame(encoded, columns=ohe.categories_[0])
    x = pd.concat([x.drop(columns=['gender']), encoded_df], axis=1)
    return x

In [None]:
# 'gender' column의 경우 추가 전처리가 필요 -> OneHotEncoder 적용
ohe = OneHotEncoder(sparse=False)
ohe.fit(train_df['gender'].values.reshape(-1,1))
train_df = onehot_encoding(ohe, train_df)

train_df = train_df.drop(columns=['id', 'mfcc_1'])

# Tabnet

In [None]:
cat_col = ['respiratory_condition', 'fever_or_muscle_pain', 'female', 'male', 'other'] 
features = [col for col in train_df.columns if col not in ['covid19']] 
cat_idxs = [i for i, f in enumerate(features) if f in cat_col]
cat_dims = [2, 2, 2, 2, 2]

In [None]:
x_train = train_df[features].values
y_train = train_df['covid19'].values

In [None]:
clf = TabNetClassifier(cat_idxs=cat_idxs, cat_dims=cat_dims,
                       cat_emb_dim=3,
                       optimizer_fn=torch.optim.Adam,
                       optimizer_params=dict(lr=2e-2),
                       momentum = 0.2,
                       scheduler_params={"step_size":20, "gamma":0.9},
                       scheduler_fn=torch.optim.lr_scheduler.StepLR,
                       mask_type='sparsemax', seed=1209
                      )

Device used : cpu


In [None]:
max_epochs = 150

clf.fit(
    X_train=x_train, y_train=y_train,
    eval_set=[(x_train, y_train)],
    eval_name=['train'],
    eval_metric=['auc'],
    max_epochs=max_epochs , patience=10,
    batch_size=512, virtual_batch_size=128,
    drop_last = True
)

In [None]:
# 위의 학습데이터를 전처리한 과정과 동일하게 test data에도 적용
test_x = pd.read_csv('/content/drive/MyDrive/dacon_covid/test_mfcc_data(16000,39).csv')
test_x = test_x.drop(columns=['id', 'mfcc_1'])
# Data Leakage에 유의하여 train data로만 학습된 ohe를 사용
test_x = onehot_encoding(ohe, test_x)

test_x.head()

In [None]:
pred = clf.predict_proba(test_x.values)

predicted_class = []
for value in pred:
    if value[1] > 0.1: # threshold ==> 0.1
        predicted_class.append(1)
    else:
        predicted_class.append(0)

In [None]:
# save
submission = pd.read_csv('/content/drive/MyDrive/dacon_covid/sample_submission.csv')
submission['covid19'] = predicted_class
submission.to_csv('/content/drive/MyDrive/dacon_covid/submit_tabnet.csv', index=False)