# Inference만 하는 코드

In [6]:
import os
import gc
import random
import pickle
from itertools import product

import numpy as np
import pandas as pd

import esm

import torch
import torch.nn as nn
import torch.optim as optim
import torch.nn.functional as F
from torch.utils.data import Dataset, DataLoader
from torch.optim.lr_scheduler import ReduceLROnPlateau

from tqdm.auto import tqdm

from sklearn.metrics import f1_score
from sklearn.preprocessing import OneHotEncoder
from sklearn.preprocessing import MaxAbsScaler, Normalizer, RobustScaler, StandardScaler,MinMaxScaler
from sklearn.model_selection import KFold
from sklearn.decomposition import PCA


from pytorch_tabnet.tab_model import TabNetClassifier
from pytorch_tabnet.metrics import Metric

# set visible device
os.environ["CUDA_DEVICE_ORDER"]="PCI_BUS_ID"
os.environ["CUDA_VISIBLE_DEVICES"]= "0"

CONFIG = {
    'n_worker':16,
    # ESM embedding
    'epitope_max_len':128,
    'antigen_max_len':64,
    'epitope_batch_size':50,
    'antigen_batch_size':100,
    # Feature extraction
    'CT_CTD_features' : False,
    'CNT_features' : False,
    'CT_CTD_PCA':0,
    'CNT_PCA':0,
    # Tabnet model
    'epochs' : 100,
    'patience' : 20,
    'learning_rate':2e-2,
    'weight_decay':1e-5,
    'threshold':0.5,
    'seed':42,
    'fold':5
}

# seed setting
def seed_everything(seed:int):
    random.seed(seed)
    os.environ['PYTHONHASHSEED'] = str(seed)
    np.random.seed(seed)
    torch.manual_seed(seed)
    torch.cuda.manual_seed(seed)
    torch.backends.cudnn.deterministic = True
    torch.backends.cudnn.benchmark = True

seed_everything(CONFIG['seed']) # Seed 고정

# tabnet params
tabnet_params = dict(
    n_d = 64,   # 8 to 64
    n_a = 128,  # n_d = n_a usally good
    n_steps = 1,
    gamma = 1.3,
    lambda_sparse = 0,
    n_independent = 2,
    n_shared = 1,
    optimizer_fn = optim.Adam,
    optimizer_params = dict(lr = CONFIG['learning_rate'], weight_decay = CONFIG['weight_decay']),
    mask_type = "entmax",
    scheduler_params = dict(
        mode = "min", patience = 5, min_lr = 1e-5, factor = 0.9),
    scheduler_fn = ReduceLROnPlateau,
    seed = CONFIG["seed"],
    verbose = 5
)


In [7]:
def get_preprocessing(data_type, epitope_data, left_anti, right_anti, new_df):
    tab_net_feature_list = []
    feature_list = []

    features_cols = ["assay_method_technique", "assay_group", "disease_type", "disease_state", "reference_date", "reference_journal", "reference_title"]
    feature_data = enc.transform(new_df[features_cols]).toarray()

    # sort by id
    epitope_data.sort()
    left_anti.sort()
    right_anti.sort()
    
    for df_id, epi_embs, left_embs, right_embs, feature in tqdm(zip(new_df['id'], epitope_data, left_anti, right_anti, feature_data)):
        if not df_id == epi_embs[0]:
            print("Not matched ID")
        tab_net_features = np.append(epi_embs[1], left_embs[1])
        tab_net_features = np.append(tab_net_features , right_embs[1])
        tab_net_features = np.append(tab_net_features , feature)
        tab_net_feature_list.append(tab_net_features)
    
    label_list = None
    if data_type != 'test':
        label_list = []
        for label in new_df['label']:
            label_list.append(label)
        label_list = np.array(label_list)
    print(f'{data_type} dataframe preprocessing was done.')

    tab_net_feature_list = np.array(tab_net_feature_list)
    
    return tab_net_feature_list, label_list

In [8]:
# ============= Norm 
def norm_transform(datatype, data, scaler_name='z-score', scaler=None):
    scaler_dict = {
        'z-score':StandardScaler(),
        'minmax':MinMaxScaler(),
        'maxabs':MaxAbsScaler(),
        'robust':RobustScaler(),
        'norm':Normalizer()
    }
    
    # use only train
    if not datatype=="test":
        scaler = scaler_dict[scaler_name]
        scaled_train = scaler.fit_transform(data)
        return scaled_train, scaler
    else:
        scaled_test = scaler.transform(data)
        return scaled_test

# ============= pca 
def pca_transform(datatype, data, n_comp=300, pca=None):
    if not datatype=="test":
        pca = PCA(n_components=n_comp, random_state=CONFIG["seed"])
        pca_train = pca.fit_transform(data)
        print(f"with {n_comp} components, pca variance ratio : {sum(pca.explained_variance_ratio_)}")
        return pca_train, pca
    else:
        pca_test = pca.transform(data)
        return pca_test

In [9]:
# TRAIN_DATA_PATH = "../dataset/train.csv"
# TEST_DATA_PATH = "../dataset/test.csv"
TRAIN_DATA_PATH = "/data/train.csv"
TEST_DATA_PATH = "/data/dataset/test.csv"
EPITOPE_EMB_PATH = "./Embeddings/test_epitope_embeddings.pkl"
LEFT_EMB_PATH = "./Embeddings/test_left_embeddings.pkl"
RIGHT_EMB_PATH = "./Embeddings/test_right_embeddings.pkl"

MODEL_DIR_NAME = "./Tabnet_ESM_models"

# load train data
df_train = pd.read_csv(TRAIN_DATA_PATH)
df_test = pd.read_csv(TEST_DATA_PATH)

# label one hot encoding
enc = OneHotEncoder(handle_unknown='ignore')
features_cols = ["assay_method_technique", "assay_group", "disease_type", "disease_state", "reference_date", "reference_journal", "reference_title"]
enc.fit(df_train[features_cols])

# load esm embeddings
with open(EPITOPE_EMB_PATH, "rb") as fr:
    epitope_data = pickle.load(fr)
with open(LEFT_EMB_PATH, "rb") as fr:
    left_anti_data = pickle.load(fr)
with open(RIGHT_EMB_PATH, "rb") as fr:
    right_anti_data = pickle.load(fr)

# processing embedding features
test_tab_net_feature_list, label_list = get_preprocessing('test', epitope_data, left_anti_data, right_anti_data, df_test)

# CT-CTD features ===
if CONFIG['CT_CTD_features']:
    TEST_CT_CTD_PATH = f"test_epitope_antigen_CTCTD.pkl"
    CT_CTD_SCALER = f"CTCTD_scaler.pkl"
    CT_CTD_PCA = f"CTCTD_pca.pkl"
    CT_CTD_DIR = "CT_CTD_Features"

    test_ct_ctd_path = os.path.join(CT_CTD_DIR, TEST_CT_CTD_PATH)
    scaler_ct_ctd_path = os.path.join(CT_CTD_DIR, CT_CTD_SCALER)
    pca_ct_ctd__path = os.path.join(CT_CTD_DIR, CT_CTD_PCA)

    with open(test_ct_ctd_path, "rb") as fr:
        test_ct_ctd_features= pickle.load(fr)
        
    with open(scaler_ct_ctd_path, "rb") as fr:
        scaler = pickle.load(fr)

    test_ct_ctd_features = norm_transform("test", test_ct_ctd_features, "minmax", scaler)

    # PCA
    if not CONFIG['CT_CTD_PCA']==0:
        n_comps = CONFIG['CT_CTD_PCA']
        before_features = test_ct_ctd_features.shape[1]

        with open(pca_ct_ctd__path, "rb") as fr:
            pca = pickle.load(fr)

        test_ct_ctd_features = pca_transform("test", test_ct_ctd_features, n_comps, pca)
        # need to save pca scaler
        print(f"CT_CTD feature decomposition finished {before_features} -> {n_comps}")

    test_tab_net_feature_list = np.concatenate((test_ct_ctd_features, test_tab_net_feature_list), axis=1)

# CNT featuers ===
if CONFIG['CNT_features']:
    TEST_CNT_PATH = f"test_epitope_antigen_CNT.pkl"
    CNT_SCALER = f"CNT_scaler.pkl"
    CNT_PCA = f"CNT_pca.pkl"
    CNT_DIR = "CNT_Features"

    test_cnt_path = os.path.join(CNT_DIR, TEST_CNT_PATH)
    scaler_cnt_path = os.path.join(CNT_DIR, CNT_SCALER)
    pca_cnt_path = os.path.join(CNT_DIR, CNT_PCA)
    

    with open(test_cnt_path, "rb") as fr:
        test_cnt_features= pickle.load(fr)
    # Norm
    with open(scaler_cnt_path, "rb") as fr:
        scaler = pickle.load(fr)

    test_cnt_features = norm_transform("test", test_cnt_features, "minmax", scaler)


    # PCA
    if not CONFIG['CNT_PCA']==0:
        n_comps = CONFIG['CNT_PCA']
        before_features = test_cnt_features.shape[1]

        with open(pca_cnt_path, "rb") as fr:
            pca = pickle.load(fr)

        test_cnt_features = pca_transform("test", test_cnt_features, n_comps, pca)
        print(f"CNT feature decomposition finished {before_features.shape[1]} -> {n_comps}")

    test_tab_net_feature_list = np.concatenate((test_cnt_features, test_tab_net_feature_list), axis=1)


preds_logits = np.zeros((len(df_test), 2))

for fold in range(CONFIG['fold']):
    model_path = os.path.join(MODEL_DIR_NAME, f"tabnet_esm_model_fold{fold+1}.zip")
    tabnet_model = TabNetClassifier(**tabnet_params)
    tabnet_model.load_model(model_path)

    preds_logits+= tabnet_model.predict_proba(test_tab_net_feature_list)

preds_logits/=5.
preds = preds_logits[:, 1]
pred_label = np.where(preds>CONFIG['threshold'], 1, 0)
uni, cnt = np.unique(pred_label, return_counts=True)
print(f"Predict Label Dist : {cnt}")

submit = pd.read_csv('/data/sample_submission.csv')
submit['label'] = pred_label
submit.to_csv('./tabnet_submit.csv', index=False)

0it [00:00, ?it/s]

test dataframe preprocessing was done.
Device used : cuda
Device used : cuda
Device used : cuda
Device used : cuda
Device used : cuda
Device used : cuda
Device used : cuda
Device used : cuda
Device used : cuda
Device used : cuda
Predict Label Dist : [107563  13381]
