# import

In [1]:
import os
import gc
import random
import math
import time
import datetime
import shutil

import numpy as np
import pandas as pd

from sklearn.model_selection import StratifiedKFold, GroupKFold, KFold
from sklearn.metrics import log_loss

import category_encoders as ce
from iterstrat.ml_stratifiers import MultilabelStratifiedKFold

import torch
import torch.nn as nn
from torch.utils.data import DataLoader, Dataset
import torch.optim as optim
import torch.nn.functional as F

import warnings
warnings.filterwarnings("ignore")
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

import common as com
import pth_modeling
import pth_preprocessing
import pth_model
from config import IO_CFG
from config import MODEL_CFG

# Setting

In [2]:
############################################################################
# set seed = 42
############################################################################
com.seed_everything(42)
############################################################################
# load config and set logger
############################################################################
#with open("./config.yaml", 'rb') as f:
#    config = yaml.load(f)

log_folder = IO_CFG.input_root + '/{0}.log'.format(datetime.date.today())
logger = com.setup_logger(log_folder, '00_baseline_MLP.ipynb')

In [3]:
############################################################################
# Setting I/O path
############################################################################
# input dirs
INPUT_ROOT = IO_CFG.input_root
# output dirs
OUTPUT_ROOT = IO_CFG.output_root
MODEL_DIR = IO_CFG.output_root + '/models'
os.makedirs(MODEL_DIR, exist_ok=True)

# copy config
shutil.copy('./config.yaml', OUTPUT_ROOT)

'/media/hiroki/working/kaggle/Mechanisms-of-Action-Prediction/result/MLP/baseline/config.yaml'

# dataload

In [4]:
train_features = pd.read_csv(INPUT_ROOT+'/train_features.csv')
train_targets_scored = pd.read_csv(INPUT_ROOT+'/train_targets_scored.csv')
train_targets_nonscored = pd.read_csv(INPUT_ROOT+'/train_targets_nonscored.csv')
test_features = pd.read_csv(INPUT_ROOT+'/test_features.csv')
submission = pd.read_csv(INPUT_ROOT+'/sample_submission.csv')

In [5]:
# ref: https://www.kaggle.com/c/lish-moa/discussion/180165
# check if labels for 'ctl_vehicle' are all 0.
train = train_features.merge(train_targets_scored, on='sig_id')
target_cols = [c for c in train_targets_scored.columns if c not in ['sig_id']]
cols = target_cols + ['cp_type']
train[cols].groupby('cp_type').sum().sum(1)

cp_type
ctl_vehicle        0
trt_cp         16844
dtype: int64

In [6]:
# constrcut train&test except 'cp_type'=='ctl_vehicle' data
print(train_features.shape, test_features.shape)
train = train[train['cp_type']!='ctl_vehicle'].reset_index(drop=True)
test = test_features[test_features['cp_type']!='ctl_vehicle'].reset_index(drop=True)
print(train.shape, test.shape)

(23814, 876) (3982, 876)
(21948, 1082) (3624, 876)


# CV split

In [7]:
folds = train.copy()
Fold = MultilabelStratifiedKFold(n_splits=5, shuffle=True, random_state=42)
for n, (train_index, val_index) in enumerate(Fold.split(folds, folds[target_cols])):
    folds.loc[val_index, 'fold'] = int(n)
folds['fold'] = folds['fold'].astype(int)
print(folds.shape)

(21948, 1083)


# Dataset

In [8]:
cat_features = ['cp_time', 'cp_dose']
num_features = [c for c in train.columns if train.dtypes[c] != 'object']
num_features = [c for c in num_features if c not in cat_features]
num_features = [c for c in num_features if c not in target_cols]
target = train[target_cols].values

def cate2num(df):
    df['cp_time'] = df['cp_time'].map({24: 0, 48: 1, 72: 2})
    df['cp_dose'] = df['cp_dose'].map({'D1': 3, 'D2': 4})
    return df

train = cate2num(train)
test = cate2num(test)

In [9]:
len(num_features)

872

In [10]:
len(target_cols)

206

# Model

## define

In [11]:
def run_single_nn(cfg, train, test, folds, num_features, cat_features, target, device, fold_num=0, seed=42):
    
    # Set seed
    logger.info(f'Set seed {seed}')
    com.seed_everything(seed=seed)

    # loader
    trn_idx = folds[folds['fold'] != fold_num].index
    val_idx = folds[folds['fold'] == fold_num].index
    train_folds = train.loc[trn_idx].reset_index(drop=True)
    valid_folds = train.loc[val_idx].reset_index(drop=True)
    train_target = target[trn_idx]
    valid_target = target[val_idx]
    train_dataset = pth_preprocessing.TrainDataset(train_folds, num_features, cat_features, train_target)
    valid_dataset = pth_preprocessing.TrainDataset(valid_folds, num_features, cat_features, valid_target)
    train_loader = DataLoader(train_dataset, batch_size=cfg.batch_size, shuffle=True, 
                              num_workers=4, pin_memory=True, drop_last=True)
    valid_loader = DataLoader(valid_dataset, batch_size=cfg.batch_size, shuffle=False, 
                              num_workers=4, pin_memory=True, drop_last=False)

    # model
    model = pth_model.TabularNN(cfg)
    model.to(device)
    optimizer = optim.Adam(model.parameters(), lr=cfg.lr, weight_decay=cfg.weight_decay)
    scheduler = optim.lr_scheduler.OneCycleLR(optimizer=optimizer, pct_start=0.1, div_factor=1e3, 
                                              max_lr=1e-2, epochs=cfg.epochs, steps_per_epoch=len(train_loader))

    # log
    log_df = pd.DataFrame(columns=(['EPOCH']+['TRAIN_LOSS']+['VALID_LOSS']) )

    # train & validate
    best_loss = np.inf
    for epoch in range(cfg.epochs):
        train_loss = pth_modeling.train_fn(train_loader, model, optimizer, epoch, scheduler, device)
        valid_loss, val_preds = pth_modeling.validate_fn(valid_loader, model, device)
        log_row = {'EPOCH': epoch, 
                   'TRAIN_LOSS': train_loss,
                   'VALID_LOSS': valid_loss,
                  }
        log_df = log_df.append(pd.DataFrame(log_row, index=[0]), sort=False)
        #logger.info(log_df.tail(1))
        if valid_loss < best_loss:
            logger.info(f'epoch{epoch} save best model... {valid_loss}')
            best_loss = valid_loss
            oof = np.zeros((len(train), cfg.target_cols))
            oof[val_idx] = val_preds
            torch.save(model.state_dict(), '{}/models'.format(IO_CFG.output_root)+f"/fold{fold_num}_seed{seed}.pth")

    # predictions
    test_dataset = pth_preprocessing.TestDataset(test, num_features, cat_features)
    test_loader = DataLoader(test_dataset, batch_size=cfg.batch_size, shuffle=False, 
                             num_workers=4, pin_memory=True)
    model = pth_model.TabularNN(cfg)
    model.load_state_dict(torch.load('{}/models'.format(IO_CFG.output_root)+f"/fold{fold_num}_seed{seed}.pth"))
    model.to(device)
    predictions = pth_modeling.inference_fn(test_loader, model, device)
    
    # del
    torch.cuda.empty_cache()

    return oof, predictions

In [12]:
def run_kfold_nn(cfg, train, test, folds, num_features, cat_features, target, device, n_fold=5, seed=42):

    oof = np.zeros((len(train), cfg.target_cols))
    predictions = np.zeros((len(test), cfg.target_cols))

    for _fold in range(n_fold):
        logger.info("Fold {}".format(_fold))
        _oof, _predictions = run_single_nn(cfg,
                                           train,
                                           test,
                                           folds,
                                           num_features, 
                                           cat_features,
                                           target, 
                                           device,
                                           fold_num=_fold,
                                           seed=seed)
        oof += _oof
        predictions += _predictions / n_fold

    score = 0
    for i in range(target.shape[1]):
        _score = log_loss(target[:,i], oof[:,i])
        score += _score / target.shape[1]
    logger.info(f"CV score: {score}")
    
    return oof, predictions

## run

In [13]:
# Seed Averaging for solid result
oof = np.zeros((len(train), MODEL_CFG.target_cols))
predictions = np.zeros((len(test), MODEL_CFG.target_cols))

SEED = [0, 1, 2]
for seed in SEED:
    _oof, _predictions = run_kfold_nn(MODEL_CFG, 
                                      train, test, folds, 
                                      num_features, cat_features, target,
                                      device,
                                      n_fold=5, seed=seed)
    oof += _oof / len(SEED)
    predictions += _predictions / len(SEED)

score = 0
for i in range(target.shape[1]):
    _score = log_loss(target[:,i], oof[:,i])
    score += _score / target.shape[1]
logger.info(f"Seed Averaged CV score: {score}")

2020-10-19 17:44:23,524 - 00_baseline_MLP.ipynb - INFO - Fold 0
2020-10-19 17:44:23,525 - 00_baseline_MLP.ipynb - INFO - Set seed 0
2020-10-19 17:44:27,700 - 00_baseline_MLP.ipynb - INFO - epoch0 save best model... 0.020932637995155635
2020-10-19 17:44:30,268 - 00_baseline_MLP.ipynb - INFO - epoch1 save best model... 0.018820307481024152
2020-10-19 17:44:33,127 - 00_baseline_MLP.ipynb - INFO - epoch2 save best model... 0.018154521185081613
2020-10-19 17:44:36,082 - 00_baseline_MLP.ipynb - INFO - epoch3 save best model... 0.017682247374690475
2020-10-19 17:44:41,869 - 00_baseline_MLP.ipynb - INFO - epoch5 save best model... 0.017219796780165886
2020-10-19 17:44:54,677 - 00_baseline_MLP.ipynb - INFO - epoch10 save best model... 0.017118015102532975
2020-10-19 17:44:57,287 - 00_baseline_MLP.ipynb - INFO - epoch11 save best model... 0.01694158597261664
2020-10-19 17:45:02,624 - 00_baseline_MLP.ipynb - INFO - epoch13 save best model... 0.01666934386124847
2020-10-19 17:45:05,334 - 00_baseli

2020-10-19 17:48:53,835 - 00_baseline_MLP.ipynb - INFO - epoch14 save best model... 0.016680076671859743
2020-10-19 17:48:56,644 - 00_baseline_MLP.ipynb - INFO - epoch15 save best model... 0.01657240781856679
2020-10-19 17:48:59,403 - 00_baseline_MLP.ipynb - INFO - epoch16 save best model... 0.016366834597571697
2020-10-19 17:49:08,613 - 00_baseline_MLP.ipynb - INFO - CV score: 0.016228884300074107
2020-10-19 17:49:08,651 - 00_baseline_MLP.ipynb - INFO - Fold 0
2020-10-19 17:49:08,652 - 00_baseline_MLP.ipynb - INFO - Set seed 1
2020-10-19 17:49:11,539 - 00_baseline_MLP.ipynb - INFO - epoch0 save best model... 0.022060864399227574
2020-10-19 17:49:14,287 - 00_baseline_MLP.ipynb - INFO - epoch1 save best model... 0.018940565119028634
2020-10-19 17:49:17,071 - 00_baseline_MLP.ipynb - INFO - epoch2 save best model... 0.01801903213041247
2020-10-19 17:49:19,842 - 00_baseline_MLP.ipynb - INFO - epoch3 save best model... 0.017611182314727736
2020-10-19 17:49:22,617 - 00_baseline_MLP.ipynb - I

2020-10-19 17:53:21,549 - 00_baseline_MLP.ipynb - INFO - epoch10 save best model... 0.01716847196887643
2020-10-19 17:53:24,298 - 00_baseline_MLP.ipynb - INFO - epoch11 save best model... 0.017034428360313686
2020-10-19 17:53:27,010 - 00_baseline_MLP.ipynb - INFO - epoch12 save best model... 0.016970418482309298
2020-10-19 17:53:29,827 - 00_baseline_MLP.ipynb - INFO - epoch13 save best model... 0.016785129146150927
2020-10-19 17:53:32,623 - 00_baseline_MLP.ipynb - INFO - epoch14 save best model... 0.016588811026739692
2020-10-19 17:53:35,409 - 00_baseline_MLP.ipynb - INFO - epoch15 save best model... 0.016460155700472403
2020-10-19 17:53:38,165 - 00_baseline_MLP.ipynb - INFO - epoch16 save best model... 0.016441623265028515
2020-10-19 17:53:40,810 - 00_baseline_MLP.ipynb - INFO - epoch17 save best model... 0.016409069505951134
2020-10-19 17:53:47,173 - 00_baseline_MLP.ipynb - INFO - CV score: 0.016229429212447608
2020-10-19 17:53:47,204 - 00_baseline_MLP.ipynb - INFO - Fold 0
2020-10-1

2020-10-19 17:57:46,354 - 00_baseline_MLP.ipynb - INFO - epoch9 save best model... 0.01722458765032922
2020-10-19 17:57:51,787 - 00_baseline_MLP.ipynb - INFO - epoch11 save best model... 0.016949386746930518
2020-10-19 17:57:57,503 - 00_baseline_MLP.ipynb - INFO - epoch13 save best model... 0.016758754582917347
2020-10-19 17:58:00,187 - 00_baseline_MLP.ipynb - INFO - epoch14 save best model... 0.016631022515613565
2020-10-19 17:58:03,006 - 00_baseline_MLP.ipynb - INFO - epoch15 save best model... 0.01649507479926779
2020-10-19 17:58:06,049 - 00_baseline_MLP.ipynb - INFO - epoch16 save best model... 0.01648749009060796
2020-10-19 17:58:09,930 - 00_baseline_MLP.ipynb - INFO - epoch17 save best model... 0.016433175660009455
2020-10-19 17:58:15,913 - 00_baseline_MLP.ipynb - INFO - epoch19 save best model... 0.01643272296424321
2020-10-19 17:58:16,857 - 00_baseline_MLP.ipynb - INFO - CV score: 0.016232027489009572
2020-10-19 17:58:17,508 - 00_baseline_MLP.ipynb - INFO - Seed Averaged CV sco

In [14]:
train[target_cols] = oof
train[['sig_id']+target_cols].to_csv('oof.csv', index=False)

test[target_cols] = predictions
test[['sig_id']+target_cols].to_csv('pred.csv', index=False)

In [15]:
train

Unnamed: 0,sig_id,cp_type,cp_time,cp_dose,g-0,g-1,g-2,g-3,g-4,g-5,...,tropomyosin_receptor_kinase_inhibitor,trpv_agonist,trpv_antagonist,tubulin_inhibitor,tyrosine_kinase_inhibitor,ubiquitin_specific_protease_inhibitor,vegfr_inhibitor,vitamin_b,vitamin_d_receptor_agonist,wnt_inhibitor
0,id_000644bb2,trt_cp,0,3,1.0620,0.5577,-0.2479,-0.6208,-0.1944,-1.0120,...,0.000126,0.000046,0.001006,0.000905,0.001055,0.000689,0.000051,0.001580,0.000007,0.002340
1,id_000779bfc,trt_cp,2,3,0.0743,0.4087,0.2991,0.0604,1.0190,0.5207,...,0.000803,0.001329,0.003145,0.001345,0.002158,0.000278,0.000299,0.001895,0.001350,0.003683
2,id_000a6266a,trt_cp,1,3,0.6280,0.5817,1.5540,-0.0764,-0.0323,1.2390,...,0.000179,0.000990,0.002546,0.000450,0.003742,0.001094,0.037674,0.001645,0.000206,0.001933
3,id_0015fd391,trt_cp,1,3,-0.5138,-0.2491,-0.2656,0.5288,4.0620,-0.8095,...,0.000410,0.022553,0.001278,0.461837,0.004621,0.000918,0.000066,0.000649,0.000008,0.000191
4,id_001626bd3,trt_cp,2,4,-0.3254,-0.4009,0.9700,0.6919,1.4180,-0.8244,...,0.000627,0.000544,0.002267,0.000154,0.000825,0.001018,0.000072,0.001214,0.000089,0.002249
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
21943,id_fff8c2444,trt_cp,2,3,0.1608,-1.0500,0.2551,-0.2239,-0.2431,0.4256,...,0.000230,0.000062,0.001756,0.000134,0.000339,0.000605,0.001395,0.001073,0.000845,0.001124
21944,id_fffb1ceed,trt_cp,0,4,0.1394,-0.0636,-0.1112,-0.5080,-0.4713,0.7201,...,0.000406,0.000128,0.001636,0.000537,0.002578,0.000363,0.000819,0.000912,0.001213,0.001476
21945,id_fffb70c0c,trt_cp,0,4,-1.3260,0.3478,-0.3743,0.9905,-0.7178,0.6621,...,0.000091,0.002020,0.000281,0.000042,0.029316,0.000417,0.005079,0.000670,0.027484,0.002169
21946,id_fffcb9e7c,trt_cp,0,3,0.6660,0.2324,0.4392,0.2044,0.8531,-0.0343,...,0.000034,0.000018,0.000190,0.000004,0.000471,0.000020,0.000078,0.000066,0.000115,0.001728


In [16]:
# Final result with 'cp_type'=='ctl_vehicle' data
result = train_targets_scored.drop(columns=target_cols)\
            .merge(train[['sig_id']+target_cols], on='sig_id', how='left').fillna(0)
y_true = train_targets_scored[target_cols].values
y_pred = result[target_cols].values
score = 0
for i in range(y_true.shape[1]):
    _score = log_loss(y_true[:,i], y_pred[:,i])
    score += _score / y_true.shape[1]
logger.info(f"Final result: {score}")

2020-10-19 17:58:30,673 - 00_baseline_MLP.ipynb - INFO - Final result: 0.01481483620880397


# Submit

In [17]:
sub = submission.drop(columns=target_cols).merge(test[['sig_id']+target_cols], on='sig_id', how='left').fillna(0)
sub.to_csv('submission.csv', index=False)
sub.head()

Unnamed: 0,sig_id,5-alpha_reductase_inhibitor,11-beta-hsd1_inhibitor,acat_inhibitor,acetylcholine_receptor_agonist,acetylcholine_receptor_antagonist,acetylcholinesterase_inhibitor,adenosine_receptor_agonist,adenosine_receptor_antagonist,adenylyl_cyclase_activator,...,tropomyosin_receptor_kinase_inhibitor,trpv_agonist,trpv_antagonist,tubulin_inhibitor,tyrosine_kinase_inhibitor,ubiquitin_specific_protease_inhibitor,vegfr_inhibitor,vitamin_b,vitamin_d_receptor_agonist,wnt_inhibitor
0,id_0004d9e33,0.000808,0.00131,0.001778,0.011422,0.030831,0.005141,0.001142,0.007376,8.3e-05,...,0.000513,0.000799,0.003594,0.000181,0.000342,0.000596,0.000237,0.001667,0.003253,0.001222
1,id_001897cda,0.000343,0.000667,0.001801,0.00143,0.001374,0.001664,0.007242,0.009013,0.011787,...,0.000658,0.000381,0.005166,8.9e-05,0.00919,0.000389,0.004172,0.000672,0.001604,0.003016
2,id_002429b5b,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,id_00276f245,0.001066,0.000535,0.002702,0.017918,0.015066,0.004992,0.003166,0.003499,0.000127,...,0.000706,0.000458,0.003461,0.02783,0.009553,0.000523,0.001117,0.003328,0.000178,0.003792
4,id_0027f1083,0.001917,0.002952,0.002145,0.016164,0.021451,0.00551,0.00735,0.001573,0.000211,...,0.000886,0.000392,0.009714,0.006268,0.001083,0.000951,0.000708,0.001945,9.5e-05,0.00122
