In [1]:
import sys

sys.path.append('../input/iterative-stratification/iterative-stratification-master')
from iterstrat.ml_stratifiers import MultilabelStratifiedKFold
sys.path.append('../input/moa-mlp-maxout/modules')

# Lib

In [2]:
import os
import gc
import random
import math
import time
from collections import defaultdict

import numpy as np
import pandas as pd

from sklearn.model_selection import StratifiedKFold, GroupKFold, KFold
from sklearn.metrics import log_loss

import category_encoders as ce

import torch
import torch.nn as nn
from torch.utils.data import DataLoader, Dataset
import torch.optim as optim
import torch.nn.functional as F

import warnings
warnings.filterwarnings("ignore")

device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

import common as com
from MLP_Dataset import TrainDataset, TestDataset
from MLP_Model import FCBlock, TabularNN
from trainer import train_fn, validate_fn, inference_fn, AverageMeter
from config import CFG

# util

In [3]:
out_base_path = "./"
model_path = "../input/moa-mlp-maxout/max_out_model/model"

In [4]:
logger = com.get_logger(out_base_path)
com.seed_everything(seed=42)

# loading

In [5]:
base_dir = '/kaggle/input'
#os.listdir('../input/lish-moa')
os.listdir(base_dir)

['lish-moa', 'iterative-stratification', 'moa-mlp-maxout']

In [6]:
train_features = pd.read_csv('../input/lish-moa/train_features.csv')
train_targets_scored = pd.read_csv('../input/lish-moa/train_targets_scored.csv')
train_targets_nonscored = pd.read_csv('../input/lish-moa/train_targets_nonscored.csv')
test_features = pd.read_csv('../input/lish-moa/test_features.csv')
submission = pd.read_csv('../input/lish-moa/sample_submission.csv')

In [7]:
train = train_features.merge(train_targets_scored, on='sig_id')
target_cols = [c for c in train_targets_scored.columns if c not in ['sig_id']]
cols = target_cols + ['cp_type']
train[cols].groupby('cp_type').sum().sum(1)

cp_type
ctl_vehicle        0
trt_cp         16844
dtype: int64

In [8]:
print(train_features.shape, test_features.shape)
train = train[train['cp_type']!='ctl_vehicle'].reset_index(drop=True)
test = test_features[test_features['cp_type']!='ctl_vehicle'].reset_index(drop=True)
print(train.shape, test.shape)

(23814, 876) (3982, 876)
(21948, 1082) (3624, 876)


# split

In [9]:
folds = train.copy()
Fold = MultilabelStratifiedKFold(n_splits=5, shuffle=True, random_state=42)
for n, (train_index, val_index) in enumerate(Fold.split(folds, folds[target_cols])):
    folds.loc[val_index, 'fold'] = int(n)
folds['fold'] = folds['fold'].astype(int)
print(folds.shape)

(21948, 1083)


In [10]:
cat_features = ['cp_time', 'cp_dose']
num_features = [c for c in train.columns if train.dtypes[c] != 'object']
num_features = [c for c in num_features if c not in cat_features]
num_features = [c for c in num_features if c not in target_cols]
target = train[target_cols].values

def cate2num(df):
    df['cp_time'] = df['cp_time'].map({24: 0, 48: 1, 72: 2})
    df['cp_dose'] = df['cp_dose'].map({'D1': 3, 'D2': 4})
    return df

train = cate2num(train)
test = cate2num(test)

# MODEL

In [13]:
def run_single_nn(cfg, train, test, folds, num_features, cat_features, target, target_cols, device, model_path, fold_num=0, seed=42):
    oof = None
    log_df = None
    # Set seed
    logger.info(f'Set seed {seed}')
    com.seed_everything(seed=seed)
    
    # loader
    #trn_idx = folds[folds['fold'] != fold_num].index
    #val_idx = folds[folds['fold'] == fold_num].index
    #train_folds = train.loc[trn_idx].reset_index(drop=True)
    #valid_folds = train.loc[val_idx].reset_index(drop=True)
    #train_target = target[trn_idx]
    #valid_target = target[val_idx]
    #train_dataset = TrainDataset(train_folds, num_features, cat_features, train_target)
    #valid_dataset = TrainDataset(valid_folds, num_features, cat_features, valid_target)
    #train_loader = DataLoader(train_dataset, batch_size=cfg.batch_size, shuffle=True, 
    #                          num_workers=4, pin_memory=True, drop_last=True)
    #valid_loader = DataLoader(valid_dataset, batch_size=cfg.batch_size, shuffle=False, 
    #                          num_workers=4, pin_memory=True, drop_last=False)

    # model
    model = TabularNN(cfg, num_features, target_cols)
    model.to(device)
    #optimizer = optim.Adam(model.parameters(), lr=cfg.lr, weight_decay=cfg.weight_decay)
    #scheduler = optim.lr_scheduler.OneCycleLR(optimizer=optimizer, pct_start=0.1, div_factor=1e3, 
    #                                          max_lr=1e-2, epochs=cfg.epochs, steps_per_epoch=len(train_loader))

    # log
    #log_df = pd.DataFrame(columns=(['EPOCH']+['TRAIN_LOSS']+['VALID_LOSS']) )

    # train & validate
    #best_loss = np.inf
    #early_stopping_cnt = 0
    #for epoch in range(cfg.epochs):
    #    train_loss = train_fn(train_loader, model, optimizer, epoch, scheduler, device)
    #    valid_loss, val_preds = validate_fn(valid_loader, model, device)
    #    log_row = {'EPOCH': epoch, 
    #               'TRAIN_LOSS': train_loss,
    #               'VALID_LOSS': valid_loss,
    #              }
    #    log_df = log_df.append(pd.DataFrame(log_row, index=[0]), sort=False)
    #    #logger.info(log_df.tail(1))
    #    if valid_loss < best_loss:
    #        logger.info(f'epoch{epoch} save best model... tr_loss:{train_loss}, val_loss{valid_loss}')
    #        best_loss = valid_loss
    #        oof = np.zeros((len(train), len(target_cols)))
    #        oof[val_idx] = val_preds
    #        torch.save(model.state_dict(), model_path + f"/fold{fold_num}_seed{seed}.pth")
    #        best_epoch = epoch
    #        early_stopping_cnt = 0
    #    else:
    #        early_stopping_cnt += 1
    #        if early_stopping_cnt == cfg.early_stopping_rounds:
    #            logger.info(f'best epoch: epoch{best_epoch}')
    #            break

    # predictions
    test_dataset = TestDataset(test, num_features, cat_features)
    test_loader = DataLoader(test_dataset, batch_size=cfg.batch_size, shuffle=False, 
                             num_workers=4, pin_memory=True)
    model = TabularNN(cfg, num_features, target_cols)
    model.load_state_dict(torch.load(model_path + f"/fold{fold_num}_seed{seed}.pth"))
    model.to(device)
    predictions = inference_fn(test_loader, model, device)
    
    # del
    torch.cuda.empty_cache()

    return oof, predictions, log_df


def run_kfold_nn(cfg, train, test, folds, num_features, cat_features, target, target_cols, device, model_path, n_fold=5, seed=42):

    #oof = np.zeros((len(train), len(target_cols)))
    predictions = np.zeros((len(test), len(target_cols)))
    log_dfs = []
    for _fold in range(n_fold):
        logger.info("Fold {}".format(_fold))
        _oof, _predictions, log_df = run_single_nn(cfg,
                                                   train,
                                                   test,
                                                   folds,
                                                   num_features, 
                                                   cat_features,
                                                   target,
                                                   target_cols,
                                                   device,
                                                   model_path,
                                                   fold_num=_fold,
                                                   seed=seed)
        #oof += _oof
        predictions += _predictions / n_fold
        #log_dfs.append(log_df)

    score = 0
    for i in range(target.shape[1]):
        _score = log_loss(target[:,i], oof[:,i])
        score += _score / target.shape[1]
    #logger.info(f"CV score: {score}")
    
    return oof, predictions, log_dfs

In [14]:
oof = np.zeros((len(train), len(target_cols)))
predictions = np.zeros((len(test), len(target_cols)))

#SEED = [0, 1, 2]
#for seed in SEED:
oof, predictions, log_dfs = run_kfold_nn(CFG,
                                         train, test, folds, 
                                         num_features, cat_features, target, target_cols,
                                         device,
                                         model_path=model_path,
                                         n_fold=5, seed=42)
#score = 0
#for i in range(target.shape[1]):
#    _score = log_loss(target[:,i], oof[:,i])
#    score += _score / target.shape[1]
#logger.info(f"Seed Averaged CV score: {score}")

Fold 0
Set seed 42
Fold 1
Set seed 42
Fold 2
Set seed 42
Fold 3
Set seed 42
Fold 4
Set seed 42
CV score: 0.12867376499702737


In [15]:
test[target_cols] = predictions
test[['sig_id']+target_cols].to_csv(out_base_path + '/pred.csv', index=False)

In [None]:
# Final result with 'cp_type'=='ctl_vehicle' data
#result = train_targets_scored.drop(columns=target_cols)\
#            .merge(train[['sig_id']+target_cols], on='sig_id', how='left').fillna(0)
#y_true = train_targets_scored[target_cols].values
#y_pred = result[target_cols].values
#score = 0
#for i in range(y_true.shape[1]):
#    _score = log_loss(y_true[:,i], y_pred[:,i])
#    score += _score / y_true.shape[1]
#logger.info(f"Final result: {score}")

# sub

In [16]:
sub = submission.drop(columns=target_cols).merge(test[['sig_id']+target_cols], on='sig_id', how='left').fillna(0)
sub.to_csv('submission.csv', index=False)
sub.head()

Unnamed: 0,sig_id,5-alpha_reductase_inhibitor,11-beta-hsd1_inhibitor,acat_inhibitor,acetylcholine_receptor_agonist,acetylcholine_receptor_antagonist,acetylcholinesterase_inhibitor,adenosine_receptor_agonist,adenosine_receptor_antagonist,adenylyl_cyclase_activator,...,tropomyosin_receptor_kinase_inhibitor,trpv_agonist,trpv_antagonist,tubulin_inhibitor,tyrosine_kinase_inhibitor,ubiquitin_specific_protease_inhibitor,vegfr_inhibitor,vitamin_b,vitamin_d_receptor_agonist,wnt_inhibitor
0,id_0004d9e33,0.001106,0.002928,0.002517,0.020588,0.030557,0.00309,0.001616,0.008007,0.000451,...,0.003321,0.001241,0.005976,0.000701,0.00089,0.000914,0.000936,0.003557,0.004832,0.002749
1,id_001897cda,0.000409,0.00088,0.001383,0.001295,0.002897,0.003593,0.002744,0.015397,0.00307,...,0.000584,0.00094,0.002385,0.000442,0.009371,0.000428,0.005754,0.000531,0.000833,0.002043
2,id_002429b5b,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,id_00276f245,0.000812,0.000951,0.002484,0.00974,0.006423,0.008106,0.0038,0.002179,0.000648,...,0.000667,0.00083,0.013751,0.040342,0.013643,0.000697,0.002475,0.002737,0.00076,0.00561
4,id_0027f1083,0.001964,0.004205,0.003759,0.022235,0.038166,0.004897,0.012973,0.00322,0.000665,...,0.001348,0.001004,0.014565,0.001652,0.0011,0.001265,0.0022,0.002624,0.000682,0.002128
