# About this notebook

- PyTorch NN starter code
- MultilabelStratifiedKFold 5 folds 

If this notebook is helpful, feel free to upvote :)

# Library

In [1]:
#import sys
#sys.path.append('../input/iterative-stratification/iterative-stratification-master')
from iterstrat.ml_stratifiers import MultilabelStratifiedKFold

In [2]:
import os
import gc
import random
import math
import time
from collections import defaultdict

import numpy as np
import pandas as pd

from sklearn.model_selection import StratifiedKFold, GroupKFold, KFold
from sklearn.metrics import log_loss

import category_encoders as ce

import torch
import torch.nn as nn
from torch.utils.data import DataLoader, Dataset
import torch.optim as optim
import torch.nn.functional as F

import warnings
warnings.filterwarnings("ignore")

device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

import common as com
from MLP_Dataset import TrainDataset, TestDataset
from MLP_Model import FCBlock, TabularNN
from trainer import train_fn, validate_fn, inference_fn, AverageMeter
from config import CFG

# Utils

In [3]:
out_base_path = "./result"
model_path = out_base_path + '/model'
os.makedirs(out_base_path, exist_ok=True)
os.makedirs(model_path, exist_ok=True)

In [4]:
logger = com.get_logger(out_base_path+'/mlp_baseline')
com.seed_everything(seed=42)

# Data Loading

In [5]:
base_dir = '/media/hiroki/working/kaggle/Mechanisms-of-Action-Prediction/datasets'
#os.listdir('../input/lish-moa')
os.listdir(base_dir)

['sample_submission.csv',
 'test_features.csv',
 'train_features.csv',
 'train_targets_nonscored.csv',
 'train_targets_scored.csv']

In [6]:
train_features = pd.read_csv(base_dir + '/train_features.csv')
train_targets_scored = pd.read_csv(base_dir + '/train_targets_scored.csv')
train_targets_nonscored = pd.read_csv(base_dir + '/train_targets_nonscored.csv')
test_features = pd.read_csv(base_dir + '/test_features.csv')
submission = pd.read_csv(base_dir + '/sample_submission.csv')

#train_features = pd.read_csv('../input/lish-moa/train_features.csv')
#train_targets_scored = pd.read_csv('../input/lish-moa/train_targets_scored.csv')
#train_targets_nonscored = pd.read_csv('../input/lish-moa/train_targets_nonscored.csv')
#test_features = pd.read_csv('../input/lish-moa/test_features.csv')
#submission = pd.read_csv('../input/lish-moa/sample_submission.csv')

In [7]:
# ref: https://www.kaggle.com/c/lish-moa/discussion/180165
# check if labels for 'ctl_vehicle' are all 0.
train = train_features.merge(train_targets_scored, on='sig_id')
target_cols = [c for c in train_targets_scored.columns if c not in ['sig_id']]
cols = target_cols + ['cp_type']
train[cols].groupby('cp_type').sum().sum(1)

cp_type
ctl_vehicle        0
trt_cp         16844
dtype: int64

- labels for 'ctl_vehicle' are all 0.

In [8]:
# constrcut train&test except 'cp_type'=='ctl_vehicle' data
print(train_features.shape, test_features.shape)
train = train[train['cp_type']!='ctl_vehicle'].reset_index(drop=True)
test = test_features[test_features['cp_type']!='ctl_vehicle'].reset_index(drop=True)
print(train.shape, test.shape)

(23814, 876) (3982, 876)
(21948, 1082) (3624, 876)


In [59]:
# nonscored drop all zero column
train_targets_nonscored_all_zero = train_targets_nonscored.sum() == 0
all_zero_column = train_targets_nonscored_all_zero[train_targets_nonscored_all_zero].index

In [9]:
train = train.merge(train_targets_nonscored, on='sig_id')
scored_target_cols = target_cols
nonscored_target_cols = [c for c in train_targets_nonscored.columns if c not in ['sig_id']]
target_cols = target_cols + nonscored_target_cols

In [37]:
print("scored",len(scored_target_cols))
print("nonscored", len(nonscored_target_cols))
print("target", len(target_cols))

scored 206
nonscored 402
target 608


# CV split

In [10]:
folds = train.copy()
Fold = MultilabelStratifiedKFold(n_splits=5, shuffle=True, random_state=42)
for n, (train_index, val_index) in enumerate(Fold.split(folds, folds[scored_target_cols])):
    folds.loc[val_index, 'fold'] = int(n)
folds['fold'] = folds['fold'].astype(int)
print(folds.shape)

(21948, 1485)


# Dataset

In [11]:
cat_features = ['cp_time', 'cp_dose']
num_features = [c for c in train.columns if train.dtypes[c] != 'object']
num_features = [c for c in num_features if c not in cat_features]
num_features = [c for c in num_features if c not in target_cols]
target = train[target_cols].values

def cate2num(df):
    df['cp_time'] = df['cp_time'].map({24: 0, 48: 1, 72: 2})
    df['cp_dose'] = df['cp_dose'].map({'D1': 3, 'D2': 4})
    return df

train = cate2num(train)
test = cate2num(test)

# MODEL

In [12]:
def run_single_nn(cfg, train, test, folds, num_features, cat_features, target, target_cols, device, model_path, fold_num=0, seed=42):
    
    # Set seed
    logger.info(f'Set seed {seed}')
    com.seed_everything(seed=seed)

    # loader
    trn_idx = folds[folds['fold'] != fold_num].index
    val_idx = folds[folds['fold'] == fold_num].index
    train_folds = train.loc[trn_idx].reset_index(drop=True)
    valid_folds = train.loc[val_idx].reset_index(drop=True)
    train_target = target[trn_idx]
    valid_target = target[val_idx]
    train_dataset = TrainDataset(train_folds, num_features, cat_features, train_target)
    valid_dataset = TrainDataset(valid_folds, num_features, cat_features, valid_target)
    train_loader = DataLoader(train_dataset, batch_size=cfg.batch_size, shuffle=True, 
                              num_workers=4, pin_memory=True, drop_last=True)
    valid_loader = DataLoader(valid_dataset, batch_size=cfg.batch_size, shuffle=False, 
                              num_workers=4, pin_memory=True, drop_last=False)

    # model
    model = TabularNN(cfg, num_features, target_cols)
    model.to(device)
    optimizer = optim.Adam(model.parameters(), lr=cfg.lr, weight_decay=cfg.weight_decay)
    scheduler = optim.lr_scheduler.OneCycleLR(optimizer=optimizer, pct_start=0.1, div_factor=1e3, 
                                              max_lr=1e-2, epochs=cfg.epochs, steps_per_epoch=len(train_loader))

    # log
    log_df = pd.DataFrame(columns=(['EPOCH']+['TRAIN_LOSS']+['VALID_LOSS']) )

    # train & validate
    best_loss = np.inf
    early_stopping_cnt = 0
    for epoch in range(cfg.epochs):
        train_loss = train_fn(train_loader, model, optimizer, epoch, scheduler, device)
        valid_loss, val_preds = validate_fn(valid_loader, model, device)
        log_row = {'EPOCH': epoch, 
                   'TRAIN_LOSS': train_loss,
                   'VALID_LOSS': valid_loss,
                  }
        log_df = log_df.append(pd.DataFrame(log_row, index=[0]), sort=False)
        #logger.info(log_df.tail(1))
        if valid_loss < best_loss:
            logger.info(f'epoch{epoch} save best model... tr_loss:{train_loss}, val_loss{valid_loss}')
            best_loss = valid_loss
            oof = np.zeros((len(train), len(target_cols)))
            oof[val_idx] = val_preds
            torch.save(model.state_dict(), model_path + f"/fold{fold_num}_seed{seed}.pth")
            best_epoch = epoch
            early_stopping_cnt = 0
        else:
            early_stopping_cnt += 1
            if early_stopping_cnt == cfg.early_stopping_rounds:
                logger.info(f'best epoch: epoch{best_epoch}')
                break

    # predictions
    test_dataset = TestDataset(test, num_features, cat_features)
    test_loader = DataLoader(test_dataset, batch_size=cfg.batch_size, shuffle=False, 
                             num_workers=4, pin_memory=True)
    model = TabularNN(cfg, num_features, target_cols)
    model.load_state_dict(torch.load(model_path + f"/fold{fold_num}_seed{seed}.pth"))
    model.to(device)
    predictions = inference_fn(test_loader, model, device)
    
    # del
    torch.cuda.empty_cache()

    return oof, predictions, log_df


def run_kfold_nn(cfg, train, test, folds, num_features, cat_features, target, target_cols, device, model_path, n_fold=5, seed=42):

    oof = np.zeros((len(train), len(target_cols)))
    predictions = np.zeros((len(test), len(target_cols)))
    log_dfs = []
    for _fold in range(n_fold):
        logger.info("Fold {}".format(_fold))
        _oof, _predictions, log_df = run_single_nn(cfg,
                                                   train,
                                                   test,
                                                   folds,
                                                   num_features, 
                                                   cat_features,
                                                   target,
                                                   target_cols,
                                                   device,
                                                   model_path,
                                                   fold_num=_fold,
                                                   seed=seed)
        oof += _oof
        predictions += _predictions / n_fold
        log_dfs.append(log_df)

    score = 0
    for i in range(target.shape[1]):
        _score = log_loss(target[:,i], oof[:,i])
        score += _score / target.shape[1]
    logger.info(f"CV score: {score}")
    
    return oof, predictions, log_dfs

In [49]:
len(target_cols)

608

In [13]:
# Seed Averaging for solid result
oof = np.zeros((len(train), len(target_cols)))
predictions = np.zeros((len(test), len(target_cols)))

#SEED = [0, 1, 2]
#for seed in SEED:
oof, predictions, log_dfs = run_kfold_nn(CFG,
                                         train, test, folds, 
                                         num_features, cat_features, target, target_cols,
                                         device,
                                         model_path=model_path,
                                         n_fold=5, seed=42)

Fold 0
Set seed 42
epoch0 save best model... tr_loss:0.5707114813101553, val_loss0.5034225237939787
epoch1 save best model... tr_loss:0.33735543027193876, val_loss0.3601081656839418
epoch2 save best model... tr_loss:0.20523390473022948, val_loss0.26882318080422
epoch3 save best model... tr_loss:0.13083243493779298, val_loss0.19782382885931837
epoch4 save best model... tr_loss:0.08745044579960569, val_loss0.15546585834650894
epoch5 save best model... tr_loss:0.060994403192052876, val_loss0.12071375731115733
epoch6 save best model... tr_loss:0.0444098531003416, val_loss0.09686234960659218
epoch7 save best model... tr_loss:0.03323702918651113, val_loss0.07931060891827581
epoch8 save best model... tr_loss:0.02605218276802967, val_loss0.06336923855142604
epoch9 save best model... tr_loss:0.02112015485967489, val_loss0.051988163783908706
epoch10 save best model... tr_loss:0.017717104583611563, val_loss0.043814782609904276
epoch11 save best model... tr_loss:0.015427728904152874, val_loss0.037

epoch39 save best model... tr_loss:0.008350281914981612, val_loss0.01014173019651943
epoch40 save best model... tr_loss:0.008270231432478577, val_loss0.010071582743868334
epoch41 save best model... tr_loss:0.008201961284911219, val_loss0.009892506054804103
epoch42 save best model... tr_loss:0.00815207601684642, val_loss0.00985159734624665
epoch43 save best model... tr_loss:0.008064210561892684, val_loss0.009824991678068062
epoch44 save best model... tr_loss:0.00799133027240933, val_loss0.009692038679262093
epoch45 save best model... tr_loss:0.007930375945781542, val_loss0.009593168445831008
epoch46 save best model... tr_loss:0.007852840449756189, val_loss0.009529144200655485
epoch47 save best model... tr_loss:0.007772593000176342, val_loss0.00948336385360204
epoch48 save best model... tr_loss:0.007704967265799098, val_loss0.009469711201388498
epoch49 save best model... tr_loss:0.0076237925817768505, val_loss0.009350383110293495
epoch50 save best model... tr_loss:0.007569748846249804, v

epoch6 save best model... tr_loss:0.04447261248137394, val_loss0.09700529870995085
epoch7 save best model... tr_loss:0.03343374826639456, val_loss0.0804451860029225
epoch8 save best model... tr_loss:0.026053297387803123, val_loss0.06336974604458363
epoch9 save best model... tr_loss:0.021141251542326742, val_loss0.05248457340689075
epoch10 save best model... tr_loss:0.017750273764568524, val_loss0.04345115219202563
epoch11 save best model... tr_loss:0.015423573162678603, val_loss0.03826363347555619
epoch12 save best model... tr_loss:0.013804578127162735, val_loss0.031060486058609088
epoch13 save best model... tr_loss:0.012691630427827583, val_loss0.027410512673827672
epoch14 save best model... tr_loss:0.011869525246281367, val_loss0.023521469483376908
epoch15 save best model... tr_loss:0.011333050736545646, val_loss0.021370658491460227
epoch16 save best model... tr_loss:0.010926684051119878, val_loss0.019881740122914722
epoch17 save best model... tr_loss:0.010613576104578963, val_loss0.

epoch40 save best model... tr_loss:0.00833410248466271, val_loss0.009791003255499071
epoch43 save best model... tr_loss:0.008131498046029006, val_loss0.009688765022120257
epoch45 save best model... tr_loss:0.007971368859229732, val_loss0.009458972000992055
epoch46 save best model... tr_loss:0.007898897947199697, val_loss0.009375519762153057
epoch47 save best model... tr_loss:0.007835178012300256, val_loss0.009345778608709526
epoch49 save best model... tr_loss:0.0076460554697856745, val_loss0.009262802439626568
epoch50 save best model... tr_loss:0.007596215615592842, val_loss0.009166241057325764
epoch51 save best model... tr_loss:0.007534089119234768, val_loss0.00909156708460998
epoch53 save best model... tr_loss:0.007383594060840126, val_loss0.009051101563078657
epoch54 save best model... tr_loss:0.007290739956121538, val_loss0.008876483282042503
epoch55 save best model... tr_loss:0.007214549130141517, val_loss0.008871993477551547
epoch56 save best model... tr_loss:0.007169963713789726

ValueError: y_true contains only one label (0). Please provide the true labels explicitly through the labels argument.

In [None]:
train[target_cols] = oof
train[['sig_id']+target_cols].to_csv(out_base_path + '/oof.csv', index=False)

test[target_cols] = predictions
test[['sig_id']+target_cols].to_csv(out_base_path + '/pred.csv', index=False)

In [None]:
# Final result with 'cp_type'=='ctl_vehicle' data
result = train_targets_scored.drop(columns=target_cols)\
            .merge(train[['sig_id']+target_cols], on='sig_id', how='left').fillna(0)
y_true = train_targets_scored[target_cols].values
y_pred = result[target_cols].values
score = 0
for i in range(y_true.shape[1]):
    _score = log_loss(y_true[:,i], y_pred[:,i])
    score += _score / y_true.shape[1]
logger.info(f"Final result: {score}")

# eval

In [None]:
#model_path='/media/hiroki/working/kaggle/Mechanisms-of-Action-Prediction/working/experiment/MLP/baseline2/fold0_seed0.pth'

In [None]:
def run_single_nn_evaluation(cfg, model_path, test, num_features, cat_features, target, device):
    # predictions
    test_dataset = TestDataset(test, num_features, cat_features)
    test_loader = DataLoader(test_dataset, batch_size=cfg.batch_size, shuffle=False, 
                             num_workers=4, pin_memory=True)
    model = TabularNN(cfg)
    model.load_state_dict(torch.load(model_path))
    model.to(device)
    predictions = inference_fn(test_loader, model, device)
    
    # del
    torch.cuda.empty_cache()
    return predictions

In [None]:
# Seed Averaging for solid result
#oof = np.zeros((len(train), len(CFG.target_cols)))

prediction = run_single_nn_evaluation(cfg=CFG,
                                      model_path=model_path,
                                      test=test,
                                      num_features=num_features,
                                      cat_features=cat_features,
                                      target=target,
                                      device='cpu')

In [None]:
test[target_cols] = predictions
test[['sig_id']+target_cols].to_csv('pred.csv', index=False)

# Submit

In [None]:
sub = submission.drop(columns=target_cols).merge(test[['sig_id']+target_cols], on='sig_id', how='left').fillna(0)
sub.to_csv('submission.csv', index=False)
sub.head()