This notebook was created to submit a model for the  ['MoA / PyTorch NN starter'](https://www.kaggle.com/yasufuminakama/moa-pytorch-nn-starter).   
Thanks to Y.Nakama for providing the baseline.

# Library

In [1]:
import os
import random

import numpy as np
import pandas as pd

import category_encoders as ce

import torch
import torch.nn as nn
from torch.utils.data import DataLoader, Dataset
import torch.optim as optim
import torch.nn.functional as F

import warnings
warnings.filterwarnings("ignore")

device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

# Utils

In [2]:
def seed_everything(seed=42):
    random.seed(seed)
    os.environ['PYTHONHASHSEED'] = str(seed)
    np.random.seed(seed)
    torch.manual_seed(seed)
    torch.cuda.manual_seed(seed)
    torch.backends.cudnn.deterministic = True
    
seed_everything(seed=42)

# load

In [3]:
#train_features = pd.read_csv('../input/lish-moa/train_features.csv')
#train_targets_scored = pd.read_csv('../input/lish-moa/train_targets_scored.csv')
#train_targets_nonscored = pd.read_csv('../input/lish-moa/train_targets_nonscored.csv')
test_features = pd.read_csv('../input/lish-moa/test_features.csv')
submission = pd.read_csv('../input/lish-moa/sample_submission.csv')

# preprocessing

In [4]:
#train = train[train['cp_type']!='ctl_vehicle'].reset_index(drop=True)
test = test_features[test_features['cp_type']!='ctl_vehicle'].reset_index(drop=True)

# Dataset

In [5]:
class TestDataset(Dataset):
    def __init__(self, df, num_features, cat_features):
        self.cont_values = df[num_features].values
        self.cate_values = df[cat_features].values
        
    def __len__(self):
        return len(self.cont_values)

    def __getitem__(self, idx):
        cont_x = torch.FloatTensor(self.cont_values[idx])
        cate_x = torch.LongTensor(self.cate_values[idx])
        
        return cont_x, cate_x

In [6]:
target_cols = [c for c in submission.columns if c not in ['sig_id']]
cat_features = ['cp_time', 'cp_dose']
num_features = [c for c in test.columns if test.dtypes[c] != 'object']
num_features = [c for c in num_features if c not in cat_features]
num_features = [c for c in num_features if c not in target_cols]
target = submission[target_cols].values

def cate2num(df):
    df['cp_time'] = df['cp_time'].map({24: 0, 48: 1, 72: 2})
    df['cp_dose'] = df['cp_dose'].map({'D1': 3, 'D2': 4})
    return df

#train = cate2num(train)
test = cate2num(test)

# Model

In [7]:
class CFG:
    max_grad_norm=1000
    gradient_accumulation_steps=1
    hidden_size=512
    dropout=0.5
    lr=1e-2
    weight_decay=1e-6
    batch_size=32
    epochs=20
    #total_cate_size=5
    #emb_size=4
    num_features=num_features
    cat_features=cat_features
    target_cols=target_cols

In [8]:
class TabularNN(nn.Module):
    def __init__(self, cfg):
        super().__init__()
        self.mlp = nn.Sequential(
                          nn.Linear(len(cfg.num_features), cfg.hidden_size),
                          nn.BatchNorm1d(cfg.hidden_size),
                          nn.Dropout(cfg.dropout),
                          nn.PReLU(),
                          nn.Linear(cfg.hidden_size, cfg.hidden_size),
                          nn.BatchNorm1d(cfg.hidden_size),
                          nn.Dropout(cfg.dropout),
                          nn.PReLU(),
                          nn.Linear(cfg.hidden_size, len(cfg.target_cols)),
                          )

    def forward(self, cont_x, cate_x):
        # no use of cate_x yet
        x = self.mlp(cont_x)
        return x

# Evaluation

In [9]:
model_path = '../input/baseline-nn/fold0_seed0.pth'

In [10]:
def inference_fn(test_loader, model, device):

    model.eval()
    preds = []

    for step, (cont_x, cate_x) in enumerate(test_loader):

        cont_x,  cate_x = cont_x.to(device), cate_x.to(device)

        with torch.no_grad():
            pred = model(cont_x, cate_x)

        preds.append(pred.sigmoid().detach().cpu().numpy())

    preds = np.concatenate(preds)

    return preds

In [11]:
def run_single_nn_evaluation(cfg, model_path, test, num_features, cat_features, target, device):
    # predictions
    test_dataset = TestDataset(test, num_features, cat_features)
    test_loader = DataLoader(test_dataset, batch_size=cfg.batch_size, shuffle=False, 
                             num_workers=4, pin_memory=True)
    model = TabularNN(cfg)
    model.load_state_dict(torch.load(model_path))
    model.to(device)
    predictions = inference_fn(test_loader, model, device)
    
    # del
    torch.cuda.empty_cache()
    return predictions

In [12]:
prediction = run_single_nn_evaluation(cfg=CFG,
                                      model_path=model_path,
                                      test=test,
                                      num_features=num_features,
                                      cat_features=cat_features,
                                      target=target,
                                      device=device)

In [13]:
test[target_cols] = prediction
sub = submission.drop(columns=target_cols).merge(test[['sig_id']+target_cols], on='sig_id', how='left').fillna(0)
sub.to_csv('submission.csv', index=False)
sub.head()

Unnamed: 0,sig_id,5-alpha_reductase_inhibitor,11-beta-hsd1_inhibitor,acat_inhibitor,acetylcholine_receptor_agonist,acetylcholine_receptor_antagonist,acetylcholinesterase_inhibitor,adenosine_receptor_agonist,adenosine_receptor_antagonist,adenylyl_cyclase_activator,...,tropomyosin_receptor_kinase_inhibitor,trpv_agonist,trpv_antagonist,tubulin_inhibitor,tyrosine_kinase_inhibitor,ubiquitin_specific_protease_inhibitor,vegfr_inhibitor,vitamin_b,vitamin_d_receptor_agonist,wnt_inhibitor
0,id_0004d9e33,0.000632,0.000998,0.002375,0.012234,0.017378,0.00819,0.001006,0.003046,2.3e-05,...,0.000642,0.001435,0.006101,0.000287,0.000221,0.000573,0.000336,0.000846,0.000888,0.000579
1,id_001897cda,0.000368,0.000597,0.00287,0.001922,0.001658,0.000784,0.005734,0.008512,0.003946,...,0.000807,0.000824,0.005752,9.5e-05,0.006063,0.000439,0.006928,0.001008,0.001566,0.004303
2,id_002429b5b,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,id_00276f245,0.000311,0.000278,0.00171,0.017883,0.010523,0.00451,0.002531,0.003876,3.9e-05,...,0.000397,0.00036,0.001353,0.0144,0.007123,0.000272,0.000452,0.001606,0.000226,0.002699
4,id_0027f1083,0.001637,0.002682,0.00298,0.009802,0.014779,0.002861,0.010281,0.000828,9.3e-05,...,0.00068,0.000813,0.010847,0.006458,0.000586,0.000872,0.000129,0.001595,8e-05,0.000453
