In [1]:
import os
import gc
import random
import math
import time
import numpy as np
import pandas as pd

import category_encoders as ce
import matplotlib.pyplot as plt
from plotly.subplots import make_subplots
import plotly.graph_objects as go
from sklearn import preprocessing
from sklearn import decomposition

import torch
import torch.nn as nn
from torch.utils.data import DataLoader, Dataset
import torch.optim as optim
import torch.nn.functional as F
import torchvision
from pytorch_tabnet.tab_model import TabNetClassifier, TabNetRegressor
from pytorch_tabnet.metrics import Metric

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

In [2]:
train_features = pd.read_csv('Data/train_features.csv')
train_targets_scored = pd.read_csv('Data/train_targets_scored.csv')
train_targets_nonscored = pd.read_csv('Data/train_targets_nonscored.csv')
test_features = pd.read_csv('Data/test_features.csv')
submission = pd.read_csv('Data/sample_submission.csv')

In [3]:
train = train_features.merge(train_targets_scored, on='sig_id')
# constrcut train&test except 'cp_type'=='ctl_vehicle' data
print(train_features.shape, test_features.shape)
train = train[train['cp_type']!='ctl_vehicle'].reset_index(drop=True)
#test = test_features[test_features['cp_type']!='ctl_vehicle'].reset_index(drop=True)
test = test_features

print(train.shape, test.shape)

(23814, 876) (3982, 876)
(21948, 1082) (3982, 876)


In [4]:
train['cp_time'] = train['cp_time'].map({24: -1, 48: 0, 72: 1})
train['cp_dose'] = train['cp_dose'].map({'D1': -0.5, 'D2': 0.5})

test['cp_time'] = test['cp_time'].map({24: -1, 48: 0, 72: 1})
test['cp_dose'] = test['cp_dose'].map({'D1': -0.5, 'D2': 0.5})

In [5]:
train = train.to_numpy()
test = test.to_numpy()
dist_len = 99 + 771
for d in range(dist_len):
    train[::, 4+d]  = preprocessing.scale(train[::, 4+d])
    test[::, 4+d]  = preprocessing.scale(test[::, 4+d])
train = train[::, 2:].astype('float64') 
test = test[::, 2:].astype('float64')

In [6]:
batch_size = 100

val = train[:2000, :874]
train_d = train[2000:, :874]

lables_train = train[2000:, 874:]
lables_val = train[:2000, 874:]

dataset = torch.utils.data.TensorDataset( torch.Tensor(train_d), torch.Tensor(lables_train) )
validationset = torch.utils.data.TensorDataset( torch.Tensor(val), torch.Tensor(lables_val) )

transform = torchvision.transforms.Compose([torchvision.transforms.ToTensor()])

train_loader = torch.utils.data.DataLoader(
    dataset, batch_size=batch_size, shuffle=True, pin_memory=True)

test_loader = torch.utils.data.DataLoader(
    validationset, batch_size=batch_size, shuffle=True, pin_memory=True)

pred_loader = torch.utils.data.DataLoader(
    test, batch_size=batch_size, shuffle=True, pin_memory=True)

input_shape = train_d.shape[1]

Tabnet

In [9]:
class LogitsLogLoss(Metric):
    """
    LogLoss with sigmoid applied
    """

    def __init__(self):
        self._name = "val_loss"
        self._maximize = False

    def __call__(self, y_true, y_pred):

        logits = 1 / (1 + np.exp(-y_pred))
        aux = (1 - y_true) * np.log(1 - logits + 1e-15) + y_true * np.log(logits + 1e-15)
        return np.mean(-aux)

In [10]:
model = TabNetRegressor(n_d=24, n_a=24, n_steps=1, lambda_sparse=0, optimizer_fn=torch.optim.Adam,
                                    optimizer_params=dict(lr=2e-2, weight_decay=1e-5), mask_type='entmax', 
                                    scheduler_params=dict(milestones=[50, 100, 150], gamma=0.9), 
                                    scheduler_fn=torch.optim.lr_scheduler.MultiStepLR)
model.fit(
  X_train=train_d, y_train=lables_train,
  eval_set=[(val, lables_val)],
  loss_fn = torch.nn.BCEWithLogitsLoss(),
  eval_metric = [LogitsLogLoss])

Device used : cuda
epoch 0  | loss: 0.39223 | val_0_logits_ll: 0.04578 |  0:00:01s
epoch 1  | loss: 0.03    | val_0_logits_ll: 0.02821 |  0:00:02s
epoch 2  | loss: 0.02427 | val_0_logits_ll: 0.0216  |  0:00:04s
epoch 3  | loss: 0.02154 | val_0_logits_ll: 0.02095 |  0:00:05s
epoch 4  | loss: 0.02092 | val_0_logits_ll: 0.02065 |  0:00:07s
epoch 5  | loss: 0.02061 | val_0_logits_ll: 0.02047 |  0:00:08s
epoch 6  | loss: 0.02037 | val_0_logits_ll: 0.02028 |  0:00:10s
epoch 7  | loss: 0.02017 | val_0_logits_ll: 0.02022 |  0:00:12s
epoch 8  | loss: 0.01997 | val_0_logits_ll: 0.02007 |  0:00:13s
epoch 9  | loss: 0.01967 | val_0_logits_ll: 0.01971 |  0:00:15s
epoch 10 | loss: 0.01932 | val_0_logits_ll: 0.01952 |  0:00:16s
epoch 11 | loss: 0.01885 | val_0_logits_ll: 0.0191  |  0:00:18s
epoch 12 | loss: 0.01857 | val_0_logits_ll: 0.01877 |  0:00:19s
epoch 13 | loss: 0.01828 | val_0_logits_ll: 0.0188  |  0:00:21s
epoch 14 | loss: 0.01802 | val_0_logits_ll: 0.01938 |  0:00:23s
epoch 15 | loss: 0.01

In [15]:
pred_encode = np.empty(shape = (test.shape[0], 206))
i = 1
for x in pred_loader:
    x = x.to(device)
    outputs = model.predict(x.float())
    pred_encode[((i-1)*(outputs.shape[0])):(i*(outputs.shape[0])), ::] = 1 / (1 + np.exp(-outputs))
    i += 1

In [16]:
# take a copy of all our training sig_ids for reference
test_sig_ids = test_features['sig_id'].copy()

# select all indices when 'cp_type' is 'ctl_vehicle'
test_ctl_vehicle_idx = (test_features['cp_type'] == 'ctl_vehicle')

# change all cp_type == ctl_vehicle predictions to zero
pred_encode[test_sig_ids[test_ctl_vehicle_idx].index.values] = 0
test_submission = pd.DataFrame({'sig_id' : test_sig_ids})
test_preds_df = pd.DataFrame(pred_encode, columns=train_targets_scored.columns[1:])
test_submission = pd.concat([test_submission, test_preds_df], axis=1)
test_submission.head(3)

Unnamed: 0,sig_id,5-alpha_reductase_inhibitor,11-beta-hsd1_inhibitor,acat_inhibitor,acetylcholine_receptor_agonist,acetylcholine_receptor_antagonist,acetylcholinesterase_inhibitor,adenosine_receptor_agonist,adenosine_receptor_antagonist,adenylyl_cyclase_activator,...,tropomyosin_receptor_kinase_inhibitor,trpv_agonist,trpv_antagonist,tubulin_inhibitor,tyrosine_kinase_inhibitor,ubiquitin_specific_protease_inhibitor,vegfr_inhibitor,vitamin_b,vitamin_d_receptor_agonist,wnt_inhibitor
0,id_0004d9e33,0.001554,0.00112,0.001497,0.015675,0.02769,0.005411,0.005293,0.001565,0.000248,...,0.000528,0.000452,0.001787,0.002967,0.000658,0.000492,0.000308,0.00237,0.000235,0.001286
1,id_001897cda,0.001802,0.001347,0.001388,0.011787,0.03089,0.005393,0.002957,0.003811,0.000324,...,0.000636,0.000607,0.002128,0.001614,0.001229,0.000605,0.000737,0.002173,0.000295,0.001305
2,id_002429b5b,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [17]:
test_submission.to_csv('submission.csv', index=False)