In [1]:
import os
import gc
import random
import math
import time
import numpy as np
import pandas as pd

import category_encoders as ce
import matplotlib.pyplot as plt
from plotly.subplots import make_subplots
import plotly.graph_objects as go
from sklearn import preprocessing, decomposition
from sklearn.decomposition import KernelPCA

import torch
import torch.nn as nn
from torch.utils.data import DataLoader, Dataset
import torch.optim as optim
import torch.nn.functional as F
import torchvision
from pytorch_tabnet.tab_model import TabNetClassifier, TabNetRegressor
from pytorch_tabnet.metrics import Metric
from iterstrat.ml_stratifiers import MultilabelStratifiedKFold

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

In [None]:
#Hyperparameters
#PCA
pca_num = 20
#Autoencoder
autoencoder_batch_size = 100
autoenecoder_latents = 40 
autoencoder_val_size = 400
autoencoder_epochs = 60
autoencoder_learning_rate = 0.005
autoencoder_hidden_size_1 = 500
autoencoder_hidden_size_2 = 200
autoencoder_hidden_size_3 = 150
#TabNet
n_folds = 10
tabnet_batch_size = 100
tabnet_learning_rate = 2e-2
tabnet_weight_decay = 1e-5
decision_layer_size = 32
mask_attention_layer_size = 32
#Constants
feature_size = 874
lable_size = 206

In [2]:
train_features = pd.read_csv('Data/train_features.csv')
train_targets_scored = pd.read_csv('Data/train_targets_scored.csv')
train_targets_nonscored = pd.read_csv('Data/train_targets_nonscored.csv')
test_features = pd.read_csv('Data/test_features.csv')
submission = pd.read_csv('Data/sample_submission.csv')

In [3]:
train = train_features.merge(train_targets_scored, on='sig_id')
print(train_features.shape, test_features.shape)
train = train[train['cp_type']!='ctl_vehicle'].reset_index(drop=True)
test = test_features
print(train.shape, test.shape)

(23814, 876) (3982, 876)
(21948, 1082) (3982, 876)


In [4]:
train['cp_time'] = train['cp_time'].map({24: -1, 48: 0, 72: 1})
train['cp_dose'] = train['cp_dose'].map({'D1': -0.5, 'D2': 0.5})

test['cp_time'] = test['cp_time'].map({24: -1, 48: 0, 72: 1})
test['cp_dose'] = test['cp_dose'].map({'D1': -0.5, 'D2': 0.5})

In [5]:
train = train.to_numpy()
test = test.to_numpy()
dist_len = 99 + 771
for d in range(dist_len):
    train[::, 4+d]  = preprocessing.scale(train[::, 4+d])
    test[::, 4+d]  = preprocessing.scale(test[::, 4+d])
train = train[::, 2:].astype('float64') 
test = test[::, 2:].astype('float64')

Feature Engineering 

Kernel PCA

In [6]:
transformer = KernelPCA(n_components=pca_num, kernel='linear')
X_transformed = transformer.fit_transform(train[::, :feature_size])
test_transformed = transformer.transform(test)

Autoencoder

In [None]:
mse = nn.MSELoss()

traningy = train[autoencoder_val_size:, :feature_size]
valdationy = train[:autoencoder_val_size, :feature_size]

transform = torchvision.transforms.Compose([torchvision.transforms.ToTensor()])

train_loader_ae = torch.utils.data.DataLoader(
    traningy, batch_size=autoencoder_batch_size, shuffle=False, pin_memory=True)

test_loader_ae = torch.utils.data.DataLoader(
    valdationy, batch_size=autoencoder_batch_size, shuffle=False, pin_memory=True)

class AE(nn.Module):
    def __init__(self, **kwargs):
        super().__init__()
        self.encoder_il = nn.Linear(feature_size, autoencoder_hidden_size_1)
        self.bnorm1 = nn.BatchNorm1d(num_features=autoencoder_hidden_size_1)
        self.encoder_hl1 = nn.Linear(autoencoder_hidden_size_1, autoencoder_hidden_size_2)
        self.bnorm2 = nn.BatchNorm1d(num_features=autoencoder_hidden_size_2)
        self.encoder_hl2 = nn.Linear(autoencoder_hidden_size_2, autoencoder_hidden_size_3)
        self.bnorm3 = nn.BatchNorm1d(num_features=autoencoder_hidden_size_3)
        self.encoder_ol = nn.Linear(autoencoder_hidden_size_3, autoenecoder_latents)
        
        self.bnorm4 = nn.BatchNorm1d(num_features=autoenecoder_latents)
        self.decoder_il = nn.Linear(autoenecoder_latents, autoencoder_hidden_size_3)
        self.bnorm5 = nn.BatchNorm1d(num_features=autoencoder_hidden_size_3)
        self.decoder_hl1 = nn.Linear(autoencoder_hidden_size_3, autoencoder_hidden_size_2)
        self.bnorm6 = nn.BatchNorm1d(num_features=autoencoder_hidden_size_2)
        self.decoder_hl2 = nn.Linear(autoencoder_hidden_size_2, autoencoder_hidden_size_1)
        self.bnorm7 = nn.BatchNorm1d(num_features=autoencoder_hidden_size_1)
        self.decoder_ol = nn.Linear(autoencoder_hidden_size_1, feature_size)
        
        self.elu = nn.ELU()
        
    def forward_encoder(self, x):
        x = self.encoder_il(x)
        x = self.elu(x)
        x = self.bnorm1(x)
        x = self.encoder_hl1(x)
        x = self.elu(x)
        x = self.bnorm2(x)
        x = self.encoder_hl2(x)
        x = self.elu(x)
        x = self.bnorm3(x)
        emb = self.encoder_ol(x)
        return emb
    
    def forward_decoder(self, emb):    
        x = self.bnorm4(emb)
        x = self.decoder_il(x)
        x = self.elu(x)
        x = self.bnorm5(x)
        x = self.decoder_hl1(x)
        x = self.elu(x)
        x = self.bnorm6(x)
        x = self.decoder_hl2(x)
        x = self.elu(x)
        x = self.bnorm7(x)
        x = self.decoder_ol(x)
        return x
    
model_ae = AE().to(device)
optimizer_ae = optim.Adam(model_ae.parameters(), lr=autoencoder_learning_rate)

epoch_list = []
val_list = []

for epoch in range(autoencoder_epochs):
    train_loss_en = 0
    train_loss_de = 0
    loss = 0
    
    for x in train_loader_ae:
        
        x = x.to(device)
        optimizer_ae.zero_grad()
        x = x.view((-1, feature_size))
        emb = model_ae.forward_encoder(x.float())
        rec = model_ae.forward_decoder(emb)
        # compute training reconstruction loss
        train_loss = mse(rec.double(), x)

        # compute accumulated gradients
        train_loss.backward()

        # perform parameter update based on current gradients
        optimizer_ae.step()
 
        # add the mini-batch training loss to epoch loss
        loss += train_loss_en

    if (epoch % 1) == 0:
        val_loss_en = 0 
        val_loss_de = 0
        
        for x in test_loader_ae:
            x = x.to(device)
            
            x = x.view((-1, feature_size))
            emb = model_ae.forward_encoder(x.float())
            rec = model_ae.forward_decoder(emb)
            # compute training reconstruction loss
            val_loss = mse(rec.double(), x)
            
        val_loss = val_loss.cpu().detach().numpy()
        val_list.append(val_loss)
       
        epoch_list.append(epoch)
        
        print("Validation: epoch : {}/{}, loss = {:.4f}".format(epoch+1, autoencoder_epochs, val_loss))

In [31]:
model_ae.eval()
enc_ae = np.empty(shape = (train.shape[0], autoenecoder_latents))
for i in range(enc_ae.shape[0]):
    x = torch.from_numpy(np.asarray(train[i, :feature_size])).to(device).float()
    x = x.view(-1, feature_size)
    x = model_ae.forward_encoder(x)
    enc_ae[i, ::] = np.reshape(x.cpu().detach().numpy(), (autoenecoder_latents))
    
enc_ae_test = np.empty(shape = (test.shape[0], autoenecoder_latents))
for i in range(enc_ae_test.shape[0]):
    x = torch.from_numpy(np.asarray(test[i, :feature_size])).to(device).float()
    x = x.view(-1, feature_size)
    x = model_ae.forward_encoder(x)
    enc_ae_test[i, ::] = np.reshape(x.cpu().detach().numpy(), (autoenecoder_latents))

In [32]:
train_d = np.concatenate((train[::, :feature_size], X_transformed, enc_ae), axis = 1)

lables_train = train[::, feature_size:]

dataset = torch.utils.data.TensorDataset( torch.Tensor(train_d), torch.Tensor(lables_train) )

transform = torchvision.transforms.Compose([torchvision.transforms.ToTensor()])

train_loader = torch.utils.data.DataLoader(
    dataset, batch_size=tabnet_batch_size, shuffle=True, pin_memory=True)

In [33]:
train_aug = np.concatenate((test[::, :feature_size], test_transformed, enc_ae_test), axis = 1)
pred_loader = torch.utils.data.DataLoader(train_aug, batch_size=tabnet_batch_size, shuffle=True, pin_memory=True)

Tabnet

In [34]:
class LogitsLogLoss(Metric):

    def __init__(self):
        self._name = "val_loss"
        self._maximize = False

    def __call__(self, y_true, y_pred):

        logits = 1 / (1 + np.exp(-y_pred))
        aux = (1 - y_true) * np.log(1 - logits + 1e-15) + y_true * np.log(logits + 1e-15)
        return np.mean(-aux)

In [36]:
NB_SPLITS = n_folds
mskf = MultilabelStratifiedKFold(n_splits = NB_SPLITS, random_state = 0, shuffle = True)
pred_encode = np.empty(shape = (NB_SPLITS, test.shape[0], lable_shape))

for fold_nb, (train_idx, val_idx) in enumerate(mskf.split(train_d, lables_train)):
    print("FOLD: ", fold_nb + 1)
    
    train_split, lables_train_split = train_d[train_idx, ::], lables_train[train_idx, ::]
    val_split, lables_val_split = train_d[val_idx, ::], lables_train[val_idx, ::]
    ### Model ###
   
    model = TabNetRegressor(n_d=decision_layer_size, n_a=mask_attention_layer_size, n_steps=1, lambda_sparse=0, optimizer_fn=torch.optim.Adam,
                                    optimizer_params=dict(lr=tabnet_learning_rate, weight_decay=tab_weight_decay), mask_type='entmax', 
                                    scheduler_params=dict(milestones=[50, 100, 150], gamma=0.9), 
                                    scheduler_fn=torch.optim.lr_scheduler.MultiStepLR)
    model.fit(X_train=train_split, y_train=lables_train_split,
              eval_set=[(val_split, lables_val_split)],
              loss_fn = torch.nn.BCEWithLogitsLoss(),
              eval_metric = [LogitsLogLoss])
    
    i = 1
    for x in pred_loader:
        x = x.to(device)
        outputs = model.predict(x.float())
        pred_encode[fold_nb,((i-1)*(outputs.shape[0])):(i*(outputs.shape[0])), ::] = 1 / (1 + np.exp(-outputs))
        i += 1    

FOLD:  1
Device used : cuda
epoch 0  | loss: 0.39589 | val_0_val_loss: 0.04804 |  0:00:01s
epoch 1  | loss: 0.02891 | val_0_val_loss: 0.02738 |  0:00:02s
epoch 2  | loss: 0.02323 | val_0_val_loss: 0.02188 |  0:00:03s
epoch 3  | loss: 0.0214  | val_0_val_loss: 0.0211  |  0:00:05s
epoch 4  | loss: 0.02082 | val_0_val_loss: 0.02065 |  0:00:06s
epoch 5  | loss: 0.02057 | val_0_val_loss: 0.02048 |  0:00:07s
epoch 6  | loss: 0.02036 | val_0_val_loss: 0.02025 |  0:00:08s
epoch 7  | loss: 0.02017 | val_0_val_loss: 0.02006 |  0:00:09s
epoch 8  | loss: 0.01999 | val_0_val_loss: 0.01985 |  0:00:11s
epoch 9  | loss: 0.01972 | val_0_val_loss: 0.01958 |  0:00:12s
epoch 10 | loss: 0.01945 | val_0_val_loss: 0.01931 |  0:00:14s
epoch 11 | loss: 0.01916 | val_0_val_loss: 0.01905 |  0:00:15s
epoch 12 | loss: 0.01894 | val_0_val_loss: 0.01921 |  0:00:16s
epoch 13 | loss: 0.01883 | val_0_val_loss: 0.02029 |  0:00:18s
epoch 14 | loss: 0.01866 | val_0_val_loss: 0.01887 |  0:00:19s
epoch 15 | loss: 0.01843 | 

  if __name__ == '__main__':


epoch 2  | loss: 0.0231  | val_0_val_loss: 0.02139 |  0:00:04s
epoch 3  | loss: 0.02116 | val_0_val_loss: 0.02079 |  0:00:05s
epoch 4  | loss: 0.02064 | val_0_val_loss: 0.02041 |  0:00:06s
epoch 5  | loss: 0.02038 | val_0_val_loss: 0.02019 |  0:00:07s
epoch 6  | loss: 0.02011 | val_0_val_loss: 0.01989 |  0:00:09s
epoch 7  | loss: 0.01983 | val_0_val_loss: 0.02009 |  0:00:10s
epoch 8  | loss: 0.01968 | val_0_val_loss: 0.01961 |  0:00:12s
epoch 9  | loss: 0.01949 | val_0_val_loss: 0.01936 |  0:00:13s
epoch 10 | loss: 0.01918 | val_0_val_loss: 0.01923 |  0:00:14s
epoch 11 | loss: 0.01892 | val_0_val_loss: 0.0201  |  0:00:16s
epoch 12 | loss: 0.01867 | val_0_val_loss: 0.02046 |  0:00:17s
epoch 13 | loss: 0.01838 | val_0_val_loss: 0.01901 |  0:00:18s
epoch 14 | loss: 0.01817 | val_0_val_loss: 0.01827 |  0:00:20s
epoch 15 | loss: 0.01793 | val_0_val_loss: 0.01884 |  0:00:21s
epoch 16 | loss: 0.01773 | val_0_val_loss: 0.01825 |  0:00:22s
epoch 17 | loss: 0.01768 | val_0_val_loss: 0.01875 |  0

  if __name__ == '__main__':


epoch 1  | loss: 0.03008 | val_0_val_loss: 0.02986 |  0:00:02s
epoch 2  | loss: 0.02406 | val_0_val_loss: 0.02214 |  0:00:04s
epoch 3  | loss: 0.02142 | val_0_val_loss: 0.02117 |  0:00:05s
epoch 4  | loss: 0.0207  | val_0_val_loss: 0.02073 |  0:00:06s
epoch 5  | loss: 0.0204  | val_0_val_loss: 0.02045 |  0:00:08s
epoch 6  | loss: 0.02014 | val_0_val_loss: 0.02026 |  0:00:09s
epoch 7  | loss: 0.01992 | val_0_val_loss: 0.02002 |  0:00:11s
epoch 8  | loss: 0.01961 | val_0_val_loss: 0.02073 |  0:00:12s
epoch 9  | loss: 0.01934 | val_0_val_loss: 0.01946 |  0:00:13s
epoch 10 | loss: 0.019   | val_0_val_loss: 0.01923 |  0:00:15s
epoch 11 | loss: 0.01871 | val_0_val_loss: 0.01899 |  0:00:16s
epoch 12 | loss: 0.01851 | val_0_val_loss: 0.01906 |  0:00:17s
epoch 13 | loss: 0.0183  | val_0_val_loss: 0.01873 |  0:00:19s
epoch 14 | loss: 0.01817 | val_0_val_loss: 0.02021 |  0:00:20s
epoch 15 | loss: 0.01792 | val_0_val_loss: 0.01847 |  0:00:22s
epoch 16 | loss: 0.01785 | val_0_val_loss: 0.01924 |  0

  if __name__ == '__main__':


epoch 2  | loss: 0.02417 | val_0_val_loss: 0.02178 |  0:00:04s
epoch 3  | loss: 0.0215  | val_0_val_loss: 0.02082 |  0:00:05s
epoch 4  | loss: 0.02082 | val_0_val_loss: 0.02059 |  0:00:06s
epoch 5  | loss: 0.02065 | val_0_val_loss: 0.02048 |  0:00:08s
epoch 6  | loss: 0.02049 | val_0_val_loss: 0.02035 |  0:00:09s
epoch 7  | loss: 0.02029 | val_0_val_loss: 0.02014 |  0:00:11s
epoch 8  | loss: 0.02003 | val_0_val_loss: 0.02    |  0:00:12s
epoch 9  | loss: 0.01985 | val_0_val_loss: 0.01985 |  0:00:13s
epoch 10 | loss: 0.01958 | val_0_val_loss: 0.0195  |  0:00:15s
epoch 11 | loss: 0.01925 | val_0_val_loss: 0.01928 |  0:00:16s
epoch 12 | loss: 0.01882 | val_0_val_loss: 0.0194  |  0:00:17s
epoch 13 | loss: 0.01853 | val_0_val_loss: 0.01876 |  0:00:19s
epoch 14 | loss: 0.01828 | val_0_val_loss: 0.01821 |  0:00:20s
epoch 15 | loss: 0.01803 | val_0_val_loss: 0.01821 |  0:00:21s
epoch 16 | loss: 0.01789 | val_0_val_loss: 0.01812 |  0:00:23s
epoch 17 | loss: 0.01769 | val_0_val_loss: 0.01831 |  0

  if __name__ == '__main__':


epoch 1  | loss: 0.03099 | val_0_val_loss: 0.03059 |  0:00:02s
epoch 2  | loss: 0.02546 | val_0_val_loss: 0.0227  |  0:00:04s
epoch 3  | loss: 0.02181 | val_0_val_loss: 0.02127 |  0:00:05s
epoch 4  | loss: 0.02096 | val_0_val_loss: 0.02075 |  0:00:07s
epoch 5  | loss: 0.02055 | val_0_val_loss: 0.02051 |  0:00:08s
epoch 6  | loss: 0.02032 | val_0_val_loss: 0.02033 |  0:00:10s
epoch 7  | loss: 0.02006 | val_0_val_loss: 0.02009 |  0:00:11s
epoch 8  | loss: 0.01982 | val_0_val_loss: 0.01992 |  0:00:12s
epoch 9  | loss: 0.01952 | val_0_val_loss: 0.01966 |  0:00:14s
epoch 10 | loss: 0.01919 | val_0_val_loss: 0.01938 |  0:00:15s
epoch 11 | loss: 0.01892 | val_0_val_loss: 0.01931 |  0:00:16s
epoch 12 | loss: 0.01862 | val_0_val_loss: 0.01908 |  0:00:17s
epoch 13 | loss: 0.0184  | val_0_val_loss: 0.01879 |  0:00:19s
epoch 14 | loss: 0.01815 | val_0_val_loss: 0.01896 |  0:00:20s
epoch 15 | loss: 0.01798 | val_0_val_loss: 0.01866 |  0:00:22s
epoch 16 | loss: 0.01776 | val_0_val_loss: 0.01877 |  0

epoch 19 | loss: 0.01739 | val_0_val_loss: 0.01908 |  0:00:27s
epoch 20 | loss: 0.01723 | val_0_val_loss: 0.01791 |  0:00:28s
epoch 21 | loss: 0.0173  | val_0_val_loss: 0.0182  |  0:00:30s
epoch 22 | loss: 0.01728 | val_0_val_loss: 0.01777 |  0:00:31s
epoch 23 | loss: 0.01713 | val_0_val_loss: 0.01916 |  0:00:32s
epoch 24 | loss: 0.01715 | val_0_val_loss: 0.01763 |  0:00:34s
epoch 25 | loss: 0.01693 | val_0_val_loss: 0.01903 |  0:00:36s
epoch 26 | loss: 0.0169  | val_0_val_loss: 0.01778 |  0:00:37s
epoch 27 | loss: 0.01688 | val_0_val_loss: 0.01915 |  0:00:39s
epoch 28 | loss: 0.01676 | val_0_val_loss: 0.01754 |  0:00:40s
epoch 29 | loss: 0.01678 | val_0_val_loss: 0.01758 |  0:00:42s
epoch 30 | loss: 0.01669 | val_0_val_loss: 0.01976 |  0:00:43s
epoch 31 | loss: 0.01654 | val_0_val_loss: 0.01761 |  0:00:45s
epoch 32 | loss: 0.01653 | val_0_val_loss: 0.01755 |  0:00:46s
epoch 33 | loss: 0.01661 | val_0_val_loss: 0.01743 |  0:00:48s
epoch 34 | loss: 0.01646 | val_0_val_loss: 0.01734 |  0

  if __name__ == '__main__':


epoch 1  | loss: 0.03014 | val_0_val_loss: 0.02916 |  0:00:02s
epoch 2  | loss: 0.02428 | val_0_val_loss: 0.02219 |  0:00:04s
epoch 3  | loss: 0.02149 | val_0_val_loss: 0.02088 |  0:00:05s
epoch 4  | loss: 0.02083 | val_0_val_loss: 0.02053 |  0:00:07s
epoch 5  | loss: 0.02046 | val_0_val_loss: 0.02027 |  0:00:08s
epoch 6  | loss: 0.02033 | val_0_val_loss: 0.02014 |  0:00:10s
epoch 7  | loss: 0.0201  | val_0_val_loss: 0.01996 |  0:00:11s
epoch 8  | loss: 0.01999 | val_0_val_loss: 0.0199  |  0:00:13s
epoch 9  | loss: 0.01982 | val_0_val_loss: 0.01977 |  0:00:14s
epoch 10 | loss: 0.01955 | val_0_val_loss: 0.01955 |  0:00:16s
epoch 11 | loss: 0.0193  | val_0_val_loss: 0.0194  |  0:00:17s
epoch 12 | loss: 0.01906 | val_0_val_loss: 0.01899 |  0:00:18s
epoch 13 | loss: 0.01872 | val_0_val_loss: 0.01968 |  0:00:20s
epoch 14 | loss: 0.0184  | val_0_val_loss: 0.01863 |  0:00:21s
epoch 15 | loss: 0.0182  | val_0_val_loss: 0.01866 |  0:00:22s
epoch 16 | loss: 0.01801 | val_0_val_loss: 0.01916 |  0

In [37]:
pred_mean = np.mean(pred_encode, axis = 0)
# take a copy of all our training sig_ids for reference
test_sig_ids = test_features['sig_id'].copy()

# select all indices when 'cp_type' is 'ctl_vehicle'
test_ctl_vehicle_idx = (test_features['cp_type'] == 'ctl_vehicle')

# change all cp_type == ctl_vehicle predictions to zero
pred_mean[test_sig_ids[test_ctl_vehicle_idx].index.values] = 0
test_submission = pd.DataFrame({'sig_id' : test_sig_ids})
test_preds_df = pd.DataFrame(pred_mean, columns=train_targets_scored.columns[1:])
test_submission = pd.concat([test_submission, test_preds_df], axis=1)
test_submission.head(3)

Unnamed: 0,sig_id,5-alpha_reductase_inhibitor,11-beta-hsd1_inhibitor,acat_inhibitor,acetylcholine_receptor_agonist,acetylcholine_receptor_antagonist,acetylcholinesterase_inhibitor,adenosine_receptor_agonist,adenosine_receptor_antagonist,adenylyl_cyclase_activator,...,tropomyosin_receptor_kinase_inhibitor,trpv_agonist,trpv_antagonist,tubulin_inhibitor,tyrosine_kinase_inhibitor,ubiquitin_specific_protease_inhibitor,vegfr_inhibitor,vitamin_b,vitamin_d_receptor_agonist,wnt_inhibitor
0,id_0004d9e33,0.000958,0.000903,0.001403,0.007455,0.014341,0.003101,0.002029,0.005461,0.000422,...,0.000544,0.000959,0.001637,0.095493,0.00278,0.000554,0.002451,0.00159,0.001579,0.001294
1,id_001897cda,0.000891,0.001052,0.001741,0.009358,0.012656,0.003759,0.002646,0.004528,0.001849,...,0.000677,0.000977,0.002123,0.002072,0.002038,0.000644,0.001265,0.001665,0.001365,0.001814
2,id_002429b5b,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [44]:
test_submission.to_csv('submission.csv', index=False)