In [1]:
import os
import gc
import random
import math
import time
import numpy as np
import pandas as pd

import category_encoders as ce
import matplotlib.pyplot as plt
from plotly.subplots import make_subplots
import plotly.graph_objects as go
from sklearn import preprocessing
from sklearn import decomposition
from sklearn.decomposition import KernelPCA

import torch
import torch.nn as nn
from torch.utils.data import DataLoader, Dataset
import torch.optim as optim
import torch.nn.functional as F
import torchvision
from pytorch_tabnet.tab_model import TabNetClassifier, TabNetRegressor
from pytorch_tabnet.metrics import Metric

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

In [2]:
train_features = pd.read_csv('Data/train_features.csv')
train_targets_scored = pd.read_csv('Data/train_targets_scored.csv')
train_targets_nonscored = pd.read_csv('Data/train_targets_nonscored.csv')
test_features = pd.read_csv('Data/test_features.csv')
submission = pd.read_csv('Data/sample_submission.csv')

In [3]:
train = train_features.merge(train_targets_scored, on='sig_id')
# constrcut train&test except 'cp_type'=='ctl_vehicle' data
print(train_features.shape, test_features.shape)
train = train[train['cp_type']!='ctl_vehicle'].reset_index(drop=True)
#test = test_features[test_features['cp_type']!='ctl_vehicle'].reset_index(drop=True)
test = test_features

print(train.shape, test.shape)

(23814, 876) (3982, 876)
(21948, 1082) (3982, 876)


In [4]:
train['cp_time'] = train['cp_time'].map({24: -1, 48: 0, 72: 1})
train['cp_dose'] = train['cp_dose'].map({'D1': -0.5, 'D2': 0.5})

test['cp_time'] = test['cp_time'].map({24: -1, 48: 0, 72: 1})
test['cp_dose'] = test['cp_dose'].map({'D1': -0.5, 'D2': 0.5})

In [5]:
train = train.to_numpy()
test = test.to_numpy()
dist_len = 99 + 771
for d in range(dist_len):
    train[::, 4+d]  = preprocessing.scale(train[::, 4+d])
    test[::, 4+d]  = preprocessing.scale(test[::, 4+d])
train = train[::, 2:].astype('float64') 
test = test[::, 2:].astype('float64')

Feature Engineering 

Kernel PCA

In [6]:
pca_num = 20

transformer = KernelPCA(n_components=pca_num, kernel='linear')
X_transformed = transformer.fit_transform(train[::, :874])
test_transformed = transformer.transform(test)

Autoencoder

In [23]:
mse = nn.MSELoss()

batch_size = 100
latents = 40 

traningy = train[400:, :874]
valdationy = train[:400, :874]

transform = torchvision.transforms.Compose([torchvision.transforms.ToTensor()])

train_loader_ae = torch.utils.data.DataLoader(
    traningy, batch_size=batch_size, shuffle=False, pin_memory=True)

test_loader_ae = torch.utils.data.DataLoader(
    valdationy, batch_size=batch_size, shuffle=False, pin_memory=True)

class AE(nn.Module):
    def __init__(self, **kwargs):
        super().__init__()
        self.encoder_il = nn.Linear(874, 500)
        self.bnorm1 = nn.BatchNorm1d(num_features=500)
        self.encoder_hl1 = nn.Linear(500, 200)
        self.bnorm2 = nn.BatchNorm1d(num_features=200)
        self.encoder_hl2 = nn.Linear(200, 150)
        self.bnorm3 = nn.BatchNorm1d(num_features=150)
        self.encoder_ol = nn.Linear(150, latents)
        
        self.bnorm4 = nn.BatchNorm1d(num_features=latents)
        self.decoder_il = nn.Linear(latents, 150)
        self.bnorm5 = nn.BatchNorm1d(num_features=150)
        self.decoder_hl1 = nn.Linear(150, 200)
        self.bnorm6 = nn.BatchNorm1d(num_features=200)
        self.decoder_hl2 = nn.Linear(200, 500)
        self.bnorm7 = nn.BatchNorm1d(num_features=500)
        self.decoder_ol = nn.Linear(500, 874)
        
        self.elu = nn.ELU()
        
    def forward_encoder(self, x):
        x = self.encoder_il(x)
        x = self.elu(x)
        x = self.bnorm1(x)
        x = self.encoder_hl1(x)
        x = self.elu(x)
        x = self.bnorm2(x)
        x = self.encoder_hl2(x)
        x = self.elu(x)
        x = self.bnorm3(x)
        emb = self.encoder_ol(x)
        return emb
    
    def forward_decoder(self, emb):    
        x = self.bnorm4(emb)
        x = self.decoder_il(x)
        x = self.elu(x)
        x = self.bnorm5(x)
        x = self.decoder_hl1(x)
        x = self.elu(x)
        x = self.bnorm6(x)
        x = self.decoder_hl2(x)
        x = self.elu(x)
        x = self.bnorm7(x)
        x = self.decoder_ol(x)
        return x
    
model_ae = AE().to(device)
optimizer_ae = optim.Adam(model_ae.parameters(), lr=0.005)
epochs = 50

epoch_list = []
val_list = []

for epoch in range(epochs):
    train_loss_en = 0
    train_loss_de = 0
    loss = 0
    
    for x in train_loader_ae:
        
        x = x.to(device)
        optimizer_ae.zero_grad()
        x = x.view((-1, 874))
        emb = model_ae.forward_encoder(x.float())
        rec = model_ae.forward_decoder(emb)
        # compute training reconstruction loss
        train_loss = mse(rec.double(), x)

        # compute accumulated gradients
        train_loss.backward()

        # perform parameter update based on current gradients
        optimizer_ae.step()
 
        # add the mini-batch training loss to epoch loss
        loss += train_loss_en

    if (epoch % 1) == 0:
        val_loss_en = 0 
        val_loss_de = 0
        
        for x in test_loader_ae:
            x = x.to(device)
            
            x = x.view((-1, 874))
            emb = model_ae.forward_encoder(x.float())
            rec = model_ae.forward_decoder(emb)
            # compute training reconstruction loss
            val_loss = mse(rec.double(), x)
            
        val_loss = val_loss.cpu().detach().numpy()
        val_list.append(val_loss)
       
        epoch_list.append(epoch)
        
        print("Validation: epoch : {}/{}, loss = {:.4f}".format(epoch+1, epochs, val_loss))

Validation: epoch : 1/50, loss = 0.6902
Validation: epoch : 2/50, loss = 0.5890
Validation: epoch : 3/50, loss = 0.5633
Validation: epoch : 4/50, loss = 0.5500
Validation: epoch : 5/50, loss = 0.5334
Validation: epoch : 6/50, loss = 0.5319
Validation: epoch : 7/50, loss = 0.5167
Validation: epoch : 8/50, loss = 0.5199
Validation: epoch : 9/50, loss = 0.5172
Validation: epoch : 10/50, loss = 0.5165
Validation: epoch : 11/50, loss = 0.5078
Validation: epoch : 12/50, loss = 0.5089
Validation: epoch : 13/50, loss = 0.5069
Validation: epoch : 14/50, loss = 0.5010
Validation: epoch : 15/50, loss = 0.5028
Validation: epoch : 16/50, loss = 0.4949
Validation: epoch : 17/50, loss = 0.4930
Validation: epoch : 18/50, loss = 0.4909
Validation: epoch : 19/50, loss = 0.4955
Validation: epoch : 20/50, loss = 0.4913
Validation: epoch : 21/50, loss = 0.4899
Validation: epoch : 22/50, loss = 0.4865
Validation: epoch : 23/50, loss = 0.4843
Validation: epoch : 24/50, loss = 0.4855
Validation: epoch : 25/50

In [38]:
model_ae.eval()
enc_ae = np.empty(shape = (train.shape[0], latents))
for i in range(enc_ae.shape[0]):
    x = torch.from_numpy(np.asarray(train[i, :874])).to(device).float()
    x = x.view(-1, 874)
    x = model_ae.forward_encoder(x)
    enc_ae[i, ::] = np.reshape(x.cpu().detach().numpy(), (latents))
    
enc_ae_test = np.empty(shape = (test.shape[0], latents))
for i in range(enc_ae_test.shape[0]):
    x = torch.from_numpy(np.asarray(test[i, :874])).to(device).float()
    x = x.view(-1, 874)
    x = model_ae.forward_encoder(x)
    enc_ae_test[i, ::] = np.reshape(x.cpu().detach().numpy(), (latents))

In [31]:
batch_size = 100

train_no_lables = np.concatenate((train[::, :874], X_transformed, enc_ae), axis = 1)

val = train_no_lables[:2000, ::]
train_d = train_no_lables[2000:, ::]

lables_train = train[2000:, 874:]
lables_val = train[:2000, 874:]

dataset = torch.utils.data.TensorDataset( torch.Tensor(train_d), torch.Tensor(lables_train) )
validationset = torch.utils.data.TensorDataset( torch.Tensor(val), torch.Tensor(lables_val) )

transform = torchvision.transforms.Compose([torchvision.transforms.ToTensor()])

train_loader = torch.utils.data.DataLoader(
    dataset, batch_size=batch_size, shuffle=True, pin_memory=True)

test_loader = torch.utils.data.DataLoader(
    validationset, batch_size=batch_size, shuffle=True, pin_memory=True)

input_shape = train_d.shape[1]

Tabnet

In [32]:
class LogitsLogLoss(Metric):

    def __init__(self):
        self._name = "val_loss"
        self._maximize = False

    def __call__(self, y_true, y_pred):

        logits = 1 / (1 + np.exp(-y_pred))
        aux = (1 - y_true) * np.log(1 - logits + 1e-15) + y_true * np.log(logits + 1e-15)
        return np.mean(-aux)

In [33]:
model = TabNetRegressor(n_d=24, n_a=24, n_steps=1, lambda_sparse=0, optimizer_fn=torch.optim.Adam,
                                    optimizer_params=dict(lr=2e-2, weight_decay=1e-5), mask_type='entmax', 
                                    scheduler_params=dict(milestones=[50, 100, 150], gamma=0.9), 
                                    scheduler_fn=torch.optim.lr_scheduler.MultiStepLR)
model.fit(
  X_train=train_d, y_train=lables_train,
  eval_set=[(val, lables_val)],
  loss_fn = torch.nn.BCEWithLogitsLoss(),
  eval_metric = [LogitsLogLoss])

Device used : cuda
epoch 0  | loss: 0.36775 | val_0_val_loss: 0.04888 |  0:00:01s
epoch 1  | loss: 0.02998 | val_0_val_loss: 0.02872 |  0:00:02s
epoch 2  | loss: 0.02448 | val_0_val_loss: 0.02233 |  0:00:04s
epoch 3  | loss: 0.02186 | val_0_val_loss: 0.02143 |  0:00:05s
epoch 4  | loss: 0.02123 | val_0_val_loss: 0.02114 |  0:00:07s
epoch 5  | loss: 0.02087 | val_0_val_loss: 0.02086 |  0:00:08s
epoch 6  | loss: 0.02058 | val_0_val_loss: 0.02074 |  0:00:09s
epoch 7  | loss: 0.02036 | val_0_val_loss: 0.02038 |  0:00:11s
epoch 8  | loss: 0.02005 | val_0_val_loss: 0.02004 |  0:00:12s
epoch 9  | loss: 0.01968 | val_0_val_loss: 0.02036 |  0:00:14s
epoch 10 | loss: 0.01934 | val_0_val_loss: 0.01943 |  0:00:15s
epoch 11 | loss: 0.019   | val_0_val_loss: 0.02098 |  0:00:16s
epoch 12 | loss: 0.01862 | val_0_val_loss: 0.01893 |  0:00:18s
epoch 13 | loss: 0.01842 | val_0_val_loss: 0.01869 |  0:00:19s
epoch 14 | loss: 0.01816 | val_0_val_loss: 0.01857 |  0:00:20s
epoch 15 | loss: 0.01804 | val_0_val

In [41]:
train_aug = np.concatenate((test[::, :874], test_transformed, enc_ae_test), axis = 1)
pred_loader = torch.utils.data.DataLoader(train_aug, batch_size=batch_size, shuffle=True, pin_memory=True)

In [42]:
pred_encode = np.empty(shape = (test.shape[0], 206))
i = 1
for x in pred_loader:
    x = x.to(device)
    outputs = model.predict(x.float())
    pred_encode[((i-1)*(outputs.shape[0])):(i*(outputs.shape[0])), ::] = 1 / (1 + np.exp(-outputs))
    i += 1

In [43]:
# take a copy of all our training sig_ids for reference
test_sig_ids = test_features['sig_id'].copy()

# select all indices when 'cp_type' is 'ctl_vehicle'
test_ctl_vehicle_idx = (test_features['cp_type'] == 'ctl_vehicle')

# change all cp_type == ctl_vehicle predictions to zero
pred_encode[test_sig_ids[test_ctl_vehicle_idx].index.values] = 0
test_submission = pd.DataFrame({'sig_id' : test_sig_ids})
test_preds_df = pd.DataFrame(pred_encode, columns=train_targets_scored.columns[1:])
test_submission = pd.concat([test_submission, test_preds_df], axis=1)
test_submission.head(3)

Unnamed: 0,sig_id,5-alpha_reductase_inhibitor,11-beta-hsd1_inhibitor,acat_inhibitor,acetylcholine_receptor_agonist,acetylcholine_receptor_antagonist,acetylcholinesterase_inhibitor,adenosine_receptor_agonist,adenosine_receptor_antagonist,adenylyl_cyclase_activator,...,tropomyosin_receptor_kinase_inhibitor,trpv_agonist,trpv_antagonist,tubulin_inhibitor,tyrosine_kinase_inhibitor,ubiquitin_specific_protease_inhibitor,vegfr_inhibitor,vitamin_b,vitamin_d_receptor_agonist,wnt_inhibitor
0,id_0004d9e33,0.001438,0.0011,0.002067,0.009581,0.014255,0.003371,0.003292,0.002611,0.00064,...,0.000788,0.001059,0.002315,0.002329,0.002214,0.000668,0.002227,0.002663,0.00055,0.001774
1,id_001897cda,0.001079,0.001245,0.002902,0.000989,0.001462,0.001726,0.001479,0.001522,0.000446,...,0.000576,0.003834,0.004479,0.000525,0.001478,0.000585,0.002189,0.000776,0.004364,0.000884
2,id_002429b5b,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [44]:
test_submission.to_csv('submission.csv', index=False)