In [1]:
import os
import gc
import random
import math
import time
import numpy as np
import pandas as pd

import category_encoders as ce
import matplotlib.pyplot as plt
from plotly.subplots import make_subplots
import plotly.graph_objects as go
from sklearn import preprocessing, decomposition

import torch
import torch.nn as nn
from torch.utils.data import DataLoader, Dataset
import torch.optim as optim
import torch.nn.functional as F
import torchvision
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

In [2]:
#Hyperparameters
batch_size = 100
val_size = 2000
epochs = 60
learning_rate = 1e-4
hidden_size_1 = 600
hidden_size_2 = 400
#Constants
feature_size = 874
lable_size = 206

In [3]:
train_features = pd.read_csv('Data/train_features.csv')
train_targets_scored = pd.read_csv('Data/train_targets_scored.csv')
train_targets_nonscored = pd.read_csv('Data/train_targets_nonscored.csv')
test_features = pd.read_csv('Data/test_features.csv')
submission = pd.read_csv('Data/sample_submission.csv')

In [4]:
train = train_features.merge(train_targets_scored, on='sig_id')
# constrcut train&test except 'cp_type'=='ctl_vehicle' data
print(train_features.shape, test_features.shape)
train = train[train['cp_type']!='ctl_vehicle'].reset_index(drop=True)
#test = test_features[test_features['cp_type']!='ctl_vehicle'].reset_index(drop=True)
test = test_features

print(train.shape, test.shape)

(23814, 876) (3982, 876)
(21948, 1082) (3982, 876)


In [5]:
train['cp_time'] = train['cp_time'].map({24: -1, 48: 0, 72: 1})
train['cp_dose'] = train['cp_dose'].map({'D1': -0.5, 'D2': 0.5})

test['cp_time'] = test['cp_time'].map({24: -1, 48: 0, 72: 1})
test['cp_dose'] = test['cp_dose'].map({'D1': -0.5, 'D2': 0.5})

In [6]:
train = train.to_numpy()
test = test.to_numpy()
dist_len = 99 + 771
for d in range(dist_len):
    train[::, 4+d]  = preprocessing.scale(train[::, 4+d])
    test[::, 4+d]  = preprocessing.scale(test[::, 4+d])
train = train[::, 2:].astype('float64') 
test = test[::, 2:].astype('float64')

In [7]:
val = train[:val_size, :feature_size]
train_d = train[val_size:, :feature_size]

lables_train = train[val_size:, feature_size:]
lables_val = train[:val_size, feature_size:]

dataset = torch.utils.data.TensorDataset( torch.Tensor(train_d), torch.Tensor(lables_train) )
validationset = torch.utils.data.TensorDataset( torch.Tensor(val), torch.Tensor(lables_val) )

transform = torchvision.transforms.Compose([torchvision.transforms.ToTensor()])

train_loader = torch.utils.data.DataLoader(
    dataset, batch_size=batch_size, shuffle=True, pin_memory=True)

test_loader = torch.utils.data.DataLoader(
    validationset, batch_size=batch_size, shuffle=True, pin_memory=True)

pred_loader = torch.utils.data.DataLoader(
    test, batch_size=batch_size, shuffle=True, pin_memory=True)

In [8]:
class NN(nn.Module):
    def __init__(self, **kwargs):
        super().__init__()
        self.ilayer = nn.Linear(in_features=feature_size, out_features= feature_size)
        self.bnorm1 = nn.BatchNorm1d(feature_size)
        self.drop1 = nn.Dropout(0.2)
        self.hlayer1 = nn.Linear(in_features= feature_size, out_features= hidden_size_1)
        self.bnorm2 = nn.BatchNorm1d(hidden_size_1)
        self.drop2 = nn.Dropout(0.5)
        self.hlayer2 = nn.Linear(in_features= hidden_size_1, out_features= hidden_size_2)
        self.bnorm3 = nn.BatchNorm1d(hidden_size_2)
        self.drop3 = nn.Dropout(0.5)
        self.olayer = nn.Linear(in_features= hidden_size_2, out_features= lable_size)
        self.relu = nn.ReLU()
        
    def forward(self, x):
        x = self.ilayer(x)
        x = self.relu(x)
        x = self.bnorm1(x)
        x = self.drop1(x)
        x = self.hlayer1(x)
        x = self.relu(x)
        x = self.bnorm2(x)
        x = self.drop2(x)  
        x = self.hlayer2(x)
        x = self.relu(x)
        x = self.bnorm3(x)
        x = self.drop3(x)      
        x = self.olayer(x)
        return x

In [9]:
model = NN().to(device)
optimizer = optim.Adam(model.parameters(), lr=learning_rate)

In [10]:
epoch_list = []
val_list = []

for epoch in range(epochs):
    train_loss_en = 0
    train_loss_de = 0
    loss = 0
    
    for x, y in train_loader:
        
        x = x.to(device)
        y = y.to(device)
        optimizer.zero_grad()
        
        outputs = model.forward(x.float())
        # compute training reconstruction loss
        train_loss = nn.BCEWithLogitsLoss()(outputs, y)

        # compute accumulated gradients
        train_loss.backward()

        # perform parameter update based on current gradients
        optimizer.step()
 
        # add the mini-batch training loss to epoch loss
        loss += train_loss_en

    if (epoch % 1) == 0:
        val_loss_en = 0 
        val_loss_de = 0

        for x, y in test_loader:
            x = x.to(device)
            y = y.to(device)
            outputs = model.forward(x.float())
            val_loss = nn.BCEWithLogitsLoss()(outputs, y)


        val_loss = val_loss.cpu().detach().numpy()

        val_list.append(val_loss)
       
        epoch_list.append(epoch)
        
        if epoch >= 10:
            if all((i <= val_list[-1]) for i in val_list[-6:-1]):
                break 

        print("Validation: epoch : {}/{}, loss = {:.4f}".format(epoch, epochs, val_loss))


    # display the epoch training loss
    print("epoch : {}/{}, loss = {:.4f}".format(epoch + 1, epochs, train_loss))

Validation: epoch : 0/60, loss = 0.7088
epoch : 1/60, loss = 0.7113
Validation: epoch : 1/60, loss = 0.6180
epoch : 2/60, loss = 0.6185
Validation: epoch : 2/60, loss = 0.4668
epoch : 3/60, loss = 0.4814
Validation: epoch : 3/60, loss = 0.2939
epoch : 4/60, loss = 0.2880
Validation: epoch : 4/60, loss = 0.1782
epoch : 5/60, loss = 0.1594
Validation: epoch : 5/60, loss = 0.1063
epoch : 6/60, loss = 0.1162
Validation: epoch : 6/60, loss = 0.0782
epoch : 7/60, loss = 0.0773
Validation: epoch : 7/60, loss = 0.0527
epoch : 8/60, loss = 0.0546
Validation: epoch : 8/60, loss = 0.0452
epoch : 9/60, loss = 0.0942
Validation: epoch : 9/60, loss = 0.0373
epoch : 10/60, loss = 0.0348
Validation: epoch : 10/60, loss = 0.0356
epoch : 11/60, loss = 0.0305
Validation: epoch : 11/60, loss = 0.0299
epoch : 12/60, loss = 0.0352
Validation: epoch : 12/60, loss = 0.0435
epoch : 13/60, loss = 0.0277
Validation: epoch : 13/60, loss = 0.0274
epoch : 14/60, loss = 0.0249
Validation: epoch : 14/60, loss = 0.024

In [11]:
model.eval()
pred_encode = np.empty(shape = (test.shape[0], lable_size))
i = 1
for x in pred_loader:
    x = x.to(device)
    outputs = model.forward(x.float())
    pred_encode[((i-1)*(outputs.shape[0])):(i*(outputs.shape[0])), ::] = outputs.sigmoid().cpu().detach().numpy()
    i += 1

In [12]:
# take a copy of all our training sig_ids for reference
test_sig_ids = test_features['sig_id'].copy()

# select all indices when 'cp_type' is 'ctl_vehicle'
test_ctl_vehicle_idx = (test_features['cp_type'] == 'ctl_vehicle')

# change all cp_type == ctl_vehicle predictions to zero
pred_encode[test_sig_ids[test_ctl_vehicle_idx].index.values] = 0
test_submission = pd.DataFrame({'sig_id' : test_sig_ids})
test_preds_df = pd.DataFrame(pred_encode, columns=train_targets_scored.columns[1:])
test_submission = pd.concat([test_submission, test_preds_df], axis=1)
test_submission.head(3)

Unnamed: 0,sig_id,5-alpha_reductase_inhibitor,11-beta-hsd1_inhibitor,acat_inhibitor,acetylcholine_receptor_agonist,acetylcholine_receptor_antagonist,acetylcholinesterase_inhibitor,adenosine_receptor_agonist,adenosine_receptor_antagonist,adenylyl_cyclase_activator,...,tropomyosin_receptor_kinase_inhibitor,trpv_agonist,trpv_antagonist,tubulin_inhibitor,tyrosine_kinase_inhibitor,ubiquitin_specific_protease_inhibitor,vegfr_inhibitor,vitamin_b,vitamin_d_receptor_agonist,wnt_inhibitor
0,id_0004d9e33,0.001004,0.001001,0.001313,0.009323,0.010531,0.002885,0.00227,0.004002,0.000909,...,0.000711,0.001203,0.002053,0.00272,0.003334,0.000818,0.00899,0.001361,0.002092,0.001486
1,id_001897cda,0.001116,0.001299,0.001267,0.012988,0.021447,0.003822,0.003586,0.004904,0.000841,...,0.000734,0.001117,0.002497,0.004187,0.002766,0.000867,0.00367,0.001628,0.002159,0.001549
2,id_002429b5b,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [13]:
test_submission.to_csv('submission.csv', index=False)