In [2]:
import os
import gc
import random
import math
import time
import numpy as np
import pandas as pd

import category_encoders as ce
import matplotlib.pyplot as plt
from plotly.subplots import make_subplots
import plotly.graph_objects as go
from sklearn import preprocessing
from sklearn import decomposition

import torch
import torch.nn as nn
from torch.utils.data import DataLoader, Dataset
import torch.optim as optim
import torch.nn.functional as F
import torchvision

In [3]:
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

In [7]:
train_features = pd.read_csv('Data/train_features.csv')
train_targets_scored = pd.read_csv('Data/train_targets_scored.csv')
train_targets_nonscored = pd.read_csv('Data/train_targets_nonscored.csv')
test_features = pd.read_csv('Data/test_features.csv')
submission = pd.read_csv('Data/sample_submission.csv')

In [6]:
train = train_features.merge(train_targets_scored, on='sig_id')
# constrcut train&test except 'cp_type'=='ctl_vehicle' data
print(train_features.shape, test_features.shape)
train = train[train['cp_type']!='ctl_vehicle'].reset_index(drop=True)
#test = test_features[test_features['cp_type']!='ctl_vehicle'].reset_index(drop=True)
test = test_features

print(train.shape, test.shape)

(23814, 876) (3982, 876)
(21948, 1082) (3982, 876)


In [7]:
train['cp_time'] = train['cp_time'].map({24: -1, 48: 0, 72: 1})
train['cp_dose'] = train['cp_dose'].map({'D1': -0.5, 'D2': 0.5})

test['cp_time'] = test['cp_time'].map({24: -1, 48: 0, 72: 1})
test['cp_dose'] = test['cp_dose'].map({'D1': -0.5, 'D2': 0.5})

In [8]:
train = train.to_numpy()
test = test.to_numpy()
dist_len = 99 + 771
for d in range(dist_len):
    train[::, 4+d]  = preprocessing.scale(train[::, 4+d])
    test[::, 4+d]  = preprocessing.scale(test[::, 4+d])
train = train[::, 2:].astype('float64') 
test = test[::, 2:].astype('float64')

In [9]:
batch_size = 100

val = train[:2000, :874]
train_d = train[2000:, :874]

lables_train = train[2000:, 874:]
lables_val = train[:2000, 874:]

dataset = torch.utils.data.TensorDataset( torch.Tensor(train_d), torch.Tensor(lables_train) )
validationset = torch.utils.data.TensorDataset( torch.Tensor(val), torch.Tensor(lables_val) )

transform = torchvision.transforms.Compose([torchvision.transforms.ToTensor()])

train_loader = torch.utils.data.DataLoader(
    dataset, batch_size=batch_size, shuffle=True, pin_memory=True)

test_loader = torch.utils.data.DataLoader(
    validationset, batch_size=batch_size, shuffle=True, pin_memory=True)

pred_loader = torch.utils.data.DataLoader(
    test, batch_size=batch_size, shuffle=True, pin_memory=True)

input_shape = train_d.shape[1]

In [10]:
class NN(nn.Module):
    def __init__(self, **kwargs):
        super().__init__()
        self.ilayer = nn.Linear(in_features=kwargs["input_shape"], out_features= 874)
        
        self.bnorm1 = nn.BatchNorm1d(874)
        
        self.drop1 = nn.Dropout(0.2)
        
        self.hlayer1 = nn.Linear(in_features= 874, out_features= 600)
        
        self.bnorm2 = nn.BatchNorm1d(600)
        
        self.drop2 = nn.Dropout(0.5)
        
        self.hlayer2 = nn.Linear(in_features= 600, out_features= 400)
        
        self.bnorm3 = nn.BatchNorm1d(400)
        
        self.drop3 = nn.Dropout(0.5)
        
        self.olayer = nn.Linear(in_features= 400, out_features= 206)
        
        self.tanh = nn.Tanh()
        
        self.relu = nn.ReLU()

        
    def forward(self, x):
        x = self.ilayer(x)
        x = self.relu(x)
        x = self.bnorm1(x)
        x = self.drop1(x)

        x = self.hlayer1(x)
        x = self.relu(x)
        x = self.bnorm2(x)
        x = self.drop2(x)
        
        
        x = self.hlayer2(x)
        x = self.relu(x)
        x = self.bnorm3(x)
        x = self.drop3(x)
        
        x = self.olayer(x)
        
        return x

In [11]:
# use gpu if available
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

# create a model from `AE` autoencoder class
# load it to the specified device, either gpu or cpu
model = NN(input_shape=input_shape).to(device)

# create an optimizer object
# Adam optimizer with learning rate 1e-3
optimizer = optim.Adam(model.parameters(), lr=1e-4)

In [12]:
epochs = 100

epoch_list = []
val_list = []

for epoch in range(epochs):
    train_loss_en = 0
    train_loss_de = 0
    loss = 0
    
    for x, y in train_loader:
        
        x = x.to(device)
        y = y.to(device)
        optimizer.zero_grad()
        
        outputs = model.forward(x.float())
        # compute training reconstruction loss
        train_loss = nn.BCEWithLogitsLoss()(outputs, y)

        # compute accumulated gradients
        train_loss.backward()

        # perform parameter update based on current gradients
        optimizer.step()
 
        # add the mini-batch training loss to epoch loss
        loss += train_loss_en

    if (epoch % 1) == 0:
        val_loss_en = 0 
        val_loss_de = 0

        for x, y in test_loader:
            x = x.to(device)
            y = y.to(device)
            outputs = model.forward(x.float())
            val_loss = nn.BCEWithLogitsLoss()(outputs, y)


        val_loss = val_loss.cpu().detach().numpy()

        val_list.append(val_loss)
       
        epoch_list.append(epoch)
        
        if epoch >= 10:
            if all((i <= val_list[-1]) for i in val_list[-6:-1]):
                break 

        print("Validation: epoch : {}/{}, loss = {:.4f}".format(epoch, epochs, val_loss))


    # display the epoch training loss
    print("epoch : {}/{}, loss = {:.4f}".format(epoch + 1, epochs, train_loss))

Validation: epoch : 0/100, loss = 0.6959
epoch : 1/100, loss = 0.7064
Validation: epoch : 1/100, loss = 0.6408
epoch : 2/100, loss = 0.6126
Validation: epoch : 2/100, loss = 0.4549
epoch : 3/100, loss = 0.4663
Validation: epoch : 3/100, loss = 0.2866
epoch : 4/100, loss = 0.3209
Validation: epoch : 4/100, loss = 0.1924
epoch : 5/100, loss = 0.1816
Validation: epoch : 5/100, loss = 0.1373
epoch : 6/100, loss = 0.1082
Validation: epoch : 6/100, loss = 0.0696
epoch : 7/100, loss = 0.0750
Validation: epoch : 7/100, loss = 0.0538
epoch : 8/100, loss = 0.0567
Validation: epoch : 8/100, loss = 0.0481
epoch : 9/100, loss = 0.0399
Validation: epoch : 9/100, loss = 0.0393
epoch : 10/100, loss = 0.0447
Validation: epoch : 10/100, loss = 0.0354
epoch : 11/100, loss = 0.0317
Validation: epoch : 11/100, loss = 0.0286
epoch : 12/100, loss = 0.0274
Validation: epoch : 12/100, loss = 0.0267
epoch : 13/100, loss = 0.0281
Validation: epoch : 13/100, loss = 0.0249
epoch : 14/100, loss = 0.0267
Validation:

In [None]:
preds = clf.predict(X_test)

In [13]:
model.eval()
pred_encode = np.empty(shape = (test.shape[0], 206))
i = 1
for x in pred_loader:
    x = x.to(device)
    outputs = model.forward(x.float())
    pred_encode[((i-1)*(outputs.shape[0])):(i*(outputs.shape[0])), ::] = outputs.sigmoid().cpu().detach().numpy()
    i += 1

In [14]:
# take a copy of all our training sig_ids for reference
test_sig_ids = test_features['sig_id'].copy()

# select all indices when 'cp_type' is 'ctl_vehicle'
test_ctl_vehicle_idx = (test_features['cp_type'] == 'ctl_vehicle')

# change all cp_type == ctl_vehicle predictions to zero
pred_encode[test_sig_ids[test_ctl_vehicle_idx].index.values] = 0
test_submission = pd.DataFrame({'sig_id' : test_sig_ids})
test_preds_df = pd.DataFrame(pred_encode, columns=train_targets_scored.columns[1:])
test_submission = pd.concat([test_submission, test_preds_df], axis=1)
test_submission.head(3)

Unnamed: 0,sig_id,5-alpha_reductase_inhibitor,11-beta-hsd1_inhibitor,acat_inhibitor,acetylcholine_receptor_agonist,acetylcholine_receptor_antagonist,acetylcholinesterase_inhibitor,adenosine_receptor_agonist,adenosine_receptor_antagonist,adenylyl_cyclase_activator,...,tropomyosin_receptor_kinase_inhibitor,trpv_agonist,trpv_antagonist,tubulin_inhibitor,tyrosine_kinase_inhibitor,ubiquitin_specific_protease_inhibitor,vegfr_inhibitor,vitamin_b,vitamin_d_receptor_agonist,wnt_inhibitor
0,id_0004d9e33,0.002686,0.00261,0.002865,0.008455,0.011161,0.003767,0.003519,0.004574,0.002628,...,0.00223,0.003022,0.00349,0.012452,0.004902,0.002637,0.012502,0.002925,0.003458,0.003006
1,id_001897cda,0.002368,0.00253,0.002663,0.008108,0.012531,0.003851,0.003524,0.004741,0.002631,...,0.002167,0.002688,0.003393,0.010049,0.004241,0.002447,0.009817,0.002816,0.003268,0.002802
2,id_002429b5b,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [15]:
test_submission.to_csv('submission.csv', index=False)