In [1]:
import numpy as np
import random
import pandas as pd
import matplotlib.pyplot as plt
import os
import copy
import seaborn as sns

from sklearn import preprocessing
from sklearn.metrics import log_loss
from sklearn.preprocessing import StandardScaler
from sklearn.decomposition import PCA

import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim

import warnings
warnings.filterwarnings('ignore')
from sklearn.preprocessing import QuantileTransformer
from joblib import dump, load

In [2]:
train_features = pd.read_csv('D:\\Dataset\\MOA\\train_features.csv')
train_targets_scored = pd.read_csv('D:\\Dataset\\MOA\\train_targets_scored.csv')
train_targets_nonscored = pd.read_csv('D:\\Dataset\\MOA\\train_targets_nonscored.csv')
drug = pd.read_csv('D://Dataset//MOA//train_drug.csv')

test_features = pd.read_csv('D:\\Dataset\\MOA\\test_features.csv')
sample_submission = pd.read_csv('D:\\Dataset\\MOA\\sample_submission.csv')

In [3]:
GENES = [col for col in train_features.columns if col.startswith('g-')]
CELLS = [col for col in train_features.columns if col.startswith('c-')]

In [4]:
transformer = QuantileTransformer(n_quantiles=100,random_state=0, output_distribution="normal")
transformer.fit(train_features.loc[:, GENES + CELLS])
train_features[GENES + CELLS] = transformer.transform(train_features.loc[:, GENES + CELLS])

dump(transformer, 'rank_1844_5fold.bin', compress=True)


test_features[GENES + CELLS] = transformer.transform(test_features.loc[:, GENES + CELLS])

In [5]:
def seed_everything(seed=42):
    random.seed(seed)
    os.environ['PYTHONHASHSEED'] = str(seed)
    np.random.seed(seed)
    torch.manual_seed(seed)
    torch.cuda.manual_seed(seed)
    torch.backends.cudnn.deterministic = True
    
seed_everything(seed=42)

In [6]:
# GENES
n_comp = 600  #<--Update

data = pd.concat([pd.DataFrame(train_features[GENES]), pd.DataFrame(test_features[GENES])])
pca_genes = PCA(n_components=n_comp, random_state=42)
data2 = pca_genes.fit_transform(data[GENES])

dump(pca_genes, 'pca_1844_5fold_genes.bin', compress=True)
train2 = data2[:train_features.shape[0]]; test2 = data2[-test_features.shape[0]:]

train2 = pd.DataFrame(train2, columns=[f'pca_G-{i}' for i in range(n_comp)])
test2 = pd.DataFrame(test2, columns=[f'pca_G-{i}' for i in range(n_comp)])

# drop_cols = [f'c-{i}' for i in range(n_comp,len(GENES))]
train_features = pd.concat((train_features, train2), axis=1)
test_features = pd.concat((test_features, test2), axis=1)

In [7]:
#CELLS
n_comp = 50  #<--Update

data = pd.concat([pd.DataFrame(train_features[CELLS]), pd.DataFrame(test_features[CELLS])])
pca_cells = PCA(n_components=n_comp, random_state=42)
data2 = pca_cells.fit_transform(data[CELLS])

dump(pca_cells, 'pca_1844_5fold_cells.bin', compress=True)


train2 = data2[:train_features.shape[0]]; test2 = data2[-test_features.shape[0]:]

train2 = pd.DataFrame(train2, columns=[f'pca_C-{i}' for i in range(n_comp)])
test2 = pd.DataFrame(test2, columns=[f'pca_C-{i}' for i in range(n_comp)])

# drop_cols = [f'c-{i}' for i in range(n_comp,len(CELLS))]
train_features = pd.concat((train_features, train2), axis=1)
test_features = pd.concat((test_features, test2), axis=1)

In [8]:
from sklearn.feature_selection import VarianceThreshold


var_thresh = VarianceThreshold(0.8)  #<-- Update
data = train_features.append(test_features)
data_transformed = var_thresh.fit_transform(data.iloc[:, 4:])

dump(var_thresh, 'var_1844_5fold.bin', compress=True)

train_features_transformed = data_transformed[ : train_features.shape[0]]
test_features_transformed = data_transformed[-test_features.shape[0] : ]


train_features = pd.DataFrame(train_features[['sig_id','cp_type','cp_time','cp_dose']].values.reshape(-1, 4),\
                              columns=['sig_id','cp_type','cp_time','cp_dose'])

train_features = pd.concat([train_features, pd.DataFrame(train_features_transformed)], axis=1)


test_features = pd.DataFrame(test_features[['sig_id','cp_type','cp_time','cp_dose']].values.reshape(-1, 4),\
                             columns=['sig_id','cp_type','cp_time','cp_dose'])

test_features = pd.concat([test_features, pd.DataFrame(test_features_transformed)], axis=1)

train_features.shape

(23814, 1040)

In [9]:
from sklearn.cluster import KMeans
from joblib import dump , load

def fe_cluster(train, test, n_clusters_g = 35, n_clusters_c = 5, SEED = 123):
    
    features_g = list(train.columns[4:776])
    features_c = list(train.columns[776:876])
    
    def create_cluster(train, test, features, kind = 'g', n_clusters = n_clusters_g):
        #train_ = train[features].copy()
        #test_ = test[features].copy()
        #data = pd.concat([train_, test_], axis = 0)
        #kmeans = KMeans(n_clusters = n_clusters, random_state = SEED).fit(data)
        #dump(kmeans, f'Kmeans_feat{kind}.bin', compress=True)
        km = load(f'Kmeans_feat{kind}.bin')
        train[f'clusters_{kind}'] = km.labels_[:train.shape[0]]
        test[f'clusters_{kind}'] = km.labels_[train.shape[0]:]
        #print(test)
        train = pd.get_dummies(train, columns = [f'clusters_{kind}'])
        test = pd.get_dummies(test, columns = [f'clusters_{kind}'])
        return train, test
    
    train, test = create_cluster(train, test, features_g, kind = 'g', n_clusters = n_clusters_g)
    train, test = create_cluster(train, test, features_c, kind = 'c', n_clusters = n_clusters_c)
    return train, test

train_features ,test_features=fe_cluster(train_features,test_features)

In [10]:
print(train_features.shape, test_features.shape)

(23814, 1080) (3982, 1080)


In [11]:
print(train_features.shape, test_features.shape)

(23814, 1080) (3982, 1080)


In [12]:
train = train_features.merge(train_targets_scored, on='sig_id')

#train = train.merge(drug, on='sig_id')

train = train[train['cp_type']!='ctl_vehicle'].reset_index(drop=True)
test = test_features[test_features['cp_type']!='ctl_vehicle'].reset_index(drop=True)

target = train[train_targets_scored.columns]

In [13]:
train = train.drop('cp_type', axis=1)
test = test.drop('cp_type', axis=1)

In [14]:
target_cols = target.drop('sig_id', axis=1).columns.values.tolist()


In [115]:
from iterstrat.ml_stratifiers import MultilabelStratifiedKFold


def create_folds(num_starts, num_splits, trn):

    folds = []
    scored = trn.copy()
    #targets_ = scored.loc[:, train_targets_scored.columns].columns[1:].tolist()
    #train_cols = train_features.columns.tolist() + ['fold','drug_id']
    #train_cols = [col for col in train_cols if col!='cp_type']
    # LOCATE DRUGS
    vc = scored.drug_id.value_counts()
    vc1 = vc.loc[vc <= 18].index.sort_values()
    vc2 = vc.loc[vc > 18].index.sort_values()
    
    for seed in range(num_starts):
    
        # STRATIFY DRUGS 18X OR LESS
        dct1 = {}; dct2 = {}
        skf = MultilabelStratifiedKFold(n_splits = num_splits, shuffle = True, random_state = seed)
        tmp = scored.groupby('drug_id')[target_cols].mean().loc[vc1]
        for fold,(idxT,idxV) in enumerate(skf.split(tmp,tmp[target_cols])):
            dd = {k:fold for k in tmp.index[idxV].values}
            dct1.update(dd)
    
        # STRATIFY DRUGS MORE THAN 18X
        skf = MultilabelStratifiedKFold(n_splits = num_splits, shuffle = True, random_state = seed)
        tmp = scored.loc[scored.drug_id.isin(vc2)].reset_index(drop = True)
        for fold,(idxT,idxV) in enumerate(skf.split(tmp,tmp[target_cols])):
            dd = {k:fold for k in tmp.sig_id[idxV].values}
            dct2.update(dd)
    
        # ASSIGN FOLDS
        scored['fold'] = scored.drug_id.map(dct1)
        scored.loc[scored.fold.isna(),'fold'] =\
            scored.loc[scored.fold.isna(),'sig_id'].map(dct2)
        scored.fold = scored.fold.astype('int8')
        folds.append(scored.fold.values)
        #train_cols = train_feats.columns.tolist() + ['fold','drug_id']
        #train_feats_main = train_feats_.merge(scored, on='sig_id', how='left')
        #train_feats_main['fold'] = train_feats_main['fold'].astype(int)
        
    return scored

In [15]:
from iterstrat.ml_stratifiers import MultilabelStratifiedKFold

folds = train.copy()

mskf = MultilabelStratifiedKFold(n_splits=5)

for f, (t_idx, v_idx) in enumerate(mskf.split(X=train, y=target)):
    folds.loc[v_idx, 'kfold'] = int(f)

folds['kfold'] = folds['kfold'].astype(int)
folds

Unnamed: 0,sig_id,cp_time,cp_dose,0,1,2,3,4,5,6,...,trpv_agonist,trpv_antagonist,tubulin_inhibitor,tyrosine_kinase_inhibitor,ubiquitin_specific_protease_inhibitor,vegfr_inhibitor,vitamin_b,vitamin_d_receptor_agonist,wnt_inhibitor,kfold
0,id_000644bb2,24,D1,1.134849,0.907687,-0.416385,-0.966814,-0.254723,-1.017473,-1.364787,...,0,0,0,0,0,0,0,0,0,0
1,id_000779bfc,72,D1,0.119282,0.681738,0.272399,0.080113,1.205169,0.686517,0.313396,...,0,0,0,0,0,0,0,0,0,2
2,id_000a6266a,48,D1,0.779973,0.946463,1.425350,-0.132928,-0.006122,1.492493,0.235577,...,0,0,0,0,0,0,0,0,0,1
3,id_0015fd391,48,D1,-0.734910,-0.274641,-0.438509,0.759097,2.346330,-0.858153,-2.288417,...,0,0,0,0,0,0,0,0,0,2
4,id_001626bd3,72,D2,-0.452718,-0.477513,0.972316,0.970731,1.463427,-0.869555,-0.375501,...,0,0,0,0,0,0,0,0,0,2
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
21943,id_fff8c2444,72,D1,0.237856,-1.228203,0.218376,-0.365976,-0.330177,0.569243,-0.150978,...,0,0,0,0,0,0,0,0,0,0
21944,id_fffb1ceed,24,D2,0.209361,-0.022389,-0.235888,-0.796989,-0.674009,0.919312,0.735603,...,0,0,0,0,0,0,0,0,0,4
21945,id_fffb70c0c,24,D2,-1.911021,0.587228,-0.588417,1.296405,-1.002640,0.850589,-0.304313,...,0,0,0,0,0,0,0,0,0,0
21946,id_fffcb9e7c,24,D1,0.816407,0.417618,0.431631,0.300617,1.070346,-0.024189,0.048942,...,0,0,0,0,0,0,0,0,0,1


In [16]:
print(train.shape)
print(folds.shape)
print(test.shape)
print(target.shape)
print(sample_submission.shape)

(21948, 1285)
(21948, 1286)
(3624, 1079)
(21948, 207)
(3982, 207)


In [17]:
class Model(nn.Module):      # <-- Update
    def __init__(self, num_features, num_targets, hidden_size):
        super(Model, self).__init__()
        self.batch_norm1 = nn.BatchNorm1d(num_features)
        self.dense1 = nn.utils.weight_norm(nn.Linear(num_features, hidden_size))
        
        self.batch_norm2 = nn.BatchNorm1d(hidden_size)
        self.dropout2 = nn.Dropout(0.4)
        self.dense2 = nn.utils.weight_norm(nn.Linear(hidden_size, hidden_size))
        
        self.batch_norm3 = nn.BatchNorm1d(hidden_size)
        self.dropout3 = nn.Dropout(0.4)
        self.dense3 = nn.utils.weight_norm(nn.Linear(hidden_size, num_targets))
    
    def forward(self, x):
        x = self.batch_norm1(x)
        x = F.relu(self.dense1(x))
        
        x = self.batch_norm2(x)
        x = self.dropout2(x)
        x = F.relu(self.dense2(x))
        
        x = self.batch_norm3(x)
        x = self.dropout3(x)
        x = self.dense3(x)
        
        return x

In [18]:
class MoADataset:
    def __init__(self, features, targets):
        self.features = features
        self.targets = targets
        
    def __len__(self):
        return (self.features.shape[0])
    
    def __getitem__(self, idx):
        dct = {
            'x' : torch.tensor(self.features[idx, :], dtype=torch.float),
            'y' : torch.tensor(self.targets[idx, :], dtype=torch.float)            
        }
        return dct
    
class TestDataset:
    def __init__(self, features):
        self.features = features
        
    def __len__(self):
        return (self.features.shape[0])
    
    def __getitem__(self, idx):
        dct = {
            'x' : torch.tensor(self.features[idx, :], dtype=torch.float)
        }
        return dct

In [19]:
def train_fn(model, optimizer, scheduler, loss_fn, dataloader, device):
    model.train()
    final_loss = 0
    
    for data in dataloader:
        optimizer.zero_grad()
        inputs, targets = data['x'].to(device), data['y'].to(device)
#         print(inputs.shape)
        outputs = model(inputs)
        loss = loss_fn(outputs, targets)
        loss.backward()
        optimizer.step()
        scheduler.step()
        
        final_loss += loss.item()
        
    final_loss /= len(dataloader)
    
    return final_loss


def valid_fn(model, loss_fn, dataloader, device):
    model.eval()
    final_loss = 0
    valid_preds = []
    
    for data in dataloader:
        inputs, targets = data['x'].to(device), data['y'].to(device)
        outputs = model(inputs)
        loss = loss_fn(outputs, targets)
        
        final_loss += loss.item()
        valid_preds.append(outputs.sigmoid().detach().cpu().numpy())
        
    final_loss /= len(dataloader)
    valid_preds = np.concatenate(valid_preds)
    
    return final_loss, valid_preds

def inference_fn(model, dataloader, device):
    model.eval()
    preds = []
    
    for data in dataloader:
        inputs = data['x'].to(device)

        with torch.no_grad():
            outputs = model(inputs)
        
        preds.append(outputs.sigmoid().detach().cpu().numpy())
        
    preds = np.concatenate(preds)
    
    return preds

In [20]:
import torch
from torch.nn.modules.loss import _WeightedLoss
import torch.nn.functional as F

class SmoothBCEwLogits(_WeightedLoss):
    def __init__(self, weight=None, reduction='mean', smoothing=0.0):
        super().__init__(weight=weight, reduction=reduction)
        self.smoothing = smoothing
        self.weight = weight
        self.reduction = reduction

    @staticmethod
    def _smooth(targets:torch.Tensor, n_labels:int, smoothing=0.0):
        assert 0 <= smoothing < 1
        with torch.no_grad():
            targets = targets * (1.0 - smoothing) + 0.5 * smoothing
        return targets

    def forward(self, inputs, targets):
        targets = SmoothBCEwLogits._smooth(targets, inputs.size(-1),
            self.smoothing)
        loss = F.binary_cross_entropy_with_logits(inputs, targets,self.weight)

        if  self.reduction == 'sum':
            loss = loss.sum()
        elif  self.reduction == 'mean':
            loss = loss.mean()

        return loss

In [21]:
def process_data(data):
    data = pd.get_dummies(data, columns=['cp_time','cp_dose'])
    return data

In [22]:
feature_cols = [c for c in process_data(folds).columns if c not in target_cols]
feature_cols = [c for c in feature_cols if c not in ['kfold','sig_id']]
len(feature_cols)

1081

In [23]:
dump(target_cols, '1844_5fold_target_cols.bin')
dump(feature_cols, '1844_5fold_feature_cols.bin')

['1844_5fold_feature_cols.bin']

In [24]:
# HyperParameters

DEVICE = ('cuda' if torch.cuda.is_available() else 'cpu')
EPOCHS = 25
BATCH_SIZE = 128
LEARNING_RATE = 1e-3
WEIGHT_DECAY = 1e-5
NFOLDS = 5            #<-- Update
EARLY_STOPPING_STEPS = 10
EARLY_STOP = False

num_features=len(feature_cols)
num_targets=len(target_cols)
hidden_size=2048


In [25]:
def run_training(fold, seed):
    
    seed_everything(seed)
    
    train = process_data(folds)
    test_ = process_data(test)
    
    trn_idx = train[train['kfold'] != fold].index
    val_idx = train[train['kfold'] == fold].index
    
    train_df = train[train['kfold'] != fold].reset_index(drop=True)
    valid_df = train[train['kfold'] == fold].reset_index(drop=True)
    
    x_train, y_train  = train_df[feature_cols].values, train_df[target_cols].values
    x_valid, y_valid =  valid_df[feature_cols].values, valid_df[target_cols].values
    
    train_dataset = MoADataset(x_train, y_train)
    valid_dataset = MoADataset(x_valid, y_valid)
    trainloader = torch.utils.data.DataLoader(train_dataset, batch_size=BATCH_SIZE, shuffle=True)
    validloader = torch.utils.data.DataLoader(valid_dataset, batch_size=BATCH_SIZE, shuffle=False)
    
    model = Model(
        num_features=num_features,
        num_targets=num_targets,
        hidden_size=hidden_size,
    )
    
    model.to(DEVICE)
    
    optimizer = torch.optim.Adam(model.parameters(), lr=LEARNING_RATE, weight_decay=WEIGHT_DECAY)
    scheduler = optim.lr_scheduler.OneCycleLR(optimizer=optimizer, pct_start=0.1, div_factor=1e3, 
                                              max_lr=1e-2, epochs=EPOCHS, steps_per_epoch=len(trainloader))
    
    loss_fn = nn.BCEWithLogitsLoss()
    
    loss_tr = SmoothBCEwLogits(smoothing =0.001)
    
    early_stopping_steps = EARLY_STOPPING_STEPS
    early_step = 0
    
    oof = np.zeros((len(train), target.iloc[:, 1:].shape[1]))
    best_loss = np.inf
    
    for epoch in range(EPOCHS):
        
        train_loss = train_fn(model, optimizer,scheduler, loss_tr, trainloader, DEVICE)
        print(f"SEED: {seed}, FOLD: {fold}, EPOCH: {epoch}, train_loss: {train_loss}")
        valid_loss, valid_preds = valid_fn(model, loss_fn, validloader, DEVICE)
        print(f"SEED: {seed} ,FOLD: {fold}, EPOCH: {epoch}, valid_loss: {valid_loss}")
        
        if valid_loss < best_loss:
            
            best_loss = valid_loss
            oof[val_idx] = valid_preds
            torch.save(model.state_dict(), f"D:\\MOA_Pretrained_Models\\MoA_Model_01844\\FOLD{fold}_{seed}.pth")
        
        elif(EARLY_STOP == True):
            
            early_step += 1
            if (early_step >= early_stopping_steps):
                break
            
    
    #--------------------- PREDICTION---------------------
    x_test = test_[feature_cols].values
    testdataset = TestDataset(x_test)
    testloader = torch.utils.data.DataLoader(testdataset, batch_size=BATCH_SIZE, shuffle=False)
    
    model = Model(
        num_features=num_features,
        num_targets=num_targets,
        hidden_size=hidden_size,

    )
    
    model.load_state_dict(torch.load(f"D:\\MOA_Pretrained_Models\\MoA_Model_01844\\FOLD{fold}_{seed}.pth"))
    model.to(DEVICE)
    
    predictions = np.zeros((len(test_), target.iloc[:, 1:].shape[1]))
    predictions = inference_fn(model, testloader, DEVICE)
    
    return oof, predictions

In [26]:
def run_k_fold(NFOLDS, seed):
    oof = np.zeros((len(train), len(target_cols)))
    predictions = np.zeros((len(test), len(target_cols)))
    
    for fold in range(NFOLDS):
        oof_, pred_ = run_training(fold, seed)
        
        predictions += pred_ / NFOLDS
        oof += oof_
        
    return oof, predictions

In [27]:
# Averaging on multiple SEEDS

SEED = [0, 1, 2]  #<-- Update
oof = np.zeros((len(train), len(target_cols)))
predictions = np.zeros((len(test), len(target_cols)))

for seed in SEED:
    
    oof_, predictions_ = run_k_fold(NFOLDS, seed)
    oof += oof_ / len(SEED)
    predictions += predictions_ / len(SEED)

train[target_cols] = oof
test[target_cols] = predictions

SEED: 0, FOLD: 0, EPOCH: 0, train_loss: 0.4936551513501268
SEED: 0 ,FOLD: 0, EPOCH: 0, valid_loss: 0.029317243237580573
SEED: 0, FOLD: 0, EPOCH: 1, train_loss: 0.025618759766760944
SEED: 0 ,FOLD: 0, EPOCH: 1, valid_loss: 0.02045668379536697
SEED: 0, FOLD: 0, EPOCH: 2, train_loss: 0.022699603513962982
SEED: 0 ,FOLD: 0, EPOCH: 2, valid_loss: 0.018587393686175346
SEED: 0, FOLD: 0, EPOCH: 3, train_loss: 0.021392317947702133
SEED: 0 ,FOLD: 0, EPOCH: 3, valid_loss: 0.017748841270804405
SEED: 0, FOLD: 0, EPOCH: 4, train_loss: 0.02056247222682704
SEED: 0 ,FOLD: 0, EPOCH: 4, valid_loss: 0.01754064065005098
SEED: 0, FOLD: 0, EPOCH: 5, train_loss: 0.020253552866262802
SEED: 0 ,FOLD: 0, EPOCH: 5, valid_loss: 0.017490540152149542
SEED: 0, FOLD: 0, EPOCH: 6, train_loss: 0.020219797160530437
SEED: 0 ,FOLD: 0, EPOCH: 6, valid_loss: 0.01720952128193208
SEED: 0, FOLD: 0, EPOCH: 7, train_loss: 0.020241124029068844
SEED: 0 ,FOLD: 0, EPOCH: 7, valid_loss: 0.0173496438190341
SEED: 0, FOLD: 0, EPOCH: 8, trai

SEED: 0, FOLD: 2, EPOCH: 17, train_loss: 0.019476176253047543
SEED: 0 ,FOLD: 2, EPOCH: 17, valid_loss: 0.01658906369869198
SEED: 0, FOLD: 2, EPOCH: 18, train_loss: 0.019159017677378397
SEED: 0 ,FOLD: 2, EPOCH: 18, valid_loss: 0.016364830040505953
SEED: 0, FOLD: 2, EPOCH: 19, train_loss: 0.01889975161791064
SEED: 0 ,FOLD: 2, EPOCH: 19, valid_loss: 0.01632181898291622
SEED: 0, FOLD: 2, EPOCH: 20, train_loss: 0.018506386499528005
SEED: 0 ,FOLD: 2, EPOCH: 20, valid_loss: 0.01620917400079114
SEED: 0, FOLD: 2, EPOCH: 21, train_loss: 0.018142445444844772
SEED: 0 ,FOLD: 2, EPOCH: 21, valid_loss: 0.01609649389450039
SEED: 0, FOLD: 2, EPOCH: 22, train_loss: 0.017747088544664606
SEED: 0 ,FOLD: 2, EPOCH: 22, valid_loss: 0.01608567155365433
SEED: 0, FOLD: 2, EPOCH: 23, train_loss: 0.017425535631406565
SEED: 0 ,FOLD: 2, EPOCH: 23, valid_loss: 0.016064309887588023
SEED: 0, FOLD: 2, EPOCH: 24, train_loss: 0.017244716087167246
SEED: 0 ,FOLD: 2, EPOCH: 24, valid_loss: 0.016077506914734842
SEED: 0, FOLD:

SEED: 1, FOLD: 0, EPOCH: 9, train_loss: 0.020312114416257195
SEED: 1 ,FOLD: 0, EPOCH: 9, valid_loss: 0.017160854754703386
SEED: 1, FOLD: 0, EPOCH: 10, train_loss: 0.02031999644215988
SEED: 1 ,FOLD: 0, EPOCH: 10, valid_loss: 0.01748760717787913
SEED: 1, FOLD: 0, EPOCH: 11, train_loss: 0.020276958676244038
SEED: 1 ,FOLD: 0, EPOCH: 11, valid_loss: 0.017477706287588393
SEED: 1, FOLD: 0, EPOCH: 12, train_loss: 0.020215231466336525
SEED: 1 ,FOLD: 0, EPOCH: 12, valid_loss: 0.017425501506243432
SEED: 1, FOLD: 0, EPOCH: 13, train_loss: 0.020118913802223793
SEED: 1 ,FOLD: 0, EPOCH: 13, valid_loss: 0.01705980950168201
SEED: 1, FOLD: 0, EPOCH: 14, train_loss: 0.019984045295395714
SEED: 1 ,FOLD: 0, EPOCH: 14, valid_loss: 0.01683983275932925
SEED: 1, FOLD: 0, EPOCH: 15, train_loss: 0.019924099840547726
SEED: 1 ,FOLD: 0, EPOCH: 15, valid_loss: 0.016853347713393824
SEED: 1, FOLD: 0, EPOCH: 16, train_loss: 0.019724249434859856
SEED: 1 ,FOLD: 0, EPOCH: 16, valid_loss: 0.016754164573337352
SEED: 1, FOLD:

SEED: 1, FOLD: 3, EPOCH: 1, train_loss: 0.02448790122255467
SEED: 1 ,FOLD: 3, EPOCH: 1, valid_loss: 0.02015777150435107
SEED: 1, FOLD: 3, EPOCH: 2, train_loss: 0.022633962306207504
SEED: 1 ,FOLD: 3, EPOCH: 2, valid_loss: 0.01858004629611969
SEED: 1, FOLD: 3, EPOCH: 3, train_loss: 0.021181615838861984
SEED: 1 ,FOLD: 3, EPOCH: 3, valid_loss: 0.017790326236614157
SEED: 1, FOLD: 3, EPOCH: 4, train_loss: 0.02045616824719785
SEED: 1 ,FOLD: 3, EPOCH: 4, valid_loss: 0.017576046607324054
SEED: 1, FOLD: 3, EPOCH: 5, train_loss: 0.020205302771342838
SEED: 1 ,FOLD: 3, EPOCH: 5, valid_loss: 0.017498965188860894
SEED: 1, FOLD: 3, EPOCH: 6, train_loss: 0.020242915299815544
SEED: 1 ,FOLD: 3, EPOCH: 6, valid_loss: 0.017574721202254295
SEED: 1, FOLD: 3, EPOCH: 7, train_loss: 0.020224107942287472
SEED: 1 ,FOLD: 3, EPOCH: 7, valid_loss: 0.017353903369179795
SEED: 1, FOLD: 3, EPOCH: 8, train_loss: 0.020233590236824493
SEED: 1 ,FOLD: 3, EPOCH: 8, valid_loss: 0.017335346740271363
SEED: 1, FOLD: 3, EPOCH: 9, 

SEED: 2, FOLD: 0, EPOCH: 18, train_loss: 0.01921979691563309
SEED: 2 ,FOLD: 0, EPOCH: 18, valid_loss: 0.016513354224818092
SEED: 2, FOLD: 0, EPOCH: 19, train_loss: 0.01896142419697582
SEED: 2 ,FOLD: 0, EPOCH: 19, valid_loss: 0.016381853951939515
SEED: 2, FOLD: 0, EPOCH: 20, train_loss: 0.01861335482934247
SEED: 2 ,FOLD: 0, EPOCH: 20, valid_loss: 0.01630061950002398
SEED: 2, FOLD: 0, EPOCH: 21, train_loss: 0.018184201292477657
SEED: 2 ,FOLD: 0, EPOCH: 21, valid_loss: 0.016232046937303884
SEED: 2, FOLD: 0, EPOCH: 22, train_loss: 0.017809289480573025
SEED: 2 ,FOLD: 0, EPOCH: 22, valid_loss: 0.016144783635224615
SEED: 2, FOLD: 0, EPOCH: 23, train_loss: 0.017547462074814932
SEED: 2 ,FOLD: 0, EPOCH: 23, valid_loss: 0.016142239794135094
SEED: 2, FOLD: 0, EPOCH: 24, train_loss: 0.017383888174874195
SEED: 2 ,FOLD: 0, EPOCH: 24, valid_loss: 0.016112037881144455
SEED: 2, FOLD: 1, EPOCH: 0, train_loss: 0.49166649827004777
SEED: 2 ,FOLD: 1, EPOCH: 0, valid_loss: 0.024349446115749223
SEED: 2, FOLD: 

SEED: 2 ,FOLD: 3, EPOCH: 10, valid_loss: 0.017377082098807606
SEED: 2, FOLD: 3, EPOCH: 11, train_loss: 0.020247139967978
SEED: 2 ,FOLD: 3, EPOCH: 11, valid_loss: 0.01724230721592903
SEED: 2, FOLD: 3, EPOCH: 12, train_loss: 0.02017679134302813
SEED: 2 ,FOLD: 3, EPOCH: 12, valid_loss: 0.01727732220398528
SEED: 2, FOLD: 3, EPOCH: 13, train_loss: 0.020077619229213917
SEED: 2 ,FOLD: 3, EPOCH: 13, valid_loss: 0.017284802507076945
SEED: 2, FOLD: 3, EPOCH: 14, train_loss: 0.020010813530804455
SEED: 2 ,FOLD: 3, EPOCH: 14, valid_loss: 0.01716196888259479
SEED: 2, FOLD: 3, EPOCH: 15, train_loss: 0.019771243236365095
SEED: 2 ,FOLD: 3, EPOCH: 15, valid_loss: 0.01703906399863107
SEED: 2, FOLD: 3, EPOCH: 16, train_loss: 0.019710850422743006
SEED: 2 ,FOLD: 3, EPOCH: 16, valid_loss: 0.01686716784856149
SEED: 2, FOLD: 3, EPOCH: 17, train_loss: 0.019422728595310364
SEED: 2 ,FOLD: 3, EPOCH: 17, valid_loss: 0.01675425570990358
SEED: 2, FOLD: 3, EPOCH: 18, train_loss: 0.019186835965492588
SEED: 2 ,FOLD: 3, 

KeyError: "None of [Index(['5-alpha_reductase_inhibitor', '11-beta-hsd1_inhibitor',\n       'acat_inhibitor', 'acetylcholine_receptor_agonist',\n       'acetylcholine_receptor_antagonist', 'acetylcholinesterase_inhibitor',\n       'adenosine_receptor_agonist', 'adenosine_receptor_antagonist',\n       'adenylyl_cyclase_activator', 'adrenergic_receptor_agonist',\n       ...\n       'tropomyosin_receptor_kinase_inhibitor', 'trpv_agonist',\n       'trpv_antagonist', 'tubulin_inhibitor', 'tyrosine_kinase_inhibitor',\n       'ubiquitin_specific_protease_inhibitor', 'vegfr_inhibitor', 'vitamin_b',\n       'vitamin_d_receptor_agonist', 'wnt_inhibitor'],\n      dtype='object', length=206)] are in the [columns]"

In [43]:


pred1 = np.load('moa_01844.npy')
pred2 = np.load('model_oof_01840.npy')
pred3 = np.load('moa_oof_01858.npy')
pred4 = np.load('moa_oof_xgb.npy')
pred5 = np.load('model_newcv_oof_01835.npy')
pred6 = np.load('moa_oof_newcv_01844.npy')
pred7= np.load('moa_oof_newcv_01858_opt.npy')

pred8 = np.load('model_newcv_oof_01835.npy')
pred9 = np.load('moa_oof_newcv_01858_7folds.npy')
pred10 = np.load('moa_oof_newcv_01838_7folds.npy')
pred11 = np.load('moa_oof_newcv_01858.npy')



In [34]:
(pred11 * 5 + pred8 *2 + pred1)/8

array([[0.00137969, 0.00123399, 0.00066726, ..., 0.00167902, 0.00038655,
        0.00175185],
       [0.00068334, 0.00062712, 0.00224532, ..., 0.00274619, 0.00163212,
        0.00300532],
       [0.00182621, 0.00229216, 0.00078583, ..., 0.00068411, 0.00042305,
        0.00122084],
       ...,
       [0.00067229, 0.00092357, 0.00287745, ..., 0.00195627, 0.0142041 ,
        0.00260228],
       [0.00045427, 0.00036849, 0.00042206, ..., 0.00035887, 0.00059093,
        0.00068581],
       [0.00051417, 0.00082633, 0.00092055, ..., 0.00063163, 0.00032853,
        0.00045329]])

In [45]:
predictions

array([[0.00151763, 0.00200997, 0.00352883, ..., 0.00239399, 0.00749808,
        0.00215664],
       [0.00089854, 0.0012979 , 0.00258505, ..., 0.00152582, 0.00215663,
        0.00353468],
       [0.00114159, 0.00129429, 0.00250067, ..., 0.00261794, 0.00053776,
        0.00274661],
       ...,
       [0.00159203, 0.00134392, 0.00142612, ..., 0.00178747, 0.00068978,
        0.00149942],
       [0.00207427, 0.00163424, 0.00174905, ..., 0.00253873, 0.00038995,
        0.00387187],
       [0.00083419, 0.00120032, 0.00151174, ..., 0.00226848, 0.0004628 ,
        0.00159326]])

In [123]:
train[target_cols] = (pred2 +pred1+pred3)/3

In [36]:
from sklearn.metrics import log_loss

valid_results = train_targets_scored.drop(columns=target_cols).merge(train[['sig_id']+target_cols], on='sig_id', how='left').fillna(0)


y_true = train_targets_scored[target_cols].values
y_pred = valid_results[target_cols].values
score = 0
for i in range(len(target_cols)):
    score_ = log_loss(y_true[:, i], y_pred[:, i])
    score += score_ / target.shape[1]
    
print("CV log_loss: ", score)
    

CV log_loss:  0.01574985566852989


In [37]:
for i in target_cols:
    test[i]=0.

In [98]:
pred1_ = np.load('moa_01844.npy')
pred2_ = np.load('model_01840.npy')
pred3_ = np.load('moa_01858.npy')
pred4_ = np.load('moa_xgb.npy')

In [99]:
test[target_cols] = (pred1_*0.10 + pred2_ * 0.80+ pred3_ * 0.10)

In [100]:
sub = sample_submission.drop(columns=target_cols).merge(test[['sig_id']+target_cols], on='sig_id', how='left').fillna(0)
sub.to_csv('submission_05_11_2020_v1.csv', index=False)


In [38]:
np.save('moa_oof_newcv_01844', oof)

In [39]:
np.save('moa_newcv_01844', predictions)

In [28]:
train

Unnamed: 0,sig_id,cp_time,cp_dose,0,1,2,3,4,5,6,...,tropomyosin_receptor_kinase_inhibitor,trpv_agonist,trpv_antagonist,tubulin_inhibitor,tyrosine_kinase_inhibitor,ubiquitin_specific_protease_inhibitor,vegfr_inhibitor,vitamin_b,vitamin_d_receptor_agonist,wnt_inhibitor
0,id_000644bb2,24,D1,1.134849,0.907687,-0.416385,-0.966814,-0.254723,-1.017473,-1.364787,...,0.000838,0.000540,0.002809,0.001458,0.001181,0.000739,0.000547,0.002032,0.000300,0.001982
1,id_000779bfc,72,D1,0.119282,0.681738,0.272399,0.080113,1.205169,0.686517,0.313396,...,0.001504,0.003216,0.004645,0.005326,0.004050,0.000679,0.001483,0.002844,0.001748,0.002967
2,id_000a6266a,48,D1,0.779973,0.946463,1.425350,-0.132928,-0.006122,1.492493,0.235577,...,0.000631,0.003011,0.003850,0.001351,0.020551,0.001575,0.307875,0.000941,0.000467,0.001430
3,id_0015fd391,48,D1,-0.734910,-0.274641,-0.438509,0.759097,2.346330,-0.858153,-2.288417,...,0.001118,0.002908,0.002429,0.104720,0.003758,0.001489,0.002399,0.001373,0.000207,0.000712
4,id_001626bd3,72,D2,-0.452718,-0.477513,0.972316,0.970731,1.463427,-0.869555,-0.375501,...,0.001834,0.002123,0.004314,0.001826,0.003485,0.001336,0.001609,0.002830,0.000616,0.003128
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
21943,id_fff8c2444,72,D1,0.237856,-1.228203,0.218376,-0.365976,-0.330177,0.569243,-0.150978,...,0.000873,0.000658,0.003546,0.000345,0.001008,0.001680,0.006100,0.001703,0.001223,0.001372
21944,id_fffb1ceed,24,D2,0.209361,-0.022389,-0.235888,-0.796989,-0.674009,0.919312,0.735603,...,0.001077,0.000669,0.003029,0.001465,0.001537,0.001051,0.001604,0.001873,0.000750,0.001789
21945,id_fffb70c0c,24,D2,-1.911021,0.587228,-0.588417,1.296405,-1.002640,0.850589,-0.304313,...,0.000714,0.003098,0.004251,0.000294,0.019630,0.000835,0.017936,0.001747,0.027558,0.002846
21946,id_fffcb9e7c,24,D1,0.816407,0.417618,0.431631,0.300617,1.070346,-0.024189,0.048942,...,0.000378,0.000240,0.000879,0.001592,0.001604,0.000386,0.000796,0.000390,0.000196,0.000564


In [None]:
pred1 = np.load('moa_oof_01844.npy')
