In [1]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

/kaggle/input/lish-moa/train_features.csv
/kaggle/input/lish-moa/train_drug.csv
/kaggle/input/lish-moa/test_features.csv
/kaggle/input/lish-moa/train_targets_nonscored.csv
/kaggle/input/lish-moa/sample_submission.csv
/kaggle/input/lish-moa/train_targets_scored.csv


In [2]:
from sklearn.model_selection import train_test_split
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import KFold
from sklearn.metrics import log_loss

import tensorflow as tf
import tensorflow.keras.backend as K
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, Dropout, BatchNormalization, Input
from tensorflow_addons.layers import WeightNormalization
from tensorflow.keras.optimizers import Adam
from tensorflow.keras.callbacks import EarlyStopping

In [3]:
# set seed
def seed_everything(seed = 42):
    os.environ['PYTHONHASHSEED'] = str(seed)
    np.random.seed(seed)
    tf.random.set_seed(seed)
    
seed_everything(seed = 42)

In [4]:
# reading in all the data files
train_features = pd.read_csv('/kaggle/input/lish-moa/train_features.csv')
test_features = pd.read_csv('/kaggle/input/lish-moa/test_features.csv')
train_targets_scored = pd.read_csv('/kaggle/input/lish-moa/train_targets_scored.csv')
train_targets_nonscored = pd.read_csv('/kaggle/input/lish-moa/train_targets_nonscored.csv')
sample_submission = pd.read_csv('/kaggle/input/lish-moa/sample_submission.csv')

In [5]:
ind_te = test_features[test_features['cp_type']=='ctl_vehicle'].index

In [6]:
def preprocess(df):
    d = df.copy()
    # change cp_dose: D1 -> 0, D2 -> 1
    d['cp_dose'] = d['cp_dose'].map({'D1':0, 'D2':1})
    # change cp_time: 24 -> 0, 48 -> 1, 72 -> 2
    d['cp_time'] = d['cp_time']//24-1
    # change cp_type: trt_cp -> 1, ctl_vehicle -> 0
    d['cp_type'] = d['cp_type'].map({'ctl_vehicle':0, 'trt_cp':1})
    # drop the sig_id column
    d.drop(columns = ['sig_id'], inplace = True)
    return d

In [7]:
X_train = preprocess(train_features)
X_test = preprocess(test_features)

In [8]:
print(X_train.shape, X_test.shape)

(23814, 875) (3982, 875)


In [9]:
y_train = train_targets_scored.copy()
# drop the sig_id column
y_train.drop(columns = ['sig_id'], inplace = True)
y_train.shape

(23814, 206)

In [10]:
# Stop training when the validation log loss metric has stopped decreasing for 3 epochs.
def callbacks():
    early_stopping = EarlyStopping(monitor = 'val_logloss',
                                   patience = 3,
                                   mode = 'min',
                                   restore_best_weights = True)
    return early_stopping

In [11]:
# create the neural network model
def create_model(num_columns):
    model = Sequential()
    model.add(Input(num_columns))
    model.add(BatchNormalization())
    model.add(Dropout(0.2))
    model.add(WeightNormalization(Dense(2048, activation="relu")))
    model.add(BatchNormalization())
    model.add(Dropout(0.5))
    model.add(WeightNormalization(Dense(1024, activation="relu")))
    model.add(BatchNormalization())
    model.add(Dropout(0.5))
    model.add(WeightNormalization(Dense(512, activation="relu")))
    model.add(BatchNormalization())
    model.add(Dropout(0.5))
    model.add(WeightNormalization(Dense(206, activation="sigmoid")))
    model.compile(optimizer = Adam(), loss = 'binary_crossentropy')
    return model

In [12]:
# feats = [0,1,2,.......,873]
feats = np.arange(0,X_train.shape[1],1)
inp_size = 874

In [13]:
# creating a sample df to store predicted values
sample = sample_submission.copy()

In [14]:
res = y_train.copy()
sample.loc[:, y_train.columns] = 0
res.loc[:, y_train.columns] = 0
print(sample.shape, res.shape)

(3982, 207) (23814, 206)


In [15]:
from time import time
import datetime

In [16]:
def log_loss_metric(y_true, y_pred):
    metrics = []
    for _target in y_train.columns:
        metrics.append(log_loss(y_true.loc[:, _target], y_pred.loc[:, _target].astype(float), labels = [0,1]))
    return np.mean(metrics)

In [17]:
test_preds = []
res_preds = []
np.random.seed(seed=42)
n_splits = 5
n_top = 10
n_round = 1

for seed in range(n_round):
    split_cols = np.random.choice(feats, inp_size, replace = False)
    res.loc[:, y_train.columns] = 0
    sample.loc[:, y_train.columns] = 0
    for n, (tr, te) in enumerate(KFold(n_splits = n_splits, random_state = seed, shuffle = True).split(X_train, y_train)):
        start_time = time()
        x_tr = X_train.values[tr][:, split_cols]
        x_val = X_train.values[te][:, split_cols]
        y_tr, y_val = y_train.values[tr], y_train.values[te]
        x_tt = X_test.values[:, split_cols]   
        for num in range(n_top):
            model = create_model(inp_size)
            model.fit(x_tr, y_tr,validation_data=(x_val, y_val), epochs = 25, batch_size = 128,
                      callbacks = callbacks(), verbose = 0)
            sample.loc[:, y_train.columns] += model.predict(x_tt, batch_size = 128)/(n_splits*n_top)
            res.loc[te, y_train.columns] += model.predict(x_val, batch_size = 128)/(n_top)
        loss = log_loss_metric(y_train.loc[te,y_train.columns], res.loc[te, y_train.columns])
        print(f'[{str(datetime.timedelta(seconds = time() - start_time))[2:7]}], Seed {seed}, Fold {n}:', loss)
        K.clear_session()
        del model
    # set prediction 0 for ctl_vehicle observations
    sample.loc[ind_te, y_train.columns] = 0
    test_preds.append(sample.copy())    
    res_preds.append(res.copy())


[04:36], Seed 0, Fold 0: 0.015224653315950454
[04:24], Seed 0, Fold 1: 0.0154563089384379
[04:25], Seed 0, Fold 2: 0.015135387388482887
[04:22], Seed 0, Fold 3: 0.01553196754539777
[04:22], Seed 0, Fold 4: 0.015469322363329159


In [18]:
print(len(res_preds), len(test_preds))

1 1


In [19]:
aa = [1.0]
sample.loc[:, y_train.columns] = 0
for i in range(n_round):
    sample.loc[:, y_train.columns] += aa[i] * test_preds[i].loc[:, y_train.columns]
sample.loc[ind_te, y_train.columns] = 0

In [20]:
display(sample.head())
# write sample to submission.csv file
sample.to_csv('submission.csv', index=False)

Unnamed: 0,sig_id,5-alpha_reductase_inhibitor,11-beta-hsd1_inhibitor,acat_inhibitor,acetylcholine_receptor_agonist,acetylcholine_receptor_antagonist,acetylcholinesterase_inhibitor,adenosine_receptor_agonist,adenosine_receptor_antagonist,adenylyl_cyclase_activator,...,tropomyosin_receptor_kinase_inhibitor,trpv_agonist,trpv_antagonist,tubulin_inhibitor,tyrosine_kinase_inhibitor,ubiquitin_specific_protease_inhibitor,vegfr_inhibitor,vitamin_b,vitamin_d_receptor_agonist,wnt_inhibitor
0,id_0004d9e33,0.000472,0.000781,0.001385,0.020379,0.02361,0.007416,0.003131,0.005955,0.000405,...,0.000448,0.000695,0.002757,0.00095,0.000364,0.000362,0.000464,0.001262,0.002725,0.00139
1,id_001897cda,0.000434,0.001252,0.002048,0.003315,0.003673,0.002269,0.002371,0.010873,0.005098,...,0.00065,0.001822,0.003031,0.000363,0.009506,0.000717,0.009564,0.001187,0.006719,0.001892
2,id_002429b5b,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,id_00276f245,0.000444,0.000745,0.001907,0.010084,0.010115,0.003678,0.001994,0.005109,0.000559,...,0.000403,0.002226,0.002809,0.011532,0.006735,0.000387,0.002841,0.001717,0.002285,0.002133
4,id_0027f1083,0.001314,0.001214,0.001441,0.014681,0.025693,0.005811,0.005971,0.003219,0.000427,...,0.000398,0.000726,0.003967,0.001313,0.000862,0.000433,0.00081,0.001615,0.00083,0.001271
