# Mechanism of Action (MoA) participation kernel

In [1]:
from time import time
import json
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

# Deep learning libraries
import tensorflow as tf
from keras import Model, models
from keras.models import Sequential, load_model
from keras.layers import Dense, BatchNormalization, Dropout

from tensorflow.keras.optimizers import Adam
from tensorflow.keras.callbacks import EarlyStopping, ReduceLROnPlateau, ModelCheckpoint
import tensorflow_addons as tfa
from tensorflow.keras import layers,regularizers,Sequential,backend,callbacks,optimizers,metrics,losses

# Utility functions
from sklearn.model_selection import KFold, StratifiedKFold
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score
from sklearn.utils import shuffle

from sklearn.decomposition import PCA
from sklearn.preprocessing import scale, StandardScaler

from kerastuner.tuners import RandomSearch
from kerastuner.engine.hyperparameters import HyperParameters

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

np.random.seed(7)
%matplotlib inline

 The versions of TensorFlow you are currently using is 2.3.1 and is not supported. 
Some things might work, some things might not.
If you were to encounter a bug, do not file an issue.
If you want to make sure you're using a tested and supported configuration, either change the TensorFlow version or the TensorFlow Addons's version. 
You can find the compatibility matrix in TensorFlow Addon's readme:
https://github.com/tensorflow/addons


/kaggle/input/lish-moa/train_targets_scored.csv
/kaggle/input/lish-moa/train_drug.csv
/kaggle/input/lish-moa/train_targets_nonscored.csv
/kaggle/input/lish-moa/train_features.csv
/kaggle/input/lish-moa/sample_submission.csv
/kaggle/input/lish-moa/test_features.csv


In [2]:
train_feat_df = pd.read_csv('./train_features.csv')
test_feat_df = pd.read_csv('./test_features.csv')

scored_train_targets_df = pd.read_csv('./train_targets_scored.csv')
nscored_train_targets_df = pd.read_csv('./train_targets_nonscored.csv')

submission_sample_df = pd.read_csv('./sample_submission.csv')

In [3]:
gene_cols = train_feat_df.columns[4:-100]
gene_data = train_feat_df[gene_cols]

In [4]:
cell_via_cols = train_feat_df.columns[-100:]
cell_via_data = train_feat_df[cell_via_cols]

## PCA gene features

In [5]:
gene_data

Unnamed: 0,g-0,g-1,g-2,g-3,g-4,g-5,g-6,g-7,g-8,g-9,...,g-762,g-763,g-764,g-765,g-766,g-767,g-768,g-769,g-770,g-771
0,1.0620,0.5577,-0.2479,-0.6208,-0.1944,-1.0120,-1.0220,-0.0326,0.5548,-0.0921,...,-0.5055,-0.3167,1.0930,0.0084,0.8611,-0.5582,0.3008,1.6490,0.2968,-0.0224
1,0.0743,0.4087,0.2991,0.0604,1.0190,0.5207,0.2341,0.3372,-0.4047,0.8507,...,-0.5338,0.0224,-0.4831,0.2128,-0.6999,-0.1214,-0.1626,-0.3340,-0.3289,-0.2718
2,0.6280,0.5817,1.5540,-0.0764,-0.0323,1.2390,0.1715,0.2155,0.0065,1.2300,...,2.5770,0.2356,1.3230,-1.3730,-0.2682,0.8427,0.5797,0.3143,0.8133,0.7923
3,-0.5138,-0.2491,-0.2656,0.5288,4.0620,-0.8095,-1.9590,0.1792,-0.1321,-1.0600,...,-0.1292,3.4310,1.2720,-0.4733,-2.0560,0.5699,0.1996,0.4374,0.1588,-0.0343
4,-0.3254,-0.4009,0.9700,0.6919,1.4180,-0.8244,-0.2800,-0.1498,-0.8789,0.8630,...,-0.6904,2.0540,-0.3131,-0.0809,0.3910,1.7660,-1.0020,-0.7534,0.5000,-0.6269
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
23809,0.1394,-0.0636,-0.1112,-0.5080,-0.4713,0.7201,0.5773,0.3055,-0.4726,0.1269,...,0.7790,0.5393,0.4112,-0.5059,0.0240,-0.2297,0.7221,0.5099,-0.1423,0.3806
23810,-1.3260,0.3478,-0.3743,0.9905,-0.7178,0.6621,-0.2252,-0.5565,0.5112,0.6727,...,-0.0858,0.3606,-0.0248,0.0672,-0.5901,-0.1022,0.5247,0.5438,-0.1875,-0.4751
23811,0.3942,0.3756,0.3109,-0.7389,0.5505,-0.0159,-0.2541,0.1745,-0.0340,0.4865,...,0.1796,0.3488,0.0927,0.5166,-0.3099,-0.5946,0.9778,0.2326,-0.6191,0.3603
23812,0.6660,0.2324,0.4392,0.2044,0.8531,-0.0343,0.0323,0.0463,0.4299,-0.7985,...,-0.1410,1.9590,0.8224,1.2500,-3.1930,-2.8720,0.1794,0.3109,-0.3491,-0.4741


In [6]:
scaled_gene_data = scale(gene_data)

In [7]:
pca1 = PCA(0.98)
pca1.fit(scaled_gene_data)

PCA(n_components=0.98)

In [8]:
pca_gene = pca1.transform(scaled_gene_data)
per_var = np.round(pca1.explained_variance_ratio_* 100, decimals=1)
labels = ['PC' + str(x) for x in range(1, len(per_var)+1)]
transformed_gene_feats = pd.DataFrame(pca_gene, columns=labels)

## PCA cell features

In [9]:
cell_via_data

Unnamed: 0,c-0,c-1,c-2,c-3,c-4,c-5,c-6,c-7,c-8,c-9,...,c-90,c-91,c-92,c-93,c-94,c-95,c-96,c-97,c-98,c-99
0,-0.0600,0.1083,0.6864,0.4043,0.4213,-0.6797,0.2888,0.4323,-0.3381,0.3407,...,0.2862,0.2584,0.8076,0.5523,-0.1912,0.6584,-0.3981,0.2139,0.3801,0.4176
1,0.0927,0.2723,0.2772,0.7776,0.3679,0.5696,0.2835,1.4080,0.3745,0.6775,...,-0.4265,0.7543,0.4708,0.0230,0.2957,0.4899,0.1522,0.1241,0.6077,0.7371
2,-0.1312,-1.4640,0.3394,-1.7790,0.2188,0.5826,-0.7513,0.0543,0.7182,-0.4159,...,-0.7250,-0.6297,0.6103,0.0223,-1.3240,-0.3174,-0.6417,-0.2187,-1.4080,0.6931
3,-0.3998,-4.5520,-2.7350,-1.9630,-2.8610,-1.2670,-2.5830,-0.5036,-3.1590,-1.8510,...,-2.0990,-0.6441,-5.6300,-1.3780,-0.8632,-1.2880,-1.6210,-0.8784,-0.3876,-0.8154
4,-0.3774,0.7364,-0.1659,0.2341,1.0060,0.3204,-0.0852,-0.2284,-0.2533,-0.3174,...,0.0042,0.0048,0.6670,1.0690,0.5523,-0.3031,0.1094,0.2885,-0.3786,0.7125
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
23809,0.4224,0.1871,-0.4822,0.3713,0.4754,0.9512,0.4650,0.3005,0.0338,-0.7734,...,0.1969,0.0262,-0.8121,0.3434,0.5372,-0.3246,0.0631,0.9171,0.5258,0.4680
23810,0.2144,0.4350,0.1174,1.3960,-0.6772,0.2316,-0.5396,0.0581,0.6067,-0.4622,...,0.4286,0.4426,0.0423,-0.3195,-0.8086,-0.9798,-0.2084,-0.1224,-0.2715,0.3689
23811,1.0650,0.6329,0.3742,0.1237,0.6147,0.4589,-0.2372,1.1160,0.4623,0.2830,...,0.5409,0.3755,0.7343,0.2807,0.4116,0.6422,0.2256,0.7592,0.6656,0.3808
23812,0.5377,1.3240,0.9679,0.1419,1.2230,0.3404,-0.1589,0.8667,0.7749,-0.1458,...,-0.1105,0.4258,-0.2012,0.1506,1.5230,0.7101,0.1732,0.7015,-0.6290,0.0740


In [10]:
scaled_cell_data = scale(cell_via_data)

In [11]:
pca2 = PCA(.98)
pca2.fit(scaled_cell_data)

PCA(n_components=0.98)

In [12]:
pca_cell = pca2.transform(scaled_cell_data)
per_var = np.round(pca2.explained_variance_ratio_* 100, decimals=1)
labels = ['PC' + str(x) for x in range(1, len(per_var)+1)]
transformed_cell_feats = pd.DataFrame(pca_cell, columns=labels)

# NN Model

In [13]:
Train_data = train_feat_df.drop(columns=['sig_id']+list(gene_cols)+list(cell_via_cols) )

In [14]:
Train_data = Train_data.merge(transformed_gene_feats, how='left', left_index=True, right_index=True)
Train_data = Train_data.merge(transformed_cell_feats, how='left', left_index=True, right_index=True)
Train_data[['ctl_vehicle','trt_cp']] = pd.get_dummies(Train_data.cp_type)
Train_data[['D1','D2']] = pd.get_dummies(Train_data.cp_dose)
Train_data.drop(columns=['cp_type','cp_dose'], inplace=True)

In [15]:
Train_data

Unnamed: 0,cp_time,PC1_x,PC2_x,PC3_x,PC4_x,PC5_x,PC6_x,PC7_x,PC8_x,PC9_x,...,PC67_y,PC68_y,PC69_y,PC70_y,PC71_y,PC72_y,ctl_vehicle,trt_cp,D1,D2
0,24,-5.979264,-1.550012,-0.502657,7.610341,2.050595,-5.608992,0.569159,3.113275,-2.178552,...,-0.273477,-0.005386,-0.378165,-0.023851,0.041090,-0.266087,0,1,1,0
1,72,-4.980401,3.846588,7.405576,-4.088724,-1.163923,4.233157,1.971495,1.358631,0.273302,...,0.056971,-0.254081,0.077159,0.015269,-0.216188,-0.199728,0,1,1,0
2,48,-1.277123,2.626640,-3.098642,-5.160985,-0.862044,-0.332660,2.230737,-2.632508,-0.090752,...,-0.526111,-0.361337,0.094417,0.230581,0.173666,-0.158800,0,1,1,0
3,48,6.386498,-2.626036,-5.265179,-8.992739,0.707729,-6.287851,2.813483,-6.799060,-0.356779,...,0.107201,0.105797,-0.267601,0.024292,-1.032409,0.639363,0,1,1,0
4,72,-5.556092,0.921688,4.123502,-8.539803,1.471287,-3.358239,-1.307632,-0.794091,7.106465,...,0.137671,-0.114519,-0.137438,-0.093885,-0.324818,0.213504,0,1,0,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
23809,24,-4.050846,2.455376,-1.952724,-2.745738,-0.230533,-0.046272,0.977865,-0.534667,-2.717050,...,-0.010419,0.111000,0.300112,-0.368521,-0.015981,-0.147645,0,1,0,1
23810,24,-3.587688,-0.719366,-2.095710,1.556405,3.480427,0.490399,-0.826361,-1.985018,1.169461,...,0.133469,-0.363638,0.452713,-0.023380,0.108713,0.323834,0,1,0,1
23811,48,-4.575507,-1.040907,-0.129310,0.465662,-1.097595,-1.519675,2.197309,0.772037,0.941187,...,0.120363,0.164758,-0.257116,0.317431,-0.116415,0.005997,1,0,0,1
23812,24,3.805914,-16.156608,5.855234,7.802877,-6.736117,-1.319509,0.969848,6.297461,-6.666178,...,-0.017931,-0.134203,0.271563,0.345889,0.121288,0.336403,0,1,1,0


In [16]:
scaler = StandardScaler()
scaler.fit(Train_data)

StandardScaler()

In [17]:
Train_data_scaled = pd.DataFrame(scaler.transform(Train_data))

In [18]:
Train_data_scaled

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,709,710,711,712,713,714,715,716,717,718
0,-1.237999,-0.454727,-0.266090,-0.095055,1.512799,0.482955,-1.342296,0.147553,0.854344,-0.660847,...,-0.875537,-0.017400,-1.228588,-0.077754,0.135612,-0.879256,-0.291580,0.291580,0.980043,-0.980043
1,1.235922,-0.378763,0.660343,1.400438,-0.812765,-0.274127,1.013043,0.511104,0.372835,0.082904,...,0.182392,-0.820878,0.250674,0.049777,-0.713505,-0.659981,-0.291580,0.291580,0.980043,-0.980043
2,-0.001039,-0.097126,0.450915,-0.585971,-1.025911,-0.203028,-0.079609,0.578312,-0.722412,-0.027529,...,-1.684345,-1.167399,0.306744,0.751680,0.573166,-0.524737,-0.291580,0.291580,0.980043,-0.980043
3,-0.001039,0.485697,-0.450811,-0.995676,-1.787595,0.166684,-1.504755,0.729387,-1.865796,-0.108226,...,0.343203,0.341806,-0.869387,0.079191,-3.407355,2.112710,-0.291580,0.291580,0.980043,-0.980043
4,1.235922,-0.422544,0.158226,0.779778,-1.697559,0.346517,-0.803665,-0.339000,-0.217914,2.155691,...,0.440753,-0.369986,-0.446511,-0.306059,-1.072027,0.705502,-0.291580,0.291580,-1.020364,1.020364
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
23809,-1.237999,-0.308069,0.421514,-0.369271,-0.545803,-0.054295,-0.011073,0.253509,-0.146723,-0.824196,...,-0.033357,0.358617,0.975008,-1.201354,-0.052743,-0.487877,-0.291580,0.291580,-1.020364,1.020364
23810,-1.237999,-0.272846,-0.123493,-0.396311,0.309385,0.819709,0.117358,-0.214232,-0.544728,0.354747,...,0.427303,-1.174830,1.470781,-0.076216,0.358794,1.070075,-0.291580,0.291580,-1.020364,1.020364
23811,-0.001039,-0.347970,-0.178692,-0.024453,0.092565,-0.258505,-0.363676,0.569646,0.211862,0.285502,...,0.385342,0.532294,-0.835322,1.034804,-0.384216,0.019817,3.429586,-3.429586,-1.020364,1.020364
23812,-1.237999,0.289442,-2.773600,1.107259,1.551072,-1.586487,-0.315774,0.251430,1.728147,-2.022133,...,-0.057407,-0.433580,0.882258,1.127576,0.400299,1.111609,-0.291580,0.291580,0.980043,-0.980043


In [19]:
Train_targets = scored_train_targets_df.drop(columns=['sig_id'])
Train_targets

Unnamed: 0,5-alpha_reductase_inhibitor,11-beta-hsd1_inhibitor,acat_inhibitor,acetylcholine_receptor_agonist,acetylcholine_receptor_antagonist,acetylcholinesterase_inhibitor,adenosine_receptor_agonist,adenosine_receptor_antagonist,adenylyl_cyclase_activator,adrenergic_receptor_agonist,...,tropomyosin_receptor_kinase_inhibitor,trpv_agonist,trpv_antagonist,tubulin_inhibitor,tyrosine_kinase_inhibitor,ubiquitin_specific_protease_inhibitor,vegfr_inhibitor,vitamin_b,vitamin_d_receptor_agonist,wnt_inhibitor
0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
23809,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
23810,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
23811,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
23812,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [20]:
ns_Train_targets = nscored_train_targets_df.drop(columns=['sig_id'])
ns_Train_targets

Unnamed: 0,abc_transporter_expression_enhancer,abl_inhibitor,ace_inhibitor,acetylcholine_release_enhancer,adenosine_deaminase_inhibitor,adenosine_kinase_inhibitor,adenylyl_cyclase_inhibitor,age_inhibitor,alcohol_dehydrogenase_inhibitor,aldehyde_dehydrogenase_activator,...,ve-cadherin_antagonist,vesicular_monoamine_transporter_inhibitor,vitamin_k_antagonist,voltage-gated_calcium_channel_ligand,voltage-gated_potassium_channel_activator,voltage-gated_sodium_channel_blocker,wdr5_mll_interaction_inhibitor,wnt_agonist,xanthine_oxidase_inhibitor,xiap_inhibitor
0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
23809,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
23810,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
23811,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
23812,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [21]:
val_idx = np.random.choice(range(Train_data_scaled.shape[0]), Train_data_scaled.shape[0]//10, replace=False)

In [22]:
Tuner_train_data = np.delete(np.array(Train_data_scaled), val_idx, 0)
Tuner_train_target = np.delete(np.array(Train_targets), val_idx, 0)

Tuner_val_data = np.array(Train_data_scaled)[val_idx,:]
Tuner_val_target = np.array(Train_targets)[val_idx,:]

In [23]:
p_min = 0.001
p_max = 0.999
def logloss(y_true, y_pred):
    y_pred = tf.clip_by_value(y_pred,p_min,p_max)
    return -backend.mean(y_true*backend.log(y_pred) + (1-y_true)*backend.log(1-y_pred))

In [24]:
def build_model(hp):
    
    Model = Sequential()
    Model.add(Dense(hp.Choice('input_units', values=[512,1024,2048]), activation='relu', kernel_initializer='he_normal', input_shape=(Train_data_scaled.shape[1],)))
    Model.add(Dropout(hp.Float('input_drop', min_value=0.3, max_value=0.9,step=0.1)))
    Model.add(BatchNormalization())
              
    for i in range(hp.Int('nbr_lay', min_value=3, max_value=15, step=1)):
        Model.add(Dense(hp.Choice(f'dense_{i}_units', values=[256,512,1024,2048]), activation=hp.Choice(f'dense_{i}_act', values=['relu','elu']), kernel_initializer='he_normal'))
        Model.add(Dropout(hp.Float(f'lay_{i}_drop', min_value=0.2, max_value=0.9,step=0.1)))
        Model.add(BatchNormalization())
    
    Model.add(Dense(206, activation='sigmoid', kernel_initializer='he_normal'))
    
    Model.compile(optimizer = hp.Choice(f'optimizer', values=['adam']), loss=losses.BinaryCrossentropy(label_smoothing=0.001), metrics=logloss)
    
    return Model

In [25]:
Tuner = RandomSearch(build_model,
                    objective='val_loss',
                    max_trials=40,
                    executions_per_trial=1,
                    seed=7,
                    directory='./')

In [26]:
Tuner.search(Tuner_train_data, Tuner_train_target,
             epochs=10, verbose=0,
             validation_data=(Tuner_val_data, Tuner_val_target))

In [27]:
Model_tuned = Tuner.get_best_models(num_models=1)
Model_tuned[0].summary()

Model: "sequential"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
dense (Dense)                (None, 1024)              737280    
_________________________________________________________________
dropout (Dropout)            (None, 1024)              0         
_________________________________________________________________
batch_normalization (BatchNo (None, 1024)              4096      
_________________________________________________________________
dense_1 (Dense)              (None, 1024)              1049600   
_________________________________________________________________
dropout_1 (Dropout)          (None, 1024)              0         
_________________________________________________________________
batch_normalization_1 (Batch (None, 1024)              4096      
_________________________________________________________________
dense_2 (Dense)              (None, 1024)              1

In [28]:
trials_dir=[]
for s in os.listdir('./untitled_project'):
    if 'trial_' in s:
        trials_dir.append(s)

In [29]:
Trials = []
for i in trials_dir:
    with open(f'./untitled_project/{i}/trial.json', 'r') as handle:
        parsed = json.load(handle)
        Trials.append((parsed['hyperparameters']['values'],parsed['score']))

In [30]:
max_score=1
for trial in Trials:
    if trial[1]<max_score:
        best_trial = trial
        max_score = trial[1]

In [31]:
best_trial

({'input_units': 1024,
  'input_drop': 0.4,
  'nbr_lay': 5,
  'dense_0_units': 1024,
  'dense_0_act': 'elu',
  'lay_0_drop': 0.30000000000000004,
  'dense_1_units': 1024,
  'dense_1_act': 'relu',
  'lay_1_drop': 0.2,
  'dense_2_units': 1024,
  'dense_2_act': 'elu',
  'lay_2_drop': 0.6000000000000001,
  'optimizer': 'adam',
  'dense_3_units': 256,
  'dense_3_act': 'elu',
  'lay_3_drop': 0.5000000000000001,
  'dense_4_units': 2048,
  'dense_4_act': 'elu',
  'lay_4_drop': 0.5000000000000001,
  'dense_5_units': 256,
  'dense_5_act': 'relu',
  'lay_5_drop': 0.30000000000000004,
  'dense_6_units': 512,
  'dense_6_act': 'elu',
  'lay_6_drop': 0.4000000000000001,
  'dense_7_units': 512,
  'dense_7_act': 'relu',
  'lay_7_drop': 0.6000000000000001,
  'dense_8_units': 512,
  'dense_8_act': 'elu',
  'lay_8_drop': 0.30000000000000004,
  'dense_9_units': 1024,
  'dense_9_act': 'relu',
  'lay_9_drop': 0.8000000000000003,
  'dense_10_units': 1024,
  'dense_10_act': 'elu',
  'lay_10_drop': 0.7000000000

In [32]:
splits_nbr = 10
kf = KFold(n_splits = splits_nbr)
skf = StratifiedKFold(n_splits = splits_nbr, random_state = 7, shuffle = True)

In [33]:
def get_model_name(k):
    return 'Model_'+str(k)+'.h5'

In [34]:
VAL_LOSS = []

fold_var = 1

for train_index, val_index in kf.split(Train_data_scaled,Train_targets):
    
    train_data = Train_data_scaled.iloc[train_index]
    train_target = Train_targets.iloc[train_index]
    
    val_data = Train_data_scaled.iloc[val_index]
    val_target = Train_targets.iloc[val_index]
    
    Model = Sequential(name='FFN')
    Model.add(Dense(best_trial[0]['input_units'], activation='relu', kernel_initializer='he_normal', input_shape=(Train_data_scaled.shape[1],)))
    Model.add(Dropout(best_trial[0]['input_drop']))
    Model.add(BatchNormalization())
    for l in range(best_trial[0]['nbr_lay']):
        Model.add(Dense(best_trial[0][f'dense_{l}_units'], activation=best_trial[0][f'dense_{l}_act'], kernel_initializer='he_normal'))
        Model.add(Dropout(best_trial[0][f'lay_{l}_drop']))
        Model.add(BatchNormalization())
    
    Model.add(Dense(206, activation='sigmoid', kernel_initializer='he_normal'))

    Model.compile(optimizer = best_trial[0]['optimizer'], loss=losses.BinaryCrossentropy(label_smoothing=0.001), metrics='accuracy')
    
    Checkpoint = tf.keras.callbacks.ModelCheckpoint(get_model_name(fold_var), 
                                                    monitor='val_loss', verbose=1, 
                                                    save_best_only=True, mode='min')

    LR_OnPlat = ReduceLROnPlateau(monitor='val_loss',
                                  patience = 2,
                                  cooldown=1,
                                  verbose=1,
                                  factor=0.8,
                                  epsilon=1e-4,
                                  min_lr=0.000001)
    
    results = Model.fit(x=train_data,
                        y=train_target,
                        batch_size=128,
                        validation_data=(val_data,val_target),
                        epochs=50,
                        verbose=0,
                        callbacks=[LR_OnPlat, Checkpoint])
    
    Model.load_weights("./"+get_model_name(fold_var))
    
    results = Model.evaluate(val_data, val_target, batch_size=128)
    results = dict(zip(Model.metrics_names,results))

    VAL_LOSS.append(results['loss'])

    tf.keras.backend.clear_session()

    fold_var += 1


Epoch 00001: val_loss improved from inf to 0.02403, saving model to Model_1.h5

Epoch 00002: val_loss improved from 0.02403 to 0.02264, saving model to Model_1.h5

Epoch 00003: val_loss improved from 0.02264 to 0.02171, saving model to Model_1.h5

Epoch 00004: val_loss improved from 0.02171 to 0.02113, saving model to Model_1.h5

Epoch 00005: val_loss improved from 0.02113 to 0.02105, saving model to Model_1.h5

Epoch 00006: val_loss improved from 0.02105 to 0.02056, saving model to Model_1.h5

Epoch 00007: val_loss did not improve from 0.02056

Epoch 00008: val_loss improved from 0.02056 to 0.02029, saving model to Model_1.h5

Epoch 00009: val_loss improved from 0.02029 to 0.02000, saving model to Model_1.h5

Epoch 00010: val_loss did not improve from 0.02000

Epoch 00011: ReduceLROnPlateau reducing learning rate to 0.000800000037997961.

Epoch 00011: val_loss improved from 0.02000 to 0.01994, saving model to Model_1.h5

Epoch 00012: val_loss improved from 0.01994 to 0.01973, saving 

In [35]:
VAL_LOSS

[0.01963799260556698,
 0.019351113587617874,
 0.019223349168896675,
 0.01987731270492077,
 0.019296512007713318,
 0.019371122121810913,
 0.019643083214759827,
 0.019311584532260895,
 0.019847281277179718,
 0.01960749365389347]

In [36]:
Model_1 = load_model('./Model_1.h5')
Model_2 = load_model('./Model_2.h5')
Model_3 = load_model('./Model_3.h5')
Model_4 = load_model('./Model_4.h5')
Model_5 = load_model('./Model_5.h5')
Model_6 = load_model('./Model_6.h5')
Model_7 = load_model('./Model_7.h5')
Model_8 = load_model('./Model_8.h5')
Model_9 = load_model('./Model_9.h5')
Model_10 = load_model('./Model_10.h5')

In [37]:
def logloss_np(y_true, y_pred):
    y_pred = np.clip(y_pred,p_min,p_max)
    return -np.mean(np.array(y_true*np.log(y_pred) + (1-y_true)*np.log(1-y_pred)))

In [38]:
models=[Model_1,Model_2,Model_3,Model_4,Model_5,Model_6,Model_7,Model_8,Model_9,Model_10]
val_losses=[]
for i in range(1,splits_nbr+1):
    temp = logloss_np(Train_targets,models[i-1].predict(Train_data_scaled))
    val_losses.append(temp)

AVG_Val_Loss= np.mean(val_losses)
print(f'The average validation log loss: {AVG_Val_Loss}')

The average validation log loss: 0.013996901921927929


In [39]:
best = val_losses.index(min(val_losses))+1
print(f'The model "Model_{best}" has the lowest log loss on the training set: {val_losses[best-1]}')

The model "Model_9" has the lowest log loss on the training set: 0.013473361730575562


## Submission

In [40]:
Test_data = test_feat_df.drop(columns=['sig_id']+list(gene_cols)+list(cell_via_cols))

In [41]:
test_gene_data = test_feat_df[gene_cols]
test_cell_via_data = test_feat_df[cell_via_cols]
scaled_test_gene_data = scale(test_gene_data)
scaled_test_cell_via_data = scale(test_cell_via_data)

In [42]:
pca_test_gene = pca1.transform(scaled_test_gene_data)
pca_test_cell = pca2.transform(scaled_test_cell_via_data)
labels_gene_pca = ['PC' + str(x) for x in range(1, pca_test_gene.shape[1]+1)]
labels_cell_pca = ['PC' + str(x) for x in range(1, pca_test_cell.shape[1]+1)]
transformed_test_gene_feats = pd.DataFrame(pca_test_gene, columns=labels_gene_pca)
transformed_test_cell_feats = pd.DataFrame(pca_test_cell, columns=labels_cell_pca)

In [43]:
Test_data = Test_data.merge(transformed_test_gene_feats, how='left', left_index=True, right_index=True)
Test_data = Test_data.merge(transformed_test_cell_feats, how='left', left_index=True, right_index=True)
Test_data[['ctl_vehicle','trt_cp']] = pd.get_dummies(Test_data.cp_type)
Test_data[['D1','D2']] = pd.get_dummies(Test_data.cp_dose)
Test_data.drop(columns=['cp_type','cp_dose'], inplace=True)

In [44]:
scaler2 = StandardScaler()
scaler2.fit(Test_data)

StandardScaler()

In [45]:
Test_data_scaled = pd.DataFrame(scaler.transform(Test_data))
Test_data_scaled

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,709,710,711,712,713,714,715,716,717,718
0,-1.237999,-0.229969,0.323212,-0.037284,-0.377140,0.222948,-0.249757,0.164230,-0.384407,0.667899,...,-1.119901,-1.036224,0.156475,-0.860849,0.433238,0.289897,-0.291580,0.291580,0.980043,-0.980043
1,1.235922,-0.210706,-0.641031,0.089094,1.262713,-0.249147,-1.750632,-1.054394,0.913309,0.111429,...,0.177485,-0.184542,0.866395,0.146604,-1.344133,-0.312676,-0.291580,0.291580,0.980043,-0.980043
2,-1.237999,-0.442051,0.051775,-0.106496,1.142170,0.828469,1.645987,-0.445482,-0.417394,-0.679147,...,-0.154768,-1.204758,-1.280099,-0.605786,-0.471472,-0.700342,3.429586,-3.429586,0.980043,-0.980043
3,-1.237999,-0.171720,-0.125064,-0.503656,1.319325,-0.021596,0.406237,-0.145388,-0.060704,-1.159531,...,-0.168354,-1.070266,0.888876,0.943882,1.442905,-1.127152,-0.291580,0.291580,-1.020364,1.020364
4,-0.001039,-0.482340,0.293780,0.632893,-0.173099,0.148159,1.199457,-0.096995,-0.362447,-0.367599,...,-1.979159,-0.266876,-0.242624,2.074457,0.361099,0.109405,-0.291580,0.291580,0.980043,-0.980043
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
3977,-1.237999,-0.004681,0.377149,-0.295972,-0.151271,-0.187288,0.066247,-0.333588,-0.105169,-0.859925,...,0.685337,-0.027597,-0.490823,2.182661,0.073771,-0.904446,-0.291580,0.291580,0.980043,-0.980043
3978,-1.237999,-0.514479,0.264488,0.512203,-0.453765,0.037410,-0.093514,-0.133853,-0.715097,0.866548,...,-0.743456,0.861268,-1.127224,-0.048644,0.438528,0.424288,-0.291580,0.291580,0.980043,-0.980043
3979,1.235922,-0.469470,-0.260557,0.663786,0.809701,-0.240629,-1.102172,-0.123972,0.723112,-0.588847,...,-0.040094,0.040372,0.129363,-0.048089,0.590008,0.732237,-0.291580,0.291580,0.980043,-0.980043
3980,-0.001039,-0.163115,0.177823,-2.220507,-0.035005,0.916844,0.953605,-0.567995,0.100758,-0.315725,...,0.007947,0.781632,2.459864,-0.571899,0.204540,-0.142888,-0.291580,0.291580,-1.020364,1.020364


In [46]:
models_pred = np.zeros((Test_data_scaled.shape[0],206))
for i in range(1,splits_nbr+1):
    temp = models[i-1].predict(Test_data_scaled)
    models_pred += temp
AVG_test_pred = models_pred/splits_nbr

In [47]:
best_test_pred = models[best-1].predict(Test_data_scaled)

In [48]:
best_test_pred

array([[1.3961112e-03, 1.2963581e-03, 1.6187932e-03, ..., 1.6913320e-03,
        1.8042171e-03, 2.4987285e-03],
       [2.1835391e-03, 2.7506172e-03, 3.8534431e-03, ..., 1.0807218e-03,
        1.6919902e-02, 4.0031043e-03],
       [7.5122949e-07, 1.0507176e-06, 5.2206333e-06, ..., 1.4042786e-06,
        7.1861407e-08, 3.3123453e-07],
       ...,
       [2.8382300e-03, 1.3597911e-03, 1.4313359e-03, ..., 2.3213672e-03,
        9.0174522e-04, 2.5347171e-03],
       [8.9324598e-04, 1.6239956e-03, 2.0389899e-03, ..., 1.9555469e-03,
        9.4491564e-04, 3.9414773e-03],
       [2.7604103e-03, 1.7801319e-03, 2.0400498e-03, ..., 2.4691902e-03,
        1.3984465e-03, 3.0585134e-03]], dtype=float32)

In [49]:
Prediction = pd.DataFrame(AVG_test_pred, columns=Train_targets.columns)
sig_id_df = pd.DataFrame(test_feat_df.sig_id)
Prediction = sig_id_df.merge(Prediction, how='left', left_index=True, right_index=True)

In [50]:
Prediction.to_csv('submission.csv', index=False)