<a href="https://colab.research.google.com/github/Hackman-git/Mechanisms_of_action/blob/master/modeling.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [2]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import tensorflow as tf
import tensorflow_addons as tfa
from tensorflow import keras
from keras.layers import Dense, BatchNormalization, Dropout
from sklearn.model_selection import KFold
from sklearn.metrics import log_loss
from sklearn.preprocessing import OneHotEncoder, StandardScaler

In [3]:
train_x = pd.read_csv('/content/drive/My Drive/MOA/Data/train_features.csv')
train_y = pd.read_csv("/content/drive/My Drive/MOA/Data/train_targets_scored.csv")
test = pd.read_csv("/content/drive/My Drive/MOA/Data/test_features.csv")
submit = pd.read_csv("/content/drive/My Drive/MOA/Data/sample_submission.csv")

In [None]:
def preprocessing(train_features, train_y, test_features):
    # map cp_type to binary
    train_features.cp_type = train_features.cp_type.map({'trt_cp': 1, 'ctl_vehicle': 0})
    test_features.cp_type = test_features.cp_type.map({'trt_cp': 1, 'ctl_vehicle': 0})

    # we don't need ctl_vehicle
    train_y = train_y.loc[train_features['cp_type']==1].reset_index(drop=True)
    train_features = train_features.loc[train_features['cp_type']==1].reset_index(drop=True)

    # numeric cols
    num = ['cp_time'] + cell_cols + gene_cols
    rem = ['cp_type']
    train_x_num = train_features[num]

    # categorical cols
    cat = ['cp_dose']
    train_x_cat = train_features[cat]

    # one-hot encoding
    oneHotEnc = OneHotEncoder(handle_unknown='ignore', sparse=False)
    fit = oneHotEnc.fit_transform(train_x_cat)
    train_x_oneH = pd.DataFrame(fit, columns=['dose_1', 'dose_2'])
    fit_ = oneHotEnc.transform(test_features[cat])
    test_oneH = pd.DataFrame(fit_, columns=['dose_1', 'dose_2'])

    # scaling numeric features
    scaler = StandardScaler()
    train_x_num_tr = scaler.fit_transform(train_x_num)
    train_x_num_tr = pd.DataFrame(train_x_num_tr, columns=num)

    test_num_tr = scaler.transform(test_features[num])
    test_num_tr = pd.DataFrame(test_num_tr, columns=num)

    # merging all transformed columns
    train_x_trans = pd.concat([train_features[['cp_type']].reset_index(drop=True),
                           train_x_oneH.reset_index(drop=True)], axis=1)
    train_x_trans = pd.concat([train_x_trans.reset_index(drop=True),
                            train_x_num_tr.reset_index(drop=True)], axis=1)
    test_trans = pd.concat([test_features[['cp_type']].reset_index(drop=True),
                            test_oneH.reset_index(drop=True)], axis=1)
    test_trans = pd.concat([test_trans.reset_index(drop=True),
                            test_num_tr.reset_index(drop=True)], axis=1)
    
    train_y.drop(columns=['sig_id'], inplace=True)
    
    return (train_x_trans, train_y, test_trans)


In [None]:
train_x_trans, train_y, test_trans = preprocessing(train_x, train_y, test)

In [None]:
train_x_trans.shape

(21948, 876)

In [None]:
all_features = train_x_trans.columns

In [None]:
def model_init():
    model = keras.models.Sequential([
        tfa.layers.WeightNormalization(Dense(2000, activation='elu', input_shape=(876,), kernel_initializer='he_normal')),
        BatchNormalization(),
        Dropout(0.3),
        Dense(2000, activation='elu',kernel_initializer='he_normal'),
        BatchNormalization(),
        Dropout(0.3),
        Dense(1600, activation='elu',kernel_initializer='he_normal'),
        BatchNormalization(),
        Dropout(0.3),
        Dense(1200, activation='elu',kernel_initializer='he_normal'),
        BatchNormalization(),
        Dropout(0.3),
        Dense(206, activation='sigmoid')
    ])
    
    model.compile(loss='binary_crossentropy', 
                  optimizer=keras.optimizers.Adam(lr=0.0005,beta_1=0.9,beta_2=0.999),
                  metrics=['accuracy'])
    
    return (model)

In [None]:
def training(train_x_trans, train_y, k=5):
    models = []
    oob_pred = train_y.copy()
    cross_val = KFold(k, shuffle=True)
    
    for fold, (train_index, val_index) in enumerate(cross_val.split(train_x_trans)):
        print('\ncross-val fold '+ str(fold+1))
        model = model_init()
        
        lr_scheduler = keras.callbacks.ReduceLROnPlateau(factor=0.8, patience=3, 
                                                         monitor='val_loss',mode='auto')
        model.fit(train_x_trans.values[train_index],
              train_y.values[train_index],
              validation_data=(train_x_trans.values[val_index],train_y.values[val_index]),
              batch_size=256, 
              callbacks=[lr_scheduler],
              epochs=20)
        
        oob_pred.loc[val_index, :] = model.predict(train_x_trans.values[val_index])
        models.append(model)
        
    return (models, oob_pred)

In [None]:
def multi_log_loss(y_true, y_pred):
    losses = []
    for col in y_true.columns:
        losses.append(log_loss(y_true.loc[:, col], y_pred.loc[:, col]))
    return np.mean(losses)

In [None]:
def get_models(train_x_trans, train_y, submission):
    models, oob_preds = training(train_x_trans, train_y, k=5)
    print('\n\nmultilogloss: ', multi_log_loss(train_y, oob_preds))
    y_cols = submit.columns[1:]

    test_pred = submission.copy()
    test_pred[y_cols]=0
    for m in models:
        test_pred[y_cols] += m.predict(test_trans)

    test_pred[y_cols] = test_pred[y_cols] / len(models)
    test_pred.loc[test['cp_type']==0, y_cols] = 0

    return test_pred

In [None]:
test_pred = get_models(train_x_trans, train_y, submit)


cross-val fold 1
Epoch 1/20
Epoch 2/20
Epoch 3/20
Epoch 4/20
Epoch 5/20
Epoch 6/20
Epoch 7/20
Epoch 8/20
Epoch 9/20
Epoch 10/20
Epoch 11/20
Epoch 12/20
Epoch 13/20
Epoch 14/20
Epoch 15/20
Epoch 16/20
Epoch 17/20
Epoch 18/20
Epoch 19/20
Epoch 20/20

cross-val fold 2
Epoch 1/20
Epoch 2/20
Epoch 3/20
Epoch 4/20
Epoch 5/20
Epoch 6/20
Epoch 7/20
Epoch 8/20
Epoch 9/20
Epoch 10/20
Epoch 11/20
Epoch 12/20
Epoch 13/20
Epoch 14/20
Epoch 15/20
Epoch 16/20
Epoch 17/20
Epoch 18/20
Epoch 19/20
Epoch 20/20

cross-val fold 3
Epoch 1/20
Epoch 2/20
Epoch 3/20
Epoch 4/20
Epoch 5/20
Epoch 6/20
Epoch 7/20
Epoch 8/20
Epoch 9/20
Epoch 10/20
Epoch 11/20
Epoch 12/20
Epoch 13/20
Epoch 14/20
Epoch 15/20
Epoch 16/20
Epoch 17/20
Epoch 18/20
Epoch 19/20
Epoch 20/20

cross-val fold 4
Epoch 1/20
Epoch 2/20
Epoch 3/20
Epoch 4/20
Epoch 5/20
Epoch 6/20
Epoch 7/20
Epoch 8/20
Epoch 9/20
Epoch 10/20
Epoch 11/20
Epoch 12/20
Epoch 13/20
Epoch 14/20
Epoch 15/20
Epoch 16/20
Epoch 17/20
Epoch 18/20
Epoch 19/20
Epoch 20/20

cro

In [None]:
test_pred.head(3)

Unnamed: 0,sig_id,5-alpha_reductase_inhibitor,11-beta-hsd1_inhibitor,acat_inhibitor,acetylcholine_receptor_agonist,acetylcholine_receptor_antagonist,acetylcholinesterase_inhibitor,adenosine_receptor_agonist,adenosine_receptor_antagonist,adenylyl_cyclase_activator,adrenergic_receptor_agonist,adrenergic_receptor_antagonist,akt_inhibitor,aldehyde_dehydrogenase_inhibitor,alk_inhibitor,ampk_activator,analgesic,androgen_receptor_agonist,androgen_receptor_antagonist,anesthetic_-_local,angiogenesis_inhibitor,angiotensin_receptor_antagonist,anti-inflammatory,antiarrhythmic,antibiotic,anticonvulsant,antifungal,antihistamine,antimalarial,antioxidant,antiprotozoal,antiviral,apoptosis_stimulant,aromatase_inhibitor,atm_kinase_inhibitor,atp-sensitive_potassium_channel_antagonist,atp_synthase_inhibitor,atpase_inhibitor,atr_kinase_inhibitor,aurora_kinase_inhibitor,...,protein_synthesis_inhibitor,protein_tyrosine_kinase_inhibitor,radiopaque_medium,raf_inhibitor,ras_gtpase_inhibitor,retinoid_receptor_agonist,retinoid_receptor_antagonist,rho_associated_kinase_inhibitor,ribonucleoside_reductase_inhibitor,rna_polymerase_inhibitor,serotonin_receptor_agonist,serotonin_receptor_antagonist,serotonin_reuptake_inhibitor,sigma_receptor_agonist,sigma_receptor_antagonist,smoothened_receptor_antagonist,sodium_channel_inhibitor,sphingosine_receptor_agonist,src_inhibitor,steroid,syk_inhibitor,tachykinin_antagonist,tgf-beta_receptor_inhibitor,thrombin_inhibitor,thymidylate_synthase_inhibitor,tlr_agonist,tlr_antagonist,tnf_inhibitor,topoisomerase_inhibitor,transient_receptor_potential_channel_antagonist,tropomyosin_receptor_kinase_inhibitor,trpv_agonist,trpv_antagonist,tubulin_inhibitor,tyrosine_kinase_inhibitor,ubiquitin_specific_protease_inhibitor,vegfr_inhibitor,vitamin_b,vitamin_d_receptor_agonist,wnt_inhibitor
0,id_0004d9e33,0.000758,0.000933,0.00121,0.022532,0.019889,0.002882,0.001347,0.004564,0.000392,0.007845,0.013477,0.000617,0.000563,0.000555,0.000762,0.00068,0.001999,0.003767,0.009254,0.001607,0.002305,0.002915,0.000543,0.00085,0.000564,0.000598,0.00079,0.000691,0.004,0.001416,0.000955,0.002016,0.002835,0.000458,0.000494,0.000518,0.002844,0.000422,0.000761,...,0.002124,0.000527,0.004216,0.000416,0.000863,0.005453,0.000603,0.000552,0.00061,0.00129,0.011127,0.009064,0.002883,0.00261,0.001064,0.001155,0.029916,0.002303,0.00032,0.000462,0.000339,0.0017,0.000319,0.001017,0.001353,0.00102,0.000705,0.001546,0.000673,0.000844,0.000799,0.001155,0.001555,0.00134,0.000288,0.000741,0.000367,0.001232,0.003821,0.001506
1,id_001897cda,0.000508,0.000931,0.001123,0.001547,0.001357,0.001584,0.002592,0.00713,0.005242,0.009432,0.007298,0.001314,0.001022,0.011143,0.000836,0.000785,0.001396,0.002752,0.002542,0.002325,0.001876,0.000915,0.000775,0.001335,0.001005,0.001633,0.00077,0.002132,0.00113,0.000873,0.000755,0.001851,0.001167,0.001443,0.000761,0.001043,0.003004,0.003478,0.013888,...,0.001552,0.001506,0.000792,0.000587,0.001902,0.011788,0.001058,0.004916,0.001232,0.001586,0.005975,0.003441,0.000782,0.000904,0.002873,0.001375,0.003352,0.001294,0.015766,0.000845,0.002422,0.00289,0.004986,0.001264,0.000539,0.001069,0.000796,0.002544,0.005476,0.00221,0.000905,0.001608,0.001441,0.000455,0.007832,0.000691,0.00292,0.000812,0.005556,0.00219
2,id_002429b5b,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [None]:
test_pred.to_csv('submission.csv', index=False)