<a href="https://colab.research.google.com/github/Hackman-git/Mechanisms_of_action/blob/master/modeling.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [13]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import tensorflow as tf
import random
import tensorflow_addons as tfa
from tensorflow import keras
from keras.layers import Dense, BatchNormalization, Dropout
from sklearn.model_selection import KFold
from sklearn.metrics import log_loss
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.neighbors import NearestNeighbors

In [3]:
train_x = pd.read_csv('/content/drive/My Drive/MOA/Data/train_features.csv')
train_y = pd.read_csv("/content/drive/My Drive/MOA/Data/train_targets_scored.csv")
test = pd.read_csv("/content/drive/My Drive/MOA/Data/test_features.csv")
submit = pd.read_csv("/content/drive/My Drive/MOA/Data/sample_submission.csv")

In [4]:
def preprocessing(train_features, train_y, test_features):
    # map cp_type to binary
    train_features.cp_type = train_features.cp_type.map({'trt_cp': 1, 'ctl_vehicle': 0})
    test_features.cp_type = test_features.cp_type.map({'trt_cp': 1, 'ctl_vehicle': 0})

    # we don't need ctl_vehicle
    train_y = train_y.loc[train_features['cp_type']==1].reset_index(drop=True)
    train_features = train_features.loc[train_features['cp_type']==1].reset_index(drop=True)

    cols = train_features.columns
    cell_cols = [col for col in cols if col.startswith("c-")]
    gene_cols = [col for col in cols if col.startswith('g-')]

    # numeric cols
    num = ['cp_time'] + cell_cols + gene_cols
    rem = ['cp_type']
    train_x_num = train_features[num]

    # categorical cols
    cat = ['cp_dose']
    train_x_cat = train_features[cat]

    # one-hot encoding
    oneHotEnc = OneHotEncoder(handle_unknown='ignore', sparse=False)
    fit = oneHotEnc.fit_transform(train_x_cat)
    train_x_oneH = pd.DataFrame(fit, columns=['dose_1', 'dose_2'])
    fit_ = oneHotEnc.transform(test_features[cat])
    test_oneH = pd.DataFrame(fit_, columns=['dose_1', 'dose_2'])

    # scaling numeric features
    scaler = StandardScaler()
    train_x_num_tr = scaler.fit_transform(train_x_num)
    train_x_num_tr = pd.DataFrame(train_x_num_tr, columns=num)

    test_num_tr = scaler.transform(test_features[num])
    test_num_tr = pd.DataFrame(test_num_tr, columns=num)

    # merging all transformed columns
    train_x_trans = pd.concat([train_features[['cp_type']].reset_index(drop=True),
                           train_x_oneH.reset_index(drop=True)], axis=1)
    train_x_trans = pd.concat([train_x_trans.reset_index(drop=True),
                            train_x_num_tr.reset_index(drop=True)], axis=1)
    test_trans = pd.concat([test_features[['cp_type']].reset_index(drop=True),
                            test_oneH.reset_index(drop=True)], axis=1)
    test_trans = pd.concat([test_trans.reset_index(drop=True),
                            test_num_tr.reset_index(drop=True)], axis=1)
    
    train_y.drop(columns=['sig_id'], inplace=True)
    
    return (train_x_trans, train_y, test_trans)

In [5]:
train_x_trans, train_y, test_trans = preprocessing(train_x, train_y, test)

In [6]:
train_x_trans.shape

(21948, 876)

In [7]:
all_features = train_x_trans.columns

In [10]:
def get_tail_label(df, ql=[0.05, 1.]) -> list:
    """
    Find the underrepresented targets.
    Underrepresented targets are those which are observed less than the median occurance.
    Targets beyond a quantile limit are filtered.
    """
    irlbl = df.sum(axis=0)
    irlbl = irlbl[(irlbl > irlbl.quantile(ql[0])) & ((irlbl < irlbl.quantile(ql[1])))]  # Filtering
    irlbl = irlbl.max() / irlbl
    threshold_irlbl = irlbl.median()
    tail_label = irlbl[irlbl > threshold_irlbl].index.tolist()
    return tail_label

In [11]:
def get_minority_samples(X, y, ql=[0.05, 1.]):
    """
    return
    X_sub: pandas.DataFrame, the feature vector minority dataframe
    y_sub: pandas.DataFrame, the target vector minority dataframe
    """
    tail_labels = get_tail_label(y, ql=ql)
    index = y[y[tail_labels].apply(lambda x: (x == 1).any(), axis=1)].index.tolist()
    
    X_sub = X[X.index.isin(index)].reset_index(drop = True)
    y_sub = y[y.index.isin(index)].reset_index(drop = True)
    return X_sub, y_sub

In [14]:
def nearest_neighbour(X, neigh) -> list:
    """
    Give index of 10 nearest neighbor of all the instance
    
    args
    X: np.array, array whose nearest neighbor has to find
    
    return
    indices: list of list, index of 5 NN of each element in X
    """
    nbs = NearestNeighbors(n_neighbors=neigh, metric='euclidean', algorithm='kd_tree').fit(X)
    euclidean, indices = nbs.kneighbors(X)
    return indices

In [15]:
def MLSMOTE(X, y, n_sample, neigh=5):
    """
    Give the augmented data using MLSMOTE algorithm
    
    args
    X: pandas.DataFrame, input vector DataFrame
    y: pandas.DataFrame, feature vector dataframe
    n_sample: int, number of newly generated sample
    
    return
    new_X: pandas.DataFrame, augmented feature vector data
    target: pandas.DataFrame, augmented target vector data
    """
    indices2 = nearest_neighbour(X, neigh=5)
    n = len(indices2)
    new_X = np.zeros((n_sample, X.shape[1]))
    target = np.zeros((n_sample, y.shape[1]))
    for i in range(n_sample):
        reference = random.randint(0, n-1)
        neighbor = random.choice(indices2[reference, 1:])
        all_point = indices2[reference]
        nn_df = y[y.index.isin(all_point)]
        ser = nn_df.sum(axis = 0, skipna = True)
        target[i] = np.array([1 if val > 0 else 0 for val in ser])
        ratio = random.random()
        gap = X.loc[reference,:] - X.loc[neighbor,:]
        new_X[i] = np.array(X.loc[reference,:] + ratio * gap)
    new_X = pd.DataFrame(new_X, columns=X.columns)
    target = pd.DataFrame(target, columns=y.columns)
    
    return new_X, target

In [16]:
train_x_sub, train_y_sub = get_minority_samples(train_x_trans, train_y)

In [22]:
train_x_sub.shape

(2323, 876)

In [53]:
train_x_aug, train_y_aug = MLSMOTE(train_x_sub, train_y_sub, 2000, 5)

In [54]:
train_x_aug.shape

(2000, 876)

In [74]:
train_x_aug.head(2)

Unnamed: 0,cp_type,dose_1,dose_2,cp_time,c-0,c-1,c-2,c-3,c-4,c-5,c-6,c-7,c-8,c-9,c-10,c-11,c-12,c-13,c-14,c-15,c-16,c-17,c-18,c-19,c-20,c-21,c-22,c-23,c-24,c-25,c-26,c-27,c-28,c-29,c-30,c-31,c-32,c-33,c-34,c-35,...,g-732,g-733,g-734,g-735,g-736,g-737,g-738,g-739,g-740,g-741,g-742,g-743,g-744,g-745,g-746,g-747,g-748,g-749,g-750,g-751,g-752,g-753,g-754,g-755,g-756,g-757,g-758,g-759,g-760,g-761,g-762,g-763,g-764,g-765,g-766,g-767,g-768,g-769,g-770,g-771
0,1.0,1.0,0.0,-1.480245,-0.206128,0.500137,0.596209,0.188105,0.297162,0.847406,0.23543,0.464572,0.266803,0.310735,0.169581,0.531772,-0.082169,-0.120179,-0.024724,0.015446,0.79174,-0.169846,0.357006,0.029532,0.677038,0.370112,-0.035463,0.317763,0.024164,-0.138172,0.075279,0.606375,0.299332,0.709541,0.521777,0.28799,0.580161,-0.080343,0.159025,-0.144806,...,-0.894932,0.263972,0.217264,-1.276027,-0.102443,0.543642,-0.633115,-0.059481,-0.510678,0.711792,-0.148894,0.015827,-0.933569,-0.412368,0.636818,1.077071,0.918238,-0.958743,-0.279686,0.934393,1.151163,-0.877966,-0.604666,0.242613,0.841144,-0.018328,-0.473349,-0.104127,0.63632,0.786759,-0.206019,-1.964859,-0.103296,0.387079,0.297109,-0.113365,0.026841,-0.575642,0.438784,-0.1732
1,1.0,1.0,0.0,1.236104,0.784169,0.751545,1.075508,0.956368,1.192168,-0.008897,0.616517,0.537432,0.677161,0.706362,0.469312,0.359884,0.689311,0.677962,0.734157,0.419189,0.172435,0.058898,1.090115,-0.096953,0.312376,1.063864,1.17427,0.445455,-0.747749,0.548543,0.5089,0.270577,0.950186,0.472769,0.764224,0.940108,-0.509877,1.139927,1.162411,0.541738,...,0.722793,1.042834,1.714964,0.902505,0.804159,-2.325241,1.660899,-0.41132,0.856124,0.363294,0.047887,-0.883418,-0.956529,-1.259618,1.493985,0.209654,1.574727,0.478609,-0.354773,0.671696,-0.969068,0.047046,0.025997,-0.363444,0.039257,-0.17452,1.212468,-0.011597,-0.097057,0.755461,-0.370676,-0.890522,0.241432,-0.57499,1.87695,-1.178177,-0.418421,-0.629886,-0.391023,-1.022588


In [55]:
train_x_ups = train_x_trans.append(train_x_aug, ignore_index=True)
train_y_ups = train_y.append(train_y_aug, ignore_index=True)

In [56]:
train_x_ups.shape, train_y_ups.shape

((23948, 876), (23948, 206))

In [69]:
def model_init():
    model = keras.models.Sequential([
        tfa.layers.WeightNormalization(Dense(2000, activation='elu', input_shape=(876,), kernel_initializer='he_normal')),
        BatchNormalization(),
        Dropout(0.3),
        Dense(2000, activation='elu',kernel_initializer='he_normal'),
        BatchNormalization(),
        Dropout(0.3),
        Dense(1600, activation='elu',kernel_initializer='he_normal'),
        BatchNormalization(),
        Dropout(0.3),
        Dense(1200, activation='elu',kernel_initializer='he_normal'),
        BatchNormalization(),
        Dropout(0.3),
        Dense(206, activation='sigmoid')
    ])
    
    model.compile(loss='binary_crossentropy', 
                  optimizer=keras.optimizers.Adam(lr=0.0005,beta_1=0.9,beta_2=0.999),
                  metrics=['accuracy'])
    
    return (model)

In [70]:
def training(train_x_trans, train_y, k=5, batch_size=128, epochs=25):
    models = []
    oob_pred = train_y.copy()
    cross_val = KFold(k, shuffle=True)
    
    for fold, (train_index, val_index) in enumerate(cross_val.split(train_x_trans)):
        print('\ncross-val fold '+ str(fold+1))
        model = model_init()
        
        lr_scheduler = keras.callbacks.ReduceLROnPlateau(factor=0.8, patience=3, 
                                                         monitor='val_loss',mode='auto')
        model.fit(train_x_trans.values[train_index],
              train_y.values[train_index],
              validation_data=(train_x_trans.values[val_index],train_y.values[val_index]),
              batch_size=batch_size, 
              callbacks=[lr_scheduler],
              epochs=epochs)
        
        oob_pred.loc[val_index, :] = model.predict(train_x_trans.values[val_index])
        models.append(model)
        
    return (models, oob_pred)

In [71]:
def multi_log_loss(y_true, y_pred):
    losses = []
    for col in y_true.columns:
        losses.append(log_loss(y_true.loc[:, col], y_pred.loc[:, col]))
    return np.mean(losses)

In [72]:
def get_models(train_x_trans, train_y, submission):
    models, oob_preds = training(train_x_trans, train_y, k=5)
    print('\n\nmultilogloss: ', multi_log_loss(train_y, oob_preds))
    y_cols = submit.columns[1:]

    test_pred = submission.copy()
    test_pred[y_cols]=0
    for m in models:
        test_pred[y_cols] += m.predict(test_trans)

    test_pred[y_cols] = test_pred[y_cols] / len(models)
    test_pred.loc[test['cp_type']==0, y_cols] = 0

    return test_pred

In [73]:
test_pred = get_models(train_x_trans, train_y, submit)


cross-val fold 1
Epoch 1/25
Epoch 2/25
Epoch 3/25
Epoch 4/25
Epoch 5/25
Epoch 6/25
Epoch 7/25
Epoch 8/25
Epoch 9/25
Epoch 10/25
Epoch 11/25
Epoch 12/25
Epoch 13/25
Epoch 14/25
Epoch 15/25
Epoch 16/25
Epoch 17/25
Epoch 18/25
Epoch 19/25
Epoch 20/25
Epoch 21/25
Epoch 22/25
Epoch 23/25
Epoch 24/25

KeyboardInterrupt: ignored

In [None]:
test_pred.head(3)

Unnamed: 0,sig_id,5-alpha_reductase_inhibitor,11-beta-hsd1_inhibitor,acat_inhibitor,acetylcholine_receptor_agonist,acetylcholine_receptor_antagonist,acetylcholinesterase_inhibitor,adenosine_receptor_agonist,adenosine_receptor_antagonist,adenylyl_cyclase_activator,adrenergic_receptor_agonist,adrenergic_receptor_antagonist,akt_inhibitor,aldehyde_dehydrogenase_inhibitor,alk_inhibitor,ampk_activator,analgesic,androgen_receptor_agonist,androgen_receptor_antagonist,anesthetic_-_local,angiogenesis_inhibitor,angiotensin_receptor_antagonist,anti-inflammatory,antiarrhythmic,antibiotic,anticonvulsant,antifungal,antihistamine,antimalarial,antioxidant,antiprotozoal,antiviral,apoptosis_stimulant,aromatase_inhibitor,atm_kinase_inhibitor,atp-sensitive_potassium_channel_antagonist,atp_synthase_inhibitor,atpase_inhibitor,atr_kinase_inhibitor,aurora_kinase_inhibitor,...,protein_synthesis_inhibitor,protein_tyrosine_kinase_inhibitor,radiopaque_medium,raf_inhibitor,ras_gtpase_inhibitor,retinoid_receptor_agonist,retinoid_receptor_antagonist,rho_associated_kinase_inhibitor,ribonucleoside_reductase_inhibitor,rna_polymerase_inhibitor,serotonin_receptor_agonist,serotonin_receptor_antagonist,serotonin_reuptake_inhibitor,sigma_receptor_agonist,sigma_receptor_antagonist,smoothened_receptor_antagonist,sodium_channel_inhibitor,sphingosine_receptor_agonist,src_inhibitor,steroid,syk_inhibitor,tachykinin_antagonist,tgf-beta_receptor_inhibitor,thrombin_inhibitor,thymidylate_synthase_inhibitor,tlr_agonist,tlr_antagonist,tnf_inhibitor,topoisomerase_inhibitor,transient_receptor_potential_channel_antagonist,tropomyosin_receptor_kinase_inhibitor,trpv_agonist,trpv_antagonist,tubulin_inhibitor,tyrosine_kinase_inhibitor,ubiquitin_specific_protease_inhibitor,vegfr_inhibitor,vitamin_b,vitamin_d_receptor_agonist,wnt_inhibitor
0,id_0004d9e33,0.000758,0.000933,0.00121,0.022532,0.019889,0.002882,0.001347,0.004564,0.000392,0.007845,0.013477,0.000617,0.000563,0.000555,0.000762,0.00068,0.001999,0.003767,0.009254,0.001607,0.002305,0.002915,0.000543,0.00085,0.000564,0.000598,0.00079,0.000691,0.004,0.001416,0.000955,0.002016,0.002835,0.000458,0.000494,0.000518,0.002844,0.000422,0.000761,...,0.002124,0.000527,0.004216,0.000416,0.000863,0.005453,0.000603,0.000552,0.00061,0.00129,0.011127,0.009064,0.002883,0.00261,0.001064,0.001155,0.029916,0.002303,0.00032,0.000462,0.000339,0.0017,0.000319,0.001017,0.001353,0.00102,0.000705,0.001546,0.000673,0.000844,0.000799,0.001155,0.001555,0.00134,0.000288,0.000741,0.000367,0.001232,0.003821,0.001506
1,id_001897cda,0.000508,0.000931,0.001123,0.001547,0.001357,0.001584,0.002592,0.00713,0.005242,0.009432,0.007298,0.001314,0.001022,0.011143,0.000836,0.000785,0.001396,0.002752,0.002542,0.002325,0.001876,0.000915,0.000775,0.001335,0.001005,0.001633,0.00077,0.002132,0.00113,0.000873,0.000755,0.001851,0.001167,0.001443,0.000761,0.001043,0.003004,0.003478,0.013888,...,0.001552,0.001506,0.000792,0.000587,0.001902,0.011788,0.001058,0.004916,0.001232,0.001586,0.005975,0.003441,0.000782,0.000904,0.002873,0.001375,0.003352,0.001294,0.015766,0.000845,0.002422,0.00289,0.004986,0.001264,0.000539,0.001069,0.000796,0.002544,0.005476,0.00221,0.000905,0.001608,0.001441,0.000455,0.007832,0.000691,0.00292,0.000812,0.005556,0.00219
2,id_002429b5b,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [None]:
test_pred.to_csv('submission.csv', index=False)