<a href="https://colab.research.google.com/github/Hackman-git/Mechanisms_of_action/blob/master/modeling.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [2]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import tensorflow as tf
import random
import tensorflow_addons as tfa
from tensorflow import keras
from keras import backend
from keras.layers import Dense, BatchNormalization, Dropout
from sklearn.model_selection import KFold
from sklearn.metrics import log_loss
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.neighbors import NearestNeighbors
from sklearn.utils import class_weight
# from iterstrat.ml_stratifiers import MultilabelStratifiedKFold

In [15]:
train_x = pd.read_csv('/content/drive/My Drive/MOA/Data/train_features.csv')
train_y = pd.read_csv("/content/drive/My Drive/MOA/Data/train_targets_scored.csv")
test = pd.read_csv("/content/drive/My Drive/MOA/Data/test_features.csv")
submit = pd.read_csv("/content/drive/My Drive/MOA/Data/sample_submission.csv")

In [4]:
def preprocessing(train_features, train_y, test_features):
    # map cp_type to binary
    train_features.cp_type = train_features.cp_type.map({'trt_cp': 1, 'ctl_vehicle': 0})
    test_features.cp_type = test_features.cp_type.map({'trt_cp': 1, 'ctl_vehicle': 0})

    # we don't need ctl_vehicle
    train_y = train_y.loc[train_features['cp_type']==1].reset_index(drop=True)
    train_features = train_features.loc[train_features['cp_type']==1].reset_index(drop=True)

    cols = train_features.columns
    cell_cols = [col for col in cols if col.startswith("c-")]
    gene_cols = [col for col in cols if col.startswith('g-')]

    # numeric cols
    num = ['cp_time'] + cell_cols + gene_cols
    rem = ['cp_type']
    train_x_num = train_features[num]

    # categorical cols
    cat = ['cp_dose']
    train_x_cat = train_features[cat]

    # one-hot encoding
    oneHotEnc = OneHotEncoder(handle_unknown='ignore', sparse=False)
    fit = oneHotEnc.fit_transform(train_x_cat)
    train_x_oneH = pd.DataFrame(fit, columns=['dose_1', 'dose_2'])
    fit_ = oneHotEnc.transform(test_features[cat])
    test_oneH = pd.DataFrame(fit_, columns=['dose_1', 'dose_2'])

    # scaling numeric features
    scaler = StandardScaler()
    train_x_num_tr = scaler.fit_transform(train_x_num)
    train_x_num_tr = pd.DataFrame(train_x_num_tr, columns=num)

    test_num_tr = scaler.transform(test_features[num])
    test_num_tr = pd.DataFrame(test_num_tr, columns=num)

    # merging all transformed columns
    train_x_trans = pd.concat([train_features[['cp_type']].reset_index(drop=True),
                           train_x_oneH.reset_index(drop=True)], axis=1)
    train_x_trans = pd.concat([train_x_trans.reset_index(drop=True),
                            train_x_num_tr.reset_index(drop=True)], axis=1)
    test_trans = pd.concat([test_features[['cp_type']].reset_index(drop=True),
                            test_oneH.reset_index(drop=True)], axis=1)
    test_trans = pd.concat([test_trans.reset_index(drop=True),
                            test_num_tr.reset_index(drop=True)], axis=1)
    
    train_y.drop(columns=['sig_id'], inplace=True)
    
    return (train_x_trans, train_y, test_trans)

In [16]:
train_x_trans, train_y, test_trans = preprocessing(train_x, train_y, test)

In [7]:
train_x_trans.shape

(21948, 876)

In [8]:
all_features = train_x_trans.columns

In [None]:
def get_tail_label(df, ql=[0.05, 1.]) -> list:
    """
    Find the underrepresented targets.
    Underrepresented targets are those which are observed less than the median occurance.
    Targets beyond a quantile limit are filtered.
    """
    irlbl = df.sum(axis=0)
    irlbl = irlbl[(irlbl > irlbl.quantile(ql[0])) & ((irlbl < irlbl.quantile(ql[1])))]  # Filtering
    irlbl = irlbl.max() / irlbl
    threshold_irlbl = irlbl.median()
    tail_label = irlbl[irlbl > threshold_irlbl].index.tolist()
    return tail_label

In [None]:
def get_minority_samples(X, y, ql=[0.05, 1.]):
    """
    return
    X_sub: pandas.DataFrame, the feature vector minority dataframe
    y_sub: pandas.DataFrame, the target vector minority dataframe
    """
    tail_labels = get_tail_label(y, ql=ql)
    index = y[y[tail_labels].apply(lambda x: (x == 1).any(), axis=1)].index.tolist()
    
    X_sub = X[X.index.isin(index)].reset_index(drop = True)
    y_sub = y[y.index.isin(index)].reset_index(drop = True)
    return X_sub, y_sub

In [None]:
def nearest_neighbour(X, neigh) -> list:
    """
    Give index of 10 nearest neighbor of all the instance
    
    args
    X: np.array, array whose nearest neighbor has to find
    
    return
    indices: list of list, index of 5 NN of each element in X
    """
    nbs = NearestNeighbors(n_neighbors=neigh, metric='euclidean', algorithm='kd_tree').fit(X)
    euclidean, indices = nbs.kneighbors(X)
    return indices

In [None]:
def MLSMOTE(X, y, n_sample, neigh=5):
    """
    Give the augmented data using MLSMOTE algorithm
    
    args
    X: pandas.DataFrame, input vector DataFrame
    y: pandas.DataFrame, feature vector dataframe
    n_sample: int, number of newly generated sample
    
    return
    new_X: pandas.DataFrame, augmented feature vector data
    target: pandas.DataFrame, augmented target vector data
    """
    indices2 = nearest_neighbour(X, neigh=5)
    n = len(indices2)
    new_X = np.zeros((n_sample, X.shape[1]))
    target = np.zeros((n_sample, y.shape[1]))
    for i in range(n_sample):
        reference = random.randint(0, n-1)
        neighbor = random.choice(indices2[reference, 1:])
        all_point = indices2[reference]
        nn_df = y[y.index.isin(all_point)]
        ser = nn_df.sum(axis = 0, skipna = True)
        target[i] = np.array([1 if val > 0 else 0 for val in ser])
        ratio = random.random()
        gap = X.loc[reference,:] - X.loc[neighbor,:]
        new_X[i] = np.array(X.loc[reference,:] + ratio * gap)
    new_X = pd.DataFrame(new_X, columns=X.columns)
    target = pd.DataFrame(target, columns=y.columns)
    
    return new_X, target

In [None]:
# train_x_sub, train_y_sub = get_minority_samples(train_x_trans, train_y)
# train_x_aug, train_y_aug = MLSMOTE(train_x_sub, train_y_sub, 3000, 5)
# train_x_ups = train_x_trans.append(train_x_aug, ignore_index=True)
# train_y_ups = train_y.append(train_y_aug, ignore_index=True)
# train_x_ups.shape, train_y_ups.shape

In [20]:
def model_init():
    model = keras.models.Sequential([
        tfa.layers.WeightNormalization(Dense(2100, activation='elu', input_shape=(876,), kernel_initializer='he_normal')),
        BatchNormalization(),
        Dropout(0.3),
        Dense(1024, activation='elu',kernel_initializer='he_normal'),
        BatchNormalization(),
        Dropout(0.3),
        Dense(1000, activation='elu',kernel_initializer='he_normal'),
        BatchNormalization(),
        Dropout(0.3),
        Dense(500, activation='elu',kernel_initializer='he_normal'),
        BatchNormalization(),
        Dropout(0.3),
        Dense(206, activation='sigmoid')
    ])
    
    model.compile(loss=keras.losses.BinaryCrossentropy(label_smoothing=0.001), 
                  optimizer=keras.optimizers.Adam(learning_rate=0.0005),
                  metrics=logloss)
    
    
    return (model)

In [23]:
c_min = 0.001
c_max = 0.999

In [24]:
def training(train_x_trans, train_y, k=5, batch_size=128, epochs=50, num_seeds=2):

    num_labels = train_y.shape[1]
    num_train = train_x_trans.shape[0]
    num_features = train_x_trans.shape[1]
    num_test = test_trans.shape[0]

    # clipping thresholds
    c_min = 0.001
    c_max = 0.999

    seeds = np.random.randint(0,100,size=num_seeds)
    y_pred = np.zeros((num_test, num_labels))
    oof_loss = tf.constant(0.0)
    model_hist = []

    for seed in seeds:
        fold = 1
#         strat = MultilabelStratifiedKFold(n_splits=k, shuffle=True, random_state=seed)
        strat = KFold(k, shuffle=True, random_state=seed)
        for fold, (train, test) in enumerate(strat.split(train_x_trans, train_y)):
            x_train = train_x_trans.values[train]
            x_test = train_x_trans.values[test]
            y_train = train_y.values[train]
            y_test = train_y.values[test]

            print('\nSeed '+str(seed)+', fold '+str(fold+1))
            model = model_init()
            lr_scheduler = keras.callbacks.ReduceLROnPlateau(
                factor=0.3, patience=5, monitor='val_logloss',mode='min',min_lr=1E-5)
            
            early_stopping = keras.callbacks.EarlyStopping(
                monitor='val_logloss', min_delta=1E-5, patience=6,
                mode='min',restore_best_weights=True)
            
            checkpoint = tf.keras.callbacks.ModelCheckpoint(
                '/content/drive/My Drive/MOA/Models/Model_Seed_'+str(seed)+'_fold_'+str(fold)+'.h5',
                save_best_only=True)
            
            mod = model.fit(x_train, y_train, validation_data=(x_test,y_test),
                            callbacks=[lr_scheduler,early_stopping,checkpoint],
                            batch_size = batch_size, epochs=epochs)
            
            model_hist.append(mod)
            # model.save('./Model_Seed_'+str(seed)+'_fold_'+str(fold)+'.h5')
            oof_pred = model.predict(x_test)
            current_oof_loss = logloss(tf.constant(y_test,dtype=tf.float32),
                                tf.constant(oof_pred,dtype=tf.float32))
            tf.print(current_oof_loss)
            oof_loss += current_oof_loss

            y_pred += model.predict(test_trans)

    oof_loss = oof_loss/(k*num_seeds)
    y_pred = y_pred / (k*num_seeds)
    tf.print('\noof loss after '+str(len(seeds))+' seeds of '+str(k)+'-fold CV is: ',oof_loss)

    test_pred = submit.copy()
    y_cols = submit.columns[1:]
    test_pred.iloc[:,1:] = np.clip(y_pred, c_min, c_max)
    test_pred.loc[test_trans['cp_type']==0, y_cols] = 0

    return(test_pred)


In [9]:
def logloss(y_true, y_pred):
    y_pred = tf.clip_by_value(y_pred,c_min,c_max)
    return (-backend.mean(y_true*backend.log(y_pred) + (1-y_true)*backend.log(1-y_pred)))


In [None]:
def multi_log_loss(y_true, y_pred):
    losses = []
    for col in y_true.columns:
        losses.append(log_loss(y_true.loc[:, col], y_pred.loc[:, col]))
    return np.mean(losses)

In [None]:
test_pred = training(train_x_trans,train_y,5,128,50,3)

In [42]:
test_pred.head(3)

Unnamed: 0,sig_id,5-alpha_reductase_inhibitor,11-beta-hsd1_inhibitor,acat_inhibitor,acetylcholine_receptor_agonist,acetylcholine_receptor_antagonist,acetylcholinesterase_inhibitor,adenosine_receptor_agonist,adenosine_receptor_antagonist,adenylyl_cyclase_activator,adrenergic_receptor_agonist,adrenergic_receptor_antagonist,akt_inhibitor,aldehyde_dehydrogenase_inhibitor,alk_inhibitor,ampk_activator,analgesic,androgen_receptor_agonist,androgen_receptor_antagonist,anesthetic_-_local,angiogenesis_inhibitor,angiotensin_receptor_antagonist,anti-inflammatory,antiarrhythmic,antibiotic,anticonvulsant,antifungal,antihistamine,antimalarial,antioxidant,antiprotozoal,antiviral,apoptosis_stimulant,aromatase_inhibitor,atm_kinase_inhibitor,atp-sensitive_potassium_channel_antagonist,atp_synthase_inhibitor,atpase_inhibitor,atr_kinase_inhibitor,aurora_kinase_inhibitor,...,protein_synthesis_inhibitor,protein_tyrosine_kinase_inhibitor,radiopaque_medium,raf_inhibitor,ras_gtpase_inhibitor,retinoid_receptor_agonist,retinoid_receptor_antagonist,rho_associated_kinase_inhibitor,ribonucleoside_reductase_inhibitor,rna_polymerase_inhibitor,serotonin_receptor_agonist,serotonin_receptor_antagonist,serotonin_reuptake_inhibitor,sigma_receptor_agonist,sigma_receptor_antagonist,smoothened_receptor_antagonist,sodium_channel_inhibitor,sphingosine_receptor_agonist,src_inhibitor,steroid,syk_inhibitor,tachykinin_antagonist,tgf-beta_receptor_inhibitor,thrombin_inhibitor,thymidylate_synthase_inhibitor,tlr_agonist,tlr_antagonist,tnf_inhibitor,topoisomerase_inhibitor,transient_receptor_potential_channel_antagonist,tropomyosin_receptor_kinase_inhibitor,trpv_agonist,trpv_antagonist,tubulin_inhibitor,tyrosine_kinase_inhibitor,ubiquitin_specific_protease_inhibitor,vegfr_inhibitor,vitamin_b,vitamin_d_receptor_agonist,wnt_inhibitor
0,id_0004d9e33,0.001,0.001283,0.001471,0.015843,0.021177,0.004546,0.001922,0.005809,0.001,0.010109,0.013212,0.001,0.001,0.001,0.001,0.001,0.001672,0.003756,0.014708,0.002057,0.002329,0.005282,0.001,0.001486,0.001,0.001,0.001,0.001,0.005871,0.001522,0.001449,0.001987,0.003147,0.001,0.001,0.001,0.003008,0.001,0.001,...,0.002478,0.001,0.004962,0.001,0.001,0.002962,0.001,0.001,0.001,0.00167,0.008585,0.004394,0.002876,0.002453,0.001122,0.001418,0.030459,0.002458,0.001,0.001,0.001,0.001014,0.001,0.001161,0.001325,0.001795,0.001,0.001873,0.001,0.001,0.001,0.001476,0.002897,0.001,0.001,0.001,0.001,0.001448,0.004879,0.001261
1,id_001897cda,0.001,0.00179,0.00276,0.001172,0.0015,0.002499,0.003343,0.006979,0.008151,0.005436,0.007001,0.003546,0.001098,0.018881,0.001,0.001153,0.001128,0.002862,0.003253,0.002652,0.003069,0.001231,0.001,0.002294,0.001,0.002548,0.001,0.001261,0.001,0.001029,0.001,0.002448,0.001097,0.002093,0.001,0.001283,0.003118,0.009048,0.006704,...,0.001282,0.001417,0.001,0.001,0.001636,0.003017,0.001,0.014874,0.001,0.003191,0.003623,0.001,0.001,0.001,0.001696,0.003171,0.004231,0.001029,0.032227,0.001,0.004597,0.004553,0.002988,0.00116,0.001,0.001,0.001,0.001974,0.005384,0.003371,0.001472,0.002118,0.003272,0.001,0.007953,0.001,0.002481,0.001,0.005028,0.003336
2,id_002429b5b,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [None]:
test_pred.to_csv('submission.csv', index=False)