This notebook is based on the [DeepLift tutorial](https://github.com/kundajelab/deeplift)

In [None]:
import os
import numpy as np
from keras.models import load_model

Load the model trained on GRIDSS results

In [None]:
#specify the data directory
data_dir = ""
model = load_model(os.path.join(data_dir,'gridss_model.h5'))

Generate predictions

In [None]:
probs = model.predict_proba(X, batch_size=1000, verbose=True)

Checking the entropy of the model predictions (see the [Integrated-Gradients howto, Sanity Checking Baselines](https://github.com/ankurtaly/Integrated-Gradients/blob/master/howto.md)

In [None]:
def get_entropy(b):
    return np.apply_along_axis(lambda p: -(np.sum(p*np.log(p))), 1, b)

In [None]:
probs_entropy = get_entropy(probs)

In [None]:
# !conda install matplotlib -y
# if matplotlib does not work, you might need to install nomkl
# !conda install nomkl -y

In [None]:
%matplotlib inline

import matplotlib.pyplot as plt

n, bins, patches = plt.hist(probs_entropy, 100, density=False, facecolor='g', alpha=0.75)

Model definition

In [None]:
import keras
print ("keras version",keras.__version__)
import numpy as np

from keras.models import Sequential
from keras.layers import Dense, Activation, Convolution1D, Lambda, \
    Convolution2D, Flatten, \
    Reshape, LSTM, Dropout, TimeDistributed, BatchNormalization
from keras.regularizers import l2
from keras.optimizers import Adam

#create a keras 2 model with the same architecture
#set the weights for each layer using the hdf5
#weights file

class_number = 3
dim_length = 200
dim_channels = 46
layers = 4 # 2
filters = [32] * layers  # 4
fc_hidden_nodes = 8
learning_rate = 10 ** (-4)
regularization_rate = 10 ** (-1)
kernel_size = 7
drp_out1 = 0
drp_out2 = 0

outputdim = class_number  # number of classes

weightinit = 'lecun_uniform'  # weight initialization

model = Sequential()
model.add(
    BatchNormalization(
        input_shape=(
            dim_length,
            dim_channels)))

for filter_number in filters:
    model.add(Convolution1D(filter_number, kernel_size=kernel_size, padding='same',
                            kernel_regularizer=l2(regularization_rate),
                            kernel_initializer=weightinit))
    model.add(BatchNormalization())
    model.add(Activation('relu'))

model.add(Flatten())
# model.add(Dropout(drp_out1))
model.add(Dense(units=fc_hidden_nodes,
                kernel_regularizer=l2(regularization_rate),
                kernel_initializer=weightinit))  # Fully connected layer
model.add(BatchNormalization())
model.add(Activation('relu'))  # Relu activation
# model.add(Dropout(drp_out2))
model.add(Dense(units=outputdim, kernel_initializer=weightinit))
model.add(BatchNormalization())
model.add(Activation("softmax"))  # Final classification layer

model.compile(loss='categorical_crossentropy',
              optimizer=Adam(lr=learning_rate),
              metrics=['accuracy'])

Save the model in JSON format and the model weights in HDF5 format

In [None]:
# serialize model to JSON
keras_model_weights = os.path.join(data_dir, "060220_gridss_model.h5")
keras_model_json = os.path.join(data_dir, "060220_gridss_model.json")

model_json = model.to_json()
with open(keras_model_json, "w") as json_file:
    json_file.write(model_json)
# serialize weights to HDF5
model.save_weights(keras_model_weights)
print("Saved model to disk")

# DeepLift

In [None]:
from __future__ import print_function
import tensorflow
print("Tensorflow version:", tensorflow.__version__)
import keras
print("Keras version:", keras.__version__)
import numpy
print("Numpy version:", numpy.__version__)

In [None]:
#!conda install -y -c bioconda deeplift

In [None]:
import deeplift
from keras.models import model_from_json

#load the keras model
#model files should be: model weights in HDF5 format and model in JSON format

keras_model = model_from_json(open(keras_model_json).read())
keras_model.load_weights(keras_model_weights)

Convert the Keras model for DeepLift

In [None]:
from deeplift.layers import NonlinearMxtsMode
import deeplift.conversion.kerasapi_conversion as kc
from collections import OrderedDict

method_to_model = OrderedDict()
for method_name, nonlinear_mxts_mode in [
    #The genomics default = rescale on conv layers, revealcancel on fully-connected
    ('rescale_conv_revealcancel_fc', NonlinearMxtsMode.DeepLIFT_GenomicsDefault),
    ('rescale_all_layers', NonlinearMxtsMode.Rescale),
    ('revealcancel_all_layers', NonlinearMxtsMode.RevealCancel),
    ('grad_times_inp', NonlinearMxtsMode.Gradient),
    ('guided_backprop', NonlinearMxtsMode.GuidedBackprop)]:
    method_to_model[method_name] = kc.convert_model_from_saved_files(
        h5_file=keras_model_weights,
        json_file=keras_model_json,
        nonlinear_mxts_mode=nonlinear_mxts_mode)

Load the input data (windows and labels)

In [None]:
import numpy as np
truth_set = 'gridss'

with np.load(os.path.join(data_dir, truth_set+'_windows.npz')) as data:
    # print(type(data['start']))
    input_data = data['start']
    input_data_labels = ['DEL_start']*data['start'].shape[0]
    #input_data = data['end']
    #y.extend(['DEL_end']*data['end'].shape[0])

with np.load(os.path.join(data_dir, 'negative_windows.npz')) as data:
    input_data = np.concatenate([input_data, data['neg']], axis=0)
    input_data_labels.extend(['noSV']*data['neg'].shape[0])

input_data.shape

In [None]:
#make sure predictions are the same as the original model
from deeplift.util import compile_func
model_to_test = method_to_model['rescale_conv_revealcancel_fc']
deeplift_prediction_func = compile_func([model_to_test.get_layers()[0].get_activation_vars()],
                                         model_to_test.get_layers()[-1].get_activation_vars())
original_model_predictions = keras_model.predict(input_data, batch_size=200)
converted_model_predictions = deeplift.util.run_function_in_batches(
                                input_data_list=[input_data],
                                func=deeplift_prediction_func,
                                batch_size=200,
                                progress_update=None)
print("maximum difference in predictions:",np.max(np.array(converted_model_predictions)-np.array(original_model_predictions)))
assert np.max(np.array(converted_model_predictions)-np.array(original_model_predictions)) < 10**-5
predictions = converted_model_predictions

In [None]:
print("Compiling scoring functions")
method_to_scoring_func = OrderedDict()
for method,model in method_to_model.items():
    print("Compiling scoring function for: "+method)
    method_to_scoring_func[method] = model.get_target_contribs_func(find_scores_layer_idx=0,
                                                                    target_layer_idx=-3)
    
#To get a function that just gives the gradients, we use the multipliers of the Gradient model
gradient_func = method_to_model['grad_times_inp'].get_target_multipliers_func(find_scores_layer_idx=0,
                                                                              target_layer_idx=-3)
print("Compiling integrated gradients scoring functions")
integrated_gradients10_func = deeplift.util.get_integrated_gradients_function(
    gradient_computation_function = gradient_func,
    num_intervals=10)
method_to_scoring_func['integrated_gradients10'] = integrated_gradients10_func

In [None]:
# Plot channels
# print(input_data.shape)

# for i, w in enumerate(input_data):
#     print(i)
#     plot_channels(w)

In [None]:
#Use mean as reference
bg = np.mean(input_data, axis=0)

In [None]:
input_data[0,:,:].shape

In [None]:
from collections import OrderedDict

method_to_task_to_scores = OrderedDict()
for method_name, score_func in method_to_scoring_func.items():
    print("on method",method_name)
    method_to_task_to_scores[method_name] = OrderedDict()
    for task_idx in [0,1,2]:
        scores = np.array(score_func(
                    task_idx=task_idx,
                    input_data_list=[X],
                    input_references_list=[bg],
                    batch_size=32,
                    progress_update=None))
        # print(scores.shape)
        assert scores.shape == X.shape
        #The sum over the ACGT axis in the code below is important! Recall that DeepLIFT
        # assigns contributions based on difference-from-reference; if
        # a position is [1,0,0,0] (i.e. 'A') in the actual sequence and [0.3, 0.2, 0.2, 0.3]
        # in the reference, importance will be assigned to the difference (1-0.3)
        # in the 'A' channel, (0-0.2) in the 'C' channel,
        # (0-0.2) in the G channel, and (0-0.3) in the T channel. You want to take the importance
        # on all four channels and sum them up, so that at visualization-time you can project the
        # total importance over all four channels onto the base that is actually present (i.e. the 'A'). If you
        # don't do this, your visualization will look very confusing as multiple bases will be highlighted at
        # every position and you won't know which base is the one that is actually present in the sequence!
        # scores = np.sum(scores, axis=2)
        method_to_task_to_scores[method_name][task_idx] = scores

In [None]:
#Save scores
import numpy as np
scores_file = os.path.join(data_dir, 'method_to_task_to_scores.npy')
np.save(scores_file, method_to_task_to_scores)

In [None]:
#Load scores
import numpy as np

method_to_task_to_scores = np.load(scores_file, allow_pickle=True).item()
# method_to_task_to_scores_loaded
print(method_to_task_to_scores.keys())
for k in method_to_task_to_scores.keys():
    for i in [0,1,2]:
        print(method_to_task_to_scores[k][i].shape)

Function to visualize windows and scores

In [None]:
%matplotlib

def plot_window(W, idx, method_name, task):
    
    import pandas as pd
    import numpy as np
    from sklearn.preprocessing import minmax_scale
    from matplotlib import colors
    
    ch_names = ['COV','CDF','CDR','MRQ','MBQ','SNV','LCF','LCR','RCF','RCR',
           'DRF','DRR','DLF','DLR','LDF','LDR','RDF','RDR','IRF','IRR',
           'LSF','LSR', 'RSF','RSR','INB','INA','DUB','DUA','TRO','TRS','FLD','FRD','FAD',
           'RLD','RRD','RAD','SLF','SLR','SRF','SRR','MAP','_A_','_T_',
            '_C_','_G_','_N_'
           ]
    
    scores = method_to_task_to_scores[method_name][task]
    df_mask = pd.DataFrame(scores[idx], columns=ch_names)

    n_ch = len(ch_names)
    W_i = minmax_scale(W[idx,:,:], feature_range=(0, 1), axis=0, copy=True )

    df = pd.DataFrame(W_i, columns=ch_names)
    # print(df)
    ax = df.plot(subplots=True, figsize=(15, 10), kind='line',
                 legend=False, color = 'black')
    
    for i, a in enumerate(ax):
        #a.fill_between(df.index, df[ch_names[i]].min(), df[ch_names[i]], color=a.get_lines()[0].get_color())
        if i != n_ch-1:
            a.spines['bottom'].set_color('white')
        if i != 0:
            a.spines['top'].set_color('white') 
        a.set_yticks([])
        a.set_ylabel(ch_names[i], rotation=0, va='center', ha='right')
        #extent = [x[0]-(x[1]-x[0])/2., x[-1]+(x[1]-x[0])/2.,0,1]
        vmin=df_mask.min().min()
        vmax=df_mask.max().max()
        a.imshow(df_mask[ch_names[i]][np.newaxis,:], cmap="bwr", aspect="auto", alpha=1,
                norm=colors.Normalize(vmin=vmin, vmax=vmax))
        a.set_ylim(0, 1)

In [None]:
method_name = 'rescale_all_layers'
task = 2
#print(df_mask)
plot_window(X, 4000, method_name, task)