# EUGENe DeepSTARR model interpretation
Adam Klie (last updated: *09/20/2023*)
***
Notebook for training a DeepSTARR model with EUGENe

We recommend cloning the entire tutorials repository so that you have all the necessary intermediate files you need, but when applicable, we also provide links to download the files directly.

In [None]:
import os
import numpy as np
import seqdatasets
import seqexplainer as se
from eugene import models
from eugene import preprocess as pp

# Load the model

In [None]:
model = models.DeepSTARR.load_from_checkpoint("/cellar/users/aklie/projects/ML4GLand/models/DeepSTARR/eugene/DeepSTARR.ckpt")

# Load the test data

In [None]:
sdata_test = seqdatasets.deAlmeida22("test")
pp.ohe_seqs_sdata(sdata_test)

# Attribution

In [None]:
# Let's start with a naive ISM that mutates every nucleotide to every other nucleotids and measures the effect on the model's prediction
ism_attrs = se.attribute(
    model,
    inputs=sdata_test["ohe_seq"].values,
    method="NaiveISM",
    target=0,
    batch_size=32
)

In [None]:
# Next we can use the gradient of the model with respect to the input sequence with a multiplication by the input sequence
ixg_attrs = se.attribute(
    model,
    inputs=sdata_test["ohe_seq"].values,
    method="InputXGradient",
    target=0,
    batch_size=32
)

In [None]:
# Let's generate a baseline distribution of
N = 1000
A, L = 4, 249
ref_tokens = np.random.randint(4, size=(N, L))
refs = np.eye(A)[ref_tokens].transpose(0, 2, 1)

In [None]:
# We now pass the baseline distribution directly to GradientShap
shap_attrs = se.attribute(
    model,
    inputs=sdata_test["ohe_seq"].values,
    method="GradientShap",
    target=0,
    batch_size=32,
    references=refs,
    stdevs=0.1,
    n_samples=100
)

## Plotting the results

In [None]:
# Random sequences
rand_inds = np.random.choice(range(100), 5, replace=False)

In [None]:
from seqexplainer.attributions._plot import plot_attribution_logos

In [None]:
# Plot the attribution logos for input X gradient
plot_attribution_logos(
    attrs=ixg_attrs[rand_inds],
    vocab="DNA",
)

In [None]:
# Plot the attribution logos for GradientShap
plot_attribution_logos(
    attrs=shap_attrs[rand_inds],
    inputs=batch[rand_inds],  # we add the input argument here so that the one-hots are multiplied by the attributions
    vocab="DNA",
    height_scaler=1.2
)

SeqExplainer offers a special function that can help us visualize the matrix of deltas from ISM

In [None]:
from seqexplainer.attributions._plot import plot_attribution_logo_heatmap

In [None]:
plot_attribution_logo_heatmap(
    attrs=ism_attrs[1],
    inputs=sdata_test["ohe_seq"].values[1],
    flip_sign=True,
    figsize=(12, 3)
)

## From local to global: TF-MoDISco

In [None]:
# EUGENe has a wrapper for this function called `modisco` that can also be used here
from modiscolite.tfmodisco import TFMoDISco

In [None]:
# All sequences
shap_attrs = set.attribute(
    model,
    inputs=sdata_test["ohe_seq"].values,
    method="GradientShap",
    target=0,
    batch_size=128,
    references="dinuc_shuffle",
    stdevs=0.1,
    n_samples=100
)

In [None]:
pos_patterns, neg_patterns = TFMoDISco(
    hypothetical_contribs=shap_attrs.transpose(0, 2, 1),
    one_hot=sdata_test["ohe_seq"].value.transpose(0, 2, 1),
)

In [None]:
from modiscolite.io import save_hdf5

In [None]:
output_dir = "output"
save_hdf5(os.path.join(output_dir, "modisco.h5"), pos_patterns, neg_patterns)

These seqlets can then be treated a lot like the maximimally activaing seqlets from the filter interpretation tutorial. For the purposes of this tutorial, we will just plot the sequence logos for the positive and negative clusters. 

In [None]:
from seqexplainer.attributions._modisco import modisco_logos

In [None]:
modisco_logos(
    modisco_h5_file=os.path.join(output_dir, "modisco.h5"),
    output_dir=os.path.join(output_dir, "modisco_logos"),
)

# DONE!

---