In [246]:
import h5py
import numpy as np
import torch
import seqexplainer as se

import seqpro as sp

from eugene import models

# Test data

In [198]:
dataset = h5py.File("/cellar/users/aklie/projects/ML4GLand/use_cases/deAlmeida22/data/evo_aug/DeepSTARR_data.h5", 'r')
x_test = np.array(dataset['X_test']).astype(np.float32)
y_test = np.array(dataset['Y_test']).astype(np.float32)
dataset.close()

In [201]:
x_test.shape, y_test.shape

((41186, 4, 249), (41186, 2))

# References for attribution

In [54]:
import importlib
from seqexplainer import _references as ref
importlib.reload(ref)

<module 'seqexplainer._references' from '/cellar/users/aklie/projects/ML4GLand/SeqExplainer/seqexplainer/_references.py'>

In [55]:
test_np = x_test[:100]

In [56]:
ref.zero_ref_inputs(test_np).shape
ref.random_ref_inputs(test_np).shape
ref.shuffle_ref_inputs(test_np).shape
ref.dinuc_shuffle_ref_inputs(test_np).shape
ref.gc_ref_inputs(test_np).shape
ref.gc_ref_inputs(test_np, bg="uniform", uniform_dist=[0.3, 0.2, 0.3, 0.2]).shape
ref.gc_ref_inputs(test_np, bg="batch").shape
ref.gc_ref_inputs(test_np, bg="seq").shape
ref.profile_ref_inputs(test_np).shape

(100, 4, 249)

In [57]:
ref.get_reference(test_np, "zero").shape
ref.get_reference(test_np, "random").shape
ref.get_reference(test_np, "shuffle").shape
ref.get_reference(test_np, "dinuc_shuffle").shape
ref.get_reference(test_np, "gc").shape
ref.get_reference(test_np, "gc", bg="uniform", uniform_dist=[0.3, 0.2, 0.3, 0.2]).shape
ref.get_reference(test_np, "gc", bg="batch").shape
ref.get_reference(test_np, "gc", bg="seq").shape
ref.get_reference(test_np, "profile").shape

(100, 4, 249)

In [58]:
test_np = (x_test[:100], x_test[100:150])

In [59]:
ref.get_reference(test_np, "zero")
ref.get_reference(test_np, "random")
ref.get_reference(test_np, "shuffle")
ref.get_reference(test_np, "dinuc_shuffle")
ref.get_reference(test_np, "gc")
ref.get_reference(test_np, "gc", bg="uniform", uniform_dist=[0.3, 0.2, 0.3, 0.2])
ref.get_reference(test_np, "gc", bg="batch")
ref.get_reference(test_np, "gc", bg="seq")
ref.get_reference(test_np, "profile");

In [100]:
decoded_seq = sp.decode_seq(test_np[0])
decoded_ref = sp.decode_seq(refs[0])

In [102]:
sp.nucleotide_content_seq(decoded_seq), sp.nucleotide_content_seq(decoded_ref)

(array([0.26907631, 0.22891566, 0.15662651, 0.34538153]),
 array([0.26907631, 0.22891566, 0.15662651, 0.34538153]))

In [95]:
check_nucleotide_balance(test_np, refs)

True

# Attributions

In [202]:
model = models.DeepSTARR.load_from_checkpoint("/cellar/users/aklie/projects/ML4GLand/use_cases/deAlmeida22/models/eugene/DeepSTARR.ckpt")

In [203]:
model

DeepSTARR(
  (train_metric): R2Score()
  (val_metric): R2Score()
  (test_metric): R2Score()
  (conv1d_tower): Conv1DTower(
    (layers): Sequential(
      (0): Conv1d(4, 246, kernel_size=(7,), stride=(1,), padding=same)
      (1): BatchNorm1d(246, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
      (2): ReLU()
      (3): MaxPool1d(kernel_size=2, stride=1, padding=0, dilation=1, ceil_mode=False)
      (4): Conv1d(246, 60, kernel_size=(3,), stride=(1,), padding=same)
      (5): BatchNorm1d(60, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
      (6): ReLU()
      (7): MaxPool1d(kernel_size=2, stride=1, padding=0, dilation=1, ceil_mode=False)
      (8): Conv1d(60, 60, kernel_size=(5,), stride=(1,), padding=same)
      (9): BatchNorm1d(60, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
      (10): ReLU()
      (11): MaxPool1d(kernel_size=2, stride=1, padding=0, dilation=1, ceil_mode=False)
      (12): Conv1d(60, 120, kernel_size=(3,), str

In [204]:
test_np = x_test[:10]
test_y = torch.from_numpy(y_test[:10])
test_torch = torch.from_numpy(test_np)
test_torch.shape, test_y.shape

(torch.Size([10, 4, 249]), torch.Size([10, 2]))

In [273]:
model = models.Basset(
    input_len=249,
    output_dim=2
)



In [282]:
import importlib
from seqexplainer import _attributions as attribute 
importlib.reload(attribute)

<module 'seqexplainer._attributions' from '/cellar/users/aklie/projects/ML4GLand/SeqExplainer/seqexplainer/_attributions.py'>

In [194]:
#attribute.delta(model.predict(test_torch), test_y).shape
#attribute.delta(model.predict(test_torch), test_y).shape
#attribute.l2(model.predict(test_torch), test_y).shape

In [275]:
model.summary()

Model: Basset
Sequence length: 249
Output dimension: 2
Strand: ss
Task: multilabel_classification
Aggregation of strands: None
Loss function: binary_cross_entropy_with_logits
Optimizer: Adam
	Optimizer parameters: {}
	Optimizer starting learning rate: 0.001
Scheduler: None
	Scheduler parameters: {}
Metric: auroc
	Metric parameters: {'task': 'multilabel'}
Seed: None
Parameters summary:


  | Name         | Type        | Params
---------------------------------------------
0 | train_metric | AUROC       | 0     
1 | val_metric   | AUROC       | 0     
2 | test_metric  | AUROC       | 0     
3 | conv1d_tower | Conv1DTower | 964 K 
4 | dense_block  | DenseBlock  | 48.4 M
---------------------------------------------
49.3 M    Trainable params
0         Non-trainable params
49.3 M    Total params
197.331   Total estimated model params size (MB)

In [286]:
import gc
def report_gpu():
   print(f"Allocated: {round(torch.cuda.memory_allocated(0)/1024**3,1)} GB") 
   gc.collect()
   torch.cuda.empty_cache()
   torch.cuda.synchronize()
   print(f"Allocated: {round(torch.cuda.memory_allocated(0)/1024**3,1)} GB")
report_gpu()

Allocated: 13.9 GB
Allocated: 2.1 GB


In [284]:
model.eval()

Basset(
  (train_metric): AUROC()
  (val_metric): AUROC()
  (test_metric): AUROC()
  (conv1d_tower): Conv1DTower(
    (layers): Sequential(
      (0): Conv1d(4, 300, kernel_size=(19,), stride=(1,), padding=(9,))
      (1): BatchNorm1d(300, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
      (2): ReLU()
      (3): MaxPool1d(kernel_size=3, stride=1, padding=0, dilation=1, ceil_mode=False)
      (4): Conv1d(300, 200, kernel_size=(11,), stride=(1,), padding=(5,))
      (5): BatchNorm1d(200, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
      (6): ReLU()
      (7): MaxPool1d(kernel_size=4, stride=1, padding=0, dilation=1, ceil_mode=False)
      (8): Conv1d(200, 200, kernel_size=(7,), stride=(1,), padding=(3,))
      (9): BatchNorm1d(200, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
      (10): ReLU()
      (11): MaxPool1d(kernel_size=4, stride=1, padding=0, dilation=1, ceil_mode=False)
    )
  )
  (dense_block): DenseBlock(
    (layers)

In [287]:
naive_ism_output = attribute._naive_ism(model.to("cuda"), test_torch.to("cuda"), device="cuda", batch_size=128)

inputs: torch.Size([10, 4, 249]) cuda:0
reference: torch.Size([10, 1, 2]) cpu
inputs: torch.Size([10, 4, 249]) cpu
batch_starts: [  0 128 256 384 512]
Allocated: 2.1 GB
input: torch.Size([4, 249]) cpu
X: torch.Size([747, 4, 249]) cpu
model: cuda:0
Allocated: 2.1 GB
model: cuda:0
Allocated: 2.1 GB
model: cuda:0
Allocated: 2.1 GB
model: cuda:0
Allocated: 2.1 GB
model: cuda:0
Allocated: 2.1 GB
model: cuda:0
Allocated: 2.1 GB
y: torch.Size([747, 1, 2]) cpu
ism: torch.Size([747, 1]) cpu
inputs: torch.Size([10, 4, 249]) cpu
input: torch.Size([4, 249]) cpu
X: torch.Size([747, 4, 249]) cpu
model: cuda:0
Allocated: 2.1 GB
model: cuda:0
Allocated: 2.1 GB
model: cuda:0
Allocated: 2.1 GB
model: cuda:0
Allocated: 2.1 GB
model: cuda:0
Allocated: 2.1 GB
model: cuda:0
Allocated: 2.1 GB
y: torch.Size([747, 1, 2]) cpu
ism: torch.Size([747, 1]) cpu
inputs: torch.Size([10, 4, 249]) cpu
input: torch.Size([4, 249]) cpu
X: torch.Size([747, 4, 249]) cpu
model: cuda:0
Allocated: 2.1 GB
model: cuda:0
Allocated:

In [289]:
naive_ism_output

tensor([[[ 0.0000e+00,  0.0000e+00, -8.5846e-02,  ..., -2.3594e-02,
          -2.8929e-02,  2.3758e-02],
         [-9.1438e-02, -1.2441e-01,  0.0000e+00,  ..., -1.1276e-02,
           1.1793e-01,  3.2274e-03],
         [-5.5144e-03, -4.5820e-02,  9.2206e-02,  ..., -2.4472e-02,
           0.0000e+00,  0.0000e+00],
         [-3.4599e-02,  2.1148e-03, -1.5401e-02,  ...,  0.0000e+00,
           2.8023e-02, -7.5651e-02]],

        [[ 0.0000e+00, -2.7673e-02, -1.5014e-02,  ...,  3.1489e-02,
          -1.2122e-01,  0.0000e+00],
         [-1.0497e-02, -3.4869e-02, -1.1718e-01,  ...,  0.0000e+00,
           0.0000e+00,  1.8534e-01],
         [ 7.6455e-02,  2.0381e-02,  0.0000e+00,  ...,  7.2406e-02,
          -5.4008e-02,  2.3837e-02],
         [-6.6376e-02,  0.0000e+00, -1.7447e-01,  ..., -6.3656e-02,
          -1.6397e-01,  8.0734e-02]],

        [[-3.7031e-02,  1.2567e-01, -1.0226e-01,  ..., -5.6308e-02,
           4.8430e-02, -1.0986e-01],
         [-5.6239e-02,  1.1346e-02, -3.2659e-02,  .

In [103]:
naive_ism_output = attribute._naive_ism(model, test_torch, target=0)
naive_ism_output.shape

inputs: torch.Size([10, 4, 249]) cpu
10 4 249 747


RuntimeError: Input type (torch.FloatTensor) and weight type (torch.cuda.FloatTensor) should be the same or input should be a MKLDNN tensor and weight is a dense tensor

In [None]:
naive_ism_output = attribute._naive_ism(model, test_torch, target=0

torch.Size([100])

In [None]:
attribute.delta


In [None]:
se.attribute(model, test_np, method="DeepLift", reference_type="dinuc_shuffle", target=0, batch_size=128, device="cuda")


In [None]:
ref.zero_ref_inputs(test_np).shape

(100, 4, 249)

In [None]:
ref.zero_ref_inputs(test_np).shape
ref.zero_ref_inputs(test_torch).shape

ref.random_ref_inputs(test_np).shape
ref.random_ref_inputs(test_torch).shape

ref.shuffle_ref_inputs(test_np).shape
ref.shuffle_ref_inputs(test_torch).shape

ref.dinuc_shuffle_ref_inputs(test_np).shape
# ref.dinuc_shuffle_ref_inputs(test_torch).shape TODO: fix this

ref.gc_ref_inputs(test_np, bg="uniform", uniform_dist=[0.3, 0.2, 0.2, 0.3]).shape
ref.gc_ref_inputs(test_torch, bg="uniform", uniform_dist=[0.3, 0.2, 0.2, 0.3]).shape

ref.gc_ref_inputs(test_np, bg="batch").shape
ref.gc_ref_inputs(test_torch, bg="batch").shape

# ref.gc_ref_inputs(test_np, bg="seq").shape TODO: fix this
ref.gc_ref_inputs(test_torch, bg="seq").shape

ref.profile_ref_inputs(test_torch).shape
# ref.profile_ref_inputs(test_np).shape TODO: fix this

TypeError: list indices must be integers or slices, not tuple

In [None]:
ref.get_reference(test_torch, "zero", "cpu").shape
ref.get_reference(test_np, "zero", "cpu").shape
ref.get_reference(test_torch, "zero", "cuda").shape
ref.get_reference(test_np, "zero", "cuda").shape

ref.get_reference(test_torch, "random", "cpu").shape
ref.get_reference(test_np, "random", "cpu").shape
ref.get_reference(test_torch, "random", "cuda").shape
ref.get_reference(test_np, "random", "cuda").shape

ref.get_reference(test_torch, "shuffle", "cpu").shape
ref.get_reference(test_np, "shuffle", "cpu").shape
ref.get_reference(test_torch, "shuffle", "cuda").shape
ref.get_reference(test_np, "shuffle", "cuda").shape

ref.get_reference(test_np, "dinuc_shuffle", "cpu").shape 
# ref.get_reference(test_torch, "dinuc_shuffle", "cpu").shape
ref.get_reference(test_np, "dinuc_shuffle", "cuda").shape
# ref.get_reference(test_torch, "dinuc_shuffle", "cuda").shape

ref.get_reference(test_torch, "gc", "cpu").shape
ref.get_reference(test_np, "gc", "cpu").shape
ref.get_reference(test_torch, "gc", "cuda").shape
ref.get_reference(test_np, "gc", "cuda").shape

ref.get_reference(test_torch, "profile", "cpu").shape
# ref.get_reference(test_np, "profile", "cpu").shape
# ref.get_reference(test_torch, "profile", "cuda").shape
# ref.get_reference(test_np, "profile", "cuda").shape

torch.Size([100, 4, 249])

HBox(children=(FloatProgress(value=0.0, description='Computing attributions on batches of size 128', max=1.0, …

TypeError: can't convert cuda:0 device type tensor to numpy. Use Tensor.cpu() to copy the tensor to host memory first.