# Ray et al 2013 Intepretation
**Authorship:**
Adam Klie, *09/03/2022*
***
**Description:**
Notebook to interpret the trained models on the Ray et al dataset.
***

In [1]:
if 'autoreload' not in get_ipython().extension_manager.loaded:
    %load_ext autoreload
%autoreload 2

import os
import glob
import logging
import torch
import numpy as np
import pandas as pd
import eugene as eu
import matplotlib.pyplot as plt
import matplotlib

matplotlib.rcParams['pdf.fonttype'] = 42
matplotlib.rcParams['ps.fonttype'] = 42

Global seed set to 13


GPU is available: True
Number of GPUs: 1
Current GPU: 0
GPUs: Quadro RTX 5000


  min_coords = np.vstack(data.min(0) for data in polygons_data).min(0)
  max_coords = np.vstack(data.max(0) for data in polygons_data).max(0)


In [2]:
eu.settings.dataset_dir = "/cellar/users/aklie/data/eugene/ray13"
eu.settings.output_dir = "/cellar/users/aklie/projects/EUGENe/EUGENe_paper/output/ray13"
eu.settings.logging_dir = "/cellar/users/aklie/projects/EUGENe/EUGENe_paper/logs/ray13"
eu.settings.config_dir = "/cellar/users/aklie/projects/EUGENe/EUGENe_paper/configs/ray13"
figure_dir = "/cellar/users/aklie/projects/EUGENe/EUGENe_paper/figures/ray13"
eu.settings.verbosity = logging.ERROR

# Load the test `SeqData`

In [3]:
sdata_test = eu.dl.read_h5sd(os.path.join(eu.settings.output_dir, "norm_test_predictions.h5sd"))
target_mask = sdata_test.seqs_annot.columns.str.contains("RNCMPT") & ~sdata_test.seqs_annot.columns.str.contains("ST|MT|kipoi")
target_cols = sdata_test.seqs_annot.columns[target_mask]
sdata_test.seqs_annot.columns[:5]

Index(['Probe_Set', 'RNCMPT00001', 'RNCMPT00001_predictions_MT',
       'RNCMPT00001_predictions_ST', 'RNCMPT00002'],
      dtype='object')

# Feature Attribution

## Single task models

In [4]:
for i, target_col in enumerate(target_cols[:3]):
    print(f"Testing DeepBind SingleTask model on {target_col}")
    model_file = glob.glob(os.path.join(eu.settings.logging_dir, "DeepBind_ST", target_col, "checkpoints", "*"))[0]
    model = eu.models.DeepBind.load_from_checkpoint(model_file)
    eu.interpret.feature_attribution(
        model,
        sdata_test,
        saliency_method="InputXGradient",
        suffix=f"_{target_col}_ST"
    )

Testing DeepBind SingleTask model on RNCMPT00001
No transforms given, assuming just need to tensorize).


Computing saliency on batches:   0%|          | 0/945 [00:00<?, ?it/s]

Testing DeepBind SingleTask model on RNCMPT00002
No transforms given, assuming just need to tensorize).


Computing saliency on batches:   0%|          | 0/945 [00:00<?, ?it/s]

Testing DeepBind SingleTask model on RNCMPT00003
No transforms given, assuming just need to tensorize).


Computing saliency on batches:   0%|          | 0/945 [00:00<?, ?it/s]

## Multi-task

In [5]:
print(f"Testing DeepBind MultiTask model on {target_cols[0]}")
version = 0
model_file = glob.glob(os.path.join(eu.settings.logging_dir, "DeepBind_MT", f"v{version}", "checkpoints", "*"))[0]
model = eu.models.DeepBind.load_from_checkpoint(model_file)
for i, target_col in enumerate(target_cols[:3]):
    eu.interpret.feature_attribution(
        model,
        sdata_test,
        saliency_method="InputXGradient",
        target=i,
        suffix=f"_{target_col}_MT"
    )

Testing DeepBind MultiTask model on RNCMPT00001
No transforms given, assuming just need to tensorize).


Computing saliency on batches:   0%|          | 0/945 [00:00<?, ?it/s]

No transforms given, assuming just need to tensorize).


Computing saliency on batches:   0%|          | 0/945 [00:00<?, ?it/s]

No transforms given, assuming just need to tensorize).


Computing saliency on batches:   0%|          | 0/945 [00:00<?, ?it/s]

# Plot feature attributions 

In [6]:
for i, target_col in enumerate(target_cols[:3]):
    print(f"Generating feature attribution scores for DeepBind models on {target_col}")
    top3_index = sdata_test[target_col].sort_values(ascending=False).index[:3]
    bottom3_index = sdata_test[target_col].sort_values(ascending=False).index[-3:]
    eu.pl.lm_multiseq_track(
        sdata_test,
        seq_ids=top3_index,
        uns_keys=[f"InputXGradient_imps_{target_col}_ST", f"InputXGradient_imps_{target_col}_MT"],
        alphabet="RNA",
        width=18,
        height=6,
        ylabels=["DeepBind SingleTask", "DeepBind MultiTask"],
        save=os.path.join(figure_dir, "feature_attr", f"model_top3_feature_attr_{target_col}_STandMT.pdf")
    )
    plt.close()
    eu.pl.lm_multiseq_track(
        sdata_test,
        seq_ids=bottom3_index,
        uns_keys=[f"InputXGradient_imps_{target_col}_ST", f"InputXGradient_imps_{target_col}_MT"],
        alphabet="RNA",
        width=18,
        height=6,
        ylabels=["DeepBind SingleTask", "DeepBind MultiTask"],
        save=os.path.join(figure_dir, "feature_attr", f"model_bottom3_feature_attr_{target_col}_STandMT.pdf")
    )
    plt.close()

Generating feature attribution scores for DeepBind models on RNCMPT00001


Importance values: 0it [00:00, ?it/s]

Importance values: 0it [00:00, ?it/s]

Generating feature attribution scores for DeepBind models on RNCMPT00002


Importance values: 0it [00:00, ?it/s]

Importance values: 0it [00:00, ?it/s]

Generating feature attribution scores for DeepBind models on RNCMPT00003


Importance values: 0it [00:00, ?it/s]

Importance values: 0it [00:00, ?it/s]

# Filter viz

## Single task 

In [4]:
# Grab the pfms for the 16 filters of each single task model
for i, target_col in enumerate(target_cols[:3]):
    print(f"Generating pfms for DeepBind models on {target_col}")
    model_file = glob.glob(os.path.join(eu.settings.logging_dir, "DeepBind_ST", target_col, "checkpoints", "*"))[0]
    model = eu.models.DeepBind.load_from_checkpoint(model_file)
    eu.interpret.generate_pfms(model, sdata_test, key_name=f"pfms_{target_col}_ST", alphabet="RNA")

Generating pfms for DeepBind models on RNCMPT00001
No transforms given, assuming just need to tensorize).


Getting maximial activating seqlets:   0%|          | 0/945 [00:00<?, ?it/s]

Getting PFMs from filters:   0%|          | 0/16 [00:00<?, ?it/s]

Generating pfms for DeepBind models on RNCMPT00002
No transforms given, assuming just need to tensorize).


Getting maximial activating seqlets:   0%|          | 0/945 [00:00<?, ?it/s]

Getting PFMs from filters:   0%|          | 0/16 [00:00<?, ?it/s]

Generating pfms for DeepBind models on RNCMPT00003
No transforms given, assuming just need to tensorize).


Getting maximial activating seqlets:   0%|          | 0/945 [00:00<?, ?it/s]

Getting PFMs from filters:   0%|          | 0/16 [00:00<?, ?it/s]

## Multitask

In [8]:
# Grab the pfms for all filters of the multitask model
version = 0
model_file = glob.glob(os.path.join(eu.settings.logging_dir, "DeepBind_MT", f"v{version}", "checkpoints", "*"))[0]
model = eu.models.DeepBind.load_from_checkpoint(model_file)
eu.interpret.generate_pfms(model, sdata_test, key_name=f"pfms_MT", alphabet="RNA")

No transforms given, assuming just need to tensorize).


Getting maximial activating seqlets:   0%|          | 0/945 [00:00<?, ?it/s]

Getting PFMs from filters:   0%|          | 0/512 [00:00<?, ?it/s]

# Plot filter viz

In [5]:
# Visualizations for all 16 filters for DeepBind SingleTask models
for i, target_col in enumerate(target_cols[:3]):
    print(f"Plotting filter visualizations for DeepBind models on {target_col}")
    eu.pl.lm_multifilter_viz(
        sdata_test,
        filter_ids=range(0,16),
        uns_key=f"pfms_{target_col}_ST",
        titles=[f"filter {i}" for i in range(16)],
        num_rows=4,
        num_cols=4,
        save=os.path.join(figure_dir, "filter_viz", f"model_filters_viz_{target_col}_ST.pdf")
    )
    plt.close()

Plotting filter visualizations for DeepBind models on RNCMPT00001
Plotting filter visualizations for DeepBind models on RNCMPT00002
Plotting filter visualizations for DeepBind models on RNCMPT00003


In [13]:
# Visualizations for all filters of the multitask model
for i in range(16):
    start_filter = i*32
    end_filter = (i*32) + 32
    print(f"Plotting and saving filters {start_filter+1}-{end_filter}")
    eu.pl.lm_multifilter_viz(
        sdata_test,
        filter_ids=list(sdata_test.uns["pfms_MT"].keys())[start_filter:end_filter],
        num_rows=8,
        num_cols=4,
        uns_key="pfms_MT",
        titles=[f"filter {i}" for i in range(32)],
        save=os.path.join(figure_dir, "filter_viz", f"model_filters{start_filter+1}-{end_filter}_viz_MT.pdf")
    )
    plt.tight_layout()
    plt.close()

Plotting and saving filters 1-32
Plotting and saving filters 33-64
Plotting and saving filters 65-96
Plotting and saving filters 97-128
Plotting and saving filters 129-160
Plotting and saving filters 161-192
Plotting and saving filters 193-224
Plotting and saving filters 225-256
Plotting and saving filters 257-288
Plotting and saving filters 289-320
Plotting and saving filters 321-352
Plotting and saving filters 353-384
Plotting and saving filters 385-416
Plotting and saving filters 417-448
Plotting and saving filters 449-480
Plotting and saving filters 481-512


# Save

In [9]:
# Save the SeqData with predictions and interpretations
sdata_test.write_h5sd(os.path.join(eu.settings.output_dir, "norm_test_predictions_and_intepretations.h5sd"))

---

# Scratch