# Kipoi DeepSTARR model evaluation
Adam Klie (last updated: *09/20/2023*)
***
Notebook for evaluating DeepSTARR model on test set.

In [None]:
# Load the required packages
import os
import sys
import pyfaidx
import kipoi
import kipoiseq
import kipoi_interpret

# import the required packages
import numpy as np
import pandas as pd

# plot the predictions in a scatter plot
import matplotlib.pyplot as plt
import seaborn as sns
from scipy.stats import gaussian_kde

# Add metrics to the plots
from sklearn.metrics import r2_score
from scipy.stats import pearsonr, spearmanr

In [None]:
# Add the path to the bin directory of the current python environment
bin_dir = os.path.dirname(sys.executable)
os.environ["PATH"] += os.pathsep + bin_dir

# Load the Kipoi model

In [None]:
deepstarr = kipoi.get_model("DeepSTARR")

# Load the test set

In [None]:
# Read sequences from a fasta file
test_fasta = pyfaidx.Fasta("/cellar/users/aklie/projects/ML4GLand/use_cases/deAlmeida22/data/Sequences_Test.fa")

In [None]:
# Get all the sequences from the fasta file
seqs = [str(test_fasta[i]) for i in test_fasta.keys()]

In [None]:
# Ohe the sequences
ohe_seqs = np.array([kipoiseq.transforms.functional.one_hot(seq) for seq in seqs])

In [None]:
# Grab the targets
targets = pd.read_csv("/cellar/users/aklie/projects/ML4GLand/use_cases/deAlmeida22/data/Sequences_activity_Test.txt", sep="\t")
targets = targets[['Dev_log2_enrichment_scaled', 'Hk_log2_enrichment_scaled']]

# Evaluate test set predictions

In [None]:
# Make predictions
preds = deepstarr.model.predict(
    ohe_seqs,
    batch_size=32
)

In [None]:
# Add the predictions to the targets dataframe
targets['Dev_log2_enrichment_scaled_pred'] = preds[0]
targets['Hk_log2_enrichment_scaled_pred'] = preds[1]

# Plot results

In [None]:
fig, ax = plt.subplots(1, 2, figsize=(10, 5))

# Get point densities
x = targets['Dev_log2_enrichment_scaled']
y = targets['Dev_log2_enrichment_scaled_pred']
xy = np.vstack([x,y])
z = gaussian_kde(xy)(xy)

# Sort the points by density, so that the densest points are plotted last
idx = z.argsort()
x, y, z = x[idx], y[idx], z[idx]

ax[0].scatter(x, y, c=z)

# Get point densities
x = targets['Hk_log2_enrichment_scaled']
y = targets['Hk_log2_enrichment_scaled_pred']
xy = np.vstack([x,y])
z = gaussian_kde(xy)(xy)

# Sort the points by density, so that the densest points are plotted last
idx = z.argsort()
x, y, z = x[idx], y[idx], z[idx]

ax[1].scatter(x, y, c=z)

r2_dev = r2_score(targets['Dev_log2_enrichment_scaled'], targets['Dev_log2_enrichment_scaled_pred'])
r2_hk = r2_score(targets['Hk_log2_enrichment_scaled'], targets['Hk_log2_enrichment_scaled_pred'])

pearson_dev = pearsonr(targets['Dev_log2_enrichment_scaled'], targets['Dev_log2_enrichment_scaled_pred'])
pearson_hk = pearsonr(targets['Hk_log2_enrichment_scaled'], targets['Hk_log2_enrichment_scaled_pred'])

spearman_dev = spearmanr(targets['Dev_log2_enrichment_scaled'], targets['Dev_log2_enrichment_scaled_pred'])
spearman_hk = spearmanr(targets['Hk_log2_enrichment_scaled'], targets['Hk_log2_enrichment_scaled_pred'])

ax[0].set_title(f"Dev R2: {r2_dev:.2f}\nPearson: {pearson_dev[0]:.2f}\nSpearman: {spearman_dev[0]:.2f}")
ax[1].set_title(f"Hk R2: {r2_hk:.2f}\nPearson: {pearson_hk[0]:.2f}\nSpearman: {spearman_hk[0]:.2f}")

# Add a diagonal line to the plots
ax[0].plot(ax[0].get_xlim(), ax[0].get_ylim(), ls="--", c=".3")
ax[1].plot(ax[1].get_xlim(), ax[1].get_ylim(), ls="--", c=".3")

plt.tight_layout()
plt.show()

# Interpretation
Using Kipoi's attribution functionality is currently failing with this model and dataset. Seems to be due to incompatibilities with Keras:

```python
from kipoi_interpret.importance_scores.gradient import GradientXInput
explainer = GradientXInput(deepstarr)
val = explainer.score(ohe_seqs[:10])
AttributeError: module 'keras.engine' has no attribute 'training_utils'
```

Although the following code seems to work
    
```python
from kipoi_interpret.importance_scores.gradient import GradientXInput
model = kipoi.get_model("DeepBind/Homo_sapiens/TF/D00765.001_ChIP-seq_GATA1")
seq = "ATGGGCCAGCACACAGACCAGCACGTTGCCCAGGAGCTGTGGGAGGAAGATAAGAGGTATGAACATGATTAGCAAAAGGGCCTAGCTTGGACTCAGAATAA"
seqa = kipoiseq.transforms.functional.one_hot(seq) # one-hot-encode the sequence
grxinp = GradientXInput(model)
val = grxinp.score(seqa)[0]
```

This merits some more investigation

# DONE!

---