# Evaluate DeepMEL model using EUGENe on `pbmc-granulocyte-sorted-3k_10x-Multiome`
Adam Klie (last updated: *09/20/2023*)
***
This notebook shows how to evaluate a DeepMEL model using EUGENe on the `pbmc-granulocyte-sorted-3k_10x-Multiome` dataset.

# Set-up

In [None]:
# Load necessary packages
import os
import sys
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seqdata as sd
from eugene import models
from eugene.models.zoo import DeepMEL
from eugene import plot as pl
sys.path.append("/Users/adamklie/Desktop/research/projects/ML4GLand/use_cases/DeepMEL/scripts")
from eval import shuffle_label, calculate_roc_pr, evaluate_model

%matplotlib inline

In [None]:
# Set-up the paths to data (TODO: change to your own paths)
dataset_name = "pbmc-granulocyte-sorted-3k_10x-Multiome"
input_dir = '/cellar/users/aklie/projects/ML4GLand/use_cases/scBasset/pbmc-granulocyte-sorted-3k_10x-Multiome/processed'

# Loss curve

In [None]:
pl.training_summary("/cellar/users/aklie/projects/ML4GLand/collabs/er_stress_regulation/models/multiome_cells_all_peaks/multiome_cells_all_peaks.DeepMEL.revision/v0", metric="auroc")

# Load some data

In [None]:
# Load the datasets
sdata = sdata = sd.open_zarr(os.path.join(input_dir, f"{dataset_name}.train.zarr"))
train_sdata = sdata.sel(_sequence=(sdata["train_val"] == True).compute())  # noqa
val_sdata = sdata.sel(_sequence=(sdata["train_val"] == False).compute())  # noqa
test_sdata = sd.open_zarr(os.path.join(input_dir, f"{dataset_name}.test.zarr"))

In [None]:
# Check percentage of 1s
train_sdata["topics"].values.sum(axis=0)/train_sdata["topics"].values.shape[0]

In [None]:
# Check percentage of 1s
val_sdata["topics"].values.sum(axis=0)/val_sdata["topics"].values.shape[0]

In [None]:
# Check percentage of 1s
test_sdata["topics"].values.sum(axis=0)/test_sdata["topics"].values.shape[0]

# Load a trained model

In [None]:
arch = DeepMEL(
    input_len=500, 
    output_dim=37,
    conv_kwargs={
        "conv_channels": [1024],  
    },
)

In [None]:
model = models.SequenceModule.load_from_checkpoint(
    os.path.join(models_dir, dataset_name, "multiome_cells_all_peaks.DeepMEL.revision/v0/checkpoints/epoch=15-step=16080.ckpt"),
    arch=arch,
)

In [None]:
# TODO: change to your own path
os.path.join(input_dir, dataset_name, "multiome_cells_all_peaks.DeepMEL.revision/v0/checkpoints/epoch=15-step=16080.ckpt")

# Evaluate per topic performance

In [None]:
train_preds = model.predict(train_sdata["ohe_seqs"].transpose("_sequence", "_ohe", "length").values, batch_size=512)

In [None]:
val_preds= model.predict(val_sdata["ohe_seqs"].transpose("_sequence", "_ohe", "length").values, batch_size=512)

In [None]:
test_preds = model.predict(test_sdata["ohe_seqs"].transpose("_sequence", "_ohe", "length").values, batch_size=512)

In [None]:
train_preds.shape, val_preds.shape, test_preds.shape

In [None]:
train_preds = train_preds.cpu().numpy()
val_preds = val_preds.cpu().numpy()
test_preds = test_preds.cpu().numpy()

In [None]:
train_trues = train_sdata["topics"].values
val_trues = val_sdata["topics"].values
test_trues = test_sdata["topics"].values

In [None]:
train_preds.shape, train_trues.shape, val_preds.shape, val_trues.shape, test_preds.shape, test_trues.shape

In [None]:
print('calculate roc and pr...')
roc_pr_dict = {"train": {}, "val": {}, "test": {}, "shuffle": {}}
roc_pr_dict["train"]["score"] = train_preds
roc_pr_dict["train"]["label"] = train_trues
roc_pr_dict["val"]["score"] = val_preds
roc_pr_dict["val"]["label"] = val_trues
roc_pr_dict["test"]["score"] = test_preds
roc_pr_dict["test"]["label"] = test_trues
roc_pr_dict["shuffle"]["score"] = np.array(roc_pr_dict["train"]["score"], copy=True)
roc_pr_dict["shuffle"]["label"] = shuffle_label(np.array(train_trues, copy=True))

In [None]:
for sets in ["train", "val", "test", "shuffle"]:
    roc_pr_dict[sets]["roc_pr"] = calculate_roc_pr(roc_pr_dict[sets]["score"], roc_pr_dict[sets]["label"])

In [None]:
roc_pr_dict["train"]["roc_pr"].T[0]

In [None]:
roc_pr_dict["val"]["roc_pr"].T[0]

In [None]:
roc_pr_dict["test"]["roc_pr"].T[0]

In [None]:
fig = plt.figure(figsize=(25, 10))
ax = fig.add_subplot(2, 1, 1)
ax.set_ylabel('auROC')
ax.scatter(list(range(37)), roc_pr_dict["train"]["roc_pr"].T[0], color='red', label='TRAIN')
ax.scatter(list(range(37)), roc_pr_dict["val"]["roc_pr"].T[0], color='orange', label='VAL')
ax.scatter(list(range(37)), roc_pr_dict["test"]["roc_pr"].T[0], color='blue', label='TEST')
ax.scatter(list(range(37)), roc_pr_dict["shuffle"]["roc_pr"].T[0], color='gray', label='SHUFFLED')
ax.set_ylim([0, 1])
_ = plt.xticks(range(37),range(1,38))
ax.legend()

ax = fig.add_subplot(2, 1, 2)
ax.set_ylabel('auPR')
ax.scatter(list(range(37)), roc_pr_dict["train"]["roc_pr"].T[1], color='red', label='TRAIN')
ax.scatter(list(range(37)), roc_pr_dict["test"]["roc_pr"].T[1], color='blue', label='TEST')
ax.scatter(list(range(37)), roc_pr_dict["val"]["roc_pr"].T[1], color='orange', label='VAL')
ax.scatter(list(range(37)), roc_pr_dict["shuffle"]["roc_pr"].T[1], color='gray', label='SHUFFLED')
ax.set_ylim([0, 1])
_ = plt.xticks(range(37),range(1,38))

In [None]:
evaluate_model(train_trues, train_preds)

In [None]:
evaluate_model(val_trues, val_preds)

In [None]:
evaluate_model(test_trues, test_preds)

# DONE!

---