# Kopp et al 2021 Evaluation 
**Authorship:**
Adam Klie, *08/12/2022*
***
**Description:**
Notebook to perform a brief evaluation of trained models on the Kopp et al (2021) dataset.
***

In [9]:
# General imports
import os
import sys
import glob
import torch
import numpy as np

# EUGENe imports and settings
import eugene as eu
from eugene import dataload as dl
from eugene import models
from eugene import evaluate
from eugene import settings
settings.dataset_dir = "/cellar/users/aklie/data/eugene/revision/kopp21"
settings.output_dir = "/cellar/users/aklie/projects/ML4GLand/EUGENe_paper/output/revision/kopp21"
settings.logging_dir = "/cellar/users/aklie/projects/ML4GLand/EUGENe_paper/logs/revision/kopp21"
settings.config_dir = "/cellar/users/aklie/projects/ML4GLand/EUGENe_paper/configs/kopp21"

# EUGENe packages
import seqdata as sd

# For illustrator editing
import matplotlib
matplotlib.rcParams['pdf.fonttype'] = 42
matplotlib.rcParams['ps.fonttype'] = 42

# Print versions
print(f"Python version: {sys.version}")
print(f"NumPy version: {np.__version__}")
print(f"Eugene version: {eu.__version__}")
print(f"SeqData version: {sd.__version__}")
print(f"PyTorch version: {torch.__version__}")

Python version: 3.9.16 | packaged by conda-forge | (main, Feb  1 2023, 21:39:03) 
[GCC 11.3.0]
NumPy version: 1.23.5
Eugene version: 0.0.8
SeqData version: 0.0.1
PyTorch version: 2.0.0


# Load in the test `SeqData`(s)

In [10]:
sdata = sd.open_zarr(os.path.join(settings.dataset_dir, 'kopp21_test.zarr'))
sdata

Unnamed: 0,Array,Chunk
Bytes,502.41 kiB,251.21 kiB
Shape,"(64309,)","(32155,)"
Dask graph,2 chunks in 2 graph layers,2 chunks in 2 graph layers
Data type,object numpy.ndarray,object numpy.ndarray
"Array Chunk Bytes 502.41 kiB 251.21 kiB Shape (64309,) (32155,) Dask graph 2 chunks in 2 graph layers Data type object numpy.ndarray",64309  1,

Unnamed: 0,Array,Chunk
Bytes,502.41 kiB,251.21 kiB
Shape,"(64309,)","(32155,)"
Dask graph,2 chunks in 2 graph layers,2 chunks in 2 graph layers
Data type,object numpy.ndarray,object numpy.ndarray

Unnamed: 0,Array,Chunk
Bytes,502.41 kiB,251.21 kiB
Shape,"(64309,)","(32155,)"
Dask graph,2 chunks in 2 graph layers,2 chunks in 2 graph layers
Data type,int64 numpy.ndarray,int64 numpy.ndarray
"Array Chunk Bytes 502.41 kiB 251.21 kiB Shape (64309,) (32155,) Dask graph 2 chunks in 2 graph layers Data type int64 numpy.ndarray",64309  1,

Unnamed: 0,Array,Chunk
Bytes,502.41 kiB,251.21 kiB
Shape,"(64309,)","(32155,)"
Dask graph,2 chunks in 2 graph layers,2 chunks in 2 graph layers
Data type,int64 numpy.ndarray,int64 numpy.ndarray

Unnamed: 0,Array,Chunk
Bytes,502.41 kiB,251.21 kiB
Shape,"(64309,)","(32155,)"
Dask graph,2 chunks in 2 graph layers,2 chunks in 2 graph layers
Data type,int64 numpy.ndarray,int64 numpy.ndarray
"Array Chunk Bytes 502.41 kiB 251.21 kiB Shape (64309,) (32155,) Dask graph 2 chunks in 2 graph layers Data type int64 numpy.ndarray",64309  1,

Unnamed: 0,Array,Chunk
Bytes,502.41 kiB,251.21 kiB
Shape,"(64309,)","(32155,)"
Dask graph,2 chunks in 2 graph layers,2 chunks in 2 graph layers
Data type,int64 numpy.ndarray,int64 numpy.ndarray

Unnamed: 0,Array,Chunk
Bytes,61.33 MiB,0.97 MiB
Shape,"(64309, 1, 500)","(8039, 1, 63)"
Dask graph,64 chunks in 2 graph layers,64 chunks in 2 graph layers
Data type,uint16 numpy.ndarray,uint16 numpy.ndarray
"Array Chunk Bytes 61.33 MiB 0.97 MiB Shape (64309, 1, 500) (8039, 1, 63) Dask graph 64 chunks in 2 graph layers Data type uint16 numpy.ndarray",500  1  64309,

Unnamed: 0,Array,Chunk
Bytes,61.33 MiB,0.97 MiB
Shape,"(64309, 1, 500)","(8039, 1, 63)"
Dask graph,64 chunks in 2 graph layers,64 chunks in 2 graph layers
Data type,uint16 numpy.ndarray,uint16 numpy.ndarray

Unnamed: 0,Array,Chunk
Bytes,122.66 MiB,0.96 MiB
Shape,"(64309, 500, 4)","(8039, 125, 1)"
Dask graph,128 chunks in 2 graph layers,128 chunks in 2 graph layers
Data type,uint8 numpy.ndarray,uint8 numpy.ndarray
"Array Chunk Bytes 122.66 MiB 0.96 MiB Shape (64309, 500, 4) (8039, 125, 1) Dask graph 128 chunks in 2 graph layers Data type uint8 numpy.ndarray",4  500  64309,

Unnamed: 0,Array,Chunk
Bytes,122.66 MiB,0.96 MiB
Shape,"(64309, 500, 4)","(8039, 125, 1)"
Dask graph,128 chunks in 2 graph layers,128 chunks in 2 graph layers
Data type,uint8 numpy.ndarray,uint8 numpy.ndarray

Unnamed: 0,Array,Chunk
Bytes,30.66 MiB,0.96 MiB
Shape,"(64309, 500)","(8039, 125)"
Dask graph,32 chunks in 2 graph layers,32 chunks in 2 graph layers
Data type,|S1 numpy.ndarray,|S1 numpy.ndarray
"Array Chunk Bytes 30.66 MiB 0.96 MiB Shape (64309, 500) (8039, 125) Dask graph 32 chunks in 2 graph layers Data type |S1 numpy.ndarray",500  64309,

Unnamed: 0,Array,Chunk
Bytes,30.66 MiB,0.96 MiB
Shape,"(64309, 500)","(8039, 125)"
Dask graph,32 chunks in 2 graph layers,32 chunks in 2 graph layers
Data type,|S1 numpy.ndarray,|S1 numpy.ndarray

Unnamed: 0,Array,Chunk
Bytes,502.41 kiB,251.21 kiB
Shape,"(64309,)","(32155,)"
Dask graph,2 chunks in 2 graph layers,2 chunks in 2 graph layers
Data type,object numpy.ndarray,object numpy.ndarray
"Array Chunk Bytes 502.41 kiB 251.21 kiB Shape (64309,) (32155,) Dask graph 2 chunks in 2 graph layers Data type object numpy.ndarray",64309  1,

Unnamed: 0,Array,Chunk
Bytes,502.41 kiB,251.21 kiB
Shape,"(64309,)","(32155,)"
Dask graph,2 chunks in 2 graph layers,2 chunks in 2 graph layers
Data type,object numpy.ndarray,object numpy.ndarray

Unnamed: 0,Array,Chunk
Bytes,62.80 kiB,62.80 kiB
Shape,"(64309,)","(64309,)"
Dask graph,1 chunks in 2 graph layers,1 chunks in 2 graph layers
Data type,uint8 numpy.ndarray,uint8 numpy.ndarray
"Array Chunk Bytes 62.80 kiB 62.80 kiB Shape (64309,) (64309,) Dask graph 1 chunks in 2 graph layers Data type uint8 numpy.ndarray",64309  1,

Unnamed: 0,Array,Chunk
Bytes,62.80 kiB,62.80 kiB
Shape,"(64309,)","(64309,)"
Dask graph,1 chunks in 2 graph layers,1 chunks in 2 graph layers
Data type,uint8 numpy.ndarray,uint8 numpy.ndarray

Unnamed: 0,Array,Chunk
Bytes,62.80 kiB,62.80 kiB
Shape,"(64309,)","(64309,)"
Dask graph,1 chunks in 2 graph layers,1 chunks in 2 graph layers
Data type,bool numpy.ndarray,bool numpy.ndarray
"Array Chunk Bytes 62.80 kiB 62.80 kiB Shape (64309,) (64309,) Dask graph 1 chunks in 2 graph layers Data type bool numpy.ndarray",64309  1,

Unnamed: 0,Array,Chunk
Bytes,62.80 kiB,62.80 kiB
Shape,"(64309,)","(64309,)"
Dask graph,1 chunks in 2 graph layers,1 chunks in 2 graph layers
Data type,bool numpy.ndarray,bool numpy.ndarray


# Get test set predictions for each model

In [13]:
from itertools import product
configs = ["fcn.yaml", "cnn.yaml", "hybrid.yaml", "kopp21_cnn.yaml"]
trials = 5

for config, trial in product(configs, range(1, trials+1)):
    model_name = config.split('.')[0]
    print(model_name)

    # Load the model
    model_file = glob.glob(os.path.join(settings.logging_dir, model_name, f"trial_{trial}", "checkpoints", "*"))[0]
    model = models.load_config(config_path=config)
    best_model = models.SequenceModule.load_from_checkpoint(model_file, arch=model.arch)
    
    # Set-up transforms
    transforms = {
        "target": lambda x: torch.tensor(x, dtype=torch.float32),
        "ohe_seq": lambda x: torch.tensor(x, dtype=torch.float32).swapaxes(1, 2)
    }
    
    # Evaluate
    evaluate.predictions_sequence_module(
        model=best_model,
        sdata=sdata,
        seq_key="ohe_seq",
        target_keys="target",
        gpus=1,
        batch_size=2048,
        num_workers=4,
        prefetch_factor=2,
        in_memory=True,
        transforms=transforms,
        file_label="test",
        name=model_name,
        version=f"trial_{trial}",
        prefix=f"{model_name}_trial_{trial}_"
    )

# Save the predictions
pred_keys = [k for k in sdata.data_vars.keys() if "predictions" in k]
(
    sdata[['chrom', 'chromStart', 'chromEnd', 'target', *pred_keys]]
    .to_dataframe()
    .to_csv(os.path.join(settings.output_dir, f"test_predictions_all.tsv"), sep="\t", index=False)
)
sd.to_zarr(sdata, os.path.join(settings.output_dir, f"test_predictions_all.zarr"), mode="w", load_first=True)

fcn


[rank: 0] Global seed set to 1


Loading ohe_seq and ['target'] into memory


  pkg_resources.require(self.requirement)
  rank_zero_warn(
GPU available: True (cuda), used: True
TPU available: False, using: 0 TPU cores
IPU available: False, using: 0 IPUs
HPU available: False, using: 0 HPUs
  rank_zero_warn(
LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0]


Predicting: 0it [00:00, ?it/s]

[rank: 0] Global seed set to 2


fcn
Loading ohe_seq and ['target'] into memory


  rank_zero_warn(
GPU available: True (cuda), used: True
TPU available: False, using: 0 TPU cores
IPU available: False, using: 0 IPUs
HPU available: False, using: 0 HPUs
LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0]


Predicting: 0it [00:00, ?it/s]

[rank: 0] Global seed set to 3


fcn
Loading ohe_seq and ['target'] into memory


  rank_zero_warn(
GPU available: True (cuda), used: True
TPU available: False, using: 0 TPU cores
IPU available: False, using: 0 IPUs
HPU available: False, using: 0 HPUs
LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0]


Predicting: 0it [00:00, ?it/s]

[rank: 0] Global seed set to 4


fcn
Loading ohe_seq and ['target'] into memory


  rank_zero_warn(
GPU available: True (cuda), used: True
TPU available: False, using: 0 TPU cores
IPU available: False, using: 0 IPUs
HPU available: False, using: 0 HPUs
LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0]


Predicting: 0it [00:00, ?it/s]

[rank: 0] Global seed set to 5


fcn
Loading ohe_seq and ['target'] into memory


  rank_zero_warn(
GPU available: True (cuda), used: True
TPU available: False, using: 0 TPU cores
IPU available: False, using: 0 IPUs
HPU available: False, using: 0 HPUs
LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0]


Predicting: 0it [00:00, ?it/s]

cnn


[rank: 0] Global seed set to 1
  rank_zero_warn(


Loading ohe_seq and ['target'] into memory


GPU available: True (cuda), used: True
TPU available: False, using: 0 TPU cores
IPU available: False, using: 0 IPUs
HPU available: False, using: 0 HPUs
LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0]


Predicting: 0it [00:00, ?it/s]

[rank: 0] Global seed set to 2


cnn
Loading ohe_seq and ['target'] into memory


  rank_zero_warn(
GPU available: True (cuda), used: True
TPU available: False, using: 0 TPU cores
IPU available: False, using: 0 IPUs
HPU available: False, using: 0 HPUs
LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0]


Predicting: 0it [00:00, ?it/s]

cnn


[rank: 0] Global seed set to 3
  rank_zero_warn(


Loading ohe_seq and ['target'] into memory


GPU available: True (cuda), used: True
TPU available: False, using: 0 TPU cores
IPU available: False, using: 0 IPUs
HPU available: False, using: 0 HPUs
LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0]


Predicting: 0it [00:00, ?it/s]

[rank: 0] Global seed set to 4


cnn
Loading ohe_seq and ['target'] into memory


  rank_zero_warn(
GPU available: True (cuda), used: True
TPU available: False, using: 0 TPU cores
IPU available: False, using: 0 IPUs
HPU available: False, using: 0 HPUs
LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0]


Predicting: 0it [00:00, ?it/s]

[rank: 0] Global seed set to 5


cnn
Loading ohe_seq and ['target'] into memory


  rank_zero_warn(
GPU available: True (cuda), used: True
TPU available: False, using: 0 TPU cores
IPU available: False, using: 0 IPUs
HPU available: False, using: 0 HPUs
LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0]


Predicting: 0it [00:00, ?it/s]

[rank: 0] Global seed set to 1


hybrid
Loading ohe_seq and ['target'] into memory


  rank_zero_warn(
GPU available: True (cuda), used: True
TPU available: False, using: 0 TPU cores
IPU available: False, using: 0 IPUs
HPU available: False, using: 0 HPUs
LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0]


Predicting: 0it [00:00, ?it/s]

[rank: 0] Global seed set to 2


hybrid
Loading ohe_seq and ['target'] into memory


  rank_zero_warn(
GPU available: True (cuda), used: True
TPU available: False, using: 0 TPU cores
IPU available: False, using: 0 IPUs
HPU available: False, using: 0 HPUs
LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0]


Predicting: 0it [00:00, ?it/s]

[rank: 0] Global seed set to 3


hybrid
Loading ohe_seq and ['target'] into memory


  rank_zero_warn(
GPU available: True (cuda), used: True
TPU available: False, using: 0 TPU cores
IPU available: False, using: 0 IPUs
HPU available: False, using: 0 HPUs
LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0]


Predicting: 0it [00:00, ?it/s]

hybrid


[rank: 0] Global seed set to 4
  rank_zero_warn(


Loading ohe_seq and ['target'] into memory


GPU available: True (cuda), used: True
TPU available: False, using: 0 TPU cores
IPU available: False, using: 0 IPUs
HPU available: False, using: 0 HPUs
LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0]


Predicting: 0it [00:00, ?it/s]

[rank: 0] Global seed set to 5


hybrid
Loading ohe_seq and ['target'] into memory


  rank_zero_warn(
GPU available: True (cuda), used: True
TPU available: False, using: 0 TPU cores
IPU available: False, using: 0 IPUs
HPU available: False, using: 0 HPUs
LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0]


Predicting: 0it [00:00, ?it/s]

[rank: 0] Global seed set to 1


kopp21_cnn
Loading ohe_seq and ['target'] into memory


  rank_zero_warn(
GPU available: True (cuda), used: True
TPU available: False, using: 0 TPU cores
IPU available: False, using: 0 IPUs
HPU available: False, using: 0 HPUs
LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0]


Predicting: 0it [00:00, ?it/s]

[rank: 0] Global seed set to 2


kopp21_cnn
Loading ohe_seq and ['target'] into memory


  rank_zero_warn(
GPU available: True (cuda), used: True
TPU available: False, using: 0 TPU cores
IPU available: False, using: 0 IPUs
HPU available: False, using: 0 HPUs
LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0]


Predicting: 0it [00:00, ?it/s]

[rank: 0] Global seed set to 3


kopp21_cnn
Loading ohe_seq and ['target'] into memory


  rank_zero_warn(
GPU available: True (cuda), used: True
TPU available: False, using: 0 TPU cores
IPU available: False, using: 0 IPUs
HPU available: False, using: 0 HPUs
LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0]


Predicting: 0it [00:00, ?it/s]

[rank: 0] Global seed set to 4


kopp21_cnn
Loading ohe_seq and ['target'] into memory


  rank_zero_warn(
GPU available: True (cuda), used: True
TPU available: False, using: 0 TPU cores
IPU available: False, using: 0 IPUs
HPU available: False, using: 0 HPUs
LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0]


Predicting: 0it [00:00, ?it/s]

[rank: 0] Global seed set to 5


kopp21_cnn
Loading ohe_seq and ['target'] into memory


  rank_zero_warn(
GPU available: True (cuda), used: True
TPU available: False, using: 0 TPU cores
IPU available: False, using: 0 IPUs
HPU available: False, using: 0 HPUs
LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0]


Predicting: 0it [00:00, ?it/s]

# DONE!

---

# Scratch