# Testing EUGENE prediction

**Authorship:**
Adam Klie, *03/19/2022*
***
**Description:**
Notebook for testing predictions with EUGENE architectures

<div class="alert alert-block alert-warning">
<b>TODOs</b>:
<ul>
    <b><li>Config for EUGENES</li></b>
    </ul>
</div>

In [1]:
import os
import numpy as np
import pandas as pd
import torch

# Autoreload extension
if 'autoreload' not in get_ipython().extension_manager.loaded:
    %load_ext autoreload
%autoreload 2

# Predictions with a EUGENE model
<div class="alert alert-info" role="alert">
  <b>Predict with dsEUGENE models</b>
</div>

In [2]:
BATCH_SIZE = 512
DATA_TYPE = "tsv"
NUM_WORKERS = 0
NUM_SEQS = 100000
SEQ_LEN = 66

## Load data

In [3]:
from torchvision import transforms
from eugene.utils.seq_transforms import ReverseComplement, Augment, OneHotEncode, ToTensor
from eugene.dataloading.SeqDataModule import SeqDataModule

In [4]:
data_transform = transforms.Compose([
    Augment(randomize_linker_p=0.1, enhancer="WT-otx-a"), 
    ReverseComplement(ohe_encoded=False), 
    OneHotEncode(), 
    ToTensor(transpose=True)
])

In [5]:
DATA_DIR = "test_{}seqs_{}/".format(NUM_SEQS, SEQ_LEN)
mod = SeqDataModule(seq_file=DATA_DIR + "test_seqs.tsv",
                    transform=data_transform,
                    num_workers=4,
                    batch_size=BATCH_SIZE,
                    test= True,
                    load_kwargs=dict(seq_col="SEQ", name_col="NAME", target_col="LABEL"))

## Instantiate EUGENE architecture: dsEUGENE

In [6]:
from eugene.models.cnn import CNN

In [9]:
CKT_PTH  = "/cellar/users/aklie/projects/EUGENE/results/simple/classification/ssCNN/2022_04_23_NPY_Baseline/checkpoints/epoch=13-step=2435.ckpt"
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
eugene = CNN.load_from_checkpoint(
    checkpoint_path=CKT_PTH,
    map_location=device,
)



## Testing and predictions with PyTorch Lightning

In [10]:
import pytorch_lightning as pl
from eugene.utils.custom_callbacks import PredictionWriter

In [11]:
trainer = pl.Trainer(gpus=1, callbacks=PredictionWriter(DATA_DIR, write_interval="epoch", prefix="test_10000seqs_66_"))

GPU available: True, used: True
TPU available: False, using: 0 TPU cores
IPU available: False, using: 0 IPUs


In [12]:
#trainer.test(model=eugene, datamodule=mod)
preds = trainer.predict(model=eugene, datamodule=mod)

LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0]


Predicting: 0it [00:00, ?it/s]



ValueError: zero-dimensional arrays cannot be concatenated

In [13]:
pd.read_csv(os.path.join(DATA_DIR, "test_10000seqs_66_predictions.tsv"), sep="\t")

Unnamed: 0,NAME,PREDICTION,TARGET
0,seq001,1.508530,1.0
1,seq002,1.162973,0.0
2,seq003,1.486226,0.0
3,seq004,0.534772,0.0
4,seq005,-0.000528,0.0
...,...,...,...
99995,seq99996,2.098904,1.0
99996,seq99997,0.900767,1.0
99997,seq99998,1.445276,0.0
99998,seq99999,0.495785,0.0


# References