# Testing EUGENE prediction

**Authorship:**
Adam Klie, *03/19/2022*
***
**Description:**
Notebook for testing predictions with EUGENE architectures

<div class="alert alert-block alert-warning">
<b>TODOs</b>:
<ul>
    <b><li>Config for EUGENES</li></b>
    </ul>
</div>

In [2]:
import os
import numpy as np
import pandas as pd
import torch

# Autoreload extension
if 'autoreload' not in get_ipython().extension_manager.loaded:
    %load_ext autoreload
%autoreload 2

# Predictions with a EUGENE model
<div class="alert alert-info" role="alert">
  <b>Predict with dsEUGENE models</b>
</div>

In [4]:
BATCH_SIZE = 512
DATA_TYPE = "tsv"
NUM_WORKERS = 0
NUM_SEQS = 1000
SEQ_LEN = 66

## Load data

In [5]:
from torchvision import transforms
from eugene.utils.seq_transforms import ReverseComplement, Augment, OneHotEncode, ToTensor
from eugene.dataloading.SeqDataModule import SeqDataModule

In [6]:
data_transform = transforms.Compose([
    Augment(randomize_linker_p=0.1, enhancer="WT-otx-a"), 
    ReverseComplement(ohe_encoded=False), 
    OneHotEncode(), 
    ToTensor(transpose=True)
])

In [7]:
DATA_DIR = "test_{}seqs_{}/".format(NUM_SEQS, SEQ_LEN)
mod = SeqDataModule(seq_file=DATA_DIR + "test_seqs.tsv",
                    transform=data_transform,
                    num_workers=4,
                    batch_size=BATCH_SIZE,
                    test= True,
                    load_kwargs=dict(seq_col="SEQ", name_col="NAME", target_col="LABEL"))

## Instantiate EUGENE architecture: dsEUGENE

In [8]:
from eugene.models.cnn import CNN

In [11]:
CKT_PTH  = "/cellar/users/aklie/projects/EUGENE/results/simple/binary_classification/sscnn/2022_04_23_NPY_Baseline/checkpoints/epoch=13-step=2435.ckpt"
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
eugene = CNN.load_from_checkpoint(
    checkpoint_path=CKT_PTH,
    map_location=device,
)



## Testing and predictions with PyTorch Lightning

In [12]:
import pytorch_lightning as pl
from eugene.utils.custom_callbacks import PredictionWriter

In [18]:
trainer = pl.Trainer(gpus=1, logger=False, callbacks=PredictionWriter(DATA_DIR + "test_1000seqs_66_", write_interval="epoch"))

GPU available: True, used: True
TPU available: False, using: 0 TPU cores
IPU available: False, using: 0 IPUs


In [19]:
preds = trainer.predict(model=eugene, datamodule=mod)

  f"DataModule.{name} has already been called, so it will not be called again. "
LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0]


Predicting: 0it [00:00, ?it/s]

  f"DataModule.{name} has already been called, so it will not be called again. "


# Predictions on the split

In [35]:
mod.test

False

In [34]:
pl.utilities.seed.seed_everything(13)

Global seed set to 13


13

In [20]:
DATA_DIR = "test_{}seqs_{}/".format(NUM_SEQS, SEQ_LEN)
mod = SeqDataModule(seq_file=DATA_DIR + "test_seqs.tsv",
                    transform=data_transform,
                    num_workers=4,
                    batch_size=BATCH_SIZE,
                    load_kwargs=dict(seq_col="SEQ", name_col="NAME", target_col="LABEL"))

In [21]:
mod.setup()

In [23]:
train_dataloader = mod.train_dataloader()

In [24]:
val_dataloader = mod.val_dataloader()

In [26]:
trainer = pl.Trainer(gpus=1, logger=False)

GPU available: True, used: True
TPU available: False, using: 0 TPU cores
IPU available: False, using: 0 IPUs


In [45]:
preds = trainer.predict(model=eugene, dataloaders=train_dataloader)

LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0]


Predicting: 0it [00:00, ?it/s]

In [50]:
preds = np.concatenate(preds)

In [52]:
preds.shape

(900, 3)

In [54]:
pred_df = pd.DataFrame(data=preds, columns=["NAME", "PREDICTION", "TARGET"])

In [55]:
pred_df

Unnamed: 0,NAME,PREDICTION,TARGET
0,seq247,-0.7930637,1.0
1,seq434,-1.6768111,0.0
2,seq260,-0.6487483,0.0
3,seq191,-0.90770334,0.0
4,seq519,-1.9701676,1.0
...,...,...,...
895,seq362,-0.65784943,1.0
896,seq328,-0.69132656,1.0
897,seq838,1.0923125,0.0
898,seq558,-1.0055817,0.0


In [25]:
trainer.predict?

[0;31mSignature:[0m
[0mtrainer[0m[0;34m.[0m[0mpredict[0m[0;34m([0m[0;34m[0m
[0;34m[0m    [0mmodel[0m[0;34m:[0m [0mUnion[0m[0;34m[[0m[0mForwardRef[0m[0;34m([0m[0;34m'pl.LightningModule'[0m[0;34m)[0m[0;34m,[0m [0mNoneType[0m[0;34m][0m [0;34m=[0m [0;32mNone[0m[0;34m,[0m[0;34m[0m
[0;34m[0m    [0mdataloaders[0m[0;34m:[0m [0mUnion[0m[0;34m[[0m[0mtorch[0m[0;34m.[0m[0mutils[0m[0;34m.[0m[0mdata[0m[0;34m.[0m[0mdataloader[0m[0;34m.[0m[0mDataLoader[0m[0;34m,[0m [0mSequence[0m[0;34m[[0m[0mtorch[0m[0;34m.[0m[0mutils[0m[0;34m.[0m[0mdata[0m[0;34m.[0m[0mdataloader[0m[0;34m.[0m[0mDataLoader[0m[0;34m][0m[0;34m,[0m [0mpytorch_lightning[0m[0;34m.[0m[0mcore[0m[0;34m.[0m[0mdatamodule[0m[0;34m.[0m[0mLightningDataModule[0m[0;34m,[0m [0mNoneType[0m[0;34m][0m [0;34m=[0m [0;32mNone[0m[0;34m,[0m[0;34m[0m
[0;34m[0m    [0mdatamodule[0m[0;34m:[0m [0mUnion[0m[0;34m[[0m[0mpytorch_li

In [16]:
pd.read_csv(os.path.join(DATA_DIR, "test_1000seqs_66_predictions.tsv"), sep="\t")

FileNotFoundError: [Errno 2] No such file or directory: 'test_1000seqs_66/test_1000seqs_66_predictions.tsv'

# References