In [1]:
# you have to make the module discoverable to load the classes below:

import sys
import os
from pathlib import Path

os.chdir('../../../nucleotran/')
sys.path.append('./src')

In [2]:
import torch
import torch.utils.data as torch_data
import pytorch_lightning as pl

from models.models import IEAquaticDilated
from dataloading.dataloaders import LitCoverageDatasetHDF5
from util.load_config import config

First, load your model from a checkpoint path

In [3]:
checkpoint_dir = Path('/dhc/groups/fglippert/dna_transformers_disentanglement/model_checkpoints/alex/IEAquaticDilated/sweep_b3036x2m/58zi0nkt')
model = IEAquaticDilated.load_from_checkpoint(next(checkpoint_dir.glob('*.ckpt')))



#### You can check the model's hyperparameters via the `model.hparams` field

In [5]:
model.hparams

"C":                                   30
"D":                                   1.5
"L1":                                  6
"L2":                                  2
"activation":                          gelu
"basepath":                            data/processed/GRCh38/221111_128bp_minoverlap64_mincov2_nc10_tissues
"batch_size":                          256
"biological_subspace_ratio":           0.5
"class_freq":                          None
"coeff_cov":                           0.01
"cov_and_adv":                         True
"cov_norm":                            True
"crop":                                8
"dilated_residual_dropout":            0.0
"dilated_residual_kernel_size":        3
"dim_hidden_discriminator":            1024
"dim_hidden_embedding":                256
"dim_random_projections":              10
"init_bias":                           None
"input_kernel_size":                   15
"labels_encoder":                      False
"linearly_embed_direct_features": 

### Basic "manual" way of obtaining predictions

Let's generate some dummy data

In [31]:
random_data = torch.randn(100, 16, 2176)

By passing *return_features=True* to the *forward()* method the model will return all the features in addition to the experiment-level predictions.

In [32]:
predictions, features_all, features_biological, features_technical = model.forward(
    x=random_data,
    return_features=True,
)
print(predictions.shape)
print(features_all.shape)
print(features_biological.shape)
print(features_technical.shape)

torch.Size([100, 2106])
torch.Size([100, 4])
torch.Size([100, 2])
torch.Size([100, 2])


### The "sophisticated" way

We will use *pytorch_lighning.Trainer* to do the job for us - this is useful when handling large amounts of data, taking care of proper tensor-to-device placement etc.

Let's define a dummy dataset:

In [33]:
class DummyDataset(torch_data.Dataset):
    def __len__(self):
        return 1000
    
    def __getitem__(self, idx):
        return torch.randn(16, 2176)
    
    

In [34]:
trainer = pl.Trainer()

ret = trainer.predict(
    model=model,
    dataloaders=torch_data.DataLoader(
        dataset=DummyDataset(),
        batch_size=128,
    ),
    # you can also pass a datamodule:
    # datamodule=my_datamodule
)

GPU available: False, used: False
TPU available: False, using: 0 TPU cores
IPU available: False, using: 0 IPUs
HPU available: False, using: 0 HPUs
  rank_zero_warn(


Predicting: 0it [00:00, ?it/s]

In [35]:
print(f'Number of batches: {len(ret)}')
print(f'Outputs per batch: {len(ret[0])}')

Number of batches: 8
Outputs per batch: 4


This returns a list of outputs for each batch - we still need to concatenate them into single arrays

In [36]:
predictions, features_all, features_biological, features_technical = (
    # inner loop - iterate over all returned batches and select the appropriate output type
    torch.cat([batch[output_idx] for batch in ret]) 
    # outer loop - iterate over the 4 output types (predictions and 3 feature types)
    for output_idx in range(4)    
)

print(predictions.shape)
print(features_all.shape)
print(features_biological.shape)
print(features_technical.shape)

torch.Size([1000, 2106])
torch.Size([1000, 4])
torch.Size([1000, 2])
torch.Size([1000, 2])


### Use the ENCODE data

Human "toy" data:

In [10]:
datamodule = LitCoverageDatasetHDF5(
    seq_order=2, 
    seq_len=2176,
    basepath="data/processed/GRCh38/toydata",
    ref_path=config['reference']['GRCh38'],
    batch_size=256,
)

BED-file contains 10000 regions.
93.250% of regions have at least 1 label.


Human data

In [None]:
datamodule = LitCoverageDatasetHDF5(
    seq_order=2, 
    seq_len=1152,
    basepath="data/processed/GRCh38/221111_128bp_minoverlap64_mincov2_nc10_tissues",
    ref_path=config['reference']['GRCh38'],
    batch_size=256,
)

Mouse data

In [None]:
datamodule = LitCoverageDatasetHDF5(
    seq_order=2, 
    seq_len=2176,
    basepath="data/processed/mm10/221111_128bp_minoverlap64_mincov2_nc10_tissues",
    ref_path=config['reference']['mm10'],
    batch_size=256,
)

In [11]:
def predict_val_collate_fn(batch):
    batch = torch.cat([x for x, _ in batch])
    return batch


datamodule.setup(stage='fit')

# choose train_dataloader() or val_dataloader() in 'fit' stage
dloader = datamodule.val_dataloader()
dloader.collate_fn = predict_val_collate_fn

model.eval()
trainer = pl.Trainer()
ret = trainer.predict(
    model=model,
    dataloaders=dloader,
)


--------------- Subsetting Info ------------------
Subsetting Method: no subsetting 

Number of samples:
available = 10000
	after subsetting = 10000 (100% of available)
	training = 8590 (86% of subset)
	validation = 506 (5% of subset)
	test = 904 (9.04% of subset)
for check: missed data in split = 0
----------------------------------------------------


  rank_zero_warn(
GPU available: True (cuda), used: True
TPU available: False, using: 0 TPU cores
IPU available: False, using: 0 IPUs
HPU available: False, using: 0 HPUs
  rank_zero_warn(
You are using a CUDA device ('NVIDIA A40') that has Tensor Cores. To properly utilize them, you should set `torch.set_float32_matmul_precision('medium' | 'high')` which will trade-off precision for performance. For more details, read https://pytorch.org/docs/stable/generated/torch.set_float32_matmul_precision.html#torch.set_float32_matmul_precision
LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0]


Predicting DataLoader 0: 100%|██████████| 2/2 [00:05<00:00,  2.58s/it]


In [13]:
predictions, features_all, features_biological, features_technical = (
    # inner loop - iterate over all returned batches and select the appropriate output type
    torch.cat([batch[output_idx] for batch in ret]) 
    # outer loop - iterate over the 4 output types (predictions and 3 feature types)
    for output_idx in range(4)    
)

print(predictions.shape)
print(features_all.shape)
print(features_biological.shape)
print(features_technical.shape)

torch.Size([506, 2106])
torch.Size([506, 60])
torch.Size([506, 30])
torch.Size([506, 30])
