# Testing MPRA Dataloading

**Authorship:**
Adam Klie, *03/02/2022*
***
**Description:**
Notebook for testing out PyTorch DataLoading for MPRADataSet and MPRADataModule classes

<div class="alert alert-block alert-warning">
<b>TODOs</b>:
<ul>
    <b><li>Add test cases for each step</li></b>
    <b><li>Feel like initial loading will break given certain data</li></b>
    </ul>
</div>

In [1]:
import numpy as np
import pandas as pd

# Autoreload extension
if 'autoreload' not in get_ipython().extension_manager.loaded:
    %load_ext autoreload
%autoreload 2

# Add the scripts from this project
import sys
sys.path.append("../eugene")

# Dataloading packages
from torch.utils.data import Dataset, DataLoader
from torchvision import transforms
from transforms import ReverseComplement, Augment, OneHotEncode, ToTensor

# Logging info
import logging
logging.basicConfig(level=logging.DEBUG)

In [3]:
# Define directories
OLS_TSV = "/cellar/users/aklie/projects/EUGENE/data/2021_OLS_Library/2021_OLS_Library.tsv"
NUMPY_OHE = "/cellar/users/aklie/projects/EUGENE/data/2021_OLS_Library/ohe_seq/0.09-0.4_X-train-0.9_ohe-seq.npy"
FASTA_SEQS = "/cellar/users/aklie/projects/EUGENE/data/2021_OLS_Library/fasta/0.09-0.4_X-test-0.1_fasta.fa"
BINARY_TARGET = "/cellar/users/aklie/projects/EUGENE/data/2021_OLS_Library/binary/0.09-0.4_y-train-0.9_binary.txt"

# MPRADataset Class
PyTorch Dataset class for loading MPRA data. Here tare the steps for loading:
 1. Load the dataset from files of different supported types using functions from `load_data.py`
 2. Generate an MPRADataset object from sequences and targets
     - Pass in the seqs and targets
     - Compose torchvision transforms
 3. Pass the dataset to DataLoader

In [25]:
from MPRADataset import MPRADataset
MPRADataset?

[0;31mInit signature:[0m [0mMPRADataset[0m[0;34m([0m[0;34m*[0m[0margs[0m[0;34m,[0m [0;34m**[0m[0mkwds[0m[0;34m)[0m[0;34m[0m[0;34m[0m[0m
[0;31mDocstring:[0m      MPRA Dataset definition
[0;31mInit docstring:[0m
Args:
    names (iterable):
    seqs (iterable): list of sequences to serve as input into models
    targets (iterable): aligned list of targets for each sequence
    rev_seqs (iterable, optional): Optional reverse complements of seqs
    transform (callable, optional): Optional transform to be applied
        on a sample.
[0;31mFile:[0m           ~/Desktop/research/lab/dev/EUGENE/eugene/MPRADataset.py
[0;31mType:[0m           type
[0;31mSubclasses:[0m     


## Load from TSV

In [2]:
from eugene.load_data import load_csv

In [265]:
# Load the sequences as numpy arrays
names, seqs, rev_seqs, targets = load_csv("test_seqs.tsv", seq_col="SEQ", rev_comp=True)
names, len(seqs), seqs[0][:5], len(rev_seqs), rev_seqs[0][-5:], targets

(None, 100, 'GTAGG', 100, 'CCTAC', None)

In [266]:
# Compose different data transforms for this particular load
data_transform = transforms.Compose([
    Augment(randomize_linker_p=0.1, enhancer="WT-otx-a"), ReverseComplement(ohe_encoded=False), OneHotEncode(), ToTensor(transpose=False)
])

In [273]:
# Instantiate a Dataset
test_dataset = MPRADataset(seqs, transform=data_transform)
test_dataset[0]

(tensor([-1.]),
 tensor([[0., 0., 1., 0.],
         [0., 0., 0., 1.],
         [1., 0., 0., 0.],
         ...,
         [0., 0., 0., 1.],
         [0., 0., 0., 1.],
         [0., 0., 1., 0.]]),
 tensor([[0., 1., 0., 0.],
         [1., 0., 0., 0.],
         [1., 0., 0., 0.],
         ...,
         [0., 0., 0., 1.],
         [1., 0., 0., 0.],
         [0., 1., 0., 0.]]),
 tensor([-1.]))

In [275]:
# Check the Dataset Class
for i in range(len(test_dataset)):
    sample = test_dataset[i]
    print(i, sample[1].size(), sample[2].size(), sample[3])
    if i == 3:
        break

0 torch.Size([1000, 4]) torch.Size([1000, 4]) tensor([-1.])
1 torch.Size([1000, 4]) torch.Size([1000, 4]) tensor([-1.])
2 torch.Size([1000, 4]) torch.Size([1000, 4]) tensor([-1.])
3 torch.Size([1000, 4]) torch.Size([1000, 4]) tensor([-1.])


## Load from Numpy arrays

In [276]:
from load_data import load_numpy

In [278]:
# Load the sequences as numpy arrays
names, seqs, rev_seqs, targets = load_numpy("test_ohe_seqs.npy", names_file="test_ids.npy", rev_seq_file="test_rev_ohe_seqs.npy")
len(names), names[0], len(seqs), seqs[0][:5], len(rev_seqs), rev_seqs[0][-5:], targets

(100,
 'seq001',
 100,
 array([[0, 0, 1, 0],
        [0, 0, 0, 1],
        [1, 0, 0, 0],
        [0, 0, 1, 0],
        [0, 0, 1, 0]], dtype=int8),
 100,
 array([[0, 1, 0, 0],
        [0, 1, 0, 0],
        [0, 0, 0, 1],
        [1, 0, 0, 0],
        [0, 1, 0, 0]], dtype=int8),
 None)

In [279]:
# Compose different data transforms for this particular load
data_transform = transforms.Compose([ToTensor(transpose=False)])

In [280]:
# Instantiate a Dataset
test_dataset = MPRADataset(seqs, names=names, rev_seqs=rev_seqs, transform=data_transform)

In [282]:
# Check the Dataset Class
for i in range(len(test_dataset)):
    sample = test_dataset[i]
    print(i, sample[0].size(), sample[1].size(), sample[2].size(), sample[3])
    if i == 3:
        break

0 torch.Size([6]) torch.Size([1000, 4]) torch.Size([1000, 4]) tensor([-1.])
1 torch.Size([6]) torch.Size([1000, 4]) torch.Size([1000, 4]) tensor([-1.])
2 torch.Size([6]) torch.Size([1000, 4]) torch.Size([1000, 4]) tensor([-1.])
3 torch.Size([6]) torch.Size([1000, 4]) torch.Size([1000, 4]) tensor([-1.])


## Load from Fasta

In [283]:
from load_data import load_fasta

In [284]:
# Load the sequences as numpy arrays
names, seqs, rev_seqs, targets = load_fasta("test_seqs.fa", "test_labels.npy", rev_comp=True)

In [285]:
# Compose different data transforms for this particular load
data_transform = transforms.Compose([
    Augment(randomize_linker_p=0.1, enhancer="WT-otx-a"), OneHotEncode(), ToTensor(transpose=False)
])

In [289]:
# Instantiate a Dataset
test_dataset = MPRADataset(seqs, names=names, targets=targets, rev_seqs=rev_seqs, transform=data_transform)

In [290]:
# Check the Dataset Class
for i in range(len(test_dataset)):
    sample = test_dataset[i]
    print(i, sample[0].size(), sample[1].size(), sample[2].size(), sample[3])
    if i == 3:
        break

0 torch.Size([6]) torch.Size([1000, 4]) torch.Size([1000, 4]) tensor(0.)
1 torch.Size([6]) torch.Size([1000, 4]) torch.Size([1000, 4]) tensor(1.)
2 torch.Size([6]) torch.Size([1000, 4]) torch.Size([1000, 4]) tensor(1.)
3 torch.Size([6]) torch.Size([1000, 4]) torch.Size([1000, 4]) tensor(1.)


## Build DataLoader

In [26]:
from load_data import load

In [27]:
# Load the sequences as numpy arrays
names, seqs, rev_seqs, targets = load("test_ohe_seqs.npy", names_file="test_ids.npy", target_file="test_labels.npy", rev_seq_file="test_rev_ohe_seqs.npy")
len(names), names[0], len(seqs), seqs[0][:5], len(rev_seqs), rev_seqs[0][-5:], len(targets), targets[0]

(100,
 'seq001',
 100,
 array([[0, 0, 1, 0],
        [0, 0, 0, 1],
        [1, 0, 0, 0],
        [0, 0, 1, 0],
        [0, 0, 1, 0]], dtype=int8),
 100,
 array([[0, 1, 0, 0],
        [0, 1, 0, 0],
        [0, 0, 0, 1],
        [1, 0, 0, 0],
        [0, 1, 0, 0]], dtype=int8),
 100,
 0)

In [28]:
data_transform = transforms.Compose([
    ToTensor(transpose=False)
])

In [29]:
test_dataset = MPRADataset(seqs, names=names, targets=targets, rev_seqs=rev_seqs, transform=data_transform)

In [30]:
# Instantiate a DataLoader
test_dataloader = DataLoader(test_dataset, batch_size=32, shuffle=False, num_workers=0)

In [33]:
# Check the Dataset Class
for i in range(len(test_dataset)):
    sample = test_dataset[i]
    print(i, sample[0].size(), sample[1].size(), sample[2].size(), sample[3])
    #if i == 3:
    #    break

0 torch.Size([6]) torch.Size([1000, 4]) torch.Size([1000, 4]) tensor(0.)
1 torch.Size([6]) torch.Size([1000, 4]) torch.Size([1000, 4]) tensor(1.)
2 torch.Size([6]) torch.Size([1000, 4]) torch.Size([1000, 4]) tensor(1.)
3 torch.Size([6]) torch.Size([1000, 4]) torch.Size([1000, 4]) tensor(1.)
4 torch.Size([6]) torch.Size([1000, 4]) torch.Size([1000, 4]) tensor(1.)
5 torch.Size([6]) torch.Size([1000, 4]) torch.Size([1000, 4]) tensor(1.)
6 torch.Size([6]) torch.Size([1000, 4]) torch.Size([1000, 4]) tensor(0.)
7 torch.Size([6]) torch.Size([1000, 4]) torch.Size([1000, 4]) tensor(0.)
8 torch.Size([6]) torch.Size([1000, 4]) torch.Size([1000, 4]) tensor(0.)
9 torch.Size([6]) torch.Size([1000, 4]) torch.Size([1000, 4]) tensor(1.)
10 torch.Size([6]) torch.Size([1000, 4]) torch.Size([1000, 4]) tensor(0.)
11 torch.Size([6]) torch.Size([1000, 4]) torch.Size([1000, 4]) tensor(1.)
12 torch.Size([6]) torch.Size([1000, 4]) torch.Size([1000, 4]) tensor(1.)
13 torch.Size([6]) torch.Size([1000, 4]) torch.S

In [22]:
# Check the DataLoader
for i_batch, sample_batched in enumerate(test_dataloader):
    print(i_batch, sample_batched[1].size(), sample_batched[2].size(), sample_batched[3].size())
    # observe 4th batch and stop.
    if i_batch == 3:
        break

NameError: name 'test_dataloader' is not defined

# MPRADataModule
PyTorch Lightning DataModule class for MPRA data that allows for asbtracting most of the dataloading process. These DataModules can be passed straight to trainers for model training.

In [77]:
from MPRADataModule import MPRADataModule

In [78]:
data_transform = transforms.Compose([
    Augment(randomize_linker_p=0.1, enhancer="WT-otx-a"), 
    OneHotEncode(), 
    ToTensor(transpose=True)
])

In [79]:
test_datamodule = MPRADataModule(
    seq_file="test_seqs.tsv",
    transform=data_transform,
    num_workers=0,
    batch_size=16,
    load_kwargs=dict(seq_col="SEQ"))

In [80]:
test_datamodule.setup()
test_dataset = test_datamodule.train_dataloader().dataset

In [81]:
# Check the Dataset Class
for i in range(len(test_datamodule.train_dataloader().dataset)):
    sample = test_datamodule.train_dataloader().dataset[i]
    print(i, sample[0].size(), sample[1].size(), sample[2].size(), sample[3])
    if i == 3:
        break

0 torch.Size([1]) torch.Size([4, 1000]) torch.Size([1]) tensor([-1.])
1 torch.Size([1]) torch.Size([4, 1000]) torch.Size([1]) tensor([-1.])
2 torch.Size([1]) torch.Size([4, 1000]) torch.Size([1]) tensor([-1.])
3 torch.Size([1]) torch.Size([4, 1000]) torch.Size([1]) tensor([-1.])


In [82]:
# Check the DataLoader
for i_batch, sample_batched in enumerate(test_datamodule.train_dataloader()):
    print(i_batch, sample_batched[1].size(), sample_batched[2].size(), sample_batched[3].size())
    # observe 4th batch and stop.
    if i_batch == 3:
        break

0 torch.Size([16, 4, 1000]) torch.Size([16, 1]) torch.Size([16, 1])
1 torch.Size([16, 4, 1000]) torch.Size([16, 1]) torch.Size([16, 1])
2 torch.Size([16, 4, 1000]) torch.Size([16, 1]) torch.Size([16, 1])
3 torch.Size([16, 4, 1000]) torch.Size([16, 1]) torch.Size([16, 1])


In [21]:
test_datamodule.

(tensor([-1.]),
 tensor([[0., 0., 0.,  ..., 1., 0., 1.],
         [0., 0., 1.,  ..., 0., 0., 0.],
         [1., 1., 0.,  ..., 0., 0., 0.],
         [0., 0., 0.,  ..., 0., 1., 0.]]),
 tensor([-1.]),
 tensor([-1.]))