# Testing MPRA Dataloading

**Authorship:**
Adam Klie, *03/02/2022*
***
**Description:**
Notebook for testing out PyTorch DataLoading for MPRADataSet and MPRADataModule classes

<div class="alert alert-block alert-warning">
<b>TODOs</b>:
<ul>
    <b><li>Add test cases for each step</li></b>
    <b><li>Feel like initial loading will break given certain data</li></b>
    </ul>
</div>

In [2]:
import numpy as np
import pandas as pd

# Autoreload extension
if 'autoreload' not in get_ipython().extension_manager.loaded:
    %load_ext autoreload
%autoreload 2

# Add the scripts from this project
import sys
sys.path.append("../eugene")

# Dataloading packages
from torch.utils.data import Dataset, DataLoader
from torchvision import transforms
from transforms import ReverseComplement, Augment, OneHotEncode, ToTensor

# Logging info
import logging
logging.basicConfig(level=logging.DEBUG)

In [3]:
# Define directories
OLS_TSV = "/cellar/users/aklie/projects/EUGENE/data/2021_OLS_Library/2021_OLS_Library.tsv"
NUMPY_OHE = "/cellar/users/aklie/projects/EUGENE/data/2021_OLS_Library/ohe_seq/0.09-0.4_X-train-0.9_ohe-seq.npy"
FASTA_SEQS = "/cellar/users/aklie/projects/EUGENE/data/2021_OLS_Library/fasta/0.09-0.4_X-test-0.1_fasta.fa"
BINARY_TARGET = "/cellar/users/aklie/projects/EUGENE/data/2021_OLS_Library/binary/0.09-0.4_y-train-0.9_binary.txt"

# MPRADataset Class
PyTorch Dataset class for loading MPRA data. Here tare the steps for loading:
 1. Load the dataset from files of different supported types using functions from `load_data.py`
 2. Generate an MPRADataset object from sequences and targets
     - Pass in the seqs and targets
     - Compose torchvision transforms
 3. Pass the dataset to DataLoader

In [4]:
from MPRADataset import MPRADataset
MPRADataset?

[0;31mInit signature:[0m [0mMPRADataset[0m[0;34m([0m[0;34m*[0m[0margs[0m[0;34m,[0m [0;34m**[0m[0mkwds[0m[0;34m)[0m[0;34m[0m[0;34m[0m[0m
[0;31mDocstring:[0m      MPRA Dataset definition
[0;31mInit docstring:[0m
Args:
    seqs (iterable): list of sequences to serve as input into models
    targets (iterable): aligned list of targets for each sequence
    rev_comps (iterable, optional): Optional reverse complements of seqs
    transform (callable, optional): Optional transform to be applied
        on a sample.
[0;31mFile:[0m           /mnt/beegfs/users/aklie/projects/EUGENE/eugene/MPRADataset.py
[0;31mType:[0m           type
[0;31mSubclasses:[0m     


## Load from TSV

In [5]:
from load_data import load_csv

In [7]:
# Load the sequences as numpy arrays
seqs, targets = load_csv(OLS_TSV, target_col="ACTIVITY_SUMRNA_NUMDNA")
len(seqs), seqs[0], len(targets), targets[0]

(460800,
 'CATCTGAAGCTCGTTATCTCTAACGGAAGTTTTCGAAAAGGAAATTGCTCAATATCTAAGATAGGA',
 460800,
 0.6117667250932141)

In [8]:
# Compose different data transforms for this particular load
data_transform = transforms.Compose([
    Augment(randomize_linker_p=0.1, enhancer="WT-otx-a"), ReverseComplement(ohe_encoded=False), OneHotEncode(), ToTensor(transpose=False)
])

## Load from Numpy arrays

In [9]:
from load_data import load_numpy

In [10]:
# Load the sequences as numpy arrays
seqs, targets = load_numpy(NUMPY_OHE, BINARY_TARGET)
seqs[0][:5], targets[0]

(array([[0., 1., 0., 0.],
        [1., 0., 0., 0.],
        [0., 0., 0., 1.],
        [0., 1., 0., 0.],
        [0., 0., 0., 1.]]),
 1.0)

In [11]:
# Compose different data transforms for this particular load
data_transform = transforms.Compose([ToTensor(transpose=False)])

## Load from Fasta

In [12]:
from load_data import load_fasta

In [13]:
# Define vars as file names
seq_file = FASTA_SEQS
target_file = BINARY_TARGET

In [14]:
# Load the sequences as numpy arrays
seqs, targets = load_fasta(seq_file, target_file)
seqs[0], targets[0]

('CATCTGAAGCTCAATATCTACTTCCGTTGCTCATTTCCTTTTTCGAACCTATCTTTAGAGATAACA', 1.0)

In [15]:
# Compose different data transforms for this particular load
data_transform = transforms.Compose([
    Augment(randomize_linker_p=0.1, enhancer="WT-otx-a"), OneHotEncode(), ToTensor(transpose=False)
])

## Build MPRADataset

In [16]:
# Instantiate a Dataset
ols_dataset = MPRADataset(seqs, targets, transform=data_transform)

In [17]:
# Check the Dataset Class
for i in range(len(ols_dataset)):
    sample = ols_dataset[i]
    print(i, sample['sequence'].size(), sample['target'])
    if "reverse_complement" in sample:
        print(sample["reverse_complement"].size())
    if i == 3:
        break

0 torch.Size([66, 4]) tensor(1.)
1 torch.Size([66, 4]) tensor(1.)
2 torch.Size([66, 4]) tensor(0.)
3 torch.Size([66, 4]) tensor(1.)


## Build DataLoader

In [18]:
# Instantiate a DataLoader
ols_dataloader = DataLoader(ols_dataset, batch_size=512, shuffle=True, num_workers=0)

In [19]:
# Check the DataLoader
for i_batch, sample_batched in enumerate(ols_dataloader):
    print(i_batch, sample_batched['sequence'].size(),
          sample_batched['target'].size())
    if "reverse_complement" in sample_batched:
        print(sample_batched["reverse_complement"].size())
    # observe 4th batch and stop.
    if i_batch == 3:
        break

0 torch.Size([512, 66, 4]) torch.Size([512])
1 torch.Size([512, 66, 4]) torch.Size([512])
2 torch.Size([512, 66, 4]) torch.Size([512])
3 torch.Size([512, 66, 4]) torch.Size([512])


# MPRADataModule
PyTorch Lightning DataModule class for MPRA data that allows for asbtracting most of the dataloading process. These DataModules can be passed straight to trainers for model training.

In [20]:
from MPRADataModule import MPRADataModule

In [21]:
data_transform = transforms.Compose([
    Augment(randomize_linker_p=0.1, enhancer="WT-otx-a"), 
    ReverseComplement(ohe_encoded=False), 
    OneHotEncode(), 
    ToTensor(transpose=True)
])

In [23]:
ols_datamodule = MPRADataModule(seq_file=OLS_TSV,
                                transform=data_transform,
                                num_workers=4,
                                batch_size=128,
                                load_kwargs=dict(target_col="ACTIVITY_SUMRNA_NUMDNA", low_thresh=0.18, high_thresh=0.4))