# Testing Sequence Dataloading

**Authorship:**
Adam Klie, *03/02/2022*
***
**Description:**
Notebook for testing out PyTorch DataLoading for SeqDataSet and SeqDataModule classes

<div class="alert alert-block alert-warning">
<b>TODOs</b>:
<ul>
    <b><li>Need to turn this into a unit testing script</li></b>
    <b><li>Can we go from SeqDataset to SeqDataModule</li></b>
    </ul>
</div>

In [2]:
# Classics
import numpy as np
import pandas as pd

# Autoreload extension
if 'autoreload' not in get_ipython().extension_manager.loaded:
    %load_ext autoreload
%autoreload 2

import eugene

# SeqDataset, DataLoader, SeqDataModule Classes
SeqDataset - PyTorch Dataset class for loading sequence data. 
SeqDataModule
Here are the general steps for loading:
 1. Load the dataset from files of different supported types using functions from `datasets._io.py`
 2. Generate an `SeqDataset` object from sequences and targets
     - Pass in the seqs and targets
     - Compose `torchvision.transforms`
 3. Pass the dataset to DataLoader
 4. PyTorch Lightning DataModule class for MPRA data that allows for asbtracting most of the dataloading process. These DataModules can be passed straight to trainers for model training.

# Load a random dataset from a CSV file using `load_csv`

In [7]:
from torchvision import transforms
from torch.utils.data import DataLoader
from eugene.datasets import load_csv
from eugene.dataloading.dataloaders import SeqDataset
from eugene.dataloading.dataloaders import SeqDataModule
from eugene.dataloading import Augment, ReverseComplement, OneHotEncode, ToTensor

# Load the sequences as numpy arrays
names, seqs, rev_seqs, targets = load_csv("test_100seqs_66/test_seqs.tsv", seq_col="SEQ", name_col="NAME", target_col="LABEL", rev_comp=True)
print(names[0], len(seqs), seqs[0][:5], len(rev_seqs), rev_seqs[0][-5:], targets[0])

# Compose different data transforms for this particular load
data_transform = transforms.Compose([
    Augment(randomize_linker_p=0.1, enhancer="WT-otx-a"), ReverseComplement(ohe_encoded=False), OneHotEncode(), ToTensor(transpose=False)
])

# Instantiate a Dataset
test_dataset = SeqDataset(seqs, names=names, targets=targets, transform=data_transform)
print(test_dataset[0][0], test_dataset[0][1][:5], test_dataset[0][2][:5], test_dataset[0][3])

# Check the Dataset Class
for i in range(len(test_dataset)):
    sample = test_dataset[i]
    print(i, sample[1].size(), sample[2].size(), sample[3])
    if i == 3:
        break

# Instantiate a DataLoader
test_dataloader = DataLoader(test_dataset, batch_size=32, shuffle=True, num_workers=0)

# Check the DataLoader
for i_batch, sample_batched in enumerate(test_dataloader):
    print(i_batch, sample_batched[1].size(), sample_batched[2].size(), sample_batched[3].size())
    # observe 4th batch and stop.
    if i_batch == 3:
        break

# SeqDataModule
test_datamod = SeqDataModule("test_100seqs_66/test_seqs.tsv")

seq001 100 GTAGG 100 CCTAC 0.0
tensor([115., 101., 113.,  48.,  48.,  49.]) tensor([[0., 0., 1., 0.],
        [0., 0., 0., 1.],
        [1., 0., 0., 0.],
        [0., 0., 1., 0.],
        [0., 0., 1., 0.]]) tensor([[0., 0., 1., 0.],
        [0., 1., 0., 0.],
        [0., 0., 0., 1.],
        [0., 0., 0., 1.],
        [0., 1., 0., 0.]]) tensor(0.)
0 torch.Size([66, 4]) torch.Size([66, 4]) tensor(0.)
1 torch.Size([66, 4]) torch.Size([66, 4]) tensor(1.)
2 torch.Size([66, 4]) torch.Size([66, 4]) tensor(1.)
3 torch.Size([66, 4]) torch.Size([66, 4]) tensor(0.)
0 torch.Size([32, 66, 4]) torch.Size([32, 66, 4]) torch.Size([32])
1 torch.Size([32, 66, 4]) torch.Size([32, 66, 4]) torch.Size([32])
2 torch.Size([32, 66, 4]) torch.Size([32, 66, 4]) torch.Size([32])
3 torch.Size([4, 66, 4]) torch.Size([4, 66, 4]) torch.Size([4])


# Load real `ols` dataset from `datasets` module

In [9]:
from torchvision import transforms
from torch.utils.data import DataLoader
from eugene.dataloading.dataloaders import SeqDataset
from eugene.dataloading.dataloaders import SeqDataModule
from eugene.dataloading import Augment, ReverseComplement, OneHotEncode, ToTensor

# Load from `datasets` module
names, seqs, rev_seqs, targets = eugene.datasets.ols()
print(names[0], len(seqs), seqs[0][:5], rev_seqs, targets[0])

# Compose different data transforms for this particular load
data_transform = transforms.Compose([
    Augment(randomize_linker_p=0.1, enhancer="WT-otx-a"), ReverseComplement(ohe_encoded=False), OneHotEncode(), ToTensor(transpose=False)
])

# Instantiate a Dataset
test_dataset = SeqDataset(seqs, names=names, targets=targets, transform=data_transform)
print(test_dataset[0][0], test_dataset[0][1][:5], test_dataset[0][2][:5], test_dataset[0][3])

# Check the Dataset Class
for i in range(len(test_dataset)):
    sample = test_dataset[i]
    print(i, sample[1].size(), sample[2].size(), sample[3])
    if i == 3:
        break

# Instantiate a DataLoader
test_dataloader = DataLoader(test_dataset, batch_size=32, shuffle=True, num_workers=0)

# Check the DataLoader
for i_batch, sample_batched in enumerate(test_dataloader):
    print(i_batch, sample_batched[1].size(), sample_batched[2].size(), sample_batched[3].size())
    # observe 4th batch and stop.
    if i_batch == 3:
        break

# SeqDataModule
# Can we go from SeqDataset to SeqDataModule?

S1-G1R-S2-E1F-S3-E2F-S4-G2R-S5-G3F-S6 460800 CATCT None 0.6117667250932141
tensor([83., 49., 45., 71., 49., 82., 45., 83., 50., 45., 69., 49., 70., 45.,
        83., 51., 45., 69., 50., 70., 45., 83., 52., 45., 71., 50., 82., 45.,
        83., 53., 45., 71., 51., 70., 45., 83., 54.]) tensor([[0., 1., 0., 0.],
        [1., 0., 0., 0.],
        [0., 0., 0., 1.],
        [0., 1., 0., 0.],
        [0., 0., 0., 1.]]) tensor([[0., 0., 0., 1.],
        [0., 1., 0., 0.],
        [0., 1., 0., 0.],
        [0., 0., 0., 1.],
        [1., 0., 0., 0.]]) tensor(0.6118)
0 torch.Size([66, 4]) torch.Size([66, 4]) tensor(0.6118)
1 torch.Size([66, 4]) torch.Size([66, 4]) tensor(0.3130)
2 torch.Size([66, 4]) torch.Size([66, 4]) tensor(0.3871)
3 torch.Size([66, 4]) torch.Size([66, 4]) tensor(0.)
0 torch.Size([32, 66, 4]) torch.Size([32, 66, 4]) torch.Size([32])
1 torch.Size([32, 66, 4]) torch.Size([32, 66, 4]) torch.Size([32])
2 torch.Size([32, 66, 4]) torch.Size([32, 66, 4]) torch.Size([32])
3 torch.Size(

# Load a random dataset from Numpy arrays using `load_numpy`

In [10]:
from torchvision import transforms
from torch.utils.data import DataLoader
from eugene.datasets import load_numpy
from eugene.dataloading.dataloaders import SeqDataset
from eugene.dataloading.dataloaders import SeqDataModule
from eugene.dataloading import Augment, ReverseComplement, OneHotEncode, ToTensor

# Load the sequences as numpy arrays
names, seqs, rev_seqs, targets = load_numpy("../data/2021_OLS_Library/ohe_seq/0.09-0.4_X-all_ohe-seq.npy",
                                            target_file="../data/2021_OLS_Library/binary/0.09-0.4_y-all_binary.txt",
                                            is_target_text=True)
print(names, seqs[0][:5], rev_seqs, targets[0])

# Load the sequences as numpy arrays
names, seqs, rev_seqs, targets = load_numpy("test_100seqs_1000/test_ohe_seqs.npy", 
                                            target_file="test_100seqs_66/test_labels.npy",
                                            names_file="test_100seqs_66/test_ids.npy", 
                                            rev_seq_file="test_100seqs_66/test_rev_ohe_seqs.npy")
print(len(names), names[0], len(seqs), seqs[0][:5], len(rev_seqs), rev_seqs[0][-5:], targets[0])

# Compose different data transforms for this particular load
data_transform = transforms.Compose([ToTensor(transpose=False)])

# Instantiate a Dataset
test_dataset = SeqDataset(seqs, names=names, rev_seqs=rev_seqs, transform=data_transform)

# Check the Dataset Class
for i in range(len(test_dataset)):
    sample = test_dataset[i]
    print(i, sample[0].size(), sample[1].size(), sample[2].size(), sample[3])
    if i == 3:
        break

# Instantiate a DataLoader
test_dataloader = DataLoader(test_dataset, batch_size=32, shuffle=True, num_workers=0)

# Check the DataLoader
for i_batch, sample_batched in enumerate(test_dataloader):
    print(i_batch, sample_batched[1].size(), sample_batched[2].size(), sample_batched[3].size())
    # observe 4th batch and stop.
    if i_batch == 3:
        break

# SeqDataModule
test_datamod = SeqDataModule("test_100seqs_66/test_seqs.fa", )

0 torch.Size([6]) torch.Size([1000, 4]) torch.Size([66, 4]) tensor([-1.])
1 torch.Size([6]) torch.Size([1000, 4]) torch.Size([66, 4]) tensor([-1.])
2 torch.Size([6]) torch.Size([1000, 4]) torch.Size([66, 4]) tensor([-1.])
3 torch.Size([6]) torch.Size([1000, 4]) torch.Size([66, 4]) tensor([-1.])


# Load from a fasta file using `load_fasta`

In [41]:
from eugene.datasets import load_fasta 

# Load the sequences as numpy arrays
names, seqs, rev_seqs, targets = load_fasta("test_100seqs_66/test_seqs.fa", "test_100seqs_66/test_labels.npy", rev_comp=True)
names[0], seqs[0], rev_seqs[0], targets[0]

# Compose different data transforms for this particular load
data_transform = transforms.Compose([
    Augment(randomize_linker_p=0.1, enhancer="WT-otx-a"), OneHotEncode(), ToTensor(transpose=False)
])

# Instantiate a Dataset
test_dataset = SeqDataset(seqs, names=names, targets=targets, rev_seqs=rev_seqs, transform=data_transform)

# Check the Dataset Class
for i in range(len(test_dataset)):
    sample = test_dataset[i]
    print(i, sample[0].size(), sample[1].size(), sample[2].size(), sample[3])
    if i == 3:
        break

# SeqDataModule
test_datamod = SeqDataModule("test_100seqs_66/test_seqs.tsv")

## Load dataset

In [72]:
# Import the laod function
from eugene.datasets import load
from torch.utils.data import DataLoader


# Load the sequences as numpy arrays
names, seqs, rev_seqs, targets = load("test_100seqs_66/test_ohe_seqs.npy", names_file="test_100seqs_66/test_ids.npy", target_file="test_100seqs_66/test_labels.npy", rev_seq_file="test_100seqs_66/test_rev_ohe_seqs.npy")
len(names), names[0], len(seqs), seqs[0][:5], len(rev_seqs), rev_seqs[0][-5:], len(targets), targets[0]

# Compose different data transforms for this particular load
data_transform = transforms.Compose([
    ToTensor(transpose=False)
])

# Instantiate a Dataset
test_dataset = SeqDataset(seqs, names=names, targets=targets, rev_seqs=rev_seqs, transform=data_transform)

# Check the Dataset Class
for i in range(len(test_dataset)):
    sample = test_dataset[i]
    print(i, sample[0].size(), sample[1].size(), sample[2].size(), sample[3])
    if i == 3:
        break

# Instantiate a DataLoader
test_dataloader = DataLoader(test_dataset, batch_size=32, shuffle=False, num_workers=0)

# Check the DataLoader
for i_batch, sample_batched in enumerate(test_dataloader):
    print(i_batch, sample_batched[1].size(), sample_batched[2].size(), sample_batched[3].size())
    # observe 4th batch and stop.
    if i_batch == 3:
        break

# Build SeqDataModule 
test_datamod = SeqDataModule(NUMPY_OHE)

(100,
 'seq001',
 100,
 array([[0, 0, 1, 0],
        [0, 0, 0, 1],
        [1, 0, 0, 0],
        [0, 0, 1, 0],
        [0, 0, 1, 0]], dtype=int8),
 100,
 array([[0, 1, 0, 0],
        [0, 1, 0, 0],
        [0, 0, 0, 1],
        [1, 0, 0, 0],
        [0, 1, 0, 0]], dtype=int8),
 100,
 0)

# Loading variable length genomic sequences (`Khoeuiery10`)

In [6]:
from torchvision import transforms
from torch.utils.data import DataLoader
from eugene.datasets import load
from eugene.dataloading.dataloaders import SeqDataset
from eugene.dataloading.dataloaders import SeqDataModule
from eugene.dataloading import Augment, ReverseComplement, OneHotEncode, ToTensor

# File definition
seq_file="/cellar/users/aklie/projects/EUGENE/data/2010_Khoueiry_CellPress/2010_Khoueiry_CellPress.tsv"

# Load the sequences as numpy arrays
names, seqs, rev_seqs, targets = load(seq_file, seq_col="SEQ", name_col="NAME", target_col="FXN_LABEL", rev_comp=True)
print(len(names), names[0], len(seqs), seqs[0][:5], len(rev_seqs), rev_seqs[0][-5:], len(targets), targets[0])

# Compose different data transforms for this particular load
data_transform = transforms.Compose([
    ReverseComplement(ohe_encoded=False), OneHotEncode(), ToTensor(transpose=False)
])

# Instantiate a Dataset
test_dataset = SeqDataset(seqs, names=names, targets=targets, transform=data_transform)
print(test_dataset[0][0], test_dataset[0][1][:5], test_dataset[0][2][:5], test_dataset[0][3])

# Check the Dataset Class
for i in range(len(test_dataset)):
    sample = test_dataset[i]
    print(i, sample[0], sample[1], sample[2], sample[3])
    if i == 3:
        break

# Instantiate a DataLoader
test_dataloader = DataLoader(test_dataset, batch_size=1, shuffle=False, num_workers=0)

# Check the DataLoader
for i_batch, sample_batched in enumerate(test_dataloader):
    print(i_batch, sample_batched[1].size(), sample_batched[2].size(), sample_batched[3].size())
    # observe 4th batch and stop.
    if i_batch == 3:
        break

# Build SeqDataModule 
#test_datamod = SeqDataModule(NUMPY_OHE)

20 scaffold_1:462149:462232 20 AAAGT 20 ACTTT 20 1.0
AAAGTAGGCTATATGCTACAGCCCAGAGCTCATGGATTTTAATGGGATCGGCTATCTAGGCCGACCCTCGCTCTCCCAAGGAAATGTCCACCTTCCAGCCGGGAAAAGATAACCGCTCGCCAGAGCGACGCTTTCCGGCTGACAAATTGTGTCGGACCTTGATAGCATTCCTGTTCCCTATCGGACCCAACTTT 
AAAGTAGGCTATATGCTACAGCCCAGAGCTCATGGATTTTAATGGGATCGGCTATCTAGGCCGACCCTCGCTCTCCCAAGGAAATGTCCACCTTCCAGCCGGGAAAAGATAACCGCTCGCCAGAGCGACGCTTTCCGGCTGACAAATTGTGTCGGACCTTGATAGCATTCCTGTTCCCTATCGGACCCAACTTT 
True
 AAAGTTGGGTCCGATAGGGAACAGGAATGCTATCAAGGTCCGACACAATTTGTCAGCCGGAAAGCGTCGCTCTGGCGAGCGGTTATCTTTTCCCGGCTGGAAGGTGGACATTTCCTTGGGAGAGCGAGGGTCGGCCTAGATAGCCGATCCCATTAAAATCCATGAGCTCTGGGCTGTAGCATATAGCCTACTTT
True
AAAGTAGGCTATATGCTACAGCCCAGAGCTCATGGATTTTAATGGGATCGGCTATCTAGGCCGACCCTCGCTCTCCCAAGGAAATGTCCACCTTCCAGCCGGGAAAAGATAACCGCTCGCCAGAGCGACGCTTTCCGGCTGACAAATTGTGTCGGACCTTGATAGCATTCCTGTTCCCTATCGGACCCAACTTT 
AAAGTAGGCTATATGCTACAGCCCAGAGCTCATGGATTTTAATGGGATCGGCTATCTAGGCCGACCCTCGCTCTCCCAAGGAAATGTCCACCTTCCAGCCGGGAAAAGATAACCGCTCGCCAGAGCGACGCTTTCCGGCTGACAAATTGTGTC

---