# Testing `SeqData`

**Authorship:**
Adam Klie, *04/06/2022*
***
**Description:**
Notebook to test the EUGENE data structure `SeqData` 
***
<div class="alert alert-block alert-warning">
<b>TODOs</b>:
<ul>
    <b><li></li></b>
    <b><li></li></b>
    <b><li></li></b>
    </ul>
</div>

# Setup

In [7]:
# Classics
import numpy as np
import pandas as pd

# Autoreload extension
if 'autoreload' not in get_ipython().extension_manager.loaded:
    %load_ext autoreload
%autoreload 2

# Basic import
import eugene as eu
import logging
eu.settings.verbosity = logging.INFO

Global seed set to 13


## Test `SeqData` on random dataset

In [8]:
random1000 = eu.datasets.random1000()
random1000

SeqData object with = 1000 seqs
seqs = (1000,)
names = (1000,)
rev_seqs = None
ohe_seqs = None
ohe_rev_seqs = None
    seqs_annot: 'TARGETS'

In [9]:
eu.pp.reverse_complement_data(random1000, copy=False)

SeqData object modified:
	rev_seqs: None -> 1000 rev_seqs added


In [10]:
eu.pp.one_hot_encode_data(random1000, copy=False)

SeqData object modified:
	ohe_seqs: None -> 1000 ohe_seqs added
	ohe_rev_seqs: None -> 1000 ohe_rev_seqs added


In [11]:
random1000

SeqData object with = 1000 seqs
seqs = (1000,)
names = (1000,)
rev_seqs = (1000,)
ohe_seqs = (1000, 66, 4)
ohe_rev_seqs = (1000, 66, 4)
    seqs_annot: 'TARGETS'

In [27]:
eu.pp.train_test_split_data(random1000, copy=False, kwargs={})

800


In [29]:
random1000.seqs_annot["TRAIN"]

0       True
1       True
2       True
3      False
4      False
       ...  
995     True
996     True
997     True
998     True
999     True
Name: TRAIN, Length: 1000, dtype: bool

# Reading and writing

In [6]:
sdata = eu.dl.read_csv(file="../_data/test_1000seqs_66/test_seqs.tsv")
sdata

SeqData object with = 1000 seqs
seqs = (1000,)
names = None
rev_seqs = None
ohe_seqs = None
ohe_rev_seqs = None
    seqs_annot: 'TARGETS'

In [7]:
sdata = eu.dl.read_fasta(seq_file="../_data/test_1000seqs_66/test_seqs.fa", target_file="../_data/test_1000seqs_66/test_activities.npy")
sdata

SeqData object with = 1000 seqs
seqs = (1000,)
names = (1000,)
rev_seqs = None
ohe_seqs = None
ohe_rev_seqs = None
    seqs_annot: 'TARGETS'

In [8]:
sdata = eu.dl.read_numpy(seq_file="../_data/test_1000seqs_66/test_ohe_seqs.npy", names_file="../_data/test_1000seqs_66/test_ids.npy", target_file="../_data/test_1000seqs_66/test_activities.npy", ohe_encoded=True)
sdata.seqs = np.load("../_data/test_1000seqs_66/test_seqs.npy")
sdata.write_h5sd("../_data/test_1000seqs_66/test_seqs.h5sd")
sdata

SeqData object with = 1000 seqs
seqs = (1000,)
names = (1000,)
rev_seqs = None
ohe_seqs = (1000, 66, 4)
ohe_rev_seqs = None
    seqs_annot: 'TARGETS'

In [9]:
sdata = eu.dl.read("../_data/test_1000seqs_66/test_seqs.h5sd")
sdata

SeqData object with = 1000 seqs
seqs = (1000,)
names = (1000,)
rev_seqs = None
ohe_seqs = (1000, 66, 4)
ohe_rev_seqs = None
    seqs_annot: 'TARGETS'

# Creating a dataloader

In [40]:
sdataset = sdata.to_dataset(label="TARGETS", seq_transforms = ["augment", "one_hot_encode"], transform_kwargs={"enhancer": "Core-otx-a"})

In [41]:
sdataset.seqs[0], sdataset.targets[0], sdataset.rev_seqs, sdataset.transform

('GTAGGTAAGCGGGGTATTTGCACTTCCCTTAATCCATAAGGGCTTTTGCCGCGTGTTAGAGGAAGC',
 0.7708982213861482,
 None,
 Compose(
     <eugene.dataloading._transforms.Augment object at 0x1554a535f950>
     <eugene.dataloading._transforms.OneHotEncode object at 0x1554a535fdd0>
     <eugene.dataloading._transforms.ToTensor object at 0x1554a535fa10>
 ))

In [42]:
sdataset[0]

(tensor([115., 101., 113.,  48.,  48.,  49.,  36.]),
 tensor([[0., 1., 0., 0.],
         [1., 0., 0., 0.],
         [0., 1., 0., 0.],
         [1., 0., 0., 0.],
         [0., 1., 0., 0.],
         [0., 1., 0., 0.],
         [1., 0., 0., 0.],
         [0., 0., 1., 0.],
         [1., 0., 0., 0.],
         [0., 1., 0., 0.],
         [0., 0., 0., 1.],
         [0., 1., 0., 0.],
         [1., 0., 0., 0.],
         [0., 1., 0., 0.],
         [0., 1., 0., 0.],
         [0., 1., 0., 0.],
         [0., 1., 0., 0.],
         [0., 0., 1., 0.],
         [0., 0., 1., 0.],
         [0., 1., 0., 0.],
         [0., 0., 1., 0.],
         [1., 0., 0., 0.],
         [0., 1., 0., 0.],
         [0., 0., 0., 1.],
         [0., 0., 0., 1.],
         [0., 1., 0., 0.],
         [0., 1., 0., 0.],
         [0., 1., 0., 0.],
         [0., 0., 0., 1.],
         [0., 0., 0., 1.],
         [1., 0., 0., 0.],
         [1., 0., 0., 0.],
         [0., 0., 0., 1.],
         [0., 1., 0., 0.],
         [0., 1., 0., 0.],
  

In [43]:
# Instantiate a DataLoader
from torch.utils.data import DataLoader
test_dataloader = DataLoader(sdataset, batch_size=32, shuffle=True, num_workers=0)

In [45]:
# Check the DataLoader
for i_batch, sample_batched in enumerate(test_dataloader):
    print(i_batch, sample_batched[0].size(), sample_batched[1].size(), sample_batched[2].size(), sample_batched[3].size())
    # observe 4th batch and stop.
    if i_batch == 3:
        break

0 torch.Size([32, 7]) torch.Size([32, 66, 4]) torch.Size([32, 1]) torch.Size([32])
1 torch.Size([32, 7]) torch.Size([32, 66, 4]) torch.Size([32, 1]) torch.Size([32])
2 torch.Size([32, 7]) torch.Size([32, 66, 4]) torch.Size([32, 1]) torch.Size([32])
3 torch.Size([32, 7]) torch.Size([32, 66, 4]) torch.Size([32, 1]) torch.Size([32])


# SeqDataModule

In [49]:
from torchvision import transforms
data_transform = transforms.Compose([eu.dl.ToTensor(transpose=True)])
datamodule = eu.dl.SeqDataModule(seq_file=f"../_data/test_1000seqs_66/test_ohe_seqs.npy", batch_size=32, transform=data_transform, read_kwargs={"return_numpy": True})
datamodule.setup()
dataset = datamodule.train_dataloader().dataset
assert(len(dataset[0]) == 4)

In [50]:
dataset[0]

(tensor([-1.]),
 tensor([[0., 0., 0., 1., 0., 0., 0., 0., 0., 0., 0., 0., 0., 1., 1., 0., 0., 0.,
          0., 0., 0., 1., 0., 0., 0., 1., 1., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
          0., 0., 0., 1., 0., 0., 0., 0., 0., 0., 0., 0., 1., 0., 0., 0., 0., 0.,
          1., 0., 1., 0., 0., 1., 0., 0., 0., 0., 0., 0.],
         [0., 0., 0., 0., 1., 0., 0., 1., 1., 1., 1., 1., 0., 0., 0., 0., 1., 0.,
          0., 0., 1., 0., 1., 0., 1., 0., 0., 0., 0., 1., 1., 0., 0., 0., 0., 1.,
          0., 0., 1., 0., 0., 0., 1., 1., 0., 0., 1., 0., 0., 1., 0., 0., 1., 0.,
          0., 0., 0., 0., 0., 0., 0., 0., 0., 1., 0., 0.],
         [0., 1., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 1., 0., 0., 0., 0., 1.,
          1., 0., 0., 0., 0., 1., 0., 0., 0., 0., 1., 0., 0., 0., 0., 0., 1., 0.,
          0., 1., 0., 0., 0., 1., 0., 0., 0., 0., 0., 1., 0., 0., 1., 0., 0., 0.,
          0., 0., 0., 0., 0., 0., 0., 1., 0., 0., 0., 1.],
         [1., 0., 1., 0., 0., 1., 1., 0., 0., 0., 0., 0., 0., 0., 0.,

---

# Scratch

In [6]:
names, seqs, rev_seqs, targets = eu.dl.read_numpy(f"../_data/test_1000seqs_66/test_seqs.npy", names_file=f"../_data/test_1000seqs_66/test_ids.npy", target_file=f"../_data/test_1000seqs_66/test_labels.npy", rev_seq_file=f"../_data/test_1000seqs_66/test_rev_seqs.npy", return_numpy=True)
sdata = eu.dl.SeqData(names=names, seqs=seqs, seqs_annot=targets, rev_seqs=rev_seqs)
sdataset = sdata.to_dataset(label=0, seq_transforms = ["augment", "one_hot_encode"], transform_kwargs={"enhancer": "Core-otx-a"})
assert(sdataset[0])

In [5]:
    names, seqs, rev_seqs, targets = eu.dl.read_numpy("../_data/test_1000seqs_66/test_seqs.npy", names_file="../_data/test_1000seqs_66/test_ids.npy", target_file="../_data/test_1000seqs_66/test_labels.npy", rev_seq_file="../_data/test_1000seqs_66/test_rev_seqs.npy", return_numpy=True)
    assert(len(names) == len(seqs) == len(rev_seqs) == len(targets))
    sdata = eu.dl.SeqData(names=names, seqs=seqs, seqs_annot=targets, rev_seqs=rev_seqs)

  if index_name in anno:


In [6]:
sdata.seqs_annot

Unnamed: 0,0
0,1
1,0
2,1
3,0
4,0
...,...
995,0
996,0
997,0
998,0


In [8]:
sdata.write_h5sd("../_data/test_1000seqs_66/test_seqs.h5sd")

In [None]:
def write_elem(
    f: "Union[H5Group, ZarrGroup]",
    k: str,
    elem: Any,
    *args,
    modifiers=frozenset(),
    **kwargs,
):
    """
    Write an element to a disk store using it's anndata encoding.
    Params
    ------
    f
        The store to write to.
    k
        The key to write for this value.
    elem
        The element to write as k to f.
    """
    dest_type = type(f)
    if elem is None:
        return
    t = type(elem)
    if k == "/":
        f.clear()
    elif k in f:
        del f[k]
    if (
        hasattr(elem, "dtype")
        and (dest_type, (t, elem.dtype.kind), modifiers) in write
    ):
        get_writer(dest_type, (t, elem.dtype.kind), modifiers)(
            f, k, elem, *args, **kwargs
        )
    else:
        get_writer(dest_type, t, modifiers)(f, k, elem, *args, **kwargs)

In [None]:
    def get_writer(self, dest_type, typ, modifiers=frozenset()):
        import h5py

        if dest_type is h5py.File:
            dest_type = h5py.Group
        modifiers = frozenset(modifiers)

        if (dest_type, typ, modifiers) not in self.write:
            raise TypeError(
                f"No method has been defined for writing {typ} elements to {dest_type}"
            )

        return self.write[(dest_type, typ, modifiers)]

In [None]:
from typing import Any, NamedTuple, Tuple, Type, Callable, Union

In [None]:
write_elem(f, "seq_annot", sdata.seqs_annot)

AttributeError: type object 'Group' has no attribute 'write'

In [None]:
sdata.write_h5sd(filepath)

In [None]:
test_dict = eu.datasets.load_h5sd(filepath)

{'seqs': array(['GTAGGTAAGCGGGGTATTTGCACTTCCCTTAATCCATAAGGGCTTTTGCCGCGTGTTAGAGGAAGC',
       'TATCCCACACTTGTGTATGGCATCTTCCCCCTCAGCCTCCCTCGTGTCGTACTATACGATCATTTA',
       'AAGAAAGATATTTGGGATGGAGACGCATGATTCATGGCTAGTTCGGAGAGCGAACGGCGGAGGCCT',
       'AGGTGATATTCAGGAGGATATGGGCTCCACAACTTTTTCCGTCGTAGCAAAGCATAAGGCTGACAA',
       'GCTTGGCTTTATACACTTCGCGAAATAGACCTCGATAAGCCATCTCTGTGGTGAGCTATCCCGGTT',
       'AATGCTAGTTGTGCGGGTTGTAATTGCTAGTAACGGCCGGTTCTATTACATCTAATGGAAGGTTGT',
       'TCTATTGATTCTTCGTCAGAACTCCCCGTAATATACATTTTTGGATATTGGCGCCCCCAGCTGGCA',
       'CATGTAATATGTGTATATTCACACGTAATAACAGGTATGAATGATGTCACGCCGTCTCTGCGCGGC',
       'CCATAAGCTGACGCGCATATCGATATATTCTCTGGGTCCTGGCGACGCACCCCATCCGCGTAATAT',
       'TTAGTCATTCGGGTTTACTCCGATGGTCGCACACGGATAACCAGCTCCTATAAATAGTGACAGGTC',
       'TGACAACTAGACCCTATTCCTAGTACCAGCCCATCTGCCGCTATAATTTTGCATTTGTTTCGTAAA',
       'GGATGAATCGTAATGCCAGCGGACTACCCCCGAGTCACAGATTAAAATCAATTGAGTTCAGTTGCT',
       'ATAGAGAGACAACTTACAGGATTAAGTAGTCGTTGCGTAAGTATGATAGTAGAACCGCG

In [None]:
test_dict.ohe_seqs

array([[[0., 0., 1., 0.],
        [0., 0., 0., 1.],
        [1., 0., 0., 0.],
        ...,
        [1., 0., 0., 0.],
        [0., 0., 1., 0.],
        [0., 1., 0., 0.]],

       [[0., 0., 0., 1.],
        [1., 0., 0., 0.],
        [0., 0., 0., 1.],
        ...,
        [0., 0., 0., 1.],
        [0., 0., 0., 1.],
        [1., 0., 0., 0.]],

       [[1., 0., 0., 0.],
        [1., 0., 0., 0.],
        [0., 0., 1., 0.],
        ...,
        [0., 1., 0., 0.],
        [0., 1., 0., 0.],
        [0., 0., 0., 1.]],

       ...,

       [[0., 0., 1., 0.],
        [0., 0., 0., 1.],
        [0., 0., 0., 1.],
        ...,
        [0., 1., 0., 0.],
        [0., 1., 0., 0.],
        [1., 0., 0., 0.]],

       [[1., 0., 0., 0.],
        [0., 0., 0., 1.],
        [1., 0., 0., 0.],
        ...,
        [1., 0., 0., 0.],
        [1., 0., 0., 0.],
        [0., 0., 1., 0.]],

       [[0., 0., 1., 0.],
        [0., 0., 0., 1.],
        [0., 0., 0., 1.],
        ...,
        [1., 0., 0., 0.],
        [0., 0.

In [None]:
with h5py.File(filepath, "r") as f:
    print(f.name)
    print(list(f.keys()))
    print(f["seqs"].shape)
    print(f["seqs"][0].decode('latin-1'))
    print(f["seqs_annot"].attrs["label"][0])

/
['names', 'ohe_seqs', 'seqs', 'seqs_annot']
(1000,)
GTAGGTAAGCGGGGTATTTGCACTTCCCTTAATCCATAAGGGCTTTTGCCGCGTGTTAGAGGAAGC


KeyError: "Can't open attribute (can't locate attribute: 'label')"

In [None]:
with h5py.File(filename, "r") as f:
        d = {}
        for k in f.keys():
            # Backwards compat for old raw
            if k == "raw" or k.startswith("raw."):
                continue
            if k == "X" and "X" in as_sparse:
                d[k] = rdasp(f[k])
            elif k == "raw":
                assert False, "unexpected raw format"
            elif k in {"obs", "var"}:
                # Backwards compat
                d[k] = read_dataframe(f[k])
            else:  # Base case
                d[k] = read_elem(f[k])

        d["raw"] = _read_raw(f, as_sparse, rdasp)

        X_dset = f.get("X", None)
        if X_dset is None:
            pass
        elif isinstance(X_dset, h5py.Group):
            d["dtype"] = X_dset["data"].dtype
        elif hasattr(X_dset, "dtype"):
            d["dtype"] = f["X"].dtype
        else:
            raise ValueError()

In [None]:
h5_read = h5py.File(filepath, "r")

In [None]:
h5_read.attrs["encoding-type"]

'Seqdata'

---