# Testing the `dataload` module

**Authorship:**
Adam Klie, *03/02/2022*
***
**Description:**
Notebook for testing out the `dataload` module.

In [1]:
if 'autoreload' not in get_ipython().extension_manager.loaded:
    %load_ext autoreload
%autoreload 2

import os
import torch
import numpy as np
import pandas as pd
import eugene as eu

Global seed set to 13
2022-10-10 22:24:27.311758: I tensorflow/core/platform/cpu_feature_guard.cc:193] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations:  AVX2 AVX512F FMA
To enable them in other operations, rebuild TensorFlow with the appropriate compiler flags.
2022-10-10 22:24:27.456651: W tensorflow/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libcudart.so.11.0'; dlerror: libcudart.so.11.0: cannot open shared object file: No such file or directory
2022-10-10 22:24:27.456688: I tensorflow/stream_executor/cuda/cudart_stub.cc:29] Ignore above cudart dlerror if you do not have a GPU set up on your machine.
2022-10-10 22:24:27.489102: E tensorflow/stream_executor/cuda/cuda_blas.cc:2981] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered
2022-10-10 22:24:28.143177: W tensorfl

# IO

## Built-in io

In [None]:
eu.logging.dataset_dir = "../../../eugene/datasets/random1000"
from eugene.dataload import SeqData

In [None]:
def check_random1000_load(sdata, has_target=False):
    assert isinstance(sdata, SeqData)
    assert sdata.n_obs == 1000
    assert sdata.names[-1] == "seq999"
    if has_target:
        assert sdata.seqs_annot.iloc[:, -1][0] is not np.nan

In [128]:
def test_read():
    sdata = eu.dl.read(os.path.join(eu.logging.dataset_dir, "random1000_seqs.tsv"))
    check_random1000_load(sdata)
test_read()

NameError: name 'check_random1000_load' is not defined

In [None]:
def test_read_csv():
    sdata = eu.dl.read_csv(
        filename=os.path.join(eu.logging.dataset_dir, "random1000_seqs.tsv"),
        seq_col="seq",
        name_col="name",
        target_col="activity_0",
        rev_comp=False,
        sep="\t",
        low_memory=False,
        return_numpy=False,
        return_dataframe=False,
        col_names=None,
        auto_name=False,
        compression="infer"
    )
    check_random1000_load(sdata, has_target=True)
test_read_csv()

In [None]:
def test_read_fasta():
    sdata = eu.dl.read_fasta(
        seq_file=os.path.join(eu.logging.dataset_dir, "random1000_seqs.fa"),
        target_file=os.path.join(eu.logging.dataset_dir, "random1000_activities.npy"),
        rev_comp=False,
        is_target_text=False,
        return_numpy=False
    )
    check_random1000_load(sdata, has_target=True)
test_read_fasta()

In [None]:
def test_read_numpy()
    sdata = eu.dl.read_numpy(
            seq_file=os.path.join(eu.logging.dataset_dir, "random1000_seqs.npy"),
            names_file=os.path.join(eu.logging.dataset_dir, "random1000_ids.npy"),
            target_file=os.path.join(eu.logging.dataset_dir, "random1000_activities.npy"),
            rev_seq_file=os.path.join(eu.logging.dataset_dir, "random1000_rev_seqs.npy"),
            is_names_text=False,
            is_target_text=False,
            delim="\n",
            ohe=False,
            return_numpy=False
        )
    check_random1000_load(sdata, has_target=True)
test_read_numpy()

## Janggu wrapped io

In [2]:
eu.settings.dataset_dir = "../../../eugene/datasets/janggu_resources"
ref_file = "sample_genome.fa"
roi_file = "sample.bed"
bed_file = "scored_sample.bed"
bam_file = "sample2.bam"
bw_file = "sample.bw"    
from eugene.external.janggu.data import Bioseq, Cover 
def check_janggu_load(sdata, has_target=False):
    assert isinstance(sdata, SeqData)
    assert sdata.n_obs == 100
    assert "chr2" in sdata.names[-1]
    if has_target:
        assert sdata.seqs_annot.iloc[:, -1][0] is not np.nan

Janggu datasets for deep learning in genomics.


In [3]:
def test_read_bed():
    sdata = eu.dl.read_bed(
        bed_file=os.path.join(eu.settings.dataset_dir, bed_file),
        roi_file=os.path.join(eu.settings.dataset_dir, roi_file),
        ref_file=os.path.join(eu.settings.dataset_dir, ref_file),
        binsize=200, 
        collapser="max",
        dnaflank=50,
        add_seqs=True,
        return_janggu=False
    )
    check_janggu_load(sdata, has_target=True)
test_read_bed()

NotImplementedError: "intersectBed" does not appear to be installed or on the path, so this method is disabled.  Please install a more recent version of BEDTools and re-import to use this method.

In [5]:
os.environ["PATH"]

'/usr/local/bin:/vscode/bin/linux-x64/74b1f979648cc44d385a2286793c226e611f59e7/bin/remote-cli:/home/vscode/.local/bin:/usr/local/share/nvm/current/bin:/usr/local/python/current/bin:/usr/local/py-utils/bin:/usr/local/bin:/usr/local/share/nvm/versions/node/v16.17.1/bin:/usr/local/python/current/bin:/usr/local/py-utils/bin:/usr/local/bin:/usr/local/sbin:/usr/local/bin:/usr/sbin:/usr/bin:/sbin:/bin:/home/vscode/.local/bin:/usr/local/bin:/usr/local/bin'

In [15]:
import os as _os
import sys as _sys
try:
    _bin_dir = _os.path.dirname(_sys.executable)
    _os.environ["PATH"] += _os.pathsep + _bin_dir
    from pybedtools import paths as _paths
    _paths._set_bedtools_path(_bin_dir)
except ImportError:
    raise ImportError(
        "Please install janggu dependencies `pip install eugene[janggu]`"
    )
from eugene.external.janggu.data import Bioseq, Cover 

In [16]:
_os.environ["PATH"] += _os.pathsep + _bin_dir + "/bedtools"

In [17]:
_os.environ["PATH"] 

':/usr/local/bin/bedtools:/usr/local/bin:/usr/local/bin/bedtools'

In [18]:
def test_read_bed_janggu():
    _bin_dir = _os.path.dirname(_sys.executable)
    _os.environ["PATH"] += _os.pathsep + _bin_dir
    from pybedtools import paths as _paths
    _paths._set_bedtools_path(_bin_dir)
    dna, cov = eu.dl.read_bed(
        bed_file=os.path.join(eu.settings.dataset_dir, bed_file),
        roi_file=os.path.join(eu.settings.dataset_dir, roi_file),
        ref_file=os.path.join(eu.settings.dataset_dir, ref_file),
        binsize=200, 
        collapser="max",
        dnaflank=50,
        return_janggu=True
    )
    assert isinstance(dna, Bioseq)
    assert isinstance(cov, Cover)
test_read_bed_janggu()

NotImplementedError: "intersectBed" does not appear to be installed or on the path, so this method is disabled.  Please install a more recent version of BEDTools and re-import to use this method.

In [133]:
def test_read_bam():
    sdata = eu.dl.read_bam(
        bam_file=os.path.join(eu.settings.dataset_dir, bam_file),
        roi_file=os.path.join(eu.settings.dataset_dir, roi_file),
        ref_file=os.path.join(eu.settings.dataset_dir, ref_file), 
        binsize=200, 
        resolution=25
    )
    check_janggu_load(sdata, has_target=True)
test_read_bam()

In [134]:
def test_read_bigwig():
    sdata = eu.dl.read_bigwig(
        bigwig_file=os.path.join(eu.settings.dataset_dir, bw_file),
        roi_file=os.path.join(eu.settings.dataset_dir, roi_file),
        ref_file=os.path.join(eu.settings.dataset_dir, ref_file), 
        dnaflank=50,
        binsize=200,
        resolution=None,
        collapser="max"
    )
    check_janggu_load(sdata, has_target=True)
test_read_bigwig()

# Dataloaders

## SeqData

In [135]:
from eugene.dataload import SeqDataset

In [136]:
sdata = eu.datasets.random1000()
eu.pp.ohe_seqs_sdata(sdata)
eu.pp.reverse_complement_seqs_sdata(sdata)

One-hot encoding sequences:   0%|          | 0/1000 [00:00<?, ?it/s]

SeqData object modified:
	ohe_seqs: None -> 1000 ohe_seqs added
SeqData object modified:
	ohe_rev_seqs: None -> 1000 ohe_rev_seqs added


In [137]:
def test_SeqData_to_dataset(sdata):
    assert isinstance(sdataset, SeqDataset)
    transforms = sdataset.transform
    assert(transforms.transforms.pop())

In [140]:
sdataset = sdata.to_dataset(target_keys="activity_0")

No transforms given, assuming just need to tensorize.


In [141]:
def test_SeqDataset_get_item(sdataset):
    dataset_item = sdataset[0]
    assert(np.all([isinstance(itm, torch.Tensor) for itm in dataset_item]))
    assert(dataset_item[1].shape == (4,100))
    assert(dataset_item[2].shape == (4,100))
test_SeqDataset_get_item(sdataset)

In [142]:
def test_SeqData_write_h5sd(sdata):
    sdata.write_h5sd("../../../eugene/datasets/random1000/random1000.h5sd")
test_SeqData_write_h5sd(sdata)

# Motif

## MinimalMEME

In [143]:
def test_minimalMEME():
    minimal_meme = eu.dl.motif.MinimalMEME("../../_data/CPEs.meme")
    assert isinstance(minimal_meme, eu.dl.motif.MinimalMEME)
test_minimalMEME()

## Motif

In [145]:
minimal_meme = eu.dl.motif.MinimalMEME("../../_data/CPEs.meme")
motifs = minimal_meme.motifs

In [146]:
def test_Motif(motifs):
    example_motif = motifs["TATA"]
    assert isinstance(example_motif, eu.dl.motif.Motif)
test_Motif(motifs)

In [147]:
pfm = np.expand_dims(example_motif.pfm, axis=0)

In [148]:
def test_pwm_to_meme(pfm):
    eu.dl.motif.pwm_to_meme(pfm, output_file_path="../../_data/TATA.meme")
    assert os.path.exists("../../_data/TATA.meme")
test_pwm_to_meme(pfm)

Saved PWM File as : ../../_data/TATA.meme


In [149]:
model = eu.models.DeepBind(input_len=66, output_dim=1)

In [150]:
eu.interpret.generate_pfms_sdata(model, sdata)

No transforms given, assuming just need to tensorize.


Getting maximial activating seqlets:   0%|          | 0/7 [00:00<?, ?it/s]

Getting filter activators for 16 filters:   0%|          | 0/16 [00:00<?, ?it/s]

Getting PFMs from filters:   0%|          | 0/16 [00:00<?, ?it/s]

In [151]:
def test_filters_to_meme_sdata(sdata):
    eu.dl.motif.filters_to_meme_sdata(
        sdata,
        filter_ids=list(sdata.uns["pfms"].keys()),
        output_dir="../../_data/",
        file_name=f"model_filters.meme"
    )
    assert os.path.exists("../../_data/model_filters.meme")
    meme = eu.dl.motif.load_meme("../../_data/model_filters.meme")
    assert np.all([isinstance(item, Motif) for item in loaded_meme[0]])
test_filters_to_meme_sdata(sdata)

Saved PWM File as : ../../_data/model_filters.meme


## FIMO

In [152]:
def test_get_jaspar_motifs():
    from Bio.motifs.jaspar import Motif
    single_test = eu.dl.motif.get_jaspar_motifs(motif_accs=['MA0095.2'])
    assert isinstance(single_test[0], Motif)
    multi_test = eu.dl.motif.get_jaspar_motifs(motif_names=['CTCF', 'GATA1'])
    assert np.all([isinstance(test, Motif) for test in multi_test])
test_get_jaspar_motifs()

In [153]:
def test_save_motifs_as_meme():
    from pymemesuite.common import Motif
    motifs = eu.dl.motif.get_jaspar_motifs(motif_names=['CTCF', "GATA1"])
    eu.dl.motif.save_motifs_as_meme(motifs, "../../_data/jaspar.meme")
    assert os.path.exists("../../_data/jaspar.meme")
    loaded_meme = eu.dl.motif.load_meme("../../_data/jaspar.meme")
    assert np.all([isinstance(item, Motif) for item in loaded_meme[0]])
test_save_motifs_as_meme()

Saved PWM File as : ../../_data/jaspar.meme


In [154]:
loaded_meme = eu.dl.motif.load_meme("../../_data/jaspar.meme")

In [155]:
def test_fimo_motifs(sdata, loaded_meme):
    annots = eu.dl.motif.fimo_motifs(sdata, *loaded_meme)
    assert isinstance(annots, list)
test_fimo_motifs(sdata, loaded_meme)

In [156]:
def test_score_seqs(sdata):
    from pyranges import PyRanges
    score_df = eu.dl.motif.score_seqs(
        sdata=sdata, 
        motif_accs=['MA0095.2'], 
        filename="../../_data/jaspar.meme"
    )
    assert isinstance(score_df, PyRanges)
    assert np.unique(score_df.df["Name"])[0] == "YY1"
test_score_seqs(sdata)

Saved PWM File as : jaspar.meme


In [157]:
def test_jaspar_annots_sdata(sdata):
    from pyranges import PyRanges
    eu.dl.motif.jaspar_annots_sdata(
        sdata,
        motif_accs=['MA0048.1', 'MA0049.1']
    )
    assert isinstance(sdata.pos_annot, PyRanges)
    eu.dl.motif.jaspar_annots_sdata(
        sdata, 
        collection="CORE"
    ) 
    assert isinstance(sdata.pos_annot, PyRanges)
    eu.dl.motif.jaspar_annots_sdata(
        sdata, 
        motif_names=["GATA1"]
    ) 
    assert isinstance(sdata.pos_annot, PyRanges)
test_jaspar_annots_sdata(sdata)

Saved PWM File as : motifs.meme
Saved PWM File as : motifs.meme


TypeError: sequence item 0: expected str instance, tuple found

---