# Testing the `dataload` module

**Authorship:**
Adam Klie, *03/02/2022*
***
**Description:**
Notebook for testing out the `dataload` module.

In [3]:
if 'autoreload' not in get_ipython().extension_manager.loaded:
    %load_ext autoreload
%autoreload 2

import os
import numpy as np
import pandas as pd
import eugene as eu

# IO

## Built-in io

In [6]:
eu.logging.dataset_dir = "../../../eugene/datasets/random1000"
from eugene.dataload import SeqData

In [42]:
def check_random1000_load(sdata, has_target=False):
    assert isinstance(sdata, SeqData)
    assert sdata.n_obs == 1000
    assert sdata.names[-1] == "seq999"
    if has_target:
        assert sdata.seqs_annot.iloc[:, -1][0] is not np.nan

In [17]:
sdata = eu.dl.read(os.path.join(eu.logging.dataset_dir, "random1000_seqs.tsv"))
check_random1000_load(sdata)

In [43]:
sdata = eu.dl.read_csv(
    filename=os.path.join(eu.logging.dataset_dir, "random1000_seqs.tsv"),
    seq_col="seq",
    name_col="name",
    target_col="activity_0",
    rev_comp=False,
    sep="\t",
    low_memory=False,
    return_numpy=False,
    return_dataframe=False,
    col_names=None,
    auto_name=False,
    compression="infer"
)
check_random1000_load(sdata, has_target=True)

In [44]:
sdata = eu.dl.read_fasta(
    seq_file=os.path.join(eu.logging.dataset_dir, "random1000_seqs.fa"),
    target_file=os.path.join(eu.logging.dataset_dir, "random1000_activities.npy"),
    rev_comp=False,
    is_target_text=False,
    return_numpy=False
)
check_random1000_load(sdata, has_target=True)

In [47]:
sdata = eu.dl.read_numpy(
    seq_file=os.path.join(eu.logging.dataset_dir, "random1000_seqs.npy"),
    names_file=os.path.join(eu.logging.dataset_dir, "random1000_ids.npy"),
    target_file=os.path.join(eu.logging.dataset_dir, "random1000_activities.npy"),
    rev_seq_file=os.path.join(eu.logging.dataset_dir, "random1000_rev_seqs.npy"),
    is_names_text=False,
    is_target_text=False,
    delim="\n",
    ohe=False,
    return_numpy=False
)
check_random1000_load(sdata, has_target=True)

## Janggu wrapped io

In [49]:
eu.settings.dataset_dir = "../../../eugene/datasets/janggu_resources"

In [None]:
refgenome = "sample_genome.fa"
roi_file = "sample.bed"
bed_file = "scored_sample.bed"
bam_file = "sample2.bam"
bw_file = "sample.bw"                                         
refgenome, roi_file, bed_file, bam_file, bw_file

In [None]:
# Set-up a simple function call to read to sdata
sdata = eu.dl.read_bed(
    bed_file=bed_file,
    roi_file=roi_file,  
    ref_file=refgenome,
    binsize=200, 
    collapser="max",
    dnaflank=50,
    add_seqs=True,
    return_janggu=False
)
check_random1000_load(sdata, has_target=True)

HERE
/usr/local/bin


NotImplementedError: "intersectBed" does not appear to be installed or on the path, so this method is disabled.  Please install a more recent version of BEDTools and re-import to use this method.

In [None]:
# Just a test of Janggu style return
dna, cov = eu.dl.read_bed(
    bed_file=bed_file,
    roi_file=roi_file,  
    ref_file=refgenome,
    binsize=200, 
    collapser="max",
    dnaflank=50,
    return_janggu=True
)

In [None]:
sdata = eu.dl.read_bam(
    bam_file, 
    roi_file, 
    refgenome, 
    binsize=200, 
    resolution=25
)

In [None]:
sdata = eu.dl.read_bigwig(
    bigwig_file=bw_file,
    roi_file=roi_file,
    ref_file=refgenome,
    dnaflank=50,
    binsize=200,
    resolution=None,
    collapser="max"
)

# Dataloaders

# Motif

---