# Implementing `dataloading` modules `_io.py` functionality

**Authorship:**
Adam Klie, *03/02/2022*
***
**Description:**
Notebook for implementing `dataloading` modules I/O functins in `_io.py`

In [1]:
import os
import numpy as np
import pandas as pd

# Autoreload extension
if 'autoreload' not in get_ipython().extension_manager.loaded:
    %load_ext autoreload
%autoreload 2

from pkg_resources import resource_filename
import eugene as eu

Global seed set to 13


GPU is available: True
Number of GPUs: 1
Current GPU: 0
GPUs: Quadro RTX 5000


# READ

## `read_bed`

In [71]:
def read_bed(
    bed_file: str,
    roi_file: str,
    ref_file: str,
    dnaflank=None,
    resolution=None,
    collapser="max",
    add_seqs=False,
    return_janggu=False,
    **kwargs
):
    """
    Read a BED file and return a DataFrame.

    Parameters
    ----------
    bed_file : str
        Path to the BED file where peaks are stored.
    roi_file : str
        Path to the file containing the regions of interest under consideration.
    ref_file : str
        Path to the genome reference file.
    kwargs : dict
        Additional arguments to pass to as Janggu's parameters for loading
    
    Returns
    -------
    sdata : SeqData
        SeqData object containing the peaks.
    """
    dna = eu.external.janggu.data.Bioseq.create_from_refgenome(
        name='dna',
        refgenome=ref_file,
        roi=roi_file,
        flank=dnaflank,
        **kwargs
    )
    cover = eu.external.janggu.data.Cover.create_from_bed(
        'cover',
        bedfiles=bed_file,
        roi=roi_file,
        resolution=resolution,
        collapser=collapser,
        **kwargs
    )
    if return_janggu:
        return dna, cover
    ids = np.array(list(dna.garray.region2index.keys()))
    ohe_seqs = dna[:][:, :, 0, :]
    targets = cover[:].squeeze()
    seqs = np.array(eu.pp.decode_DNA_seqs(ohe_seqs)) if add_seqs else None
    return eu.dl.SeqData(
            names=ids,
            seqs=seqs,
            ohe_seqs=ohe_seqs,
            rev_seqs=None,
            seqs_annot=pd.DataFrame(data=targets, index=ids, columns=["target"]),
        )


In [20]:
bed_file = resource_filename(
    'eugene',
    'external/janggu/resources/scored_sample.bed'
)
roi_file = resource_filename(
    'eugene',
    'external/janggu/resources/sample.bed'
)
refgenome = resource_filename(
    'eugene',
    'external/janggu/resources/sample_genome.fa'
)

In [21]:
bed_file, roi_file, refgenome

('/mnt/beegfs/users/aklie/projects/EUGENe/eugene/external/janggu/resources/scored_sample.bed',
 '/mnt/beegfs/users/aklie/projects/EUGENe/eugene/external/janggu/resources/sample.bed',
 '/mnt/beegfs/users/aklie/projects/EUGENe/eugene/external/janggu/resources/sample_genome.fa')

In [72]:
dna, cov = read_bed(
    bed_file=bed_file,
    roi_file=roi_file,  
    ref_file=refgenome,
    binsize=200, 
    collapser="max",
    dnaflank=50,
    return_janggu=True
)

In [74]:
cov.shape

(100, 1, 1, 1)

In [79]:
sdata = read_bed(
    bed_file=bed_file,
    roi_file=roi_file,  
    ref_file=refgenome,
    binsize=200, 
    collapser="max",
    dnaflank=50,
    add_seqs=True,
    return_janggu=False
)

In [81]:
eu.pp.reverse_complement_data(sdata)

SeqData object modified:
	rev_seqs: None -> 100 rev_seqs added


In [82]:
sdata

SeqData object with = 100 seqs
seqs = (100,)
names = (100,)
rev_seqs = (100,)
ohe_seqs = (100, 300, 4)
ohe_rev_seqs = None
seqs_annot: 'target'
pos_annot: None
seqsm: None
uns: None

In [70]:
cov[:].squeeze()[:5]

array([0., 0., 0., 1., 1.])

In [43]:
dna.gindexer.__dict__

{'_binsize': 200,
 '_stepsize': 200,
 '_flank': 50,
 'chrs': ['chr1',
  'chr1',
  'chr1',
  'chr1',
  'chr1',
  'chr1',
  'chr1',
  'chr1',
  'chr1',
  'chr1',
  'chr1',
  'chr1',
  'chr1',
  'chr1',
  'chr1',
  'chr1',
  'chr1',
  'chr1',
  'chr1',
  'chr1',
  'chr1',
  'chr1',
  'chr1',
  'chr1',
  'chr1',
  'chr1',
  'chr1',
  'chr1',
  'chr1',
  'chr1',
  'chr1',
  'chr1',
  'chr1',
  'chr1',
  'chr1',
  'chr1',
  'chr1',
  'chr1',
  'chr1',
  'chr1',
  'chr1',
  'chr1',
  'chr1',
  'chr1',
  'chr1',
  'chr1',
  'chr1',
  'chr1',
  'chr1',
  'chr1',
  'chr2',
  'chr2',
  'chr2',
  'chr2',
  'chr2',
  'chr2',
  'chr2',
  'chr2',
  'chr2',
  'chr2',
  'chr2',
  'chr2',
  'chr2',
  'chr2',
  'chr2',
  'chr2',
  'chr2',
  'chr2',
  'chr2',
  'chr2',
  'chr2',
  'chr2',
  'chr2',
  'chr2',
  'chr2',
  'chr2',
  'chr2',
  'chr2',
  'chr2',
  'chr2',
  'chr2',
  'chr2',
  'chr2',
  'chr2',
  'chr2',
  'chr2',
  'chr2',
  'chr2',
  'chr2',
  'chr2',
  'chr2',
  'chr2',
  'chr2',
  'chr2',


In [44]:
dna.shape

(100, 300, 1, 4)

In [46]:
dna.garray.__dict__.keys()

dict_keys(['stranded', '_condition', '_order', 'padding_value', '_resolution', 'typecode', '_full_genome_stored', 'collapser', 'region2index', 'handle'])

array(['chr1:14950-15250', 'chr1:15150-15450', 'chr1:15350-15650',
       'chr1:15550-15850', 'chr1:15750-16050', 'chr1:15950-16250',
       'chr1:16150-16450', 'chr1:16350-16650', 'chr1:16550-16850',
       'chr1:16750-17050', 'chr1:16950-17250', 'chr1:17150-17450',
       'chr1:17350-17650', 'chr1:17550-17850', 'chr1:17750-18050',
       'chr1:17950-18250', 'chr1:18150-18450', 'chr1:18350-18650',
       'chr1:18550-18850', 'chr1:18750-19050', 'chr1:18950-19250',
       'chr1:19150-19450', 'chr1:19350-19650', 'chr1:19550-19850',
       'chr1:19750-20050', 'chr1:19950-20250', 'chr1:20150-20450',
       'chr1:20350-20650', 'chr1:20550-20850', 'chr1:20750-21050',
       'chr1:20950-21250', 'chr1:21150-21450', 'chr1:21350-21650',
       'chr1:21550-21850', 'chr1:21750-22050', 'chr1:21950-22250',
       'chr1:22150-22450', 'chr1:22350-22650', 'chr1:22550-22850',
       'chr1:22750-23050', 'chr1:22950-23250', 'chr1:23150-23450',
       'chr1:23350-23650', 'chr1:23550-23850', 'chr1:23750-240

In [51]:
dna.garray.handle["data"].shape

(100, 300, 1, 1)

In [58]:
cov[:].squeeze()

array([0., 0., 0., 1., 1., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
       0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
       0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
       0., 0., 1., 1., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
       0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
       0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.])

In [57]:
cov.garray.handle["data"].squeeze()

array([0., 0., 0., 1., 1., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
       0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
       0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
       0., 0., 1., 1., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
       0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
       0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.],
      dtype=float32)

In [56]:
cov.gindexer.__dict__

{'_binsize': 200,
 '_stepsize': 200,
 '_flank': 0,
 'chrs': ['chr1',
  'chr1',
  'chr1',
  'chr1',
  'chr1',
  'chr1',
  'chr1',
  'chr1',
  'chr1',
  'chr1',
  'chr1',
  'chr1',
  'chr1',
  'chr1',
  'chr1',
  'chr1',
  'chr1',
  'chr1',
  'chr1',
  'chr1',
  'chr1',
  'chr1',
  'chr1',
  'chr1',
  'chr1',
  'chr1',
  'chr1',
  'chr1',
  'chr1',
  'chr1',
  'chr1',
  'chr1',
  'chr1',
  'chr1',
  'chr1',
  'chr1',
  'chr1',
  'chr1',
  'chr1',
  'chr1',
  'chr1',
  'chr1',
  'chr1',
  'chr1',
  'chr1',
  'chr1',
  'chr1',
  'chr1',
  'chr1',
  'chr1',
  'chr2',
  'chr2',
  'chr2',
  'chr2',
  'chr2',
  'chr2',
  'chr2',
  'chr2',
  'chr2',
  'chr2',
  'chr2',
  'chr2',
  'chr2',
  'chr2',
  'chr2',
  'chr2',
  'chr2',
  'chr2',
  'chr2',
  'chr2',
  'chr2',
  'chr2',
  'chr2',
  'chr2',
  'chr2',
  'chr2',
  'chr2',
  'chr2',
  'chr2',
  'chr2',
  'chr2',
  'chr2',
  'chr2',
  'chr2',
  'chr2',
  'chr2',
  'chr2',
  'chr2',
  'chr2',
  'chr2',
  'chr2',
  'chr2',
  'chr2',
  'chr2',
 

In [9]:
sdata

SeqData object with = 100 seqs
seqs = (100,)
names = None
rev_seqs = None
ohe_seqs = None
ohe_rev_seqs = None
seqs_annot: 'target'
pos_annot: None
seqsm: None
uns: None

## JunD Data

In [2]:
input_dir = '/cellar/users/aklie/data/eugene/junD'

In [3]:
bed_file = os.path.join(
    input_dir,
    "jund_raw_peaks.bed" 
)
roi_file = os.path.join(
    input_dir,
    "trim_roi_jund_extended.bed"
)
refgenome = os.path.join(
    input_dir,
    "hg38.fa"
)
bed_file, roi_file, refgenome

('/cellar/users/aklie/data/eugene/junD/jund_raw_peaks.bed',
 '/cellar/users/aklie/data/eugene/junD/trim_roi_jund_extended.bed',
 '/cellar/users/aklie/data/eugene/junD/hg38.fa')

In [None]:
sdata = read_bed(
    bed_file=bed_file,
    roi_file=roi_file,
    ref_file=refgenome,
    dnaflank=150
    binsize=200,
    resolution=200
)

KeyboardInterrupt: 

: 

In [8]:
LABELS = eu.external.janggu.data.Cover.create_from_bed(
    'jund_peaks',
    bedfiles=bed_file,
    roi=roi_file,
    binsize=200,
    conditions=['JunD'],
    resolution=200,
)

In [9]:
LABELS.shape

(1013080, 1, 1, 1)

In [12]:
np.unique(LABELS[:].squeeze(), return_counts=True)

(array([0., 1.]), array([985230,  27850]))

In [13]:
DNA = eu.external.janggu.data.Bioseq.create_from_refgenome(
    'dna', 
    refgenome=refgenome,
    roi=roi_file,
    binsize=200,
    flank=150
)

In [14]:
DNA.shape

(1013080, 500, 1, 4)

In [15]:
DNA[:][:, :, 0, :].shape

(1013080, 500, 4)