In [1]:
from keras_synthetic_genome_sequence.multivariate_gap_sequence import MultivariateGapSequence
from ucsc_genomes_downloader import Genome
from keras_synthetic_genome_sequence.utils import get_gaps_statistics
import numpy as np
from typing import Tuple
from numba import njit, jit

In [2]:
hg19 = Genome("hg19", chromosomes=["chr1", "chr2", "chr3"])

_, mean, covariance = get_gaps_statistics(
    hg19,
    100,
    200
)

HBox(children=(IntProgress(value=0, description='Loading chromosomes for genome hg19', layout=Layout(flex='2')…



HBox(children=(IntProgress(value=0, description='Rendering gaps in hg19', layout=Layout(flex='2'), max=3, styl…



HBox(children=(IntProgress(value=0, description='Rendering sequences in hg19', layout=Layout(flex='2'), max=2,…



In [3]:
gap_sequence = MultivariateGapSequence(
    assembly=hg19,
    bed="tests/utils/test.bed",
    gaps_mean=mean,
    gaps_covariance=covariance,
    batch_size=32
)

HBox(children=(IntProgress(value=0, description='Rendering sequences in hg19', layout=Layout(flex='2'), max=1,…



HBox(children=(IntProgress(value=0, description='Converting nucleotides to numeric classes', layout=Layout(fle…



HBox(children=(IntProgress(value=0, description='Generating synthetic gaps', layout=Layout(flex='2'), max=1, s…



In [6]:
def get1(self, idx: int) -> Tuple[np.ndarray, np.ndarray]:
    """Return batch corresponding to given index.

    Parameters
    ---------------
    idx: int,
        Index corresponding to batch to be rendered.

    Returns
    ---------------
    Return Tuple containing X and Y numpy arrays corresponding to given batch index.
    """
    # Retrieves the sequence from the bed generator
    y = self.__getitem__(idx)
    # Retrieve the indices corresponding to the gaps for the current batchsize
    indices = self._gaps_index[idx]
    # Get the boolean masks for the original positions that contain the gaps
    # for the given index
    masks = self._original_indices == indices[:, None]
    # Get the mask to drop the rows where none of the indices
    # considered for this specific batch is present
    considered_rows = masks.any(axis=0)
    # Drop rows where none of the indices is present
    indices_masks = masks[:, considered_rows]
    coordinates = self._coordinates[considered_rows]
    # For i-th row of current batch we apply the nucletides mask
    x = add_gaps(indices_masks, y, coordinates)
    return x, y

In [7]:
@njit
def add_gaps(indices_masks:np.ndarray, y:np.ndarray, coordinates:np.ndarray):
    # Making a deep copy of y, since we are going to edit the copy.
    x = np.copy(y)
    for i in range(indices_masks.shape[0]):
        for j in coordinates[indices_masks[i]]:
            x[i][j] = 0.25
    return x

In [8]:
@njit
def add_gaps2(gaps_coordinates:dict, indices:np.ndarray, y:np.ndarray):
    # Making a deep copy of y, since we are going to edit the copy.
    x = np.copy(y)
    for i in range(indices.shape[0]):
        x[i][gaps_coordinates[indices[i]]] = 0.25
    return x

In [9]:
def get4(self, idx: int) -> Tuple[np.ndarray, np.ndarray]:
    # Retrieves the sequence from the bed generator
    y = self.__getitem__(idx)
    # For i-th row of current batch we apply the nucletides mask
    x = add_gaps2(self._gaps_coordinates, self._gaps_index[idx], y)
    return x, y

In [10]:
def get2(self, idx: int) -> Tuple[np.ndarray, np.ndarray]:
    # Retrieves the sequence from the bed generator
    y = self.__getitem__(idx)
    # Retrieve the indices corresponding to the gaps for the current batchsize
    indices = self._gaps_index[idx]
    # Get the mask to drop the rows where none of the indices
    # considered for this specific batch is present
    considered_rows = np.in1d(self._original_indices, indices)
    # Extract the gaps curresponding to given indices
    indices_masks = self._original_indices[considered_rows] == indices[:, None]
    # Drop rows where none of the indices is present
    coordinates = self._coordinates[considered_rows]
    # For i-th row of current batch we apply the nucletides mask
    x = add_gaps(indices_masks, y, coordinates)
    return x, y

In [11]:
def get3(self, idx):
    """Return batch corresponding to given index.
    Parameters
    ---------------
    idx: int,
        Index corresponding to batch to be rendered.
    Returns
    ---------------
    Return Tuple containing X and Y numpy arrays corresponding to given batch index.
    """
    # Retrieves the sequence from the bed generator
    y = self.__getitem__(idx)
    # Retrieve the indices corresponding to the gaps for the current batchsize
    indices = self._gaps_index[idx]
    # Extract the gaps curresponding to given indices
    mask = np.in1d(self._coordinates, indices)
    coordinates = self._coordinates
    original_indices = self._original_indices
    # Making a deep copy of y, since we are going to edit the copy.
    x = np.copy(y)
    # For every j-th index curresponding to the i-th row of current batch
    for i, index in enumerate(indices):
        # We extract the mask curresponding to the gaps
        # for the i-th row of current batch
        gap_indices = original_indices[coordinates == index]
        # And we set the one-hot encoded nucleotides as
        # a uniform distribution.
        x[i, gap_indices] = 0.25
    # We return the tuple of the batch, containing
    # the input with added artificial gaps represented
    # as a uniform distribution and
    # the output containing the original one-hot encoded
    # sequence of nucleotides.
    return x, y

In [14]:
%%timeit -n 10000
gap_sequence[0]

15.4 µs ± 186 ns per loop (mean ± std. dev. of 7 runs, 10000 loops each)


In [15]:
%%timeit -n 10000
get4(gap_sequence, 1)

35.9 µs ± 502 ns per loop (mean ± std. dev. of 7 runs, 10000 loops each)


In [16]:
%%timeit -n 10000
get2(gap_sequence, 0)

AttributeError: 'MultivariateGapSequence' object has no attribute '_original_indices'

In [17]:
%%timeit -n 4000
get3(gap_sequence, 0)

AttributeError: 'MultivariateGapSequence' object has no attribute '_coordinates'