Skip to content

Commit

Permalink
Reduced time required to compute multivariate gaps
Browse files Browse the repository at this point in the history
  • Loading branch information
LucaCappelletti94 committed Mar 1, 2020
1 parent d61e687 commit 49b97b7
Show file tree
Hide file tree
Showing 3 changed files with 15 additions and 32 deletions.
Expand Up @@ -2,11 +2,11 @@
from typing import Union, Dict, Tuple
import pandas as pd
import numpy as np
from .multivariate_gap_sequence import MultivariateGapSequence
from .multivariate_gap_windows_sequence import MultivariateGapWindowsSequence
from .utils import generate_synthetic_gaps


class MultivariateGapCenterSequence(MultivariateGapSequence):
class MultivariateGapCenterSequence(MultivariateGapWindowsSequence):
"""
Keras Sequence that returns tuples of nucleotide sequences,
one with multivariate synthetic gaps and the other with the
Expand All @@ -26,26 +26,5 @@ def __getitem__(self, idx: int) -> Tuple[np.ndarray, np.ndarray]:
Return Tuple containing X and Y numpy arrays corresponding to given batch index.
"""
# Retrieves the sequence from the bed generator
x = super().__getitem__(idx)
# Save the original chromosomes
y = x[:, self.window_length//2].copy()
# Retrieve the indices corresponding to the gaps for the current batchsize
indices = self._gaps_index[idx]
# Extract the gaps curresponding to given indices
masks = self._gaps_coordinates[
np.in1d(self._gaps_coordinates[:, 0], indices)
]
# For every j-th index curresponding to the i-th row of current batch
for i, index in enumerate(indices):
# We extract the mask curresponding to the gaps
# for the i-th row of current batch
gap_indices = masks[masks[:, 0] == index][:, 1]
# And we set the one-hot encoded nucleotides as
# a uniform distribution.
x[i, gap_indices, :] = 0.25
# We return the tuple of the batch, containing
# the input with added artificial gaps represented
# as a uniform distribution and
# the output containing the original one-hot encoded
# sequence of nucleotides.
return x, y
x, y = super().__getitem__(idx)
return x, y[:, self.window_length//2]
4 changes: 2 additions & 2 deletions keras_synthetic_genome_sequence/multivariate_gap_sequence.py
Expand Up @@ -87,14 +87,14 @@ def __init__(
)
)
# Rendering the gaps coordinates
self._gaps_coordinates = generate_synthetic_gaps(
self._original_indices, self._coordinates = generate_synthetic_gaps(
gaps_mean,
gaps_covariance,
self.samples_number,
chunk_size=50000,
threshold=gaps_threshold,
seed=seed
)
).T
# Rendering the starting gaps index, which
# will be shuffled alongside the bed file.
self._gaps_index = NumpySequence(
Expand Down
Expand Up @@ -28,14 +28,18 @@ def __getitem__(self, idx: int) -> Tuple[np.ndarray, np.ndarray]:
y = super().__getitem__(idx)
# Retrieve the indices corresponding to the gaps for the current batchsize
indices = self._gaps_index[idx]
# Extract the gaps curresponding to given indices
masks = self._gaps_coordinates[:, 0] == indices[:, None]
# Get the boolean masks for the original positions that contain the gaps
# for the given index
masks = self._original_indices == indices[:, None]
# Get the mask to drop the rows where none of the indices
# considered for this specific batch is present
considered_rows = masks.any(axis=0)
# Drop rows where none of the indices is present
indices_masks = masks[:, considered_rows]
positions = self._gaps_coordinates[:, 1][considered_rows]
coordinates = self._coordinates[considered_rows]
# Making a deep copy of y, since we are going to edit the copy.
x = np.copy(y)
# For every j-th index curresponding to the i-th row of current batch
# For i-th row of current batch we apply the nucletides mask
for i, indices_mask in enumerate(indices_masks):
x[i, positions[indices_mask]] = 0.25
x[i, coordinates[indices_mask]] = 0.25
return x, y

0 comments on commit 49b97b7

Please sign in to comment.