Reduced time required to compute multivariate gaps

LucaCappelletti94 · Mar 1, 2020 · 49b97b7 · 49b97b7
1 parent d61e687
commit 49b97b7
Show file tree

Hide file tree

Showing 3 changed files with 15 additions and 32 deletions.
diff --git a/keras_synthetic_genome_sequence/multivariate_gap_center_sequence.py b/keras_synthetic_genome_sequence/multivariate_gap_center_sequence.py
@@ -2,11 +2,11 @@
 from typing import Union, Dict, Tuple
 import pandas as pd
 import numpy as np
-from .multivariate_gap_sequence import MultivariateGapSequence
+from .multivariate_gap_windows_sequence import MultivariateGapWindowsSequence
 from .utils import generate_synthetic_gaps
 
 
-class MultivariateGapCenterSequence(MultivariateGapSequence):
+class MultivariateGapCenterSequence(MultivariateGapWindowsSequence):
     """
     Keras Sequence that returns tuples of nucleotide sequences,
     one with multivariate synthetic gaps and the other with the
@@ -26,26 +26,5 @@ def __getitem__(self, idx: int) -> Tuple[np.ndarray, np.ndarray]:
         Return Tuple containing X and Y numpy arrays corresponding to given batch index.
         """
         # Retrieves the sequence from the bed generator
-        x = super().__getitem__(idx)
-        # Save the original chromosomes
-        y = x[:, self.window_length//2].copy()
-        # Retrieve the indices corresponding to the gaps for the current batchsize
-        indices = self._gaps_index[idx]
-        # Extract the gaps curresponding to given indices
-        masks = self._gaps_coordinates[
-            np.in1d(self._gaps_coordinates[:, 0], indices)
-        ]
-        # For every j-th index curresponding to the i-th row of current batch
-        for i, index in enumerate(indices):
-            # We extract the mask curresponding to the gaps
-            # for the i-th row of current batch
-            gap_indices = masks[masks[:, 0] == index][:, 1]
-            # And we set the one-hot encoded nucleotides as
-            # a uniform distribution.
-            x[i, gap_indices, :] = 0.25
-        # We return the tuple of the batch, containing
-        # the input with added artificial gaps represented
-        # as a uniform distribution and
-        # the output containing the original one-hot encoded
-        # sequence of nucleotides.
-        return x, y
+        x, y = super().__getitem__(idx)
+        return x, y[:, self.window_length//2]
diff --git a/keras_synthetic_genome_sequence/multivariate_gap_sequence.py b/keras_synthetic_genome_sequence/multivariate_gap_sequence.py
@@ -87,14 +87,14 @@ def __init__(
                 )
             )
         # Rendering the gaps coordinates
-        self._gaps_coordinates = generate_synthetic_gaps(
+        self._original_indices, self._coordinates = generate_synthetic_gaps(
             gaps_mean,
             gaps_covariance,
             self.samples_number,
             chunk_size=50000,
             threshold=gaps_threshold,
             seed=seed
-        )
+        ).T
         # Rendering the starting gaps index, which
         # will be shuffled alongside the bed file.
         self._gaps_index = NumpySequence(

diff --git a/keras_synthetic_genome_sequence/multivariate_gap_windows_sequence.py b/keras_synthetic_genome_sequence/multivariate_gap_windows_sequence.py
@@ -28,14 +28,18 @@ def __getitem__(self, idx: int) -> Tuple[np.ndarray, np.ndarray]:
         y = super().__getitem__(idx)
         # Retrieve the indices corresponding to the gaps for the current batchsize
         indices = self._gaps_index[idx]
-        # Extract the gaps curresponding to given indices
-        masks = self._gaps_coordinates[:, 0] == indices[:, None]
+        # Get the boolean masks for the original positions that contain the gaps
+        # for the given index
+        masks = self._original_indices == indices[:, None]
+        # Get the mask to drop the rows where none of the indices
+        # considered for this specific batch is present
         considered_rows = masks.any(axis=0)
+        # Drop rows where none of the indices is present
         indices_masks = masks[:, considered_rows]
-        positions = self._gaps_coordinates[:, 1][considered_rows]
+        coordinates = self._coordinates[considered_rows]
         # Making a deep copy of y, since we are going to edit the copy.
         x = np.copy(y)
-        # For every j-th index curresponding to the i-th row of current batch
+        # For i-th row of current batch we apply the nucletides mask
         for i, indices_mask in enumerate(indices_masks):
-            x[i, positions[indices_mask]] = 0.25
+            x[i, coordinates[indices_mask]] = 0.25
         return x, y