In [7]:
import sys
import time
import numpy as np
import xarray as xr
import seqpro as sp
import seqdata as sd

In [23]:
def random_seqs(
    shape,
    alphabet,
    rng = None,
    seed = None
):
    """Generate random nucleotide sequences.

    Parameters
    ----------
    shape : int, tuple[int]
        Shape of sequences to generate
    alphabet : NucleotideAlphabet
        Alphabet to sample nucleotides from.
    seed : int, optional
        Random seed.

    Returns
    -------
    ndarray
        Randomly generated sequences.
    """
    if rng is None:
        rng = np.random.default_rng(seed)
    return rng.choice(alphabet.array, size=shape)

def random_cov(
    shape,
    rate=1,
    rng = None,
    seed = None
):
    """Generate random coverage for nucleotide sequences.

    Parameters
    ----------
    shape : int, tuple[int]
        Shape of sequences to generate
    alphabet : NucleotideAlphabet
        Alphabet to sample nucleotides from.
    seed : int, optional
        Random seed.

    Returns
    -------
    ndarray
        Randomly generated sequences.
    """
    if rng is None:
        rng = np.random.default_rng(seed)
    return rng.poisson(rate, size=shape)

In [25]:

rng.poisson(10, size=(10, 2))

array([[11,  2],
       [11, 13],
       [14, 12],
       [14,  8],
       [12, 11],
       [12, 12],
       [ 6, 10],
       [10, 16],
       [11, 11],
       [ 9,  7]])

In [32]:
# Define data directory
data_dir = "/cellar/users/aklie/data/eugene/revision/memory_analysis"

# Define an rng for reproducibility
seed = 13
rng = np.random.default_rng(seed)

# Generate coverage data
cov = False
cov_dim = 2

# Define the grid of sizes
num_seqs = [100, 1000, 10000, 100000, 1000000, 10000000]#, 100000000]
seq_lengths = [100, 1000, 10000, 100000, 1000000, 10000000]

# Generate the datasets
for n in num_seqs:
    for l in seq_lengths:

        # Generate the sequences
        shape = (n, l)
        print(f"Generating {n} sequences of length {l}")
        start_time = time.time()
        seqs = random_seqs(shape, alphabet=sp.alphabets.DNA, seed=seed)
        mem_usage = sys.getsizeof(seqs)
        mem_usage_gb = mem_usage / 1e9
        end_time = time.time()
        elapsed_time = end_time - start_time
        print(f"Generated {n} sequences of length {l} in {elapsed_time:.2f} seconds, using {mem_usage_gb} GBs of memory")

        # Generate the coverage data
        if cov:
            print(f"Generating {cov_dim} tracks of coverage data for {n} sequences of length {l}")
            start_time = time.time()
            covs = random_cov((n, cov_dim, l), rng=rng)
            mem_usage = sys.getsizeof(covs)
            mem_usage_gb = mem_usage / 1e9
            end_time = time.time()
            elapsed_time = end_time - start_time
            print(f"Generated {cov_dim} tracks of coverage data for {n} sequences of length {l} in {elapsed_time:.2f} seconds, using {mem_usage_gb} GBs of memory")
            
            # Save the dataset
            print(f"Saving {n} sequences of length {l}")
            start_time = time.time()
            sdata = xr.Dataset(
                {
                    "seqs": (["seq", "pos"], seqs),
                    "cov": (["seq", "track", "pos"], covs),
                },
            )
            sdata.to_zarr(f"{data_dir}/{n}_random_{l}bp_seqs_{cov_dim}_cov.zarr", mode="w")
            end_time = time.time()
            elapsed_time = end_time - start_time
            print(f"Saved {n} sequences of length {l} in {elapsed_time:.2f} seconds")
        
        else:
            # Save the dataset
            print(f"Saving {n} sequences of length {l}")
            start_time = time.time()
            sdata = xr.Dataset({"seqs": (["seq", "pos"], seqs)})
            sdata.to_zarr(f"{data_dir}/{n}_random_{l}bp_seqs.zarr", mode="w")
            end_time = time.time()
            elapsed_time = end_time - start_time
            print(f"Saved {n} sequences of length {l} in {elapsed_time:.2f} seconds")

Generating 100 sequences of length 100
Generated 100 sequences of length 100 in 0.00 seconds, using 1.0128e-05 GBs of memory
Generating 2 tracks of coverage data for 100 sequences of length 100
Generated 2 tracks of coverage data for 100 sequences of length 100 in 0.00 seconds, using 0.000160144 GBs of memory
Saving 100 sequences of length 100
Saved 100 sequences of length 100 in 0.22 seconds
Generating 1000 sequences of length 100
Generated 1000 sequences of length 100 in 0.00 seconds, using 0.000100128 GBs of memory
Generating 2 tracks of coverage data for 1000 sequences of length 100
Generated 2 tracks of coverage data for 1000 sequences of length 100 in 0.01 seconds, using 0.001600144 GBs of memory
Saving 1000 sequences of length 100
Saved 1000 sequences of length 100 in 0.13 seconds
Generating 10000 sequences of length 100
Generated 10000 sequences of length 100 in 0.01 seconds, using 0.001000128 GBs of memory
Generating 2 tracks of coverage data for 10000 sequences of length 100


In [33]:
sd.open_zarr(f"{data_dir}/{n}_random_{l}bp_seqs.zarr")

Unnamed: 0,Array,Chunk
Bytes,14.90 GiB,4.77 MiB
Shape,"(10000000, 2, 100)","(156250, 1, 4)"
Dask graph,3200 chunks in 2 graph layers,3200 chunks in 2 graph layers
Data type,int64 numpy.ndarray,int64 numpy.ndarray
"Array Chunk Bytes 14.90 GiB 4.77 MiB Shape (10000000, 2, 100) (156250, 1, 4) Dask graph 3200 chunks in 2 graph layers Data type int64 numpy.ndarray",100  2  10000000,

Unnamed: 0,Array,Chunk
Bytes,14.90 GiB,4.77 MiB
Shape,"(10000000, 2, 100)","(156250, 1, 4)"
Dask graph,3200 chunks in 2 graph layers,3200 chunks in 2 graph layers
Data type,int64 numpy.ndarray,int64 numpy.ndarray

Unnamed: 0,Array,Chunk
Bytes,0.93 GiB,2.09 MiB
Shape,"(10000000, 100)","(312500, 7)"
Dask graph,480 chunks in 2 graph layers,480 chunks in 2 graph layers
Data type,|S1 numpy.ndarray,|S1 numpy.ndarray
"Array Chunk Bytes 0.93 GiB 2.09 MiB Shape (10000000, 100) (312500, 7) Dask graph 480 chunks in 2 graph layers Data type |S1 numpy.ndarray",100  10000000,

Unnamed: 0,Array,Chunk
Bytes,0.93 GiB,2.09 MiB
Shape,"(10000000, 100)","(312500, 7)"
Dask graph,480 chunks in 2 graph layers,480 chunks in 2 graph layers
Data type,|S1 numpy.ndarray,|S1 numpy.ndarray
