In [175]:
import os
import pandas as pd
import numpy as np
import zarr
from pathlib import Path
from seqdata import Table, FlatFASTA, GenomeFASTA, BigWig, BAM
import seqdata as sd
import xarray as xr
import pysam
import random

In [176]:
# Make a temporary directory for the output
os.makedirs(Path(sd.__file__).resolve().parent.parent / 'tests' / 'data' / 'tmp', exist_ok=True)

In [177]:
# Get infiles
variable_tsv_in = Path(sd.__file__).resolve().parent.parent / 'tests' / 'data' / 'variable.tsv'
fixed_tsv_in = Path(sd.__file__).resolve().parent.parent / 'tests' / 'data' / 'fixed.tsv'
variable_fasta_in = Path(sd.__file__).resolve().parent.parent / 'tests' / 'data' / 'variable.fa'
fixed_fasta_in = Path(sd.__file__).resolve().parent.parent / 'tests' / 'data' / 'fixed.fa'
out = Path(sd.__file__).resolve().parent.parent / 'tests' / 'data' / 'tmp'

# `Table`

In [178]:
def test_Table(
    input_tsv, 
    temp_dir, 
    seq_col="seq", 
    target_col="target", 
    batch_size=50, 
    fixed_length=False,
    length_dim="_length"
):
    """
    Tests the Table class for one or more input TSV files.

    Parameters:
        input_tsv (str or list): Path to a single TSV file or a list of TSV file paths.
        temp_dir (str or Path): Directory where temporary files will be stored.
        seq_col (str): Column name for sequences in the input TSV.
        target_col (str): Column name for targets in the input TSV.
        batch_size (int): Batch size for the Table reader.
        fixed_length (bool): Whether to use fixed-length sequences when writing to Zarr.
        length_dim (str): Name of the length dimension in the Zarr output.

    Raises:
        AssertionError: If any test fails.
    """
    temp_dir = Path(temp_dir)
    temp_dir.mkdir(parents=True, exist_ok=True)
    tsv_out = temp_dir / 'output.tsv.zarr'

    # Ensure input_tsv is a list
    if isinstance(input_tsv, str) or isinstance(input_tsv, Path):
        input_tsv = [input_tsv]

    # Read in data with pandas as true representation
    true = pd.concat([pd.read_csv(tsv_file, sep="\t") for tsv_file in input_tsv], ignore_index=True)

    # Test instantiation of the Table reader class
    table_reader = Table(
        name="seq",
        tables=input_tsv,
        seq_col=seq_col,
        batch_size=batch_size,
    )
    assert isinstance(table_reader, Table), "Table reader instantiation failed."

    # Test writing to Zarr
    table_reader._write(
        tsv_out,
        fixed_length=fixed_length,
        sequence_dim="_sequence",
        overwrite=True,
        length_dim=length_dim
    )
    zarr.consolidate_metadata(tsv_out)

    # Test round-trip reading
    data = sd.open_zarr(tsv_out)

    # Verify that the data is the same
    if fixed_length:
        seqs = [''.join(row.astype(str)) for row in data["seq"].values]
    else:
        seqs = data["seq"].values.astype(str)
    targets = data[target_col].values

    print(seqs)
    print(targets)
    assert np.array_equal(true[seq_col], seqs), "Sequences do not match."
    assert np.array_equal(true[target_col], targets), "Targets do not match."

    # Clean up temporary files
    for f in temp_dir.iterdir():
        if f.is_dir():
            os.system(f'rm -r {f}')
        else:
            f.unlink()

    print("All tests passed successfully!")


In [179]:
# Test Table class
test_Table(input_tsv=variable_tsv_in, temp_dir=out, seq_col="seq", target_col="target", batch_size=50, fixed_length=False)

7it [00:00, 219.41it/s]

['GGACTTTGGGAGGACAGATTGTTCGCAAGAACCCATGATCTATCACAGTTCGTCGTATAAAGCCCGCAACCAGTTCAGACGTATAAATGCAGCCTATAGCATCTTCCTCGAACCTCATGG'
 'AGCaCgCAGTGAAAATAgGTATACacATTCCCGAGGgGCaACGCATAGCAAAATAACATGGGGCcCATGTTAGTTTAGGTGGGATGACGCCTGTagAAGATTATGCTtAtGGaGGaGACTGGCGAGGCAATGgAGCGAAGCGtCTaGGTGTGAGAGCACAtCTGCTTTtAGCTCATCTCtTCACAAGTcGaTAAAgaGGGGACAGGaaTTTTCGGTACAGATAAAGACGtCGTTGcTCtAAGTGCATTGGTAAGGACTACAcaTGCCtTTtGCgGGcAGCaAaCGTTTAaAAGAtCaTAGAAAaCGCAGCATTCTAAACAaaTGGCTgTCATCGGATcGATGCGACTGGAAAGaAACTACTTCGCGGTCtACGGCGGTCCCACtTGACACCACATCGtGC'
 'CGCGAAGAGAGTCCATGCGGAAACAAGTCTATGGCTAGTGATCGCATCGGGCTGTCAGACTTCTTGTGAGACGCCAGATTAAGCCACGCACTCCCATAGTATGGTTGCGC'
 'TAAAATACAAACGCGACTCATACAACAGTACTTATGACGCGAGTGGTACTTGCAGTTCCACACATCGAATTCCTGTACATACTCATAGCTAAGACGTCTGCTAAAACACATGGTCTACACTCCCGTAGTCGGGATGGGAGTTTTGGAGCC'
 'CTTAGATATCGTTGATAAAGGCGTAGCGGTAGGGATTCGCATACGGCATGCTTATCGAAGGAGCGGAGATCCTGTAAGATCAATAGAGTCGTCTGGATTCTATCCTTCGCAAGCCTGAAGGACTGCCAACATTCGCAGACGTGTCTTATTCGAGCTGCCAGTCTCCGCATCTTCAGATGTTAGCGCTTTTGTAGCAACGAGC




In [180]:
test_Table(input_tsv=fixed_tsv_in, temp_dir=out, seq_col="seq", target_col="target", batch_size=50, fixed_length=20)

7it [00:00, 343.42it/s]

['GGACTTTGGGAGGACAGATT', 'AGCaCgCAGTGAAAATAgGT', 'CGCGAAGAGAGTCCATGCGG', 'TAAAATACAAACGCGACTCA', 'CTTAGATATCGTTGATAAAG', 'TAAATCCGCAGCGAGGCTAG', 'ATTGAATCCTTTCTCAGACT']
[-0.52928696 -1.75898058 -1.66558981 -0.38916949 -0.73085542  1.095118
 -1.21035606]
All tests passed successfully!





In [181]:
# Test Table class with multiple input files
test_Table(input_tsv=[variable_tsv_in, fixed_tsv_in], temp_dir=out, seq_col="seq", target_col="target", batch_size=50, fixed_length=False)

14it [00:00, 375.85it/s]

['GGACTTTGGGAGGACAGATTGTTCGCAAGAACCCATGATCTATCACAGTTCGTCGTATAAAGCCCGCAACCAGTTCAGACGTATAAATGCAGCCTATAGCATCTTCCTCGAACCTCATGG'
 'AGCaCgCAGTGAAAATAgGTATACacATTCCCGAGGgGCaACGCATAGCAAAATAACATGGGGCcCATGTTAGTTTAGGTGGGATGACGCCTGTagAAGATTATGCTtAtGGaGGaGACTGGCGAGGCAATGgAGCGAAGCGtCTaGGTGTGAGAGCACAtCTGCTTTtAGCTCATCTCtTCACAAGTcGaTAAAgaGGGGACAGGaaTTTTCGGTACAGATAAAGACGtCGTTGcTCtAAGTGCATTGGTAAGGACTACAcaTGCCtTTtGCgGGcAGCaAaCGTTTAaAAGAtCaTAGAAAaCGCAGCATTCTAAACAaaTGGCTgTCATCGGATcGATGCGACTGGAAAGaAACTACTTCGCGGTCtACGGCGGTCCCACtTGACACCACATCGtGC'
 'CGCGAAGAGAGTCCATGCGGAAACAAGTCTATGGCTAGTGATCGCATCGGGCTGTCAGACTTCTTGTGAGACGCCAGATTAAGCCACGCACTCCCATAGTATGGTTGCGC'
 'TAAAATACAAACGCGACTCATACAACAGTACTTATGACGCGAGTGGTACTTGCAGTTCCACACATCGAATTCCTGTACATACTCATAGCTAAGACGTCTGCTAAAACACATGGTCTACACTCCCGTAGTCGGGATGGGAGTTTTGGAGCC'
 'CTTAGATATCGTTGATAAAGGCGTAGCGGTAGGGATTCGCATACGGCATGCTTATCGAAGGAGCGGAGATCCTGTAAGATCAATAGAGTCGTCTGGATTCTATCCTTCGCAAGCCTGAAGGACTGCCAACATTCGCAGACGTGTCTTATTCGAGCTGCCAGTCTCCGCATCTTCAGATGTTAGCGCTTTTGTAGCAACGAGC




# `FlatFasta`

In [182]:
def read_fasta(file_path):
    """
    Reads a FASTA file and returns a dictionary of sequences.

    Parameters:
        file_path (str): Path to the FASTA file.

    Returns:
        dict: A dictionary where keys are sequence IDs and values are sequences.
    """
    sequences = {}
    with open(file_path, 'r') as file:
        sequence_id = None
        sequence_lines = []
        for line in file:
            line = line.strip()
            if line.startswith(">"):
                # Save the previous sequence (if any) before starting a new one
                if sequence_id:
                    sequences[sequence_id] = ''.join(sequence_lines)
                # Start a new sequence
                sequence_id = line[1:]  # Remove the '>'
                sequence_lines = []
            else:
                # Append sequence lines
                sequence_lines.append(line)
        # Save the last sequence
        if sequence_id:
            sequences[sequence_id] = ''.join(sequence_lines)
    return sequences


def test_FlatFASTA(input_fasta, temp_dir, batch_size=50, fixed_length=False, length_dim="_length"):
    """
    Tests the FlatFASTA class for a single input FASTA file.

    Parameters:
        input_fasta (str or Path): Path to the input FASTA file.
        temp_dir (str or Path): Directory where temporary files will be stored.
        batch_size (int): Batch size for the FlatFASTA reader.
        fixed_length (bool): Whether to use fixed-length sequences when writing to Zarr.
        length_dim (str): Name of the length dimension in the Zarr output.

    Raises:
        AssertionError: If any test fails.
    """
    temp_dir = Path(temp_dir)
    temp_dir.mkdir(parents=True, exist_ok=True)
    fasta_out = temp_dir / 'output.fasta.zarr'

    true = list(read_fasta(input_fasta).values())

    # Test instantiation of the FlatFASTA reader class
    flatfasta_reader = FlatFASTA(
        name="seq",
        fasta=input_fasta,
        batch_size=batch_size,
    )
    assert isinstance(flatfasta_reader, FlatFASTA), "FlatFASTA reader instantiation failed."

    # Verify number of sequences
    assert flatfasta_reader.n_seqs == len(true), "Number of sequences is incorrect."

    # Read in the data using the reader
    iterator = flatfasta_reader._reader(pysam.FastaFile(input_fasta))
    _read = [seq.decode('utf-8') for seq in iterator]
    assert np.array_equal(_read, true), "FlatFASTA reader failed to read in the correct values."

    # Test writing to Zarr
    flatfasta_reader._write(
        fasta_out,
        fixed_length=fixed_length,
        sequence_dim="_sequence",
        overwrite=True,
        length_dim=length_dim
    )
    zarr.consolidate_metadata(fasta_out)

    # Test round-trip reading
    data = sd.open_zarr(fasta_out)

    # Verify that the data is the same
    if fixed_length:
        seqs = [''.join(row.astype(str)) for row in data["seq"].values]
    else:
        seqs = data["seq"].values.astype(str)
    assert np.array_equal(true, seqs), "Sequences do not match."

    # Clean up temporary files
    for f in temp_dir.iterdir():
        if f.is_dir():
            os.system(f'rm -r {f}')
        else:
            f.unlink()

    print("All tests passed successfully!")

In [183]:
# Test FlatFASTA class for variable-length sequences
test_FlatFASTA(input_fasta=variable_fasta_in, temp_dir=out, batch_size=50, fixed_length=False)

100%|██████████| 7/7 [00:00<00:00, 2427.86it/s]

All tests passed successfully!





In [184]:
# Test FlatFASTA class for fixed-length sequences
test_FlatFASTA(input_fasta=fixed_fasta_in, temp_dir=out, batch_size=50, fixed_length=20)

100%|██████████| 7/7 [00:00<00:00, 2264.92it/s]

All tests passed successfully!





# `from_flat_files`

In [185]:
from seqdata import from_flat_files

In [193]:
def test_from_flat_files(
    *readers, 
    names,
    true,
    temp_dir, 
    fixed_length=False, 
    length_dim=None
):
    """
    Tests the FlatFASTA class for a single input FASTA file.

    Parameters:
        input_fasta (str or Path): Path to the input FASTA file.
        temp_dir (str or Path): Directory where temporary files will be stored.
        batch_size (int): Batch size for the FlatFASTA reader.
        fixed_length (bool): Whether to use fixed-length sequences when writing to Zarr.
        length_dim (str): Name of the length dimension in the Zarr output.

    Raises:
        AssertionError: If any test fails.
    """
    temp_dir = Path(temp_dir)
    temp_dir.mkdir(parents=True, exist_ok=True)
    out = temp_dir / 'output.zarr'

    data = from_flat_files(
        *readers,
        path=out,
        sequence_dim="_sequence",
        fixed_length=fixed_length,
        overwrite=True,
        length_dim=length_dim
    )
    print(data)
    # Verify that the data is the same
    for i, name in enumerate(names):
        if fixed_length:
            seqs = [''.join(row.astype(str)) for row in data[name].values]
        else:
            seqs = data[name].values.astype(str)
        print(name)
        print(seqs)
        print(true[i])
        assert np.array_equal(true[i], seqs), "Sequences do not match."

    # Clean up temporary files
    for f in temp_dir.iterdir():
        print(f)
        if f.is_dir():
            os.system(f'rm -r {f}')
        else:
            f.unlink()

    print("All tests passed successfully!")

In [187]:
# Single variable-length FASTA file
flat_fasta = FlatFASTA(
    fasta=variable_fasta_in,
    name="seq",
    batch_size=50,
)
trues = list(read_fasta(variable_fasta_in).values())
test_from_flat_files(flat_fasta, true=[trues], names=["seq"], temp_dir=out, fixed_length=False)

100%|██████████| 7/7 [00:00<00:00, 2818.75it/s]

seq
['GGACTTTGGGAGGACAGATTGTTCGCAAGAACCCATGATCTATCACAGTTCGTCGTATAAAGCCCGCAACCAGTTCAGACGTATAAATGCAGCCTATAGCATCTTCCTCGAACCTCATGG'
 'AGCaCgCAGTGAAAATAgGTATACacATTCCCGAGGgGCaACGCATAGCAAAATAACATGGGGCcCATGTTAGTTTAGGTGGGATGACGCCTGTagAAGATTATGCTtAtGGaGGaGACTGGCGAGGCAATGgAGCGAAGCGtCTaGGTGTGAGAGCACAtCTGCTTTtAGCTCATCTCtTCACAAGTcGaTAAAgaGGGGACAGGaaTTTTCGGTACAGATAAAGACGtCGTTGcTCtAAGTGCATTGGTAAGGACTACAcaTGCCtTTtGCgGGcAGCaAaCGTTTAaAAGAtCaTAGAAAaCGCAGCATTCTAAACAaaTGGCTgTCATCGGATcGATGCGACTGGAAAGaAACTACTTCGCGGTCtACGGCGGTCCCACtTGACACCACATCGtGC'
 'CGCGAAGAGAGTCCATGCGGAAACAAGTCTATGGCTAGTGATCGCATCGGGCTGTCAGACTTCTTGTGAGACGCCAGATTAAGCCACGCACTCCCATAGTATGGTTGCGC'
 'TAAAATACAAACGCGACTCATACAACAGTACTTATGACGCGAGTGGTACTTGCAGTTCCACACATCGAATTCCTGTACATACTCATAGCTAAGACGTCTGCTAAAACACATGGTCTACACTCCCGTAGTCGGGATGGGAGTTTTGGAGCC'
 'CTTAGATATCGTTGATAAAGGCGTAGCGGTAGGGATTCGCATACGGCATGCTTATCGAAGGAGCGGAGATCCTGTAAGATCAATAGAGTCGTCTGGATTCTATCCTTCGCAAGCCTGAAGGACTGCCAACATTCGCAGACGTGTCTTATTCGAGCTGCCAGTCTCCGCATCTTCAGATGTTAGCGCTTTTGTAGCAAC




In [188]:
# Single fixed-length FASTA file
flat_fasta = FlatFASTA(
    fasta=fixed_fasta_in,
    name="seq",
    batch_size=50,
)
trues = list(read_fasta(fixed_fasta_in).values())
test_from_flat_files(flat_fasta, true=[trues], names=["seq"], temp_dir=out, fixed_length=80)

100%|██████████| 7/7 [00:00<00:00, 2049.14it/s]

seq
['GGACTTTGGGAGGACAGATTGTTCGCAAGAACCCATGATCTATCACAGTTCGTCGTATAAAGCCCGCAACCAGTTCAGAC', 'AGCaCgCAGTGAAAATAgGTATACacATTCCCGAGGgGCaACGCATAGCAAAATAACATGGGGCcCATGTTAGTTTAGGT', 'CGCGAAGAGAGTCCATGCGGAAACAAGTCTATGGCTAGTGATCGCATCGGGCTGTCAGACTTCTTGTGAGACGCCAGATT', 'TAAAATACAAACGCGACTCATACAACAGTACTTATGACGCGAGTGGTACTTGCAGTTCCACACATCGAATTCCTGTACAT', 'CTTAGATATCGTTGATAAAGGCGTAGCGGTAGGGATTCGCATACGGCATGCTTATCGAAGGAGCGGAGATCCTGTAAGAT', 'TAAATCCGCAGCGAGGCTAGACTCCAGCCGACTACTCTAGTGGTGGAGCGGTAATGATCTCCACAGCCGCGCCCTATTCC', 'ATTGAATCCTTTCTCAGACTGAAGTTTTGTCCCTCTAGTCTAGCTTTGCGTGTTATGTTACCCGATGTATTTTCTAATTG']
['GGACTTTGGGAGGACAGATTGTTCGCAAGAACCCATGATCTATCACAGTTCGTCGTATAAAGCCCGCAACCAGTTCAGAC', 'AGCaCgCAGTGAAAATAgGTATACacATTCCCGAGGgGCaACGCATAGCAAAATAACATGGGGCcCATGTTAGTTTAGGT', 'CGCGAAGAGAGTCCATGCGGAAACAAGTCTATGGCTAGTGATCGCATCGGGCTGTCAGACTTCTTGTGAGACGCCAGATT', 'TAAAATACAAACGCGACTCATACAACAGTACTTATGACGCGAGTGGTACTTGCAGTTCCACACATCGAATTCCTGTACAT', 'CTTAGATATCGTTGATAAAGGCGTAGCGGTAGGGATTCGCATACGGCATGCTTATCGAAGGAGCGGAGA




In [189]:
# Combo of variable and fixed-length FASTA files
flat_fasta1 = FlatFASTA(
    fasta=variable_fasta_in,
    name="seq1",
    batch_size=50,
)
flat_fasta2 = FlatFASTA(
    fasta=fixed_fasta_in,
    name="seq2",
    batch_size=50,
)
true1 = list(read_fasta(variable_fasta_in).values())
true2 = list(read_fasta(fixed_fasta_in).values())
test_from_flat_files(flat_fasta1, flat_fasta2, true=[true1, true2], names=["seq1", "seq2"], temp_dir=out, fixed_length=False)

100%|██████████| 7/7 [00:00<00:00, 1794.41it/s]
100%|██████████| 7/7 [00:00<00:00, 2126.47it/s]

seq1
['GGACTTTGGGAGGACAGATTGTTCGCAAGAACCCATGATCTATCACAGTTCGTCGTATAAAGCCCGCAACCAGTTCAGACGTATAAATGCAGCCTATAGCATCTTCCTCGAACCTCATGG'
 'AGCaCgCAGTGAAAATAgGTATACacATTCCCGAGGgGCaACGCATAGCAAAATAACATGGGGCcCATGTTAGTTTAGGTGGGATGACGCCTGTagAAGATTATGCTtAtGGaGGaGACTGGCGAGGCAATGgAGCGAAGCGtCTaGGTGTGAGAGCACAtCTGCTTTtAGCTCATCTCtTCACAAGTcGaTAAAgaGGGGACAGGaaTTTTCGGTACAGATAAAGACGtCGTTGcTCtAAGTGCATTGGTAAGGACTACAcaTGCCtTTtGCgGGcAGCaAaCGTTTAaAAGAtCaTAGAAAaCGCAGCATTCTAAACAaaTGGCTgTCATCGGATcGATGCGACTGGAAAGaAACTACTTCGCGGTCtACGGCGGTCCCACtTGACACCACATCGtGC'
 'CGCGAAGAGAGTCCATGCGGAAACAAGTCTATGGCTAGTGATCGCATCGGGCTGTCAGACTTCTTGTGAGACGCCAGATTAAGCCACGCACTCCCATAGTATGGTTGCGC'
 'TAAAATACAAACGCGACTCATACAACAGTACTTATGACGCGAGTGGTACTTGCAGTTCCACACATCGAATTCCTGTACATACTCATAGCTAAGACGTCTGCTAAAACACATGGTCTACACTCCCGTAGTCGGGATGGGAGTTTTGGAGCC'
 'CTTAGATATCGTTGATAAAGGCGTAGCGGTAGGGATTCGCATACGGCATGCTTATCGAAGGAGCGGAGATCCTGTAAGATCAATAGAGTCGTCTGGATTCTATCCTTCGCAAGCCTGAAGGACTGCCAACATTCGCAGACGTGTCTTATTCGAGCTGCCAGTCTCCGCATCTTCAGATGTTAGCGCTTTTGTAGCAA




In [190]:
# Single variable-length TSV file
tsv = Table(
    tables=variable_tsv_in,
    name="seq",
    seq_col="seq",
    batch_size=50,
)
true = pd.read_csv(variable_tsv_in, sep="\t")["seq"].values
test_from_flat_files(tsv, true=[true], names=["seq"], temp_dir=out, fixed_length=False)

7it [00:00, 342.05it/s]

seq
['GGACTTTGGGAGGACAGATTGTTCGCAAGAACCCATGATCTATCACAGTTCGTCGTATAAAGCCCGCAACCAGTTCAGACGTATAAATGCAGCCTATAGCATCTTCCTCGAACCTCATGG'
 'AGCaCgCAGTGAAAATAgGTATACacATTCCCGAGGgGCaACGCATAGCAAAATAACATGGGGCcCATGTTAGTTTAGGTGGGATGACGCCTGTagAAGATTATGCTtAtGGaGGaGACTGGCGAGGCAATGgAGCGAAGCGtCTaGGTGTGAGAGCACAtCTGCTTTtAGCTCATCTCtTCACAAGTcGaTAAAgaGGGGACAGGaaTTTTCGGTACAGATAAAGACGtCGTTGcTCtAAGTGCATTGGTAAGGACTACAcaTGCCtTTtGCgGGcAGCaAaCGTTTAaAAGAtCaTAGAAAaCGCAGCATTCTAAACAaaTGGCTgTCATCGGATcGATGCGACTGGAAAGaAACTACTTCGCGGTCtACGGCGGTCCCACtTGACACCACATCGtGC'
 'CGCGAAGAGAGTCCATGCGGAAACAAGTCTATGGCTAGTGATCGCATCGGGCTGTCAGACTTCTTGTGAGACGCCAGATTAAGCCACGCACTCCCATAGTATGGTTGCGC'
 'TAAAATACAAACGCGACTCATACAACAGTACTTATGACGCGAGTGGTACTTGCAGTTCCACACATCGAATTCCTGTACATACTCATAGCTAAGACGTCTGCTAAAACACATGGTCTACACTCCCGTAGTCGGGATGGGAGTTTTGGAGCC'
 'CTTAGATATCGTTGATAAAGGCGTAGCGGTAGGGATTCGCATACGGCATGCTTATCGAAGGAGCGGAGATCCTGTAAGATCAATAGAGTCGTCTGGATTCTATCCTTCGCAAGCCTGAAGGACTGCCAACATTCGCAGACGTGTCTTATTCGAGCTGCCAGTCTCCGCATCTTCAGATGTTAGCGCTTTTGTAGCAAC




In [194]:
# Single fixed-length TSV file
tsv = Table(
    tables=fixed_tsv_in,
    name="seq",
    seq_col="seq",
    batch_size=50,
)
true = pd.read_csv(fixed_tsv_in, sep="\t")["seq"].values
test_from_flat_files(tsv, true=[true], names=["seq"], temp_dir=out, fixed_length=20)

0it [00:00, ?it/s]

7it [00:00, 356.07it/s]

<xarray.Dataset> Size: 196B
Dimensions:  (_sequence: 7, _length: 20)
Dimensions without coordinates: _sequence, _length
Data variables:
    seq      (_sequence, _length) |S1 140B dask.array<chunksize=(7, 20), meta=np.ndarray>
    target   (_sequence) float64 56B dask.array<chunksize=(7,), meta=np.ndarray>
seq
['GGACTTTGGGAGGACAGATT', 'AGCaCgCAGTGAAAATAgGT', 'CGCGAAGAGAGTCCATGCGG', 'TAAAATACAAACGCGACTCA', 'CTTAGATATCGTTGATAAAG', 'TAAATCCGCAGCGAGGCTAG', 'ATTGAATCCTTTCTCAGACT']
['GGACTTTGGGAGGACAGATT' 'AGCaCgCAGTGAAAATAgGT' 'CGCGAAGAGAGTCCATGCGG'
 'TAAAATACAAACGCGACTCA' 'CTTAGATATCGTTGATAAAG' 'TAAATCCGCAGCGAGGCTAG'
 'ATTGAATCCTTTCTCAGACT']
/cellar/users/aklie/projects/ML4GLand/SeqData/tests/data/tmp/output.zarr
All tests passed successfully!





In [195]:
# Combo of variable and fixed-length TSV files
tsv1 = Table(
    tables=variable_tsv_in,
    name="seq3",
    seq_col="seq",
    batch_size=50,
)
tsv2 = Table(
    tables=fixed_tsv_in,
    name="seq4",
    seq_col="seq",
    batch_size=50,
)
true1 = pd.read_csv(variable_tsv_in, sep="\t")["seq"].values
true2 = pd.read_csv(fixed_tsv_in, sep="\t")["seq"].values
test_from_flat_files(tsv1, tsv2, true=[true1, true2], names=["seq3", "seq4"], temp_dir=out, fixed_length=False)

0it [00:00, ?it/s]

7it [00:00, 211.04it/s]
7it [00:00, 261.55it/s]

<xarray.Dataset> Size: 168B
Dimensions:  (_sequence: 7)
Dimensions without coordinates: _sequence
Data variables:
    seq3     (_sequence) object 56B dask.array<chunksize=(7,), meta=np.ndarray>
    seq4     (_sequence) object 56B dask.array<chunksize=(7,), meta=np.ndarray>
    target   (_sequence) float64 56B dask.array<chunksize=(7,), meta=np.ndarray>
seq3
['GGACTTTGGGAGGACAGATTGTTCGCAAGAACCCATGATCTATCACAGTTCGTCGTATAAAGCCCGCAACCAGTTCAGACGTATAAATGCAGCCTATAGCATCTTCCTCGAACCTCATGG'
 'AGCaCgCAGTGAAAATAgGTATACacATTCCCGAGGgGCaACGCATAGCAAAATAACATGGGGCcCATGTTAGTTTAGGTGGGATGACGCCTGTagAAGATTATGCTtAtGGaGGaGACTGGCGAGGCAATGgAGCGAAGCGtCTaGGTGTGAGAGCACAtCTGCTTTtAGCTCATCTCtTCACAAGTcGaTAAAgaGGGGACAGGaaTTTTCGGTACAGATAAAGACGtCGTTGcTCtAAGTGCATTGGTAAGGACTACAcaTGCCtTTtGCgGGcAGCaAaCGTTTAaAAGAtCaTAGAAAaCGCAGCATTCTAAACAaaTGGCTgTCATCGGATcGATGCGACTGGAAAGaAACTACTTCGCGGTCtACGGCGGTCCCACtTGACACCACATCGtGC'
 'CGCGAAGAGAGTCCATGCGGAAACAAGTCTATGGCTAGTGATCGCATCGGGCTGTCAGACTTCTTGTGAGACGCCAGATTAAGCCACGCACTCCCATAGTATGGTTGCGC




In [177]:
true = [pd.read_csv(variable_tsv_in, sep="\t")["seq"].values, pd.read_csv(fixed_tsv_in, sep="\t")["seq"].values, list(read_fasta(variable_fasta_in).values()), list(read_fasta(fixed_fasta_in).values())]
test_from_flat_files(tsv1, tsv2, flat_fasta1, flat_fasta2, true=true, names=["seq3", "seq4", "seq1", "seq2"], temp_dir=out, fixed_length=False)

7it [00:00, 346.20it/s]
7it [00:00, 344.25it/s]
100%|██████████| 7/7 [00:00<00:00, 3032.76it/s]
100%|██████████| 7/7 [00:00<00:00, 2346.37it/s]


seq3
['CCCCCACTAACAGCAAGGTAGTTCTTATTTCCTTTAGGAACCCCCCGCCAAAAGAGTGCTTAGATCCTTAGGTGCGGGCAGAATACAATCATCCTGTGCCCTGCCCATTCCCCCACTGAC'
 'ACTCGGgGCCaTGTAGcGTCCGTCGTTGTATATTCTGTCGGGGTCATTaAAGCCTCATCGTGAACGaATCACCATTCCCCGCCCGGTTCTTCAcGCTGCtACACACCGCCaTGGGAGCGtTGTGTTgTGGAgTATGtAGAGaGGAcAGTCcTTGAGTGAcAACCCACTGTCGaAGGGTCCGCcCCAGtaAAAGATACTAAGCCcTGCTACGCTCGCCAgCGTCtTTtGGCAcGCTcCcAAAAACGCtGCgCTgTGaaCgTGTATGCAAACCCACGGGCTAACCGGACGTTAtCcTCAACgaTTTGTCACtgGTCcAAaGAAGACCGACACGTCCTGtGATGTGCcTAGTCcAATATcTaTtGCGAGTAGGATCGTTGATGTGGGACAtGAACTaCACGCC'
 'CCCATGAGAGGAAGGTCACAGGCGGAATAGGTATTTTAGTATAACTAAATTCAGATACGACGGTCATACAGATAATGGTCCAGGGTGCGTTAGATCCTACGATCTCCACG'
 'GTGCGAAGGAGATTCAGGCCAAACGATAACCCTTTCTCGAGATGGACTGCAAGTCCATTTGCGGTTGTGGTCTTGTGGTCGTATAGAAGCCTTCCTCCTCAGCCAACTTCTTATACCAGGATTCCTTTGTAGGGATCGTTTCTATATAAG'
 'TTTAGCCTGCTTGCCGTGGTTATCGGTAGGCGGATCCGAATCTCGGCGATCTTAAGACTACGCTCGACGGTTTGACATTC'
 'NNNNNCCTNCTTGNNNTGNTNANCNGTAGGNNNANNNGAATCTCNNCGNNNTNNNGACTNCGCNNGACGGNTNNACNNTC'
 'TTATGACCTTTGTATTCTTACCCTCTTCT

# `read_table`

In [196]:
from seqdata import read_table

In [197]:
temp_dir = Path(out)
temp_dir.mkdir(parents=True, exist_ok=True)
out = temp_dir / 'output.zarr'

sdata = read_table(
    tables=[variable_tsv_in, fixed_tsv_in],
    out=out,
    seq_col="seq",
    name="seq",
    fixed_length=False,
    batch_size=50,
    overwrite=True,
)

true = pd.concat([pd.read_csv(tsv_file, sep="\t") for tsv_file in [variable_tsv_in, fixed_tsv_in]], ignore_index=True)["seq"].values
seqs = sdata["seq"].values.astype(str)
assert np.array_equal(true, seqs), "Sequences do not match."

# Clean up temporary files
for f in temp_dir.iterdir():
    if f.is_dir():
        os.system(f'rm -r {f}')
    else:
        f.unlink()

14it [00:00, 425.69it/s]


In [198]:
sdata

Unnamed: 0,Array,Chunk
Bytes,112 B,112 B
Shape,"(14,)","(14,)"
Dask graph,1 chunks in 2 graph layers,1 chunks in 2 graph layers
Data type,object numpy.ndarray,object numpy.ndarray
"Array Chunk Bytes 112 B 112 B Shape (14,) (14,) Dask graph 1 chunks in 2 graph layers Data type object numpy.ndarray",14  1,

Unnamed: 0,Array,Chunk
Bytes,112 B,112 B
Shape,"(14,)","(14,)"
Dask graph,1 chunks in 2 graph layers,1 chunks in 2 graph layers
Data type,object numpy.ndarray,object numpy.ndarray

Unnamed: 0,Array,Chunk
Bytes,112 B,112 B
Shape,"(14,)","(14,)"
Dask graph,1 chunks in 2 graph layers,1 chunks in 2 graph layers
Data type,float64 numpy.ndarray,float64 numpy.ndarray
"Array Chunk Bytes 112 B 112 B Shape (14,) (14,) Dask graph 1 chunks in 2 graph layers Data type float64 numpy.ndarray",14  1,

Unnamed: 0,Array,Chunk
Bytes,112 B,112 B
Shape,"(14,)","(14,)"
Dask graph,1 chunks in 2 graph layers,1 chunks in 2 graph layers
Data type,float64 numpy.ndarray,float64 numpy.ndarray


## `read_flat_fasta`

In [187]:
from seqdata import read_flat_fasta

In [189]:
temp_dir = Path(out)
temp_dir.mkdir(parents=True, exist_ok=True)
out = temp_dir / 'output.zarr'

sdata = read_flat_fasta(
    fasta=variable_fasta_in,
    out=out,
    name="seq",
    fixed_length=False,
    batch_size=50,
    overwrite=True,
)

true = list(read_fasta(variable_fasta_in).values())
seqs = sdata["seq"].values.astype(str)
assert np.array_equal(true, seqs), "Sequences do not match."

# Clean up temporary files
for f in temp_dir.iterdir():
    if f.is_dir():
        os.system(f'rm -r {f}')
    else:
        f.unlink()

100%|██████████| 7/7 [00:00<00:00, 1763.37it/s]


# DONE!

---