In [8]:
import os
import pandas as pd
import numpy as np
import zarr
from pathlib import Path
from seqdata import Table, FlatFASTA, GenomeFASTA, BigWig, BAM
import seqdata as sd
import xarray as xr
import pysam
import random
import pickle
import pyBigWig

In [9]:
# Make a temporary directory for the output
os.makedirs(Path(sd.__file__).resolve().parent.parent / 'tests' / 'data' / 'tmp', exist_ok=True)

In [10]:
# Get infiles
variable_fasta_in = Path(sd.__file__).resolve().parent.parent / 'tests' / 'data' / 'variable.fa'
variable_bed_in = Path(sd.__file__).resolve().parent.parent / 'tests' / 'data' / 'variable.bed'
fixed_bed_in = Path(sd.__file__).resolve().parent.parent / 'tests' / 'data' / 'fixed.bed'
out = Path(sd.__file__).resolve().parent.parent / 'tests' / 'data' / 'tmp'

# `GenomeFasta`

In [11]:
# Read in data with Python as true representation
def read_fasta(file_path):
    sequences = {}
    with open(file_path, 'r') as file:
        sequence_id = None
        sequence_lines = []
        for line in file:
            line = line.strip()
            if line.startswith(">"):
                if sequence_id:
                    sequences[sequence_id] = ''.join(sequence_lines)
                sequence_id = line[1:]  # Remove the '>'
                sequence_lines = []
            else:
                sequence_lines.append(line)
        if sequence_id:
            sequences[sequence_id] = ''.join(sequence_lines)
    return sequences


def test_GenomeFASTA(input_fasta, input_bed, temp_dir, batch_size=50, fixed_length=20, length_dim="_length"):
    """
    Tests the GenomeFASTA class for a single input FASTA file and BED coordinates.

    Parameters:
        input_fasta (str or Path): Path to the input FASTA file.
        bed (list): List of tuples (chromosome, start, end) representing BED coordinates.
        temp_dir (str or Path): Directory where temporary files will be stored.
        batch_size (int): Batch size for the GenomeFASTA reader.
        fixed_length (int): Fixed length for the sequences when writing to Zarr.
        length_dim (str): Name of the length dimension in the Zarr output.

    Raises:
        AssertionError: If any test fails.
    """
    temp_dir = Path(temp_dir)
    temp_dir.mkdir(parents=True, exist_ok=True)
    fasta_out = temp_dir / 'output.fasta.zarr'

    bed = pd.read_csv(input_bed, sep="\t", header=None)
    fasta_sequences = read_fasta(input_fasta)
    if fixed_length:
        midpoint = (bed[1] + bed[2]) // 2
        bed[1] = midpoint - fixed_length // 2
        bed[2] = midpoint + fixed_length // 2
    true = [fasta_sequences[chrom][start:end] for chrom, start, end in bed.values]
    bed["strand"] = "+"

    # Test instantiation of the GenomeFASTA reader class
    genomefasta_reader = GenomeFASTA(
        name="seq",
        fasta=input_fasta,
        batch_size=batch_size,
    )
    assert isinstance(genomefasta_reader, GenomeFASTA), "GenomeFASTA reader instantiation failed."

    # Read in the data using the reader
    iterator = genomefasta_reader._reader(
        bed=bed,
        f=pysam.FastaFile(input_fasta)
    )
    _read = [seq.decode('utf-8') for seq in iterator]
    assert np.array_equal(_read, true), "GenomeFASTA reader failed to read in the correct values."

    # Test writing to Zarr
    genomefasta_reader._write(
        fasta_out,
        bed=bed,
        fixed_length=fixed_length,
        sequence_dim="_sequence",
        length_dim=length_dim,
        overwrite=True
    )
    zarr.consolidate_metadata(fasta_out)

    # Test round-trip reading
    data = sd.open_zarr(fasta_out)

    # Verify length and sequence dimensions
    if fixed_length:
        assert data["seq"].shape[1] == fixed_length, "Length dimension is incorrect."
    assert data["seq"].shape[0] == len(bed), "Sequence dimension is incorrect."

    # Verify that the data is the same
    if fixed_length:
        seqs = [''.join(row.astype(str)) for row in data["seq"].values]
    else:
        seqs = data["seq"].values.astype(str)
    print(seqs)
    print(true)
    assert np.array_equal(true, seqs), "Sequences do not match."

    # Clean up temporary files
    for f in temp_dir.iterdir():
        if f.is_dir():
            os.system(f'rm -r {f}')
        else:
            f.unlink()

    print("All tests passed successfully!")

In [12]:
# Test the GenomeFASTA class on variable length sequences
test_GenomeFASTA(variable_fasta_in, variable_bed_in, out, batch_size=50, fixed_length=False, length_dim="_length")

100%|██████████| 14/14 [00:00<00:00, 74518.09it/s]
100%|██████████| 14/14 [00:00<00:00, 76858.97it/s]

['CGCAAGAACCCATGATCTATCACAG' 'GCAACCAGTTCAGACGTATAAA'
 'AACATGGGGCcCATGTTAGTTTAGGTG' 'CAATGgAGCGAAGCGtCTaGG'
 'GCTAGTGATCGCATCGGGCTGTC' 'AAGCCACGCACTCCCATAGTATGGTTGCG'
 'CGACTCATACAACAGTACTT' 'TTCCACACATCGAATTCCTGTACATACTC'
 'CGTGTCTTATTCGAGCTGCCAGTCT' 'ATAGTGGGAGTGAATCAATCCTGAAC'
 'ACTCCAGCCGACTACTCTAGT' 'GCGCCCTATTCCTGTCAAGTACTA'
 'TCTAATTGTTATCCGAAACGAGTAATG' 'TTACGTCACCATGCTGTTTCGG']
['CGCAAGAACCCATGATCTATCACAG', 'GCAACCAGTTCAGACGTATAAA', 'AACATGGGGCcCATGTTAGTTTAGGTG', 'CAATGgAGCGAAGCGtCTaGG', 'GCTAGTGATCGCATCGGGCTGTC', 'AAGCCACGCACTCCCATAGTATGGTTGCG', 'CGACTCATACAACAGTACTT', 'TTCCACACATCGAATTCCTGTACATACTC', 'CGTGTCTTATTCGAGCTGCCAGTCT', 'ATAGTGGGAGTGAATCAATCCTGAAC', 'ACTCCAGCCGACTACTCTAGT', 'GCGCCCTATTCCTGTCAAGTACTA', 'TCTAATTGTTATCCGAAACGAGTAATG', 'TTACGTCACCATGCTGTTTCGG']
All tests passed successfully!





In [13]:
# Test the GenomeFASTA class on variable length sequences
test_GenomeFASTA(variable_fasta_in, variable_bed_in, out, batch_size=50, fixed_length=20, length_dim="_length")

100%|██████████| 14/14 [00:00<00:00, 95635.60it/s]
100%|██████████| 14/14 [00:00<00:00, 77060.70it/s]

['CAAGAACCCATGATCTATCA', 'CAACCAGTTCAGACGTATAA', 'ATGGGGCcCATGTTAGTTTA', 'CAATGgAGCGAAGCGtCTaG', 'CTAGTGATCGCATCGGGCTG', 'CACGCACTCCCATAGTATGG', 'CGACTCATACAACAGTACTT', 'ACACATCGAATTCCTGTACA', 'TGTCTTATTCGAGCTGCCAG', 'GTGGGAGTGAATCAATCCTG', 'ACTCCAGCCGACTACTCTAG', 'GCCCTATTCCTGTCAAGTAC', 'AATTGTTATCCGAAACGAGT', 'TACGTCACCATGCTGTTTCG']
['CAAGAACCCATGATCTATCA', 'CAACCAGTTCAGACGTATAA', 'ATGGGGCcCATGTTAGTTTA', 'CAATGgAGCGAAGCGtCTaG', 'CTAGTGATCGCATCGGGCTG', 'CACGCACTCCCATAGTATGG', 'CGACTCATACAACAGTACTT', 'ACACATCGAATTCCTGTACA', 'TGTCTTATTCGAGCTGCCAG', 'GTGGGAGTGAATCAATCCTG', 'ACTCCAGCCGACTACTCTAG', 'GCCCTATTCCTGTCAAGTAC', 'AATTGTTATCCGAAACGAGT', 'TACGTCACCATGCTGTTTCG']
All tests passed successfully!





In [14]:
# Test the GenomeFASTA class on fixed length sequences
test_GenomeFASTA(variable_fasta_in, fixed_bed_in, out, batch_size=50, fixed_length=20, length_dim="_length")

100%|██████████| 14/14 [00:00<00:00, 74518.09it/s]
100%|██████████| 14/14 [00:00<00:00, 55240.13it/s]

['CGCAAGAACCCATGATCTAT', 'GCAACCAGTTCAGACGTATA', 'AACATGGGGCcCATGTTAGT', 'CAATGgAGCGAAGCGtCTaG', 'GCTAGTGATCGCATCGGGCT', 'AAGCCACGCACTCCCATAGT', 'CGACTCATACAACAGTACTT', 'TTCCACACATCGAATTCCTG', 'CGTGTCTTATTCGAGCTGCC', 'ATAGTGGGAGTGAATCAATC', 'ACTCCAGCCGACTACTCTAG', 'GCGCCCTATTCCTGTCAAGT', 'TCTAATTGTTATCCGAAACG', 'TTACGTCACCATGCTGTTTC']
['CGCAAGAACCCATGATCTAT', 'GCAACCAGTTCAGACGTATA', 'AACATGGGGCcCATGTTAGT', 'CAATGgAGCGAAGCGtCTaG', 'GCTAGTGATCGCATCGGGCT', 'AAGCCACGCACTCCCATAGT', 'CGACTCATACAACAGTACTT', 'TTCCACACATCGAATTCCTG', 'CGTGTCTTATTCGAGCTGCC', 'ATAGTGGGAGTGAATCAATC', 'ACTCCAGCCGACTACTCTAG', 'GCGCCCTATTCCTGTCAAGT', 'TCTAATTGTTATCCGAAACG', 'TTACGTCACCATGCTGTTTC']
All tests passed successfully!





# `BAM`

In [117]:
# list of bams with simulated{1-5}.bam
bams = [Path(sd.__file__).resolve().parent.parent / 'tests' / 'data' / f'simulated{i}.bam' for i in range(1, 6)]
bams

[PosixPath('/cellar/users/aklie/projects/ML4GLand/SeqData/tests/data/simulated1.bam'),
 PosixPath('/cellar/users/aklie/projects/ML4GLand/SeqData/tests/data/simulated2.bam'),
 PosixPath('/cellar/users/aklie/projects/ML4GLand/SeqData/tests/data/simulated3.bam'),
 PosixPath('/cellar/users/aklie/projects/ML4GLand/SeqData/tests/data/simulated4.bam'),
 PosixPath('/cellar/users/aklie/projects/ML4GLand/SeqData/tests/data/simulated5.bam')]

In [118]:
# Read in coverage
path_coverage = Path(sd.__file__).resolve().parent.parent / 'tests' / 'data' / 'variable.bedcov.pkl'
coverages = pickle.load(open(path_coverage, 'rb'))

In [119]:
# read in the bed file
bed = pd.read_csv(fixed_bed_in, sep="\t", header=None)
bed

Unnamed: 0,0,1,2
0,chr1,23,43
1,chr1,65,85
2,chr2,54,74
3,chr2,127,147
4,chr3,33,53
5,chr3,80,100
6,chr4,13,33
7,chr4,55,75
8,chr5,139,159
9,chr5,248,268


In [120]:
# Subset the coverage to the bed file coordinates
new_cov = {}
# for each bam file
for bam in bams:
    bam_name = bam.name
    new_cov[bam_name] = {}
    for i, (region, coverage) in enumerate(coverages[bam_name].items()):
        coverage_interval = region.split(":")[1]
        coverage_start, coverage_end = map(int, coverage_interval.split("-"))
        start_offset = coverage_start - bed[1].values[i]
        end_offset = bed[2].values[i] - coverage_end
        new_region = f"{bed[0].values[i]}:{bed[1].values[i]}-{bed[2].values[i]}"
        print(f"Region: {region} -> {new_region}")
        print(coverage)
        print(f"Offsets: {start_offset}, {end_offset}")
        if end_offset == 0:
            new_cov[bam_name][new_region] = coverage[start_offset:]
        else:
            new_cov[bam_name][new_region] = coverage[start_offset:end_offset]

Region: chr1:23-48 -> chr1:23-43
[2 2 2 2 1 1 2 1 1 2 2 2 3 3 3 3 2 2 2 1 1 2 1 1 1]
Offsets: 0, -5
Region: chr1:65-87 -> chr1:65-85
[2 1 1 2 2 2 2 2 2 2 2 3 3 2 4 4 3 3 5 5 6 6]
Offsets: 0, -2
Region: chr2:54-81 -> chr2:54-74
[1 1 1 1 2 3 2 3 3 3 4 5 5 5 4 3 3 2 3 4 3 2 3 3 3 4 4]
Offsets: 0, -7
Region: chr2:127-148 -> chr2:127-147
[1 0 0 0 0 0 1 1 1 1 1 1 1 1 1 1 0 0 0 0 0]
Offsets: 0, -1
Region: chr3:33-56 -> chr3:33-53
[0 0 0 0 0 1 1 1 1 1 1 1 1 1 1 0 0 0 0 0 0 0 0]
Offsets: 0, -3
Region: chr3:80-109 -> chr3:80-100
[2 2 2 3 3 4 3 2 2 2 2 3 4 3 3 2 2 2 3 3 3 2 1 1 1 1 1 1 0]
Offsets: 0, -9
Region: chr4:13-33 -> chr4:13-33
[3 4 4 4 4 5 5 3 3 3 3 3 6 6 6 5 6 5 5 5]
Offsets: 0, 0
Region: chr4:55-84 -> chr4:55-75
[3 3 3 3 3 3 3 2 4 4 4 4 5 5 5 5 6 6 4 4 4 3 2 4 4 5 3 3 3]
Offsets: 0, -9
Region: chr5:139-164 -> chr5:139-159
[0 0 1 1 1 1 1 1 2 3 3 3 2 2 2 2 2 4 3 2 2 2 4 5 6]
Offsets: 0, -5
Region: chr5:248-274 -> chr5:248-268
[1 2 1 1 1 2 2 3 3 3 3 2 2 3 3 4 6 5 5 5 7 8 9 8 8 6]
Offsets:

In [121]:
def test_BAM(bam_files, bed_file, coverages, temp_dir, batch_size=50, sequence_dim="_sequence"):
    """
    Tests the BAM class for one or more input BAM files and their coverage.

    Parameters:
        bam_files (list): List of paths to BAM files.
        bed_file (str or Path): Path to the BED file.
        coverage_file (str or Path): Path to the pickled coverage file.
        temp_dir (str or Path): Directory where temporary files will be stored.
        batch_size (int): Batch size for the BAM reader.
        sequence_dim (str): Name of the sequence dimension in the Zarr output.

    Raises:
        AssertionError: If any test fails.
    """
    temp_dir = Path(temp_dir)
    temp_dir.mkdir(parents=True, exist_ok=True)
    bam_out = temp_dir / 'output.bam.zarr'

    # Test instantiation of the BAM reader class
    bam_reader = BAM(
        name="cov",
        bams=bam_files,
        samples=[bam.name for bam in bam_files],
        batch_size=batch_size,
    )
    assert isinstance(bam_reader, BAM), "BAM reader instantiation failed."

    # Read in the BED file
    bed = pd.read_csv(bed_file, sep="\t", header=None)

    # Verify each BAM file individually
    for bam_file in bam_files:
        print(f"Testing {bam_file}")
        bam_name = bam_file.name
        iterator = bam_reader._reader(bed=bed, f=pysam.AlignmentFile(bam_file))
        _read = [list(seq) for seq in iterator]

        # Verify that the data matches the expected coverage
        for i, (region, coverage) in enumerate(coverages[bam_name].items()):
            print(f"Region {region}: {list(_read[i])}, {coverage}")
            assert np.array_equal(coverage, _read[i]), f"Region {region} in {bam_name} does not match."

    # Test writing to Zarr
    bed["strand"] = "+"
    bam_reader._write(
        bam_out,
        bed=bed,
        fixed_length=False,
        sequence_dim=sequence_dim,
        overwrite=True
    )
    zarr.consolidate_metadata(bam_out)

    # Test round-trip reading
    data = sd.open_zarr(bam_out)

    # Verify that the data matches the expected coverage for all regions
    for file_index, bam_file in enumerate(bam_files):
        print(f"Testing {bam_file}")
        bam_name = bam_file.name
        for region_index, (region, coverage) in enumerate(coverages[bam_name].items()):
            print(data['cov'][region_index].shape)
            #print(f"Region {region}: {list(data['cov'].values[:, i])}, {coverage}")
            assert np.array_equal(coverage, data["cov"].values[region_index, file_index]), f"Region {region} in {bam_name} does not match."

    # Clean up temporary files
    for f in temp_dir.iterdir():
        if f.is_dir():
            os.system(f'rm -r {f}')
        else:
            f.unlink()

    print("All tests passed successfully!")


In [122]:
test_BAM(bams, fixed_bed_in, new_cov, out, batch_size=50, sequence_dim="_sequence")

Testing /cellar/users/aklie/projects/ML4GLand/SeqData/tests/data/simulated1.bam


100%|██████████| 14/14 [00:00<00:00, 12868.78it/s]


Region chr1:23-43: [2, 2, 2, 2, 1, 1, 2, 1, 1, 2, 2, 2, 3, 3, 3, 3, 2, 2, 2, 1], [2 2 2 2 1 1 2 1 1 2 2 2 3 3 3 3 2 2 2 1]
Region chr1:65-85: [2, 1, 1, 2, 2, 2, 2, 2, 2, 2, 2, 3, 3, 2, 4, 4, 3, 3, 5, 5], [2 1 1 2 2 2 2 2 2 2 2 3 3 2 4 4 3 3 5 5]
Region chr2:54-74: [1, 1, 1, 1, 2, 3, 2, 3, 3, 3, 4, 5, 5, 5, 4, 3, 3, 2, 3, 4], [1 1 1 1 2 3 2 3 3 3 4 5 5 5 4 3 3 2 3 4]
Region chr2:127-147: [1, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0], [1 0 0 0 0 0 1 1 1 1 1 1 1 1 1 1 0 0 0 0]
Region chr3:33-53: [0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0], [0 0 0 0 0 1 1 1 1 1 1 1 1 1 1 0 0 0 0 0]
Region chr3:80-100: [2, 2, 2, 3, 3, 4, 3, 2, 2, 2, 2, 3, 4, 3, 3, 2, 2, 2, 3, 3], [2 2 2 3 3 4 3 2 2 2 2 3 4 3 3 2 2 2 3 3]
Region chr4:13-33: [3, 4, 4, 4, 4, 5, 5, 3, 3, 3, 3, 3, 6, 6, 6, 5, 6, 5, 5, 5], [3 4 4 4 4 5 5 3 3 3 3 3 6 6 6 5 6 5 5 5]
Region chr4:55-75: [3, 3, 3, 3, 3, 3, 3, 2, 4, 4, 4, 4, 5, 5, 5, 5, 6, 6, 4, 4], [3 3 3 3 3 3 3 2 4 4 4 4 5 5 5 5 6 6 4 4]
Region chr5:1

100%|██████████| 14/14 [00:00<00:00, 8764.22it/s]


Region chr1:23-43: [1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1], [1 1 1 1 1 0 0 0 0 0 1 1 1 1 1 1 1 1 1 1]
Region chr1:65-85: [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 1, 1, 2, 2, 2, 2, 2], [1 1 1 1 1 1 1 1 1 1 0 0 0 1 1 2 2 2 2 2]
Region chr2:54-74: [2, 2, 1, 2, 2, 2, 3, 4, 4, 4, 3, 3, 3, 2, 2, 3, 2, 1, 2, 2], [2 2 1 2 2 2 3 4 4 4 3 3 3 2 2 3 2 1 2 2]
Region chr2:127-147: [2, 1, 0, 0, 0, 0, 1, 3, 3, 3, 3, 3, 3, 5, 5, 6, 5, 3, 4, 4], [2 1 0 0 0 0 1 3 3 3 3 3 3 5 5 6 5 3 4 4]
Region chr3:33-53: [1, 2, 2, 2, 3, 4, 4, 4, 4, 4, 4, 3, 3, 3, 3, 3, 4, 5, 5, 6], [1 2 2 2 3 4 4 4 4 4 4 3 3 3 3 3 4 5 5 6]
Region chr3:80-100: [1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1], [1 1 1 1 1 0 0 0 0 0 1 1 1 1 1 1 1 1 1 1]
Region chr4:13-33: [2, 3, 3, 3, 2, 1, 4, 4, 4, 5, 6, 5, 5, 5, 5, 5, 3, 3, 3, 2], [2 3 3 3 2 1 4 4 4 5 6 5 5 5 5 5 3 3 3 2]
Region chr4:55-75: [0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 1, 3, 3, 3, 5, 5, 7, 7], [0 1 1 1 1 1 1 1 1 1 1 0 1 3 3 3 5 5 7 7]
Region chr5:1

100%|██████████| 14/14 [00:00<00:00, 8589.86it/s]


Region chr1:23-43: [1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 2, 2, 2, 2, 2, 2, 2], [1 1 1 1 1 1 1 1 0 0 0 0 0 2 2 2 2 2 2 2]
Region chr1:65-85: [4, 4, 5, 4, 4, 5, 6, 5, 5, 5, 5, 5, 4, 4, 4, 4, 3, 4, 4, 4], [4 4 5 4 4 5 6 5 5 5 5 5 4 4 4 4 3 4 4 4]
Region chr2:54-74: [1, 1, 1, 1, 1, 2, 2, 2, 2, 1, 1, 1, 1, 1, 3, 2, 2, 2, 2, 2], [1 1 1 1 1 2 2 2 2 1 1 1 1 1 3 2 2 2 2 2]
Region chr2:127-147: [2, 2, 1, 1, 1, 1, 1, 2, 2, 2, 1, 1, 1, 1, 1, 2, 2, 1, 1, 1], [2 2 1 1 1 1 1 2 2 2 1 1 1 1 1 2 2 1 1 1]
Region chr3:33-53: [1, 0, 0, 0, 0, 0, 1, 2, 2, 2, 2, 2, 2, 2, 3, 3, 2, 2, 2, 2], [1 0 0 0 0 0 1 2 2 2 2 2 2 2 3 3 2 2 2 2]
Region chr3:80-100: [1, 1, 1, 1, 1, 1, 1, 1, 1, 2, 2, 2, 2, 2, 1, 1, 1, 1, 1, 1], [1 1 1 1 1 1 1 1 1 2 2 2 2 2 1 1 1 1 1 1]
Region chr4:13-33: [5, 4, 4, 3, 2, 3, 3, 3, 3, 4, 4, 5, 5, 6, 6, 5, 4, 4, 3, 2], [5 4 4 3 2 3 3 3 3 4 4 5 5 6 6 5 4 4 3 2]
Region chr4:55-75: [0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 2, 2, 3, 3, 4, 4, 4, 3], [0 0 0 0 0 0 0 0 0 1 1 1 2 2 3 3 4 4 4 3]
Region chr5:1

100%|██████████| 14/14 [00:00<00:00, 8262.31it/s]


Region chr1:23-43: [1, 1, 1, 1, 1, 1, 0, 0, 0, 1, 1, 2, 2, 2, 2, 3, 3, 3, 3, 2], [1 1 1 1 1 1 0 0 0 1 1 2 2 2 2 3 3 3 3 2]
Region chr1:65-85: [4, 3, 3, 2, 1, 0, 1, 1, 1, 2, 3, 3, 3, 3, 3, 3, 2, 2, 2, 1], [4 3 3 2 1 0 1 1 1 2 3 3 3 3 3 3 2 2 2 1]
Region chr2:54-74: [1, 1, 1, 0, 1, 1, 1, 1, 3, 3, 3, 3, 3, 4, 3, 3, 3, 3, 1, 2], [1 1 1 0 1 1 1 1 3 3 3 3 3 4 3 3 3 3 1 2]
Region chr2:127-147: [2, 2, 2, 3, 3, 3, 4, 4, 4, 4, 4, 4, 4, 3, 3, 3, 2, 2, 3, 3], [2 2 2 3 3 3 4 4 4 4 4 4 4 3 3 3 2 2 3 3]
Region chr3:33-53: [1, 1, 2, 2, 2, 2, 2, 3, 4, 5, 4, 4, 3, 3, 3, 4, 4, 4, 3, 2], [1 1 2 2 2 2 2 3 4 5 4 4 3 3 3 4 4 4 3 2]
Region chr3:80-100: [2, 2, 2, 2, 2, 2, 2, 3, 3, 3, 2, 2, 1, 1, 1, 2, 2, 2, 2, 2], [2 2 2 2 2 2 2 3 3 3 2 2 1 1 1 2 2 2 2 2]
Region chr4:13-33: [5, 5, 4, 4, 4, 4, 4, 3, 2, 2, 3, 3, 5, 6, 6, 5, 6, 5, 5, 6], [5 5 4 4 4 4 4 3 2 2 3 3 5 6 6 5 6 5 5 6]
Region chr4:55-75: [1, 1, 1, 1, 1, 2, 2, 2, 3, 5, 5, 5, 4, 4, 4, 3, 4, 6, 5, 3], [1 1 1 1 1 2 2 2 3 5 5 5 4 4 4 3 4 6 5 3]
Region chr5:1

100%|██████████| 14/14 [00:00<00:00, 13787.33it/s]


Region chr1:23-43: [4, 3, 3, 2, 2, 3, 5, 4, 5, 5, 4, 4, 6, 7, 7, 7, 5, 5, 4, 5], [4 3 3 2 2 3 5 4 5 5 4 4 6 7 7 7 5 5 4 5]
Region chr1:65-85: [3, 3, 2, 3, 3, 6, 4, 5, 5, 5, 5, 6, 6, 5, 6, 3, 4, 3, 4, 4], [3 3 2 3 3 6 4 5 5 5 5 6 6 5 6 3 4 3 4 4]
Region chr2:54-74: [2, 2, 2, 2, 1, 1, 1, 1, 1, 2, 1, 1, 1, 1, 2, 3, 3, 3, 3, 2], [2 2 2 2 1 1 1 1 1 2 1 1 1 1 2 3 3 3 3 2]
Region chr2:127-147: [2, 2, 2, 2, 2, 2, 2, 2, 0, 0, 0, 0, 0, 2, 2, 2, 2, 2, 3, 3], [2 2 2 2 2 2 2 2 0 0 0 0 0 2 2 2 2 2 3 3]
Region chr3:33-53: [0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0], [0 0 0 0 0 1 1 1 1 1 1 1 1 1 1 0 0 0 0 0]
Region chr3:80-100: [2, 2, 2, 2, 2, 1, 0, 0, 0, 0, 1, 2, 2, 2, 2, 2, 2, 2, 2, 2], [2 2 2 2 2 1 0 0 0 0 1 2 2 2 2 2 2 2 2 2]
Region chr4:13-33: [2, 2, 2, 2, 2, 2, 2, 2, 2, 1, 0, 0, 0, 0, 1, 2, 2, 2, 2, 2], [2 2 2 2 2 2 2 2 2 1 0 0 0 0 1 2 2 2 2 2]
Region chr4:55-75: [0, 1, 2, 3, 3, 3, 3, 3, 3, 3, 3, 2, 1, 0, 0, 0, 1, 2, 3, 3], [0 1 2 3 3 3 3 3 3 3 3 2 1 0 0 0 1 2 3 3]
Region chr5:1

100%|██████████| 14/14 [00:00<00:00, 13605.25it/s]
100%|██████████| 14/14 [00:00<00:00, 14614.30it/s]
100%|██████████| 14/14 [00:00<00:00, 10276.56it/s]
100%|██████████| 14/14 [00:00<00:00, 13659.05it/s]
100%|██████████| 14/14 [00:00<00:00, 10292.77it/s]

Testing /cellar/users/aklie/projects/ML4GLand/SeqData/tests/data/simulated1.bam
(5,)
(5,)
(5,)
(5,)
(5,)
(5,)
(5,)
(5,)
(5,)
(5,)
(5,)
(5,)
(5,)
(5,)
Testing /cellar/users/aklie/projects/ML4GLand/SeqData/tests/data/simulated2.bam
(5,)
(5,)
(5,)
(5,)
(5,)
(5,)
(5,)
(5,)
(5,)
(5,)
(5,)
(5,)
(5,)
(5,)
Testing /cellar/users/aklie/projects/ML4GLand/SeqData/tests/data/simulated3.bam
(5,)
(5,)
(5,)
(5,)
(5,)
(5,)





(5,)
(5,)
(5,)
(5,)
(5,)
(5,)
(5,)
(5,)
(5,)
Testing /cellar/users/aklie/projects/ML4GLand/SeqData/tests/data/simulated4.bam
(5,)
(5,)
(5,)
(5,)
(5,)
(5,)
(5,)
(5,)
(5,)
(5,)
(5,)
(5,)
(5,)
(5,)
Testing /cellar/users/aklie/projects/ML4GLand/SeqData/tests/data/simulated5.bam
(5,)
(5,)
(5,)
(5,)
(5,)
(5,)
(5,)
(5,)
(5,)
(5,)
(5,)
(5,)
(5,)
(5,)
All tests passed successfully!


In [123]:
test_BAM([bams[0]], fixed_bed_in, {"simulated1.bam": new_cov["simulated1.bam"]}, out, batch_size=50, sequence_dim="_sequence")

Testing /cellar/users/aklie/projects/ML4GLand/SeqData/tests/data/simulated1.bam


100%|██████████| 14/14 [00:00<00:00, 12925.44it/s]


Region chr1:23-43: [2, 2, 2, 2, 1, 1, 2, 1, 1, 2, 2, 2, 3, 3, 3, 3, 2, 2, 2, 1], [2 2 2 2 1 1 2 1 1 2 2 2 3 3 3 3 2 2 2 1]
Region chr1:65-85: [2, 1, 1, 2, 2, 2, 2, 2, 2, 2, 2, 3, 3, 2, 4, 4, 3, 3, 5, 5], [2 1 1 2 2 2 2 2 2 2 2 3 3 2 4 4 3 3 5 5]
Region chr2:54-74: [1, 1, 1, 1, 2, 3, 2, 3, 3, 3, 4, 5, 5, 5, 4, 3, 3, 2, 3, 4], [1 1 1 1 2 3 2 3 3 3 4 5 5 5 4 3 3 2 3 4]
Region chr2:127-147: [1, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0], [1 0 0 0 0 0 1 1 1 1 1 1 1 1 1 1 0 0 0 0]
Region chr3:33-53: [0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0], [0 0 0 0 0 1 1 1 1 1 1 1 1 1 1 0 0 0 0 0]
Region chr3:80-100: [2, 2, 2, 3, 3, 4, 3, 2, 2, 2, 2, 3, 4, 3, 3, 2, 2, 2, 3, 3], [2 2 2 3 3 4 3 2 2 2 2 3 4 3 3 2 2 2 3 3]
Region chr4:13-33: [3, 4, 4, 4, 4, 5, 5, 3, 3, 3, 3, 3, 6, 6, 6, 5, 6, 5, 5, 5], [3 4 4 4 4 5 5 3 3 3 3 3 6 6 6 5 6 5 5 5]
Region chr4:55-75: [3, 3, 3, 3, 3, 3, 3, 2, 4, 4, 4, 4, 5, 5, 5, 5, 6, 6, 4, 4], [3 3 3 3 3 3 3 2 4 4 4 4 5 5 5 5 6 6 4 4]
Region chr5:1

100%|██████████| 14/14 [00:00<00:00, 12638.88it/s]

Testing /cellar/users/aklie/projects/ML4GLand/SeqData/tests/data/simulated1.bam
(1,)
(1,)
(1,)
(1,)
(1,)
(1,)
(1,)
(1,)
(1,)
(1,)
(1,)
(1,)
(1,)
(1,)
All tests passed successfully!





# `BigWig`

In [65]:
def test_BigWig(bigwig_files, bed_file, chromsizes_file, coverages, temp_dir, batch_size=50, sequence_dim="_sequence", fixed_length=False, length_dim=None):
    """
    Tests the BigWig class for one or more input BigWig files.

    Parameters:
        bigwig_files (list): List of paths to BigWig files.
        bed_file (str or Path): Path to the BED file.
        chromsizes_file (str or Path): Path to the chromsizes file.
        coverage_file (str or Path): Path to the pickled coverage file.
        temp_dir (str or Path): Directory where temporary files will be stored.
        batch_size (int): Batch size for the BigWig reader.
        sequence_dim (str): Name of the sequence dimension in the Zarr output.

    Raises:
        AssertionError: If any test fails.
    """
    temp_dir = Path(temp_dir)
    temp_dir.mkdir(parents=True, exist_ok=True)
    bigwig_out = temp_dir / 'output.bigwig.zarr'

    # Load chromsizes
    chromsizes = {}
    with open(chromsizes_file) as f:
        for line in f:
            chrom, size = line.strip().split()
            chromsizes[chrom] = int(size)

    # Test instantiation of the BigWig reader class
    bigwig_reader = BigWig(
        name="cov",
        bigwigs=bigwig_files,
        samples=[bigwig.name for bigwig in bigwig_files],
        batch_size=batch_size,
    )
    assert isinstance(bigwig_reader, BigWig), "BigWig reader instantiation failed."

    # Read in the BED file
    bed = pd.read_csv(bed_file, sep="\t", header=None)

    # Verify each BigWig file individually
    for bigwig_file in bigwig_files:
        bigwig_name = bigwig_file.name
        iterator = bigwig_reader._reader(
            bed=bed,
            f=pyBigWig.open(str(bigwig_file)),
            contig_lengths=chromsizes
        )
        _read = [list(seq) for seq in iterator]

        # Verify that the data matches the expected coverage
        for i, (region, coverage) in enumerate(coverages[bigwig_name].items()):
            print(f"Region {region}: {list(_read[i])}, {coverage}")
            assert np.array_equal(coverage, _read[i]), f"Region {region} in {bigwig_name} does not match."

        # Close the BigWig file
        iterator.close()

    # Test writing to Zarr
    bed["strand"] = "+"
    bigwig_reader._write(
        bigwig_out,
        bed=bed,
        fixed_length=fixed_length,
        sequence_dim=sequence_dim,
        length_dim=length_dim,
        overwrite=True
    )
    zarr.consolidate_metadata(bigwig_out)

    # Test round-trip reading
    data = sd.open_zarr(bigwig_out)
    print(data)
    
    # Verify that the data matches the expected coverage for all regions
    for file_index, bigwig_file in enumerate(bigwig_files):
        print(f"Testing {bigwig_file}")
        bigwig_name = bigwig_file.name
        for region_index, (region, coverage) in enumerate(coverages[bigwig_name].items()):
            print(data['cov'][region_index].shape)
            assert np.array_equal(coverage, data["cov"].values[region_index, file_index]), f"Region {region} in {bigwig_name} does not match."

    # Clean up temporary files
    for f in temp_dir.iterdir():
        if f.is_dir():
            os.system(f'rm -r {f}')
        else:
            f.unlink()

    print("All tests passed successfully!")


In [66]:
# list of bigwigs with simulated{1-5}.bw
bigwigs = [Path(sd.__file__).resolve().parent.parent / 'tests' / 'data' / f'simulated{i}.bw' for i in range(1, 6)]
chromsizes = Path(sd.__file__).resolve().parent.parent / 'tests' / 'data' / 'variable.chrom.sizes'

In [67]:
# modify new cov to have .bw instead of .bam in keys
new_cov_bw = {}
for key, value in new_cov.items():
    new_cov_bw[key.replace(".bam", ".bw")] = value

In [68]:
# Test the BigWig class on variable length sequences
data = test_BigWig(bigwigs, fixed_bed_in, chromsizes, new_cov_bw, out, batch_size=50, fixed_length=20, sequence_dim="_sequence", length_dim="_length")

100%|██████████| 14/14 [00:00<00:00, 30856.68it/s]


Region chr1:23-43: [2.0, 2.0, 2.0, 2.0, 1.0, 1.0, 2.0, 1.0, 1.0, 2.0, 2.0, 2.0, 3.0, 3.0, 3.0, 3.0, 2.0, 2.0, 2.0, 1.0], [2 2 2 2 1 1 2 1 1 2 2 2 3 3 3 3 2 2 2 1]
Region chr1:65-85: [2.0, 1.0, 1.0, 2.0, 2.0, 2.0, 2.0, 2.0, 2.0, 2.0, 2.0, 3.0, 3.0, 2.0, 4.0, 4.0, 3.0, 3.0, 5.0, 5.0], [2 1 1 2 2 2 2 2 2 2 2 3 3 2 4 4 3 3 5 5]
Region chr2:54-74: [1.0, 1.0, 1.0, 1.0, 2.0, 3.0, 2.0, 3.0, 3.0, 3.0, 4.0, 5.0, 5.0, 5.0, 4.0, 3.0, 3.0, 2.0, 3.0, 4.0], [1 1 1 1 2 3 2 3 3 3 4 5 5 5 4 3 3 2 3 4]
Region chr2:127-147: [1.0, 0.0, 0.0, 0.0, 0.0, 0.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 0.0, 0.0, 0.0, 0.0], [1 0 0 0 0 0 1 1 1 1 1 1 1 1 1 1 0 0 0 0]
Region chr3:33-53: [0.0, 0.0, 0.0, 0.0, 0.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 0.0, 0.0, 0.0, 0.0, 0.0], [0 0 0 0 0 1 1 1 1 1 1 1 1 1 1 0 0 0 0 0]
Region chr3:80-100: [2.0, 2.0, 2.0, 3.0, 3.0, 4.0, 3.0, 2.0, 2.0, 2.0, 2.0, 3.0, 4.0, 3.0, 3.0, 2.0, 2.0, 2.0, 3.0, 3.0], [2 2 2 3 3 4 3 2 2 2 2 3 4 3 3 2 2 2 3 3]
Region chr4:13-33: 

100%|██████████| 14/14 [00:00<00:00, 19189.63it/s]


Region chr1:23-43: [1.0, 1.0, 1.0, 1.0, 1.0, 0.0, 0.0, 0.0, 0.0, 0.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0], [1 1 1 1 1 0 0 0 0 0 1 1 1 1 1 1 1 1 1 1]
Region chr1:65-85: [1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 0.0, 0.0, 0.0, 1.0, 1.0, 2.0, 2.0, 2.0, 2.0, 2.0], [1 1 1 1 1 1 1 1 1 1 0 0 0 1 1 2 2 2 2 2]
Region chr2:54-74: [2.0, 2.0, 1.0, 2.0, 2.0, 2.0, 3.0, 4.0, 4.0, 4.0, 3.0, 3.0, 3.0, 2.0, 2.0, 3.0, 2.0, 1.0, 2.0, 2.0], [2 2 1 2 2 2 3 4 4 4 3 3 3 2 2 3 2 1 2 2]
Region chr2:127-147: [2.0, 1.0, 0.0, 0.0, 0.0, 0.0, 1.0, 3.0, 3.0, 3.0, 3.0, 3.0, 3.0, 5.0, 5.0, 6.0, 5.0, 3.0, 4.0, 4.0], [2 1 0 0 0 0 1 3 3 3 3 3 3 5 5 6 5 3 4 4]
Region chr3:33-53: [1.0, 2.0, 2.0, 2.0, 3.0, 4.0, 4.0, 4.0, 4.0, 4.0, 4.0, 3.0, 3.0, 3.0, 3.0, 3.0, 4.0, 5.0, 5.0, 6.0], [1 2 2 2 3 4 4 4 4 4 4 3 3 3 3 3 4 5 5 6]
Region chr3:80-100: [1.0, 1.0, 1.0, 1.0, 1.0, 0.0, 0.0, 0.0, 0.0, 0.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0], [1 1 1 1 1 0 0 0 0 0 1 1 1 1 1 1 1 1 1 1]
Region chr4:13-33: 

100%|██████████| 14/14 [00:00<00:00, 17681.50it/s]


Region chr1:23-43: [1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 0.0, 0.0, 0.0, 0.0, 0.0, 2.0, 2.0, 2.0, 2.0, 2.0, 2.0, 2.0], [1 1 1 1 1 1 1 1 0 0 0 0 0 2 2 2 2 2 2 2]
Region chr1:65-85: [4.0, 4.0, 5.0, 4.0, 4.0, 5.0, 6.0, 5.0, 5.0, 5.0, 5.0, 5.0, 4.0, 4.0, 4.0, 4.0, 3.0, 4.0, 4.0, 4.0], [4 4 5 4 4 5 6 5 5 5 5 5 4 4 4 4 3 4 4 4]
Region chr2:54-74: [1.0, 1.0, 1.0, 1.0, 1.0, 2.0, 2.0, 2.0, 2.0, 1.0, 1.0, 1.0, 1.0, 1.0, 3.0, 2.0, 2.0, 2.0, 2.0, 2.0], [1 1 1 1 1 2 2 2 2 1 1 1 1 1 3 2 2 2 2 2]
Region chr2:127-147: [2.0, 2.0, 1.0, 1.0, 1.0, 1.0, 1.0, 2.0, 2.0, 2.0, 1.0, 1.0, 1.0, 1.0, 1.0, 2.0, 2.0, 1.0, 1.0, 1.0], [2 2 1 1 1 1 1 2 2 2 1 1 1 1 1 2 2 1 1 1]
Region chr3:33-53: [1.0, 0.0, 0.0, 0.0, 0.0, 0.0, 1.0, 2.0, 2.0, 2.0, 2.0, 2.0, 2.0, 2.0, 3.0, 3.0, 2.0, 2.0, 2.0, 2.0], [1 0 0 0 0 0 1 2 2 2 2 2 2 2 3 3 2 2 2 2]
Region chr3:80-100: [1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 2.0, 2.0, 2.0, 2.0, 2.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0], [1 1 1 1 1 1 1 1 1 2 2 2 2 2 1 1 1 1 1 1]
Region chr4:13-33: 

100%|██████████| 14/14 [00:00<00:00, 18292.91it/s]


Region chr1:23-43: [1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 0.0, 0.0, 0.0, 1.0, 1.0, 2.0, 2.0, 2.0, 2.0, 3.0, 3.0, 3.0, 3.0, 2.0], [1 1 1 1 1 1 0 0 0 1 1 2 2 2 2 3 3 3 3 2]
Region chr1:65-85: [4.0, 3.0, 3.0, 2.0, 1.0, 0.0, 1.0, 1.0, 1.0, 2.0, 3.0, 3.0, 3.0, 3.0, 3.0, 3.0, 2.0, 2.0, 2.0, 1.0], [4 3 3 2 1 0 1 1 1 2 3 3 3 3 3 3 2 2 2 1]
Region chr2:54-74: [1.0, 1.0, 1.0, 0.0, 1.0, 1.0, 1.0, 1.0, 3.0, 3.0, 3.0, 3.0, 3.0, 4.0, 3.0, 3.0, 3.0, 3.0, 1.0, 2.0], [1 1 1 0 1 1 1 1 3 3 3 3 3 4 3 3 3 3 1 2]
Region chr2:127-147: [2.0, 2.0, 2.0, 3.0, 3.0, 3.0, 4.0, 4.0, 4.0, 4.0, 4.0, 4.0, 4.0, 3.0, 3.0, 3.0, 2.0, 2.0, 3.0, 3.0], [2 2 2 3 3 3 4 4 4 4 4 4 4 3 3 3 2 2 3 3]
Region chr3:33-53: [1.0, 1.0, 2.0, 2.0, 2.0, 2.0, 2.0, 3.0, 4.0, 5.0, 4.0, 4.0, 3.0, 3.0, 3.0, 4.0, 4.0, 4.0, 3.0, 2.0], [1 1 2 2 2 2 2 3 4 5 4 4 3 3 3 4 4 4 3 2]
Region chr3:80-100: [2.0, 2.0, 2.0, 2.0, 2.0, 2.0, 2.0, 3.0, 3.0, 3.0, 2.0, 2.0, 1.0, 1.0, 1.0, 2.0, 2.0, 2.0, 2.0, 2.0], [2 2 2 2 2 2 2 3 3 3 2 2 1 1 1 2 2 2 2 2]
Region chr4:13-33: 

100%|██████████| 14/14 [00:00<00:00, 18230.44it/s]


Region chr1:23-43: [4.0, 3.0, 3.0, 2.0, 2.0, 3.0, 5.0, 4.0, 5.0, 5.0, 4.0, 4.0, 6.0, 7.0, 7.0, 7.0, 5.0, 5.0, 4.0, 5.0], [4 3 3 2 2 3 5 4 5 5 4 4 6 7 7 7 5 5 4 5]
Region chr1:65-85: [3.0, 3.0, 2.0, 3.0, 3.0, 6.0, 4.0, 5.0, 5.0, 5.0, 5.0, 6.0, 6.0, 5.0, 6.0, 3.0, 4.0, 3.0, 4.0, 4.0], [3 3 2 3 3 6 4 5 5 5 5 6 6 5 6 3 4 3 4 4]
Region chr2:54-74: [2.0, 2.0, 2.0, 2.0, 1.0, 1.0, 1.0, 1.0, 1.0, 2.0, 1.0, 1.0, 1.0, 1.0, 2.0, 3.0, 3.0, 3.0, 3.0, 2.0], [2 2 2 2 1 1 1 1 1 2 1 1 1 1 2 3 3 3 3 2]
Region chr2:127-147: [2.0, 2.0, 2.0, 2.0, 2.0, 2.0, 2.0, 2.0, 0.0, 0.0, 0.0, 0.0, 0.0, 2.0, 2.0, 2.0, 2.0, 2.0, 3.0, 3.0], [2 2 2 2 2 2 2 2 0 0 0 0 0 2 2 2 2 2 3 3]
Region chr3:33-53: [0.0, 0.0, 0.0, 0.0, 0.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 0.0, 0.0, 0.0, 0.0, 0.0], [0 0 0 0 0 1 1 1 1 1 1 1 1 1 1 0 0 0 0 0]
Region chr3:80-100: [2.0, 2.0, 2.0, 2.0, 2.0, 1.0, 0.0, 0.0, 0.0, 0.0, 1.0, 2.0, 2.0, 2.0, 2.0, 2.0, 2.0, 2.0, 2.0, 2.0], [2 2 2 2 2 1 0 0 0 0 1 2 2 2 2 2 2 2 2 2]
Region chr4:13-33: 

100%|██████████| 14/14 [00:00<00:00, 18850.80it/s]
100%|██████████| 14/14 [00:00<00:00, 28855.16it/s]
100%|██████████| 14/14 [00:00<00:00, 38682.65it/s]
100%|██████████| 14/14 [00:00<00:00, 16748.50it/s]
100%|██████████| 14/14 [00:00<00:00, 16782.01it/s]


<xarray.Dataset> Size: 6kB
Dimensions:     (_sequence: 14, cov_sample: 5, _length: 20)
Coordinates:
  * cov_sample  (cov_sample) object 40B 'simulated1.bw' ... 'simulated5.bw'
Dimensions without coordinates: _sequence, _length
Data variables:
    cov         (_sequence, cov_sample, _length) float32 6kB dask.array<chunksize=(14, 1, 20), meta=np.ndarray>
Testing /cellar/users/aklie/projects/ML4GLand/SeqData/tests/data/simulated1.bw
(5, 20)
(5, 20)
(5, 20)
(5, 20)
(5, 20)
(5, 20)
(5, 20)
(5, 20)
(5, 20)
(5, 20)
(5, 20)
(5, 20)
(5, 20)
(5, 20)
Testing /cellar/users/aklie/projects/ML4GLand/SeqData/tests/data/simulated2.bw
(5, 20)
(5, 20)
(5, 20)
(5, 20)
(5, 20)
(5, 20)
(5, 20)
(5, 20)
(5, 20)
(5, 20)
(5, 20)
(5, 20)
(5, 20)
(5, 20)
Testing /cellar/users/aklie/projects/ML4GLand/SeqData/tests/data/simulated3.bw
(5, 20)
(5, 20)
(5, 20)
(5, 20)
(5, 20)
(5, 20)
(5, 20)
(5, 20)
(5, 20)
(5, 20)
(5, 20)
(5, 20)
(5, 20)
(5, 20)
Testing /cellar/users/aklie/projects/ML4GLand/SeqData/tests/data/simulat

In [27]:


# Make chromsizes a list of tuples
chromsizes = {}
with open(Path(sd.__file__).resolve().parent.parent / 'tests' / 'data' / 'variable.chrom.sizes') as f:
    for line in f:
        chrom, size = line.strip().split()
        chromsizes[chrom] = int(size)
chromsizes

bigwig_reader = BigWig(
    name="cov",
    bigwigs=bigwigs,
    samples=[bigwig.name for bigwig in bigwigs],
    batch_size=50
)
assert isinstance(bigwig_reader, BigWig), "bigwig reader instantiation failed."

iterator = bigwig_reader._reader(
    bed=bed,
    f=pyBigWig.open(str(bigwigs[0])),
    contig_lengths=chromsizes
)

_read = [list(seq) for seq in iterator]

# Verify that the data matches the expected coverage
for i, (region, coverage) in enumerate(new_cov[bigwigs[0].name.replace(".bw", ".bam")].items()):
    print(f"Region {region}: {list(_read[i])}, {coverage}")
    assert np.array_equal(coverage, _read[i]), f"Region {region} in {bigwigs[0].name} does not match."

100%|██████████| 14/14 [00:00<00:00, 29214.06it/s]

Region chr1:23-43: [2.0, 2.0, 2.0, 2.0, 1.0, 1.0, 2.0, 1.0, 1.0, 2.0, 2.0, 2.0, 3.0, 3.0, 3.0, 3.0, 2.0, 2.0, 2.0, 1.0], [2 2 2 2 1 1 2 1 1 2 2 2 3 3 3 3 2 2 2 1]
Region chr1:65-85: [2.0, 1.0, 1.0, 2.0, 2.0, 2.0, 2.0, 2.0, 2.0, 2.0, 2.0, 3.0, 3.0, 2.0, 4.0, 4.0, 3.0, 3.0, 5.0, 5.0], [2 1 1 2 2 2 2 2 2 2 2 3 3 2 4 4 3 3 5 5]
Region chr2:54-74: [1.0, 1.0, 1.0, 1.0, 2.0, 3.0, 2.0, 3.0, 3.0, 3.0, 4.0, 5.0, 5.0, 5.0, 4.0, 3.0, 3.0, 2.0, 3.0, 4.0], [1 1 1 1 2 3 2 3 3 3 4 5 5 5 4 3 3 2 3 4]
Region chr2:127-147: [1.0, 0.0, 0.0, 0.0, 0.0, 0.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 0.0, 0.0, 0.0, 0.0], [1 0 0 0 0 0 1 1 1 1 1 1 1 1 1 1 0 0 0 0]
Region chr3:33-53: [0.0, 0.0, 0.0, 0.0, 0.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 0.0, 0.0, 0.0, 0.0, 0.0], [0 0 0 0 0 1 1 1 1 1 1 1 1 1 1 0 0 0 0 0]
Region chr3:80-100: [2.0, 2.0, 2.0, 3.0, 3.0, 4.0, 3.0, 2.0, 2.0, 2.0, 2.0, 3.0, 4.0, 3.0, 3.0, 2.0, 2.0, 2.0, 3.0, 3.0], [2 2 2 3 3 4 3 2 2 2 2 3 4 3 3 2 2 2 3 3]
Region chr4:13-33: 




# `from_region_files`

In [70]:
from seqdata import from_region_files

In [95]:
def test_from_region_files(
    *readers, 
    names,
    true_seqs,
    true_covs,
    temp_dir, 
    fixed_length=False, 
    length_dim=None
):
    """
    Tests the FlatFASTA class for a single input FASTA file.

    Parameters:
        input_fasta (str or Path): Path to the input FASTA file.
        temp_dir (str or Path): Directory where temporary files will be stored.
        batch_size (int): Batch size for the FlatFASTA reader.
        fixed_length (bool): Whether to use fixed-length sequences when writing to Zarr.
        length_dim (str): Name of the length dimension in the Zarr output.

    Raises:
        AssertionError: If any test fails.
    """
    temp_dir = Path(temp_dir)
    temp_dir.mkdir(parents=True, exist_ok=True)
    out = temp_dir / 'output.zarr'

    data = from_region_files(
        *readers,
        path=out,
        sequence_dim="_sequence",
        fixed_length=fixed_length,
        overwrite=True,
        length_dim=length_dim
    )

    # Verify that the data is the same
    for i, name in enumerate(names):
        if fixed_length:
            seqs = [''.join(row.astype(str)) for row in data[name].values]
        else:
            seqs = data[name].values.astype(str)
        print(name)
        print(seqs)
        print(true[i])
        assert np.array_equal(true[i], seqs), "Sequences do not match."

    # Clean up temporary files
    for f in temp_dir.iterdir():
        print(f)
        if f.is_dir():
            os.system(f'rm -r {f}')
        else:
            f.unlink()

    print("All tests passed successfully!")

In [137]:
fixed_length = 20

outfile = Path(sd.__file__).resolve().parent.parent / 'tests' / 'data' / 'tmp' / 'output.zarr'

bed = pd.read_csv(fixed_bed_in, sep="\t", header=None)
bed

if fixed_length:
    midpoint = (bed[1] + bed[2]) // 2
    bed[1] = midpoint - fixed_length // 2
    bed[2] = midpoint + fixed_length // 2
bed

fasta_sequences = read_fasta(variable_fasta_in)

true_seqs = [fasta_sequences[chrom][start:end] for chrom, start, end in bed.values]
true_seqs

genome_fasta = GenomeFASTA(
    name="seq",
    fasta=variable_fasta_in,
    batch_size=50,
)
bam = BAM(
    name="cov",
    bams=bams,
    samples=[bam.name for bam in bams],
    batch_size=50,
)

data = from_region_files(
    genome_fasta, 
    bam, 
    bed=fixed_bed_in,
    path=outfile,
    fixed_length=20,
    length_dim="_length",
    sequence_dim="_sequence",
    overwrite=True
)

# Verify length and sequence dimensions
if fixed_length:
    assert data["seq"].shape[1] == fixed_length, "Length dimension is incorrect."
assert data["seq"].shape[0] == len(bed), "Sequence dimension is incorrect."

# Verify that the data is the same
if fixed_length:
    seqs = [''.join(row.astype(str)) for row in data["seq"].values]
else:
    seqs = data["seq"].values.astype(str)
print(seqs)
print(true_seqs)
assert np.array_equal(true_seqs, seqs), "Sequences do not match."

# Verify that the data matches the expected coverage for all regions
for file_index, bam_file in enumerate(bams):
    print(f"Testing {bam_file}")
    bam_name = bam_file.name
    for region_index, (region, coverage) in enumerate(new_cov[bam_name].items()):
        print(data['cov'][region_index].shape)
        print(f"Region {region}: {list(data['cov'].values[region_index, file_index])}, {coverage}")
        assert np.array_equal(coverage, data["cov"].values[region_index, file_index]), f"Region {region} in {bam_name} does not match."

# Clean up temporary files
for f in temp_dir.iterdir():
    if f.is_dir():
        os.system(f'rm -r {f}')
    else:
        f.unlink()

## `read_genome_fasta`

In [151]:
from seqdata import read_genome_fasta

In [160]:
temp_dir = Path(out)
temp_dir.mkdir(parents=True, exist_ok=True)
out = temp_dir / 'output.zarr'

sdata = read_genome_fasta(
    fasta=variable_fasta_in,
    bed=fixed_bed_in,
    out=out,
    name="seq",
    fixed_length=20,
    batch_size=50,
    overwrite=True,
)

# Verify length and sequence dimensions
fasta_sequences = read_fasta(variable_fasta_in)
true = [fasta_sequences[chrom][start:end] for chrom, start, end in bed.values]

if fixed_length:
    assert data["seq"].shape[1] == fixed_length, "Length dimension is incorrect."
assert data["seq"].shape[0] == len(bed), "Sequence dimension is incorrect."

# Verify that the data is the same
if fixed_length:
    seqs = [''.join(row.astype(str)) for row in data["seq"].values]
else:
    seqs = data["seq"].values.astype(str)
assert np.array_equal(true, seqs), "Sequences do not match."

# Clean up temporary files
for f in temp_dir.iterdir():
    if f.is_dir():
        os.system(f'rm -r {f}')
    else:
        f.unlink()

100%|██████████| 14/14 [00:00<00:00, 82472.27it/s]




# `read_bam`

In [150]:
from seqdata import read_bam

In [171]:
temp_dir = Path(out)
temp_dir.mkdir(parents=True, exist_ok=True)
out = temp_dir / 'output.zarr'

sdata = read_bam(
    fasta=variable_fasta_in,
    bed=fixed_bed_in,
    bams=bams,
    out=out,
    seq_name="seq",
    cov_name="cov",
    samples=[b.name for b in bams],
    fixed_length=20,
    batch_size=50,
    overwrite=True,
)

# Verify length and sequence dimensions
fasta_sequences = read_fasta(variable_fasta_in)
true = [fasta_sequences[chrom][start:end] for chrom, start, end in bed.values]

if fixed_length:
    assert data["seq"].shape[1] == fixed_length, "Length dimension is incorrect."
assert data["seq"].shape[0] == len(bed), "Sequence dimension is incorrect."

# Verify that the data is the same
if fixed_length:
    seqs = [''.join(row.astype(str)) for row in data["seq"].values]
else:
    seqs = data["seq"].values.astype(str)
assert np.array_equal(true, seqs), "Sequences do not match."

# Clean up temporary files
for f in temp_dir.iterdir():
    if f.is_dir():
        os.system(f'rm -r {f}')
    else:
        f.unlink()

100%|██████████| 14/14 [00:00<00:00, 123361.88it/s]
100%|██████████| 14/14 [00:00<00:00, 14135.83it/s]
100%|██████████| 14/14 [00:00<00:00, 11027.28it/s]


100%|██████████| 14/14 [00:00<00:00, 11056.35it/s]
100%|██████████| 14/14 [00:00<00:00, 13862.19it/s]
100%|██████████| 14/14 [00:00<00:00, 12959.67it/s]


# `read_bigwig`

In [172]:
from seqdata import read_bigwig

In [173]:
[b.name for b in bigwigs]

['simulated1.bw',
 'simulated2.bw',
 'simulated3.bw',
 'simulated4.bw',
 'simulated5.bw']

In [174]:
temp_dir = Path(out)
temp_dir.mkdir(parents=True, exist_ok=True)
out = temp_dir / 'output.zarr'

sdata = read_bigwig(
    fasta=variable_fasta_in,
    bed=fixed_bed_in,
    bigwigs=bigwigs,
    out=out,
    seq_name="seq",
    cov_name="cov",
    samples=[b.name for b in bigwigs],
    fixed_length=20,
    batch_size=50,
    overwrite=True,
)

# Verify length and sequence dimensions
fasta_sequences = read_fasta(variable_fasta_in)
true = [fasta_sequences[chrom][start:end] for chrom, start, end in bed.values]

if fixed_length:
    assert data["seq"].shape[1] == fixed_length, "Length dimension is incorrect."
assert data["seq"].shape[0] == len(bed), "Sequence dimension is incorrect."

# Verify that the data is the same
if fixed_length:
    seqs = [''.join(row.astype(str)) for row in data["seq"].values]
else:
    seqs = data["seq"].values.astype(str)
assert np.array_equal(true, seqs), "Sequences do not match."

# Clean up temporary files
for f in temp_dir.iterdir():
    if f.is_dir():
        os.system(f'rm -r {f}')
    else:
        f.unlink()

100%|██████████| 14/14 [00:00<00:00, 136877.05it/s]
100%|██████████| 14/14 [00:00<00:00, 16105.39it/s]
100%|██████████| 14/14 [00:00<00:00, 29360.13it/s]
100%|██████████| 14/14 [00:00<00:00, 24734.73it/s]
100%|██████████| 14/14 [00:00<00:00, 38990.87it/s]


100%|██████████| 14/14 [00:00<00:00, 26985.41it/s]


# DONE!

---

# Testing BAM

In [None]:
seq_num = 0
print(f"chrom: {bed.iloc[seq_num]['chrom']}, start: {bed.iloc[seq_num]['start']}, end: {bed.iloc[seq_num]['end']}")
cov = bam_reader._count_depth_only(
    f=pysam.AlignmentFile(inbam),
    contig=bed.iloc[seq_num]['chrom'],
    start=bed.iloc[seq_num]['start'],
    end=bed.iloc[seq_num]['end']
)
print(len(cov))
cov

chrom: chr1, start: 10, end: 30
20


array([2, 2, 2, 2, 3, 4, 4, 4, 4, 3, 3, 3, 4, 6, 6, 6, 6, 6, 6, 7],
      dtype=uint16)

In [None]:
_read = [seq for seq in iterator]
_read

[]

In [None]:
#inbam = "/cellar/users/aklie/data/datasets/SeqDatasets/K562_ATAC-seq/data/merged.bam"
#inbed = "/cellar/users/aklie/data/datasets/SeqDatasets/K562_ATAC-seq/data/ENCSR868FGK_K562_ATAC-seq_peaks.bed"

In [None]:
f = pysam.AlignmentFile(inbam)
bed = pd.read_csv(inbed, sep='\t', header=None)

In [None]:
f.count_coverage(
    contig=bed.iloc[1][0],
    start=bed.iloc[1][1],
    stop=bed.iloc[1][2],
    #read_callback='nofilter',
)

(array('L', [0, 3, 0, 0, 0, 0, 0, 4, 0, 0, 4, 0, 0, 6, 0, 0, 6, 0, 0, 0]),
 array('L', [0, 0, 2, 0, 2, 2, 0, 0, 4, 5, 0, 0, 0, 0, 6, 0, 0, 0, 6, 6]),
 array('L', [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 5, 0, 0, 0, 0, 0, 0, 0]),
 array('L', [4, 0, 0, 2, 0, 0, 3, 0, 0, 0, 0, 4, 0, 0, 0, 7, 0, 5, 0, 0]))

In [None]:
# Read in the data using the reader
_read = [seq.decode('utf-8') for seq in iterator]

# Verify that the data is the same
assert np.array_equal(_read, true), "GenomeFASTA reader failed to read in the correct values."

In [None]:
# Open the BAM file for reading
with pysam.AlignmentFile(inbam, "rb") as bamfile:
    # Iterate over each read
    for read in bamfile.fetch():
        print(f"Read name: {read.query_name}")
        print(f"Reference: {bamfile.get_reference_name(read.reference_id)}")
        print(f"Start position: {read.reference_start}")
        print(f"Read sequence: {read.query_sequence}")
        print(f"CIGAR string: {read.cigarstring}")
        print("------")

Read name: read_chr1_9_19
Reference: chr1
Start position: 9
Read sequence: CCGACTAACT
CIGAR string: 10M
------
Read name: read_chr1_10_20
Reference: chr1
Start position: 10
Read sequence: CGACTAACTG
CIGAR string: 10M
------
Read name: read_chr1_14_24
Reference: chr1
Start position: 14
Read sequence: TAACTGACTG
CIGAR string: 10M
------
Read name: read_chr1_15_25
Reference: chr1
Start position: 15
Read sequence: AACTGACTGA
CIGAR string: 10M
------
Read name: read_chr1_20_30
Reference: chr1
Start position: 20
Read sequence: ACTGATGATG
CIGAR string: 10M
------
Read name: read_chr1_22_32
Reference: chr1
Start position: 22
Read sequence: TGATGATGAT
CIGAR string: 10M
------
Read name: read_chr1_23_33
Reference: chr1
Start position: 23
Read sequence: GATGATGATG
CIGAR string: 10M
------
Read name: read_chr1_23_33
Reference: chr1
Start position: 23
Read sequence: GATGATGATG
CIGAR string: 10M
------
Read name: read_chr1_9_19
Reference: chr1
Start position: 24
Read sequence: ATGATGATGC
CIGAR strin