# Generate sample data

In [20]:
import os
import pandas as pd
import numpy as np
import zarr
import pickle
from pathlib import Path
from seqdata import Table, FlatFASTA, GenomeFASTA, BigWig, BAM
import seqdata as sd
import xarray as xr
import pysam
import random
import pyBigWig

In [21]:
# If seqpro is not installed, pip install it
try:
    import seqpro as sp
except ImportError:
    !pip install seqpro
    import seqpro as sp

In [22]:
data_dir = Path(sd.__file__).resolve().parent.parent / 'tests' / 'data'
print(data_dir)

/cellar/users/aklie/projects/ML4GLand/SeqData/tests/data


In [23]:
# Write a numpy array of sequences to a FASTA file, if names is None, then the sequences are named chr1, chr2, etc.
def write_fasta(arr, names=None, path='test.fasta', max_per_line=60):
    if names is None:
        names = [f'chr{i+1}' for i in range(arr.shape[0])]
    with open(path, 'w') as f:
        for name, seq in zip(names, arr):
            f.write(f'>{name}\n')
            for i in range(0, len(seq), max_per_line):
                f.write(seq[i:i+max_per_line] + '\n')
            f.write('\n')
    return path    

# Simulated

In [24]:
# Random 7 "chromosomes" (chr1, chr2, ..., chr7) of variable lengths
seq_lens = [120, 400, 110, 150, 300, 100, 200]
seqs = [''.join(sp.random_seqs((1, l), sp.DNA)[0].astype(str)) for l in seq_lens]
names = [f'chr{i+1}' for i in range(7)]
seq_dict = {name: seq for name, seq in zip(names, seqs)}

# For chr2, make 50 random basepairs lower case
indexes = np.random.choice(range(len(seqs[1])), 50)
seqs[1] = ''.join([seqs[1][i].lower() if i in indexes else seqs[1][i] for i in range(len(seqs[1]))])

# For chr 6, make random basepairs N
#indexes = np.random.choice(range(len(seqs[5])), 80)
#seqs[5] = ''.join(['N' if i in indexes else seqs[5][i] for i in range(len(seqs[5]))])

## Fastas

### `variable.fa`

In [25]:
# Save as a fasta file called variable.fa
write_fasta(seqs, names=names, path=data_dir / 'variable.fa')

# Index it with pysam
pysam.faidx(str(Path(data_dir) / 'variable.fa'))

# Write variable.chrom.sizes
with open(data_dir / 'variable.chrom.sizes', 'w') as f:
    for i, l in enumerate(seq_lens):
        f.write(f'chr{i+1}\t{l}\n')

### `fixed.fa`

In [26]:
# Grab the first 80bp of each sequence
fixed_seqs = [seq[:80] for seq in seqs]

# Save as a fasta file called fixed.fa
write_fasta(fixed_seqs, names=names, path=data_dir / 'fixed.fa')

# Index it with pysam
pysam.faidx(str(Path(data_dir) / 'fixed.fa'))

# Write fixed.chrom.sizes
with open(data_dir / 'fixed.chrom.sizes', 'w') as f:
    for i, l in enumerate(seq_lens):
        f.write(f'chr{i+1}\t{l}\n')

## BEDs

### `variable.bed`

In [27]:
def generate_regions(sequences, num_regions=2, min_len=20, max_len=30):
    regions = []
    
    for chrom, seq in sequences.items():
        seq_len = len(seq)
        selected_regions = []
        
        for _ in range(num_regions):
            while True:
                # Generate random length and start position
                region_length = random.randint(min_len, max_len)
                start = random.randint(0, seq_len - region_length)
                end = start + region_length
                
                # Ensure no overlap with previous regions
                if all(end <= r[1] or start >= r[2] for r in selected_regions):
                    selected_regions.append((chrom, start, end))
                    break
        
        regions.extend(selected_regions)
    
    return regions

In [28]:
# Create 2 random intervals on each chromosome from variable.fa using start and end coordinates
regions = generate_regions(seq_dict)

# Save as a tsv file called variable.bed with no header
df = pd.DataFrame({'chrom': [r[0] for r in regions], 'start': [r[1] for r in regions], 'end': [r[2] for r in regions]})
df.sort_values(['chrom', 'start']).to_csv(data_dir / 'variable.bed', sep='\t', index=False, header=False)

### `fixed.bed`

In [29]:
# Save as a tsv file called fixed.bed with no header
fixed_df = pd.DataFrame({'chrom': df['chrom'], 'start': df['start'], 'end': df['start'] + 20})
fixed_df.sort_values(['chrom', 'start']).to_csv(data_dir / 'fixed.bed', sep='\t', index=False, header=False)

## BAMs

In [30]:
# Get file name
infasta = Path(sd.__file__).resolve().parent.parent / 'tests' / 'data' / 'variable.fa'
inbed = Path(sd.__file__).resolve().parent.parent / 'tests' / 'data' / 'variable.bed'
infasta, inbed

(PosixPath('/cellar/users/aklie/projects/ML4GLand/SeqData/tests/data/variable.fa'),
 PosixPath('/cellar/users/aklie/projects/ML4GLand/SeqData/tests/data/variable.bed'))

In [31]:
def simulate_bam(
    infasta: Path,
    inbed: Path,
    inbam: Path,
    read_len: int = 10,
    read_sep: int = 5,
    max_reads: int = 10,
    seed: int = 0,
):
    # Open your reference FASTA file
    fasta = pysam.FastaFile(infasta)
    bed = pd.read_csv(inbed, sep='\t', header=None, names=['chrom', 'start', 'end'])

    # Parameters
    random.seed(seed)

    # Dictionary to store true coverage for each chromosome
    true_coverage_arrays = {}

    # Open BAM file for writing
    with pysam.AlignmentFile(inbam, 'wb',
                             reference_names=fasta.references,
                             reference_lengths=fasta.lengths) as bamfile:
        # For each region in the BED file
        for _, region in bed.iterrows():
            chrom = region['chrom']
            start = region['start']
            end = region['end']
            print(f"Simulating reads for {chrom}:{start}-{end}")

            # Initialize true coverage array for this chromosome if not already
            if chrom not in true_coverage_arrays:
                true_coverage_arrays[chrom] = np.zeros(fasta.get_reference_length(chrom), dtype=int)

            # Generate read pairs overlapping the region
            num_reads = random.randint(1, max_reads)  # Random number of read pairs
            for _ in range(num_reads):
                # Randomly select starting position for read1, allowing partial overlap with the BED region
                read1_start = random.randint(max(0, start - read_len), min(end, fasta.get_reference_length(chrom) - read_len))
                read2_start = read1_start + read_len + read_sep

                # Fetch sequences
                read1_seq = fasta.fetch(chrom, read1_start, read1_start + read_len)
                read2_seq = fasta.fetch(chrom, read2_start, read2_start + read_len)

                # Skip incomplete sequences
                if len(read1_seq) < read_len or len(read2_seq) < read_len:
                    continue

                # Check if the read pair overlaps the region
                read1_end = read1_start + read_len
                read2_end = read2_start + read_len
                if (read1_start < end and read1_end > start) or (read2_start < end and read2_end > start):
                    
                    # Update true coverage for read1
                    true_coverage_arrays[chrom][read1_start:read1_start + read_len] += 1

                    # Update true coverage for read2
                    true_coverage_arrays[chrom][read2_start:read2_start + read_len] += 1

                    # Create read1
                    read1 = pysam.AlignedSegment()
                    read1.query_name = f"read_{chrom}_{read1_start}_{read1_start + read_len}"
                    read1.query_sequence = read1_seq
                    read1.flag = 99
                    read1.reference_id = bamfile.get_tid(chrom)
                    read1.reference_start = read1_start
                    read1.mapping_quality = 60
                    read1.cigar = [(0, len(read1_seq))]
                    read1.next_reference_id = bamfile.get_tid(chrom)
                    read1.next_reference_start = read2_start
                    read1.template_length = read2_start + read_len - read1_start
                    read1.query_qualities = pysam.qualitystring_to_array("I" * len(read1_seq))

                    # Create read2
                    read2 = pysam.AlignedSegment()
                    read2.query_name = read1.query_name
                    read2.query_sequence = read2_seq
                    read2.flag = 147
                    read2.reference_id = bamfile.get_tid(chrom)
                    read2.reference_start = read2_start
                    read2.mapping_quality = 60
                    read2.cigar = [(0, len(read2_seq))]
                    read2.next_reference_id = bamfile.get_tid(chrom)
                    read2.next_reference_start = read1_start
                    read2.template_length = -(read2_start + read_len - read1_start)
                    read2.query_qualities = pysam.qualitystring_to_array("I" * len(read2_seq))

                    # Write reads to BAM file
                    bamfile.write(read1)
                    bamfile.write(read2)

    # Sort the BAM file
    pysam.sort("-o", str(inbam), str(inbam))

    # Index the BAM file
    pysam.index(str(inbam))

    # Extract coverage arrays for each BED region
    coverage_by_region = {}
    for _, region in bed.iterrows():
        chrom, start, end = region['chrom'], region['start'], region['end']
        coverage_by_region[f"{chrom}:{start}-{end}"] = true_coverage_arrays[chrom][start:end]

    return coverage_by_region

In [32]:
# Make 5 bam files called simulated1.bam, simulated2.bam, ..., simulated5.bam
coverages = {}
for i in range(1, 6):
    coverage = simulate_bam(infasta, inbed, data_dir / f'simulated{i}.bam', seed=i)
    coverages[f'simulated{i}.bam'] = coverage

Simulating reads for chr1:4-28
Simulating reads for chr1:47-76
Simulating reads for chr2:46-72
Simulating reads for chr2:174-197
Simulating reads for chr3:18-43
Simulating reads for chr3:78-106
Simulating reads for chr4:35-60
Simulating reads for chr4:87-111
Simulating reads for chr5:40-62
Simulating reads for chr5:156-181
Simulating reads for chr6:19-49
Simulating reads for chr6:61-85
Simulating reads for chr7:12-34
Simulating reads for chr7:153-174
Simulating reads for chr1:4-28
Simulating reads for chr1:47-76
Simulating reads for chr2:46-72
Simulating reads for chr2:174-197
Simulating reads for chr3:18-43
Simulating reads for chr3:78-106
Simulating reads for chr4:35-60
Simulating reads for chr4:87-111
Simulating reads for chr5:40-62
Simulating reads for chr5:156-181
Simulating reads for chr6:19-49
Simulating reads for chr6:61-85
Simulating reads for chr7:12-34
Simulating reads for chr7:153-174
Simulating reads for chr1:4-28
Simulating reads for chr1:47-76
Simulating reads for chr2:4

In [33]:
# Save coverages in pickle file
with open(data_dir / 'variable.bedcov.pkl', 'wb') as f:
    pickle.dump(coverages, f)

### `BigWig`

In [34]:
# Make chromsizes a list of tuples
chromsizes = []
with open(data_dir / 'variable.chrom.sizes') as f:
    for line in f:
        chrom, size = line.strip().split()
        chromsizes.append((chrom, int(size)))
chromsizes

[('chr1', 120),
 ('chr2', 400),
 ('chr3', 110),
 ('chr4', 150),
 ('chr5', 300),
 ('chr6', 100),
 ('chr7', 200)]

In [35]:
# Create a bigwig file from the coverage arrays
for bam, coverage in coverages.items():
    outbw = Path(data_dir / bam.replace('.bam', '.bw'))
    regions = sorted(coverage.keys(), key=lambda x: (x.split(':')[0], int(x.split(':')[1].split('-')[0])))
    bw = pyBigWig.open(str(outbw), 'w')
    bw.addHeader(chromsizes, maxZooms=0)
    print(outbw)
    for region in regions:
        print(region)
        cov = coverage[region]
        chrom, interval = region.split(':')
        starts = np.arange(int(interval.split('-')[0]), int(interval.split('-')[1])).tolist()
        bw.addEntries(chrom, starts, values=cov.astype("float32").tolist(), span=1)
    bw.close()

/cellar/users/aklie/projects/ML4GLand/SeqData/tests/data/simulated1.bw
chr1:4-28
chr1:47-76
chr2:46-72
chr2:174-197
chr3:18-43
chr3:78-106
chr4:35-60
chr4:87-111
chr5:40-62
chr5:156-181
chr6:19-49
chr6:61-85
chr7:12-34
chr7:153-174
/cellar/users/aklie/projects/ML4GLand/SeqData/tests/data/simulated2.bw
chr1:4-28
chr1:47-76
chr2:46-72
chr2:174-197
chr3:18-43
chr3:78-106
chr4:35-60
chr4:87-111
chr5:40-62
chr5:156-181
chr6:19-49
chr6:61-85
chr7:12-34
chr7:153-174
/cellar/users/aklie/projects/ML4GLand/SeqData/tests/data/simulated3.bw
chr1:4-28
chr1:47-76
chr2:46-72
chr2:174-197
chr3:18-43
chr3:78-106
chr4:35-60
chr4:87-111
chr5:40-62
chr5:156-181
chr6:19-49
chr6:61-85
chr7:12-34
chr7:153-174
/cellar/users/aklie/projects/ML4GLand/SeqData/tests/data/simulated4.bw
chr1:4-28
chr1:47-76
chr2:46-72
chr2:174-197
chr3:18-43
chr3:78-106
chr4:35-60
chr4:87-111
chr5:40-62
chr5:156-181
chr6:19-49
chr6:61-85
chr7:12-34
chr7:153-174
/cellar/users/aklie/projects/ML4GLand/SeqData/tests/data/simulated5.bw
c

In [36]:
# Test opening the bigwig file
bw = pyBigWig.open(str(data_dir / 'simulated1.bw'))
print(bw.header())
bw.close()

{'version': 4, 'nLevels': 0, 'nBasesCovered': 348, 'minVal': 0, 'maxVal': 7, 'sumData': 904, 'sumSquared': 3438}


## Tabular

### `variable.tsv`

In [37]:
# Variable length sequences

# Single scalar targets (sample from a normal distribution)
targets = np.random.normal(size=(len(seqs), 1))

# Save as a tsv file called variable.tsv
df = pd.DataFrame({'seq': seqs, 'target': targets.flatten()})
df.to_csv(data_dir / 'variable.tsv', sep='\t', index=False)

### `fixed.tsv`

In [38]:
# Take the first 20bp of each sequence
fixed_seqs = [seq[:20] for seq in seqs]

# Save as a tsv file called fixed.tsv
df = pd.DataFrame({'seq': fixed_seqs, 'target': targets.flatten()})
df.to_csv(data_dir / 'fixed.tsv', sep='\t', index=False)

# K562 ATAC-seq data chr22

In [4]:
import pysam

In [5]:
def subsample_fasta_to_chr22(input_fasta, output_fasta, chromsizes_file):
    """
    Subsamples a FASTA file to include only chr22 and writes the corresponding chromsizes file.
    
    Parameters:
        input_fasta (str): Path to the input FASTA file.
        output_fasta (str): Path to the output FASTA file containing only chr22.
        chromsizes_file (str): Path to the output chromsizes file for chr22.
    """
    with pysam.FastaFile(input_fasta) as fasta_in, open(output_fasta, 'w') as fasta_out, open(chromsizes_file, 'w') as chromsizes_out:
        # Check if chr22 exists in the reference
        if "chr22" not in fasta_in.references:
            raise ValueError("chr22 not found in the input FASTA file.")
        
        # Get chr22 sequence and write it to the output FASTA
        chr22_sequence = fasta_in.fetch("chr22")
        chr22_length = fasta_in.get_reference_length("chr22")
        
        fasta_out.write(f">chr22\n")
        for i in range(0, len(chr22_sequence), 80):  # Wrap sequence at 80 characters
            fasta_out.write(chr22_sequence[i:i+80] + "\n")
        
        # Write chr22 chromsize to the chromsizes file
        chromsizes_out.write(f"chr22\t{chr22_length}\n")

In [6]:
# Define input and output paths
input_fasta = "/cellar/users/aklie/data/datasets/SeqDatasets/K562_ATAC-seq/data/hg38.fa"
output_fasta = "/cellar/users/aklie/projects/ML4GLand/SeqData/tests/data/K562_ATAC-seq_chr22/hg38.chr22.fa"
chromsizes_file = "/cellar/users/aklie/projects/ML4GLand/SeqData/tests/data/K562_ATAC-seq_chr22/hg38.chr22.chromsizes"

In [7]:
# Run the subsampling function
subsample_fasta_to_chr22(input_fasta, output_fasta, chromsizes_file)


Caching the list of root modules, please wait!
(This will only be done once - type '%rehashx' to reset cache!)



In [8]:
input_bed = "/cellar/users/aklie/data/datasets/SeqDatasets/K562_ATAC-seq/data/ENCSR868FGK_K562_ATAC-seq_peaks.bed"
output_bed = "/cellar/users/aklie/projects/ML4GLand/SeqData/tests/data/K562_ATAC-seq_chr22/ENCSR868FGK.chr22.bed"

In [9]:
# Subsample the BED file to include only chr22
with open(input_bed, 'r') as bed_in, open(output_bed, 'w') as bed_out:
    for line in bed_in:
        if line.startswith("chr22"):
            bed_out.write(line)

In [10]:
input_bam = '/cellar/users/aklie/data/datasets/SeqDatasets/K562_ATAC-seq/data/merged.bam'
output_bam = '/cellar/users/aklie/projects/ML4GLand/SeqData/tests/data/K562_ATAC-seq_chr22/ENCSR868FGK.chr22.bam'

In [12]:
# Subsample bam file to include only chr22 using pysam
def subsample_bam_to_chr22(input_bam, output_bam):
    """
    Subsamples a BAM file to include only reads mapped to chr22.
    
    Parameters:
        input_bam (str): Path to the input BAM file.
        output_bam (str): Path to the output BAM file with only chr22 reads.
    """
    # Open the input BAM file for reading
    with pysam.AlignmentFile(input_bam, "rb") as bam_in:
        # Open the output BAM file for writing
        with pysam.AlignmentFile(output_bam, "wb", header=bam_in.header) as bam_out:
            # Iterate over reads mapped to chr22 and write them to the output file
            for read in bam_in.fetch("chr22"):
                bam_out.write(read)

# Run the subsampling function
subsample_bam_to_chr22(input_bam, output_bam)

In [13]:
# index the output bam file
pysam.index(output_bam)

''

%%bash
# use chrombpnet to get unstranded counts bigwig with correct shift (+4/-4)
source activate chrombpnet
script=/cellar/users/aklie/opt/chrombpnet/chrombpnet/helpers/preprocessing/reads_to_bigwig.py
cmd="python $script \
--genome /cellar/users/aklie/projects/ML4GLand/SeqData/tests/data/K562_ATAC-seq_chr22/hg38.chr22.fa \
--input-bam-file /cellar/users/aklie/projects/ML4GLand/SeqData/tests/data/K562_ATAC-seq_chr22/ENCSR868FGK.chr22.bam \
--chrom-sizes /cellar/users/aklie/projects/ML4GLand/SeqData/tests/data/K562_ATAC-seq_chr22/hg38.chr22.chromsizes \
--output-prefix /cellar/users/aklie/projects/ML4GLand/SeqData/tests/data/K562_ATAC-seq_chr22/ENCSR868FGK.chr22 \
--data-type ATAC"
echo $cmd
eval $cmd

%%bash
# use bam2bw to get bigwig
#bam2bw [-h] -s SIZES -n NAME [-ps POS_SHIFT] [-ns NEG_SHIFT] [-v] filename [filename ...]
cmd="bam2bw \
-s /cellar/users/aklie/projects/ML4GLand/SeqData/tests/data/K562_ATAC-seq_chr22/hg38.chr22.chromsizes \
-n /cellar/users/aklie/projects/ML4GLand/SeqData/tests/data/K562_ATAC-seq_chr22/ENCSR868FGK.chr22.bam2bw \
/cellar/users/aklie/projects/ML4GLand/SeqData/tests/data/K562_ATAC-seq_chr22/ENCSR868FGK.chr22.bam"
echo $cmd
eval $cmd

# deBoer et al sample data

In [16]:
import pandas as pd

wget https://zenodo.org/records/10633252/files/filtered_test_data_with_MAUDE_expression.txt?download=1  -O /cellar/users/aklie/projects/ML4GLand/SeqData/tests/data/yeast_promoters/filtered_test_data_with_MAUDE_expression.txt

In [18]:
# Add seq and exp columns as headers
df = pd.read_csv('/cellar/users/aklie/projects/ML4GLand/SeqData/tests/data/yeast_promoters/filtered_test_data_with_MAUDE_expression.txt', sep='\t', header=None, names=['seq', 'exp'])
df.to_csv('/cellar/users/aklie/projects/ML4GLand/SeqData/tests/data/yeast_promoters/filtered_test_data_with_MAUDE_expression.txt', sep='\t', index=False)

# DONE!

---