# Convert bigwigs to new data points for high granular training

In [None]:
import pyBigWig
import numpy as np
import sys, os

bed_file = os.path.expanduser('/home/sasse/UW/CutandRun/ImmGen_ATACpeak.final.bed6')
# read in bed file
bed_data = np.genfromtxt(bed_file, delimiter = '\t', dtype=str)
unique_chroms = np.unique(bed_data[:,0])



# Iterate over all regions in the bed file, extend them by 1000bp on each side, and merge them  if they overlap
binned_regions = []
for chrom in unique_chroms:
    print(f'Processing {chrom}')
    merged_regions = []
    chrom_data = bed_data[bed_data[:,0] == chrom]
    max_end = np.amax(chrom_data[:,2].astype(int)) + 1875
    print(f'Max end: {max_end}')
    covered_bases = np.zeros(np.amax(chrom_data[:,2].astype(int)) + 1875, dtype=bool)
    for i, region in enumerate(chrom_data):
        start = int(region[1]) - 1875
        end = int(region[2]) + 1875
        if start < 0:
            start = 0
        covered_bases[start:end] = True
    # find contiguous regions of True values in covered_bases
    in_region = False
    for i in range(len(covered_bases)):
        if covered_bases[i] and not in_region:
            in_region = True
            start = i
        elif not covered_bases[i] and in_region:
            in_region = False
            end = i
            merged_regions.append((chrom, start, end))
    if in_region:
        end = len(covered_bases)
        merged_regions.append((chrom, start, end))
    # split merged regions in 100bp bins, extending the region if its length cannot be divided by 100
    for region in merged_regions:
        chrom, start, end = region
        length = end - start
        if length % 100 != 0:
            start -= (100-(length % 100)) // 2
            if start < 0:
                start = 0
            end += (100 - (length % 100)) // 2
        for i in range(start, end, 100):
            binned_regions.append((chrom, i, i + 100))

binned_regions = np.array(binned_regions)
np.savetxt('/home/sasse/UW/CutandRun/ImmGen_ATACpeak.final.bed.ext4kb.100bp_bins.bed', binned_regions, fmt='%s', delimiter='\t')



Processing chr1
Max end: 195311101
Processing chr10
Max end: 130544336
Processing chr11
Max end: 121850122
Processing chr12
Max end: 120030318
Processing chr13
Max end: 120322911
Processing chr14
Max end: 124802771
Processing chr15
Max end: 103906926
Processing chr16
Max end: 98084753
Processing chr17
Max end: 94866872
Processing chr18
Max end: 90600219
Processing chr19
Max end: 61314476
Processing chr2
Max end: 182002468
Processing chr3
Max end: 159912168
Processing chr4
Max end: 156257635
Processing chr5
Max end: 151736177
Processing chr6
Max end: 149568645
Processing chr7
Max end: 145331380
Processing chr8
Max end: 129268158
Processing chr9
Max end: 124480328
Processing chrX
Max end: 170678472
Processing chrY
Max end: 90814906


In [12]:
import time 
# Read in extended bed file
binned_regions = np.genfromtxt('/home/sasse/UW/CutandRun/ImmGen_ATACpeak.final.bed.ext4kb.100bp_bins.bed', delimiter='\t', dtype=str)
binned_region_names = np.array([f"{r[0]}_{r[1]}-{r[2]}" for r in binned_regions])

# Load bigWig files
bw_path = os.path.expanduser('~/UW/CutandRun/ATACbw')
# Get list of all .bw files in this folder
bw_files = [f for f in os.listdir(bw_path) if f.endswith('.bw') and f.startswith('GSE')]
bw_files = [os.path.join(bw_path, f) for f in bw_files]
print(f"Found {len(bw_files)} bigWig files.")

data_matrix = np.zeros((binned_regions.shape[0], len(bw_files)), dtype=np.float32)
celltypes = []
row_mask = np.ones(binned_regions.shape[0], dtype=bool)
# Iterate over all the bigWig files and open them
for j, f in enumerate(bw_files):
    print(f"Opening {f}")
    bws = pyBigWig.open(f)
    ul = os.path.basename(f).replace('.bw','')
    ul = ul.split('_',1)[1]
    celltypes.append(ul)
    print(f"Processing file: {ul}")
    #get the chromosome sizes from the bigWig file
    chrom_sizes = bws.chroms()
    # iterate over the chromosomes and regions in the bed file
    start_time = time.time()
    for i, (chrom, start, end) in enumerate(binned_regions):
        # Get values from the bigWig file for this region
        start = int(start)
        end = int(end)
        if chrom not in chrom_sizes.keys():
            raise ValueError(f"Chromosome {chrom} not found in bigWig file {f}. Skipping.")
        if end <= chrom_sizes[chrom]:
            vals = bws.values(chrom, start, end, numpy=True)
            data_matrix[i, j] = np.around(np.nanmean(vals) * 100 if vals is not None else 0, 2)
        else:
            print(f"Region {chrom}:{start}-{end} exceeds chromosome size {chrom_sizes[chrom]}. Skipping this region.")
            row_mask[i] = False
    bws.close()
    print(f"Finished processing {f}")
    end_time = time.time()
    print(f"Time taken: {end_time - start_time:.2f} seconds")

# Save the data matrix to a file
np.savez_compressed('/home/sasse/UW/CutandRun/ATACbw/ImmGen_ATACpeak.final.bed.ext4kb.100bp_bins.bw_matrix.npz', counts=data_matrix, names=binned_region_names, celltypes=celltypes)

Found 90 bigWig files.
Opening /home/sasse/UW/CutandRun/ATACbw/GSE100738_DC.8+.Sp.bw
Processing file: DC.8+.Sp
Finished processing /home/sasse/UW/CutandRun/ATACbw/GSE100738_DC.8+.Sp.bw
Time taken: 153.22 seconds
Opening /home/sasse/UW/CutandRun/ATACbw/GSE100738_B.mem.Sp.bw
Processing file: B.mem.Sp


KeyboardInterrupt: 