# Merge bigwig files by lineage

In [None]:
import pyBigWig
import numpy as np
import sys, os

bw_path = os.path.expanduser('~/UW/CutandRun/ATACbw')
# Get list of all .bw files in this folder
bw_files = [f for f in os.listdir(bw_path) if f.endswith('.bw')]
bw_files = [os.path.join(bw_path, f) for f in bw_files]

lineage_file = os.path.expanduser('~/UW/CutandRun/atac_lineages.lineages.txt')
lineages = np.genfromtxt(lineage_file, dtype=str, delimiter='\t')

unique_lineages = np.unique(lineages[:, 1])

for u, ul in enumerate(unique_lineages):
    cell_types = lineages[lineages[:, 1] == ul, 0]
    print(f"Lineage {u}: {ul}, Cell Types: {len(cell_types)}")
    # Open all files for this lineage
    bws = [pyBigWig.open(f) for f in bw_files if any(ct in f for ct in cell_types)]
    print(f"Found {len(bws)} bigWig files for lineage {ul}")
    # Get chromosome sizes from the first file
    chroms = bws[0].chroms()
    print(f"Chromosomes: {chroms}")
    # Output file
    out_bw = pyBigWig.open(f"{bw_path}/merged_sum_{ul}.bw", "w")
    
    out_bw.addHeader(list(chroms.items()))

    # Iterate over chromosomes
    for chrom, length in chroms.items():
        print(f"Processing chromosome: {chrom}, length: {length}")
        # Get values from each bw file for this chromosome
        all_vals = [bw.values(chrom, 0, length, numpy=True) for bw in bws]
        all_vals = np.array(all_vals)
        print(f"Shape of all_vals: {all_vals.shape}")
        
        # Sum values across all files, handle NaNs as 0
        summed_vals = np.nansum(all_vals, axis=0)
        
        # Replace any remaining NaNs with 0 and ensure float type
        summed_vals = np.nan_to_num(summed_vals, nan=0.0).astype(np.float64)
        
        # Convert to bedGraph-like blocks (start, end, value)
        if len(summed_vals) == 0:
            continue
            
        # Collect all intervals for this chromosome
        chrom_list = []
        starts_list = []
        ends_list = []
        values_list = []
        
        start = 0
        prev_val = summed_vals[0]
        
        for i in range(1, length):
            if summed_vals[i] != prev_val:
                # Only add non-zero values to reduce file size
                if prev_val != 0.0:
                    chrom_list.append(chrom)
                    starts_list.append(start)
                    ends_list.append(i)
                    values_list.append(float(prev_val))
                start = i
                prev_val = summed_vals[i]
        
        # Add the final block for this chromosome
        if prev_val != 0.0:
            chrom_list.append(chrom)
            starts_list.append(start)
            ends_list.append(length)
            values_list.append(float(prev_val))
        
        # Add all entries for this chromosome at once
        if chrom_list:  # Only add if we have entries
            print(f"Adding {len(chrom_list)} intervals for {chrom}")
            out_bw.addEntries(chroms=chrom_list, starts=starts_list, ends=ends_list, values=values_list)
        else:
            print(f"No non-zero intervals found for {chrom}")

    # Close files
    for bw in bws:
        bw.close()
    out_bw.close()

Lineage 0: B, Cell Types: 19
Found 19 bigWig files for lineage B
Chromosomes: {'chr1': 195471971, 'chr10': 130694993, 'chr11': 122082543, 'chr12': 120129022, 'chr13': 120421639, 'chr14': 124902244, 'chr15': 104043685, 'chr16': 98207768, 'chr17': 94987271, 'chr18': 90702639, 'chr19': 61431566, 'chr1_GL456210_random': 169725, 'chr1_GL456211_random': 241735, 'chr1_GL456212_random': 153618, 'chr1_GL456213_random': 39340, 'chr1_GL456221_random': 206961, 'chr2': 182113224, 'chr3': 160039680, 'chr4': 156508116, 'chr4_GL456216_random': 66673, 'chr4_GL456350_random': 227966, 'chr4_JH584292_random': 14945, 'chr4_JH584293_random': 207968, 'chr4_JH584294_random': 191905, 'chr4_JH584295_random': 1976, 'chr5': 151834684, 'chr5_GL456354_random': 195993, 'chr5_JH584296_random': 199368, 'chr5_JH584297_random': 205776, 'chr5_JH584298_random': 184189, 'chr5_JH584299_random': 953012, 'chr6': 149736546, 'chr7': 145441459, 'chr7_GL456219_random': 175968, 'chr8': 129401213, 'chr9': 124595110, 'chrM': 16299, 

KeyboardInterrupt: 

In [None]:
def merge_bigwig_by_lineage(bw_path, lineage_file, output_dir=None, window_size=None):
    """
    Merge BigWig files by lineage with optional windowing.
    
    Parameters:
    - bw_path: Path to directory containing BigWig files
    - lineage_file: Path to tab-delimited file with cell types and lineages
    - output_dir: Output directory (defaults to bw_path)
    - window_size: If specified, compute windowed averages instead of base-pair resolution

    - TODO: use numpy operations instead of for loop, or use joblib for parallelization

    Returns:
    - List of output file paths created
    """
    import pyBigWig
    import numpy as np
    import os
    
    if output_dir is None:
        output_dir = bw_path
    
    # Get list of all .bw files in this folder
    bw_files = [f for f in os.listdir(bw_path) if f.endswith('.bw')]
    bw_files = [os.path.join(bw_path, f) for f in bw_files]
    
    # Load lineage information
    lineages = np.genfromtxt(lineage_file, dtype=str, delimiter='\t')
    unique_lineages = np.unique(lineages[:, 1])
    
    output_files = []
    
    for u, ul in enumerate(unique_lineages):
        cell_types = lineages[lineages[:, 1] == ul, 0]
        print(f"Lineage {u}: {ul}, Cell Types: {len(cell_types)}")
        
        # Open all files for this lineage
        bws = [pyBigWig.open(f) for f in bw_files if any(ct in f for ct in cell_types)]
        print(f"Found {len(bws)} bigWig files for lineage {ul}")
        
        if len(bws) == 0:
            print(f"No files found for lineage {ul}, skipping...")
            continue
            
        # Get chromosome sizes from the first file
        chroms = bws[0].chroms()
        print(f"Chromosomes: {list(chroms.keys())}")
        
        # Output file
        if window_size:
            output_file = os.path.join(output_dir, f"merged_sum_{ul}_win{window_size}.bw")
        else:
            output_file = os.path.join(output_dir, f"merged_sum_{ul}.bw")
            
        out_bw = pyBigWig.open(output_file, "w")
        out_bw.addHeader(list(chroms.items()))
        
        # Iterate over chromosomes
        for chrom, length in chroms.items():
            print(f"Processing chromosome: {chrom}, length: {length}")
            
            # Get values from each bw file for this chromosome
            all_vals = [bw.values(chrom, 0, length, numpy=True) for bw in bws]
            all_vals = np.array(all_vals)
            print(f"Shape of all_vals: {all_vals.shape}")
            
            # Sum values across all files, handle NaNs as 0
            summed_vals = np.nansum(all_vals, axis=0)
            
            # Replace any remaining NaNs with 0 and ensure float type
            summed_vals = np.nan_to_num(summed_vals, nan=0.0).astype(np.float64)
            
            if len(summed_vals) == 0:
                continue
            
            # Apply windowing if specified
            if window_size:
                summed_vals = apply_windowing(summed_vals, window_size, length)
                # Update length to match windowed data
                windowed_length = len(summed_vals)
            else:
                windowed_length = length
            
            # Collect all intervals for this chromosome
            chrom_list = []
            starts_list = []
            ends_list = []
            values_list = []
            
            start = 0
            prev_val = summed_vals[0]
            
            for i in range(1, windowed_length):
                if i >= len(summed_vals):
                    break
                if summed_vals[i] != prev_val:
                    # Only add non-zero values to reduce file size
                    if prev_val != 0.0:
                        chrom_list.append(chrom)
                        if window_size:
                            # Convert windowed coordinates back to genomic coordinates
                            starts_list.append(start * window_size)
                            ends_list.append(i * window_size)
                        else:
                            starts_list.append(start)
                            ends_list.append(i)
                        values_list.append(float(prev_val))
                    start = i
                    prev_val = summed_vals[i]
            
            # Add the final block for this chromosome
            if prev_val != 0.0:
                chrom_list.append(chrom)
                if window_size:
                    starts_list.append(start * window_size)
                    ends_list.append(min(windowed_length * window_size, length))
                else:
                    starts_list.append(start)
                    ends_list.append(windowed_length)
                values_list.append(float(prev_val))
            
            # Add all entries for this chromosome at once
            if chrom_list:
                print(f"Adding {len(chrom_list)} intervals for {chrom}")
                out_bw.addEntries(chroms=chrom_list, starts=starts_list, ends=ends_list, values=values_list)
            else:
                print(f"No non-zero intervals found for {chrom}")
        
        # Close files
        for bw in bws:
            bw.close()
        out_bw.close()
        
        output_files.append(output_file)
        print(f"Created: {output_file}")
    
    return output_files


def apply_windowing(values, window_size, original_length):
    """
    Apply windowing to reduce resolution by averaging values within windows.
    
    Parameters:
    - values: 1D numpy array of values
    - window_size: Size of each window in base pairs
    - original_length: Original length of the chromosome
    
    Returns:
    - Windowed values array
    """
    import numpy as np
    
    # Calculate number of complete windows
    num_windows = len(values) // window_size
    
    if num_windows == 0:
        return np.array([np.mean(values)] if len(values) > 0 else [0.0])
    
    # Truncate to complete windows for easier reshaping
    truncated_values = values[:num_windows * window_size]
    
    # Reshape and compute mean for each window
    windowed = truncated_values.reshape(num_windows, window_size)
    window_means = np.mean(windowed, axis=1)
    
    # Handle any remaining bases in the last partial window
    remaining_bases = len(values) - (num_windows * window_size)
    if remaining_bases > 0:
        last_window_mean = np.mean(values[num_windows * window_size:])
        window_means = np.append(window_means, last_window_mean)
    
    return window_means

In [11]:
# Usage examples:

# Set up paths
bw_path = os.path.expanduser('~/UW/CutandRun/ATACbw')
lineage_file = os.path.expanduser('~/UW/CutandRun/atac_lineages.lineages.txt')

# Example 1: Merge at base-pair resolution (original functionality)
#print("=== Merging at base-pair resolution ===")
#output_files = merge_bigwig_by_lineage(bw_path, lineage_file)
#print(f"Created files: {output_files}")

# Example 2: Merge with 100bp windows
print("\n=== Merging with 100bp windows ===")
output_files_100bp = merge_bigwig_by_lineage(bw_path, lineage_file, window_size=100)
print(f"Created files: {output_files_100bp}")


=== Merging with 100bp windows ===
Lineage 0: B, Cell Types: 19
Found 19 bigWig files for lineage B
Chromosomes: ['chr1', 'chr10', 'chr11', 'chr12', 'chr13', 'chr14', 'chr15', 'chr16', 'chr17', 'chr18', 'chr19', 'chr1_GL456210_random', 'chr1_GL456211_random', 'chr1_GL456212_random', 'chr1_GL456213_random', 'chr1_GL456221_random', 'chr2', 'chr3', 'chr4', 'chr4_GL456216_random', 'chr4_GL456350_random', 'chr4_JH584292_random', 'chr4_JH584293_random', 'chr4_JH584294_random', 'chr4_JH584295_random', 'chr5', 'chr5_GL456354_random', 'chr5_JH584296_random', 'chr5_JH584297_random', 'chr5_JH584298_random', 'chr5_JH584299_random', 'chr6', 'chr7', 'chr7_GL456219_random', 'chr8', 'chr9', 'chrM', 'chrUn_GL456239', 'chrUn_GL456359', 'chrUn_GL456360', 'chrUn_GL456366', 'chrUn_GL456367', 'chrUn_GL456368', 'chrUn_GL456370', 'chrUn_GL456372', 'chrUn_GL456378', 'chrUn_GL456379', 'chrUn_GL456381', 'chrUn_GL456382', 'chrUn_GL456383', 'chrUn_GL456385', 'chrUn_GL456387', 'chrUn_GL456389', 'chrUn_GL456390', '