# Translate data matrices into bedgraph files for visualization

In [10]:
import numpy as np
import sys
import os 

data_path = os.path.expanduser('~/UW/CutandRun/Processed_Count_Matrix/Tier1_Tier2/')
data_file = os.path.join(data_path, 'ImmGen_ATACpeak.final_mouse_peak_heights.npz')

bed_file = os.path.expanduser('/home/sasse/UW/CutandRun/ImmGen_ATACpeak.final.bed6')

lineage_file = os.path.expanduser('/home/sasse/Git/DRGwebserver/data/CutandRun_and_ATAC.lineages.modsep.tsv')
lineage_array = np.genfromtxt(lineage_file, dtype=str, delimiter='\t')

# read in data file
with np.load(data_file) as data:
    peak_counts = data['counts']
    peak_names = data['names'].astype(str)
    peak_cell_types = data['celltypes'].astype(str)
    sorted_indices = np.argsort(peak_names)
    peak_names = peak_names[sorted_indices]
    peak_counts = peak_counts[sorted_indices]

lineages = np.unique(lineage_array[:, 1])

# read in bed file
bed_data = np.genfromtxt(bed_file, delimiter = '\t', dtype=str)
bed_data = bed_data[bed_data[:, 3].argsort()]

# Create a mask for the peaks
peak_mask = np.isin(peak_names, bed_data[:, 3])
# create a mask for the bed data
bed_mask = np.isin(bed_data[:, 3], peak_names)
#check if they are the same after masking
if np.array_equal(peak_names[peak_mask], bed_data[:, 3][bed_mask]):
    # apply the masks
    peak_counts = peak_counts[peak_mask]
    peak_names = peak_names[peak_mask]
    bed_data = bed_data[bed_mask]
else:
    print("Peaks and BED data do not match after masking.")
    sys.exit(1)


for l, lin in enumerate(lineages):
    print(f"Processing lineage {l+1}/{len(lineages)}: {lin}")
    # Filter cell types for the current lineage
    lineage_cell_types = lineage_array[lineage_array[:, 1] == lin]
    print(f' Has {len(lineage_cell_types)} cell types')
    lineage_mask = np.isin(peak_cell_types, lineage_cell_types)
    print(f' Has {np.sum(lineage_mask)} cell types')
    if np.sum(lineage_mask) > 0:
        lineage_peak_counts = np.mean(peak_counts[:, lineage_mask], axis=1)
        # save lineage bedgraph file
        lineage_bed_file = os.path.splitext(data_file)[0] + f"_{lin}.bedgraph"
        np.savetxt(lineage_bed_file, np.column_stack((bed_data[:, :3], lineage_peak_counts)), fmt='%s', delimiter='\t')
        print(f"Saved lineage bedgraph file: {lineage_bed_file}")

Processing lineage 1/112: B.ATAC
 Has 33 cell types
 Has 18 cell types
Saved lineage bedgraph file: /home/sasse/UW/CutandRun/Processed_Count_Matrix/Tier1_Tier2/ImmGen_ATACpeak.final_mouse_peak_heights_B.ATAC.bedgraph
Processing lineage 2/112: B.CTCF
 Has 33 cell types
 Has 0 cell types
Processing lineage 3/112: B.H33
 Has 33 cell types
 Has 0 cell types
Processing lineage 4/112: B.H3K27ac
 Has 33 cell types
 Has 0 cell types
Processing lineage 5/112: B.H3K27me3
 Has 33 cell types
 Has 0 cell types
Processing lineage 6/112: B.H3K36me3
 Has 33 cell types
 Has 0 cell types
Processing lineage 7/112: B.H3K4me1
 Has 33 cell types
 Has 0 cell types
Processing lineage 8/112: B.H3K4me3
 Has 33 cell types
 Has 0 cell types
Processing lineage 9/112: Basophil.ATAC
 Has 2 cell types
 Has 0 cell types
Processing lineage 10/112: Basophil.CTCF
 Has 2 cell types
 Has 0 cell types
Processing lineage 11/112: Basophil.H33
 Has 2 cell types
 Has 0 cell types
Processing lineage 12/112: Basophil.H3K27ac
 Has