In [1]:
import h5py
import os
import pandas as pd
import numpy as np

from scipy.sparse import csr_matrix
from scipy.io import mmwrite

os.chdir("/Volumes/salomonis2/LabFiles/Kyle/Analysis/2023_06_12_tea_seq_atac_processing/")

In [2]:
# Path to save output directories
path_out_mtx_dirs = "output/tea_r7_pmat_macs2_p_0_001_tss_added/"

In [3]:
path_snap_objects = "/Volumes/salomonis2/LabFiles/Kyle/Analysis/2022_01_11_mouse_tea_seq_atac/input/snap_atac_input/"

snap_files = {\
    "M1": "AS_TEAr_ML1_atac.snap",
    "M2": "AS_TEAr_ML2_atac.snap",
    "H1": "AS_TEAr_H1_atac.snap",
    "H2": "AS_TEAr_H2_atac.snap"}

In [4]:
# Read in merged peak set
merged_peaks = pd.read_table("output/tea_r7_macs2_p_0_05_merged_peak_set/"\
    "r7_tea_p_0_001_merged_peaks_with_tss_added.bed",
    header=None)

merged_peaks

Unnamed: 0,0,1,2,3,4,5
0,chr1,3002997,3003997,CLP1_Rrm2,3.43433,.
1,chr1,3014437,3015437,MultiLin_1_MEP,3.44355,.
2,chr1,3020384,3021384,proNeu_1,4.73524,.
3,chr1,3026360,3027360,DN4_DP_trans_Hist1h3c,6.88481,.
4,chr1,3037358,3038358,MDP_Irf8,3.77206,.
...,...,...,...,...,...,...
757535,chrY,90834621,90835621,pre_MultiLin_2,3.89979,.
757536,chrY,90836061,90837061,MultiLin_2_F13a1,3.85690,.
757537,chrY,90839304,90840304,ST_HSC,18.27197,.
757538,chrY,90841321,90842321,ETP_CC_4,3.69548,.


In [5]:
# tmp_filename = os.path.join(path_snap_objects, snap_files["M1"])
# f = h5py.File(tmp_filename, "r")
# # Root level object names
# f.keys()
# # Peak matrix keys
# f['PM'].keys()

In [6]:
for tmp_sample in snap_files:
    print("Working on sample {}...".format(tmp_sample))
    tmp_snap_file = snap_files[tmp_sample]
    tmp_path_to_snap = os.path.join(path_snap_objects, tmp_snap_file)
    tmp_outdir = os.path.join(path_out_mtx_dirs, tmp_sample)
    # Check and make output directory
    if not os.path.isdir(tmp_outdir):
        print("\tGenerating output directory...")
        os.mkdir(tmp_outdir)
    else:
        print("\tWARNING! Output directory already exists... Will overwrite...")

    # Process SNAP file to pull out PMAT
    with h5py.File(tmp_path_to_snap, "r") as snap_h5:
        tmp_peaks = pd.DataFrame({\
            "chr": np.array(snap_h5['PM']['peakChrom']).astype(str),
            "start": np.array(snap_h5['PM']['peakStart']).astype(int),
            "end": np.array(snap_h5['PM']['peakEnd']).astype(int)})
        tmp_barcodes = np.array(snap_h5['BD']['name']).astype(str)
        # tmp_idx = np.array(snap_h5['PM']['idx']).astype(int)
        # tmp_idy = np.array(snap_h5['PM']['idy']).astype(int)
        # tmp_count = np.array(snap_h5['PM']['count']).astype(int)
        tmp_csr_matrix = csr_matrix(\
            (\
                np.array(snap_h5['PM']['count']).astype(int),
                (\
                    np.array(snap_h5['PM']['idx']).astype(int) - 1, 
                    np.array(snap_h5['PM']['idy']).astype(int) - 1)), 
            shape=(len(tmp_barcodes), merged_peaks.shape[0]),
            dtype=np.int32)
        print("\tWriting out sparse mtx file...")
        mmwrite(\
            os.path.join(tmp_outdir, "pmat.mtx"),
            tmp_csr_matrix)
        pd.Series(tmp_barcodes).to_csv(os.path.join(tmp_outdir, "barcodes.tsv"),
            sep="\t", header=False, index=False)
        merged_peaks.to_csv(os.path.join(tmp_outdir, "peaks.tsv"),
            sep="\t", header=False, index=False)
        
    print("")
        
        

Working on sample M1...
	Writing out sparse mtx file...

Working on sample M2...
	Writing out sparse mtx file...

Working on sample H1...
	Writing out sparse mtx file...

Working on sample H2...
	Writing out sparse mtx file...

