In [17]:
import os
import pandas as pd
import numpy as np
from scipy.io import mmread

os.chdir("/data/salomonis2/LabFiles/Kyle/Analysis/2023_06_12_tea_seq_atac_processing/")

path_merged_pmat = "output/tea_r7_pmat_macs2_p_0_001_tss_added/combined/"

path_output = "output/tea_r7_pseudobulk_from_pmat/"

In [5]:
# Read in the cell annotations
path_tea_anno_files = "output/tea_r7_anno_by_port/"

anno_samples = {\
    "H1": "tea_seq_AS_TEAr_H1_r7_annotation.tsv",
    "H2": "tea_seq_AS_TEAr_H2_r7_annotation.tsv",
    "M1": "tea_seq_AS_TEAr_M1_r7_annotation.tsv",
    "M2": "tea_seq_AS_TEAr_M2_r7_annotation.tsv"}

tea_anno = []
for tmp_sample in anno_samples:
    tmp_anno = pd.read_table(os.path.join(\
        path_tea_anno_files, 
        anno_samples[tmp_sample]))
    tmp_anno["port"] = tmp_sample
    tmp_anno.index = tmp_anno["port"] + ":" + tmp_anno["barcode"]
    tea_anno.append(tmp_anno)

    
del tmp_anno
tea_anno = pd.concat(tea_anno)

tea_anno

Unnamed: 0,barcode,R7,port
H1:AAACAGCCAAACCCTA-1,AAACAGCCAAACCCTA-1,ST-HSC,H1
H1:AAACAGCCAATTTAGC-1,AAACAGCCAATTTAGC-1,HSCP-HPC_Tk1,H1
H1:AAACAGCCACCAGCAT-1,AAACAGCCACCAGCAT-1,LT-HSC_Mllt3,H1
H1:AAACAGCCAGCTCAAC-1,AAACAGCCAGCTCAAC-1,HSCP-HPC_Cenpf,H1
H1:AAACAGCCAGGAACTG-1,AAACAGCCAGGAACTG-1,eHSC-Pcna,H1
...,...,...,...
M2:TTTGTTGGTATTCGTC-1,TTTGTTGGTATTCGTC-1,pre-MultiLin-1,M2
M2:TTTGTTGGTCTATCGT-1,TTTGTTGGTCTATCGT-1,IG2-MP,M2
M2:TTTGTTGGTGACCTGG-1,TTTGTTGGTGACCTGG-1,immNeu-3,M2
M2:TTTGTTGGTGTCACGG-1,TTTGTTGGTGTCACGG-1,IG2-proNeu1,M2


In [8]:
# Read in peak annotation for pmat
peaks = pd.read_table(\
    os.path.join(path_merged_pmat, "peaks.tsv"),
    header=None)
peaks.columns = ["chr", "start", "end", "name", "score", "strand"]

peaks

Unnamed: 0,chr,start,end,name,score,strand
0,chr1,3002997,3003997,CLP1_Rrm2,3.43433,.
1,chr1,3014437,3015437,MultiLin_1_MEP,3.44355,.
2,chr1,3020384,3021384,proNeu_1,4.73524,.
3,chr1,3026360,3027360,DN4_DP_trans_Hist1h3c,6.88481,.
4,chr1,3037358,3038358,MDP_Irf8,3.77206,.
...,...,...,...,...,...,...
757535,chrY,90834621,90835621,pre_MultiLin_2,3.89979,.
757536,chrY,90836061,90837061,MultiLin_2_F13a1,3.85690,.
757537,chrY,90839304,90840304,ST_HSC,18.27197,.
757538,chrY,90841321,90842321,ETP_CC_4,3.69548,.


In [10]:
# Read in the barcodes for the pmat
barcodes = pd.read_table(\
    os.path.join(path_merged_pmat, "barcodes.tsv"),
    header=None)
barcodes.columns = ['barcode']

barcodes

Unnamed: 0,barcode
0,M1:AAACAGCCAACTGGCT-1
1,M1:AAACAGCCAATCGCAC-1
2,M1:AAACAGCCACAATGTT-1
3,M1:AAACAGCCACGTAAGG-1
4,M1:AAACAGCCACTCGCTC-1
...,...
63947,H2:TTTGTGTTCTTACTCG-1
63948,H2:TTTGTTGGTCAGTAAT-1
63949,H2:TTTGTTGGTCCTAGTT-1
63950,H2:TTTGTTGGTGCACGCA-1


In [15]:
# Filter to shared barcodes between annotation and pmat
shared_barcodes = np.intersect1d(\
    tea_anno.index.values,
    barcodes['barcode'].values)

tea_anno = tea_anno.loc[shared_barcodes]
tea_anno

Unnamed: 0,barcode,R7,port
H1:AAACAGCCAAACCCTA-1,AAACAGCCAAACCCTA-1,ST-HSC,H1
H1:AAACAGCCAATTTAGC-1,AAACAGCCAATTTAGC-1,HSCP-HPC_Tk1,H1
H1:AAACAGCCACCAGCAT-1,AAACAGCCACCAGCAT-1,LT-HSC_Mllt3,H1
H1:AAACAGCCAGCTCAAC-1,AAACAGCCAGCTCAAC-1,HSCP-HPC_Cenpf,H1
H1:AAACAGCCAGGAACTG-1,AAACAGCCAGGAACTG-1,eHSC-Pcna,H1
...,...,...,...
M2:TTTGTTGGTATTCGTC-1,TTTGTTGGTATTCGTC-1,pre-MultiLin-1,M2
M2:TTTGTTGGTCTATCGT-1,TTTGTTGGTCTATCGT-1,IG2-MP,M2
M2:TTTGTTGGTGACCTGG-1,TTTGTTGGTGACCTGG-1,immNeu-3,M2
M2:TTTGTTGGTGTCACGG-1,TTTGTTGGTGTCACGG-1,IG2-proNeu1,M2


In [12]:
# Read in the pmat
pmat = mmread(os.path.join(path_merged_pmat, "pmat.mtx"))
pmat

<63952x757540 sparse matrix of type '<class 'numpy.int64'>'
	with 524599877 stored elements in COOrdinate format>

In [14]:
# Convert to CSR sparse matrix
pmat = pmat.tocsr()

In [19]:
# Get R7 cell valuecounts from annotation and save
r7_cell_vcounts = tea_anno["R7"].value_counts()
r7_cell_vcounts.to_csv(\
    os.path.join(path_output, "r7_cell_counts_tea_seq_merged_captures.csv"),
    header=False, index=True)
r7_cell_vcounts

IG2-proNeu1              5172
ST-HSC                   3640
HSCP-HPC_Tk1             2892
IG2-MP                   2488
MPP5-Egr1                2366
HSCP-ERP1                2317
MPP4-Hlf                 2235
MDP-Cpa3                 2180
MPP4-Nkx2-3              1781
eHSC                     1699
HSCP-HPC_Hist1h2af       1623
pre-MultiLin-1           1471
MDP-Irf8                 1415
MEP                      1246
MultiLin-2_F13a1         1220
LT-HSC_Mllt3             1182
eHSC-Pcna                1156
HSCP-HPC_Cenpf            833
HSCP-MKP                  759
MPP5-Flt3                 738
MultiLin-1                720
pre-MultiLin-2            692
BMCP                      641
proNeu-1                  624
ML-cell-cycle             584
ERP1                      565
MultiLin-1_MEP            483
MultiLin-2_Ms4a3          474
CLP1-Rrm2                 413
MKP                       266
HSC-Mac_Fcna              253
CD127-MP                  243
ETP-CC-4                  193
Baso      

In [42]:
pseudobulk_sums = {}
pseudobulk_binary_sums = {}
cluster_read_counts = {}
cluster_binary_read_counts = {}

for tmp_cluster in r7_cell_vcounts.index.values:
    print("Working on cluster {}...".format(tmp_cluster))
    tmp_barcodes = tea_anno.loc[tea_anno["R7"] == tmp_cluster].index.values
    tmp_i = barcodes.loc[barcodes["barcode"].isin(tmp_barcodes)].index.values
    tmp_mtx = pmat[tmp_i,:].copy().astype(np.int32)
    tmp_binary_mtx = (pmat[tmp_i,:] > 0).astype(np.int32)
    cluster_read_counts[tmp_cluster] = tmp_mtx.sum()
    cluster_binary_read_counts[tmp_cluster] = tmp_binary_mtx.sum()
    pseudobulk_sums[tmp_cluster] = pd.Series(\
        np.array(tmp_mtx.sum(axis=0)).reshape(-1))
    pseudobulk_binary_sums[tmp_cluster] = pd.Series(\
        np.array(tmp_binary_mtx.sum(axis=0)).reshape(-1))

cluster_read_counts = pd.Series(cluster_read_counts)
cluster_binary_read_counts = pd.Series(cluster_binary_read_counts)
pseudobulk_sums = pd.DataFrame(pseudobulk_sums)
pseudobulk_binary_sums = pd.DataFrame(pseudobulk_binary_sums)

Working on cluster IG2-proNeu1...
Working on cluster ST-HSC...
Working on cluster HSCP-HPC_Tk1...
Working on cluster IG2-MP...
Working on cluster MPP5-Egr1...
Working on cluster HSCP-ERP1...
Working on cluster MPP4-Hlf...
Working on cluster MDP-Cpa3...
Working on cluster MPP4-Nkx2-3...
Working on cluster eHSC...
Working on cluster HSCP-HPC_Hist1h2af...
Working on cluster pre-MultiLin-1...
Working on cluster MDP-Irf8...
Working on cluster MEP...
Working on cluster MultiLin-2_F13a1...
Working on cluster LT-HSC_Mllt3...
Working on cluster eHSC-Pcna...
Working on cluster HSCP-HPC_Cenpf...
Working on cluster HSCP-MKP...
Working on cluster MPP5-Flt3...
Working on cluster MultiLin-1...
Working on cluster pre-MultiLin-2...
Working on cluster BMCP...
Working on cluster proNeu-1...
Working on cluster ML-cell-cycle...
Working on cluster ERP1...
Working on cluster MultiLin-1_MEP...
Working on cluster MultiLin-2_Ms4a3...
Working on cluster CLP1-Rrm2...
Working on cluster MKP...
Working on cluster H

In [43]:
cluster_read_counts

IG2-proNeu1              79509404
ST-HSC                   43898896
HSCP-HPC_Tk1             57491240
IG2-MP                   38994609
MPP5-Egr1                31485399
HSCP-ERP1                34343789
MPP4-Hlf                 24999035
MDP-Cpa3                 30166580
MPP4-Nkx2-3              19261375
eHSC                     23408586
HSCP-HPC_Hist1h2af       33757909
pre-MultiLin-1           14734639
MDP-Irf8                 20752772
MEP                      12909754
MultiLin-2_F13a1         16449720
LT-HSC_Mllt3             13133989
eHSC-Pcna                16423445
HSCP-HPC_Cenpf           18043413
HSCP-MKP                 12872760
MPP5-Flt3                 8324587
MultiLin-1                9422930
pre-MultiLin-2            8476189
BMCP                      6792478
proNeu-1                 10731043
ML-cell-cycle             8413250
ERP1                     11416231
MultiLin-1_MEP            5429078
MultiLin-2_Ms4a3          6191822
CLP1-Rrm2                 4965866
MKP           

In [59]:
pseudobulk_cpm = np.log2((\
    1e6 * pseudobulk_sums / \
    cluster_read_counts.loc[\
        pseudobulk_sums.columns.values].values.reshape(1,-1)) + 1)

In [60]:
pseudobulk_cpm_from_binary = np.log2((\
    1e6 * pseudobulk_binary_sums / \
    cluster_binary_read_counts.loc[\
        pseudobulk_binary_sums.columns.values].values.reshape(1,-1)) + 1)

In [61]:
pseudobulk_cpm.to_csv(\
    os.path.join(path_output, "r7_tea_pbulk_cpm_from_pmat.csv"),
    header=True, index=False)

pseudobulk_cpm_from_binary.to_csv(\
    os.path.join(path_output, "r7_tea_pbulk_cpm_from_binary_pmat.csv"),
    header=True, index=False)

pseudobulk_sums.to_csv(\
    os.path.join(path_output, "r7_tea_pbulk_total_counts.csv"),
    header=True, index=False)

cluster_binary_read_counts.to_csv(\
    os.path.join(path_output, "r7_tea_pbulk_total_counts_from_binary.csv"),
    header=True, index=False)

cluster_read_counts.to_csv(\
    os.path.join(path_output, "r7_tea_cluster_read_counts.csv"),
    header=False, index=True)

cluster_binary_read_counts.to_csv(\
    os.path.join(path_output, "r7_tea_cluster_read_counts_from_binary.csv"),
    header=False, index=True)

In [63]:
# Write out the matching ordered peak set with pbulk counts
peaks.to_csv(\
    os.path.join(path_output, "r7_tea_cluster_peak_set_with_tss.bed"),
    header=False, index=False, sep="\t")