In [None]:
import h5py
import matplotlib.pyplot as plt
import os
import pandas as pd
import random
import subprocess
from anndata._io.specs import read_elem

**Read in the barcodes from the AnnData**

In [None]:
my_dir = '/Users/jason/Downloads/10x_multiome_samples/brain_pediatric/'

In [None]:
rna_file = f'{my_dir}fd808279-8829-4677-836d-ed43628a7a54.h5ad'
with h5py.File(rna_file) as f:
    barcodes = read_elem(f['obs']).index.to_series()
random.sample(list(barcodes), 25)

**Identify if the library tag is a prefix or suffix to the barcode**

In [None]:
appendage = 'suffix' #will need to update this


if appendage == 'prefix':
    pattern = r"[ACGT]{16}-1$"  # for prefix
    col = 0
else:
    pattern = r"^[AGCT]{16}-1"  # for suffix
    col = 1
adata_index_split = barcodes.str.split(pat = pattern, regex=True, expand=True)
barcode_apps = adata_index_split[col].unique()
list(barcode_apps)

In [None]:
#may change depending on file structure
sample_apps = [f.replace('atac_fragments.tsv.gz','') for f in os.listdir(my_dir) if f.endswith('atac_fragments.tsv.gz')]
sample_apps

**Map the sample IDs to the IDs appended to the barcodes**

In [None]:
app_map = {
    'G120_F1_N_': '_1',
    'G133_D_FL_': '_4',
    'G210_D_': '_11',
    'G150_D_': '_7',
    'G171_D_': '_9',
    'G120_D_FL_': '_3',
    'G187_D_': '_10',
    'G159_D_': '_8',
    'G129_D_': '_6',
    'G133_N_FL_': '_5',
    'G120_D_TL_': '_2'
}

In [None]:
[a for a in barcode_apps if a not in app_map.values()]

In [None]:
[a for a in sample_apps if a not in app_map.keys()]

**May need to alter how `frag_file` is defined based on the file structure & naming**

In [None]:
ind_frag_files = []
stats = []
for s,a in app_map.items():
    print(s)
    frag_file = f'{my_dir}{s}atac_fragments.tsv.gz' #gonna need to update this pattern

    #read in the fragments
    frags_df = pd.read_csv(
        frag_file,
        comment='#',
        sep='\t',
        names=['chrom','start','end','barcode','readSupport']
    )

    #plot for QA
    counts = frags_df['barcode'].value_counts()
    fig, axes = plt.subplots(1, 2, figsize=(10, 5))
    axes[0].hist(counts, range=(0,1000), bins=200)
    axes[0].set_ylim(ymin=0)
    axes[0].set_title('raw')

    #store stats for QA
    raw_min = counts.min()
    raw_mean = round(counts.mean())

    #update the barcode to match the CxG matrix obx index
    if appendage == 'prefix':
        frags_df['barcode'] = a + frags_df['barcode'].astype(str)
    else:
        frags_df['barcode'] = frags_df['barcode'].astype(str) + a

    #filter down to only barcodes in the CxG matrix
    frags_df = frags_df[frags_df['barcode'].isin(barcodes)]

    #plot for QA
    counts = frags_df['barcode'].value_counts()
    axes[1].hist(counts, range=(0,1000), bins=200)
    axes[1].set_ylim(ymin=0)
    axes[1].set_title('filtered')
    plt.show()

    #store stats for QA
    stats.append({
        'sample': s,
        'raw min': raw_min,
        'filt min': counts.min(),
        'raw mean': raw_mean,
        'filt mean': round(counts.mean())
    })
    #add in 'uniq barcodes': len(frags_df['barcode'].unique())?
    #could validate that the total equals cells in the RNA matrix - len(barcodes)

    #write the filtered fragments file
    output = frag_file.replace('atac_fragments', 'filtered_fragments')
    frags_df.to_csv(output, compression='gzip', sep='\t', index=False, header=False)
    ind_frag_files.append(output)

pd.DataFrame(stats)

**Concatenate all of the outputs**

In [None]:
concat_frags = f'{my_dir}concatenated_filtered_fragments.tsv.gz'
subprocess.run(['cat ' + ' '.join(ind_frag_files) + ' > ' + concat_frags], shell=True)