In [None]:
import h5py
import matplotlib.pyplot as plt
import os
import pandas as pd
import random
import re
import subprocess
from anndata._io.specs import read_elem


barcode_pattern = r'[ACGT]{16}'
replace_with = 'B@RCODE'

**Define the directory that contains the AnnData file and the unfiltered fragments files**

In [None]:
my_dir = ''

**Read in the barcodes from the AnnData & extract each prefix/suffix**\
These will be the values in `id_map` below

In [None]:
mx_file = ''
with h5py.File(f'{my_dir}{mx_file}') as f:
    barcodes = read_elem(f['obs']).index.to_series()
index_patterns = set([re.sub(barcode_pattern, replace_with, b) for b in barcodes])
index_patterns

**Define the ending that is expected on each unfiltered fragments files**

In [None]:
file_end = 'atac_fragments.tsv.gz'

**Extract the library IDs in the names of the raw fragments files**\
These will be the keys in `id_map` below

In [None]:
file_ids = [f.replace(file_end,'') for f in os.listdir(my_dir) if f.endswith(file_end)]
file_ids

**Map the file library IDs to the library IDs with the barcodes in the RNA file index**

In [None]:
id_map = {
    'G120_F1_N_': 'B@RCODE-1_1',
    'G133_D_FL_': 'B@RCODE-1_4',
    'G210_D_': 'B@RCODE-1_11',
    'G150_D_': 'B@RCODE-1_7',
    'G171_D_': 'B@RCODE-1_9',
    'G120_D_FL_': 'B@RCODE-1_3',
    'G187_D_': 'B@RCODE-1_10',
    'G159_D_': 'B@RCODE-1_8',
    'G129_D_': 'B@RCODE-1_6',
    'G133_N_FL_': 'B@RCODE-1_5',
    'G120_D_TL_': 'B@RCODE-1_2'
}

**Review any IDs not mapped**

In [None]:
[a for a in index_patterns if a not in id_map.values()]

In [None]:
[a for a in file_ids if a not in id_map.keys()]

**Create a new fragments file for each raw file, amending barcodes and filtering some out**\
*This will also report statistics to review for QA of mappings*\
*May need to alter how `frag_file` is defined based on the file structure & naming*

In [None]:
ind_frag_files = []
stats = []
for s,a in id_map.items():
    print(s)
    frag_file = f'{my_dir}{s}{file_end}'

    #read in the fragments
    frags_df = pd.read_csv(
        frag_file,
        comment='#',
        sep='\t',
        names=['chrom','start','end','barcode','readSupport']
    )

    #plot for QA
    counts = frags_df['barcode'].value_counts()
    fig, axes = plt.subplots(1, 2, figsize=(10, 5))
    axes[0].hist(counts, range=(0,1000), bins=200)
    axes[0].set_ylim(ymin=0)
    axes[0].set_title('raw')

    #store stats for QA
    raw_min = counts.min()
    raw_mean = round(counts.mean())

    #update the barcode to match the CxG matrix obx index
    frags_df['barcode'] = frags_df['barcode'].apply(lambda x: re.sub(replace_with, re.search(barcode_pattern, x).group(), a))

    #filter down to only barcodes in the CxG matrix
    frags_df = frags_df[frags_df['barcode'].isin(barcodes)]

    #plot for QA
    counts = frags_df['barcode'].value_counts()
    axes[1].hist(counts, range=(0,1000), bins=200)
    axes[1].set_ylim(ymin=0)
    axes[1].set_title('filtered')
    plt.show()

    #store stats for QA
    stats.append({
        'sample': s,
        'raw min': raw_min,
        'filt min': counts.min(),
        'raw mean': raw_mean,
        'filt mean': round(counts.mean()),
        'unique barcodes': len(counts)
    })

    #write the filtered fragments file
    output = frag_file.replace(file_end, 'filtered_fragments.tsv')
    frags_df.to_csv(output, sep='\t', index=False, header=False)
    ind_frag_files.append(output)

pd.DataFrame(stats)

**Concatenate all of the outputs**

In [None]:
processes = []
for f in ind_frag_files:
    p = subprocess.Popen(['gzip',f])
    processes.append(p)

for p in processes:
    p.wait()

ind_frag_files_gz = [f + '.gz' for f in ind_frag_files]
concat_frags = f'{my_dir}concatenated_filtered_fragments.tsv.gz'
subprocess.run(['cat ' + ' '.join(ind_frag_files_gz) + ' > ' + concat_frags], shell=True)