# scATAC-seq analysis

Set-up for running a scATAC-seq analysis with pycisTopic.

In [43]:
import os
import sys
import glob
import warnings
import pandas as pd
import pycisTopic
import pyranges as pr
import requests
warnings.simplefilter(action='ignore', category=FutureWarning)
_stderr = sys.stderr
null = open(os.devnull,'wb')

In [17]:
dataset_name = "CharacterizationMcGinnis_Dataset6"
combinomics_dir = "/cellar/users/aklie/data/igvf/topic_grn_links/combinomics"
results_dir = "/cellar/users/aklie/projects/igvf/topic_grn_links/grn_inference/scenicplus/results"
dataset_name = "CharacterizationMcGinnis_Dataset6"

In [18]:
#make a directory for to store the processed scATAC-seq data.
if not os.path.exists(os.path.join(results_dir, dataset_name, 'scATAC')):
    os.makedirs(os.path.join(results_dir, dataset_name, 'scATAC'))
tmp_dir = '/cellar/users/aklie/tmp/'

You will need to create a dictionary with values as paths to fragment files and keys as sample names.

In [23]:
fragment_files = os.path.join(combinomics_dir, dataset_name, "*tsv.gz")
fragment_files = sorted(glob.glob(fragment_files))
fragment_files

['/cellar/users/aklie/data/igvf/topic_grn_links/combinomics/CharacterizationMcGinnis_Dataset6/Dataset6_sample_1.atac.filter.fragments.hg38.tsv.gz',
 '/cellar/users/aklie/data/igvf/topic_grn_links/combinomics/CharacterizationMcGinnis_Dataset6/Dataset6_sample_2.atac.filter.fragments.hg38.tsv.gz',
 '/cellar/users/aklie/data/igvf/topic_grn_links/combinomics/CharacterizationMcGinnis_Dataset6/Dataset6_sample_3.atac.filter.fragments.hg38.tsv.gz',
 '/cellar/users/aklie/data/igvf/topic_grn_links/combinomics/CharacterizationMcGinnis_Dataset6/Dataset6_sample_4.atac.filter.fragments.hg38.tsv.gz']

In [27]:

# Load in the paths to your fragment files and turn them into a dictionary
fragments_dict = dict(zip([os.path.basename(file).split(".")[0] for file in fragment_files], fragment_files))
fragments_dict

{'Dataset6_sample_1': '/cellar/users/aklie/data/igvf/topic_grn_links/combinomics/CharacterizationMcGinnis_Dataset6/Dataset6_sample_1.atac.filter.fragments.hg38.tsv.gz',
 'Dataset6_sample_2': '/cellar/users/aklie/data/igvf/topic_grn_links/combinomics/CharacterizationMcGinnis_Dataset6/Dataset6_sample_2.atac.filter.fragments.hg38.tsv.gz',
 'Dataset6_sample_3': '/cellar/users/aklie/data/igvf/topic_grn_links/combinomics/CharacterizationMcGinnis_Dataset6/Dataset6_sample_3.atac.filter.fragments.hg38.tsv.gz',
 'Dataset6_sample_4': '/cellar/users/aklie/data/igvf/topic_grn_links/combinomics/CharacterizationMcGinnis_Dataset6/Dataset6_sample_4.atac.filter.fragments.hg38.tsv.gz'}

Read in or grab the metadata from any previous methods for annotations

In [28]:
atac_metadata_glob = os.path.join(combinomics_dir, dataset_name, '*atac.qc.hg38.metadata.tsv')
atac_metadata_files = sorted(glob.glob(atac_metadata_glob))
atac_metadata_files

['/cellar/users/aklie/data/igvf/topic_grn_links/combinomics/CharacterizationMcGinnis_Dataset6/Dataset6_sample_1.atac.qc.hg38.metadata.tsv',
 '/cellar/users/aklie/data/igvf/topic_grn_links/combinomics/CharacterizationMcGinnis_Dataset6/Dataset6_sample_2.atac.qc.hg38.metadata.tsv',
 '/cellar/users/aklie/data/igvf/topic_grn_links/combinomics/CharacterizationMcGinnis_Dataset6/Dataset6_sample_3.atac.qc.hg38.metadata.tsv',
 '/cellar/users/aklie/data/igvf/topic_grn_links/combinomics/CharacterizationMcGinnis_Dataset6/Dataset6_sample_4.atac.qc.hg38.metadata.tsv']

In [31]:
metadata = pd.concat([pd.read_csv(f, sep='\t', index_col=0) for f in atac_metadata_files])

In [38]:
# Chris McGinnis annotations
metadata_dir = "/cellar/users/aklie/data/igvf/topic_grn_links/metadata"
processed_metadata_file = os.path.join(metadata_dir, dataset_name, 'metadata_IGVF6.csv')
processed_metadata = pd.read_csv(processed_metadata_file, index_col=0)
index_split = processed_metadata.index.str.split("-")
processed_metadata.index = [x[0] + "_Dataset6_sample_" + x[1] for x in index_split]
processed_metadata

Unnamed: 0,orig.ident,nCount_RNA,nFeature_RNA,RNA_snn_res.0.25,seurat_clusters,PercentMito,lane,MULTI,RNA_snn_res.1,RNA_snn_res.0.5,sample,temp
AAACAGCCAAGGTAAC_Dataset6_sample_1,SeuratProject,5710,2647,10,9,0.066900,1,Bar64,9,8,S5_2D,9
AAACAGCCACCTCGCT_Dataset6_sample_1,SeuratProject,7103,2723,13,21,0.059693,1,Bar63,21,13,S4,21
AAACAGCCACTCGCTC_Dataset6_sample_1,SeuratProject,6402,2918,3,2,0.037176,1,Bar64,2,4,S5_2D,2
AAACAGCCAGCAATAA_Dataset6_sample_1,SeuratProject,1721,991,7,3,0.160372,1,Bar65,3,6,S5_2D3D,3
AAACAGCCAGCGCTTG_Dataset6_sample_1,SeuratProject,2853,1535,5,15,0.049772,1,Bar66,15,2,S6_2D,15
...,...,...,...,...,...,...,...,...,...,...,...,...
TTTGTTGGTGTCCAAA_Dataset6_sample_4,SeuratProject,5090,2357,2,0,0.038310,4,Bar66,0,1,S6_2D,0
TTTGTTGGTGTCCTGC_Dataset6_sample_4,SeuratProject,3570,1924,6,6,0.001681,4,Bar64,6,5,S5_2D,6
TTTGTTGGTGTTTGTC_Dataset6_sample_4,SeuratProject,1093,890,0,7,0.020128,4,Bar63,7,10,S4,7
TTTGTTGGTTAAGCGC_Dataset6_sample_4,SeuratProject,5066,2621,10,24,0.042835,4,Bar65,24,8,S5_2D3D,24


In [52]:
metadata.merge(processed_metadata, left_index=True, right_index=True, how="inner")

Unnamed: 0,fragments_promoter,reads_tss,reads_promoter,tss_enrichment,reads_tss_total,total,duplicate,unmapped,lowmapq,unique,...,nFeature_RNA,RNA_snn_res.0.25,seurat_clusters,PercentMito,lane,MULTI,RNA_snn_res.1,RNA_snn_res.0.5,sample,temp
AAACAGCCAAACGCGA_Dataset6_sample_4,17088,3423,33940,14.033577,3423.0,10210,2872,754,0,6584,...,457,7,23,0.187793,4,Bar63,23,12,S4,23
AAACAGCCAACCTAAT_Dataset6_sample_3,28363,6324,56294,14.527578,6324.0,14754,4991,830,0,8933,...,2503,1,12,0.082565,3,Bar66,12,0,S6_2D,12
AAACAGCCAACGTGCT_Dataset6_sample_2,1989,370,3937,11.817311,370.0,1280,341,121,0,818,...,4350,11,25,0.083430,2,Bar63,25,16,S4,25
AAACAGCCAACTGGCT_Dataset6_sample_2,20899,4449,41451,14.141093,4449.0,10961,3392,581,0,6988,...,2168,1,1,0.061338,2,Bar66,1,0,S6_2D,1
AAACAGCCAAGCCAGA_Dataset6_sample_4,42589,9141,84417,15.021569,9141.0,21966,6166,1153,0,14647,...,3478,1,12,0.054582,4,Bar66,12,0,S6_2D,12
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
TTTGTTGGTTGGCCGA_Dataset6_sample_4,68800,13642,136343,12.724381,13642.0,37133,9709,3032,0,24392,...,1863,9,8,0.055872,4,Bar61,8,9,S1,8
TTTGTTGGTTTAACGG_Dataset6_sample_1,90339,18836,179149,15.021752,18836.0,63967,24015,3805,0,36147,...,1505,8,20,0.006016,1,Bar63,20,7,S4,20
TTTGTTGGTTTATTCG_Dataset6_sample_3,39529,9556,78510,21.901357,9556.0,21712,6899,1339,0,13474,...,1681,4,4,0.047019,3,Bar64,4,3,S5_2D,4
TTTGTTGGTTTGCAGA_Dataset6_sample_2,27188,5644,53924,16.484126,5644.0,14667,4342,905,0,9420,...,1046,0,7,0.034535,2,Bar63,7,10,S4,7


In [47]:
processed_metadata

Unnamed: 0,orig.ident,nCount_RNA,nFeature_RNA,RNA_snn_res.0.25,seurat_clusters,PercentMito,lane,MULTI,RNA_snn_res.1,RNA_snn_res.0.5,sample,temp
AAACAGCCAAGGTAAC_Dataset6_sample_1,SeuratProject,5710,2647,10,9,0.066900,1,Bar64,9,8,S5_2D,9
AAACAGCCACCTCGCT_Dataset6_sample_1,SeuratProject,7103,2723,13,21,0.059693,1,Bar63,21,13,S4,21
AAACAGCCACTCGCTC_Dataset6_sample_1,SeuratProject,6402,2918,3,2,0.037176,1,Bar64,2,4,S5_2D,2
AAACAGCCAGCAATAA_Dataset6_sample_1,SeuratProject,1721,991,7,3,0.160372,1,Bar65,3,6,S5_2D3D,3
AAACAGCCAGCGCTTG_Dataset6_sample_1,SeuratProject,2853,1535,5,15,0.049772,1,Bar66,15,2,S6_2D,15
...,...,...,...,...,...,...,...,...,...,...,...,...
TTTGTTGGTGTCCAAA_Dataset6_sample_4,SeuratProject,5090,2357,2,0,0.038310,4,Bar66,0,1,S6_2D,0
TTTGTTGGTGTCCTGC_Dataset6_sample_4,SeuratProject,3570,1924,6,6,0.001681,4,Bar64,6,5,S5_2D,6
TTTGTTGGTGTTTGTC_Dataset6_sample_4,SeuratProject,1093,890,0,7,0.020128,4,Bar63,7,10,S4,7
TTTGTTGGTTAAGCGC_Dataset6_sample_4,SeuratProject,5066,2621,10,24,0.042835,4,Bar65,24,8,S5_2D3D,24


Map the RNA barcodes from the anndata to the ATAC barcodes present in the fragment file

In [39]:
test_fragments = pd.read_csv(fragments_dict["Dataset6_sample_4"], sep="\t", header=None, nrows=1000)
metadata.index.isin(test_fragments[3]).sum()

We also need chromosome sizes in order to export to pseudobulk and to call peaks. Read these in. Make sure you choose the correct species for your data!

In [44]:
# Stream directly into memory using pandas
target_url = 'http://hgdownload.cse.ucsc.edu/goldenPath/hg38/bigZips/hg38.chrom.sizes'
chromsizes = pd.read_csv(target_url, sep='\t', header=None)
chromsizes.columns = ['Chromosome', 'End']
chromsizes['Start'] = [0]*chromsizes.shape[0]
chromsizes = chromsizes.loc[:,['Chromosome', 'Start', 'End']]
chromsizes = pr.PyRanges(chromsizes)

In [45]:
chromsizes

Unnamed: 0,Chromosome,Start,End
0,chr1,0,248956422
1,chr1_GL383518v1_alt,0,182439
2,chr1_GL383519v1_alt,0,110268
3,chr1_GL383520v2_alt,0,366580
4,chr1_KI270706v1_random,0,175055
...,...,...,...
450,chrX_KI270880v1_alt,0,284869
451,chrX_KI270881v1_alt,0,144206
452,chrX_KI270913v1_alt,0,274009
453,chrY,0,57227415


The next big step is to generate bed and bw files using all the fragments within each cell type. pycisTopic has a decent function for this that's a little finicky. A couple things to make sure of:
- You have a "barcode" column with barcodes that match the barcodes in your fragment files
- You have a "sample_id" column that maps cells to the samples contained in your fragment file dictionary
- Your chromosomes in the chromsizes object you created match the chromsomes in your fragment files (make sure you have the right species)
- You have enough total memory to handle ray's parallelization (if you use it)

In [46]:
metadata

Unnamed: 0_level_0,fragments_promoter,reads_tss,reads_promoter,tss_enrichment,reads_tss_total,total,duplicate,unmapped,lowmapq,unique,pct_dup,pct_unmapped,reads_peaks,fragment_peaks,pct_reads_promoter,pct_reads_peaks,pct_mito_reads
barcode,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1
AAACAGCCAAACCTTG_Dataset6_sample_1,939,132,1849,3.308685,132.0,1366,440,218,0,708,0.383275,0.159590,290,187,135.359,0,29.3977
AAACAGCCAAACGCGA_Dataset6_sample_1,28,4,56,0.396040,4.0,31,4,3,0,24,0.142857,0.096774,16,11,180.645,0,40.3846
AAACAGCCAAACGGGC_Dataset6_sample_1,105,20,210,1.980198,20.0,60,4,5,0,51,0.072727,0.083333,40,23,350.000,0,53.3980
AAACAGCCAAAGCGCA_Dataset6_sample_1,29,1,57,0.099010,1.0,20,8,1,0,11,0.421053,0.050000,2,1,285.000,0,10.6145
AAACAGCCAAAGCTAA_Dataset6_sample_1,43,4,83,0.396040,4.0,41,13,7,0,21,0.382353,0.170732,28,18,202.439,0,30.8690
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
TTTGTTGGTTTGCAGA_Dataset6_sample_4,67,13,132,1.287129,13.0,23,0,5,0,18,0.000000,0.217391,10,6,573.913,0,100.0000
TTTGTTGGTTTGGCTT_Dataset6_sample_4,99,4,196,0.396040,4.0,59,1,13,0,45,0.021739,0.220339,33,23,332.203,0,91.0198
TTTGTTGGTTTGGGCG_Dataset6_sample_4,32,17,63,1.683168,17.0,25,3,5,0,17,0.150000,0.200000,8,6,252.000,0,57.1429
TTTGTTGGTTTGGGTA_Dataset6_sample_4,19,10,38,0.990099,10.0,16,2,1,0,13,0.133333,0.062500,13,7,237.500,0,31.9149


In [None]:
from pycisTopic.pseudobulk_peak_calling import export_pseudobulk

# Run without ray since these are big files
bw_paths, bed_paths = export_pseudobulk(
    input_data = cell_data,
    variable = 'celltype', # variable by which to generate pseubulk profiles, in this case we want pseudobulks per celltype
    sample_id_col = 'sample_id',
    chromsizes = chromsizes,
    bed_path = os.path.join(work_dir, 'scATAC/consensus_peak_calling/pseudobulk_bed_files/'),  # specify where pseudobulk_bed_files should be stored
    bigwig_path = os.path.join(work_dir, 'scATAC/consensus_peak_calling/pseudobulk_bw_files/'), # specify where pseudobulk_bw_files should be stored
    path_to_fragments = fragments_dict, # location of fragment fiels
    n_cpu = 1, # specify the number of cores to use, we use ray for multi processing
    normalize_bigwig = True,
    remove_duplicates = True,
    _temp_dir = os.path.join(tmp_dir, 'ray_spill'),
    split_pattern = '-'
)

2023-01-14 22:54:59,585 cisTopic     INFO     Reading fragments from /cellar/users/aklie/data/igvf/topic_grn_links/mouse_adrenal/encode/fragments/ENCFF187VMN/encode_scatac_dcc_2/results/ENCSR525WPH-1/fragments/fragments.tsv.gz
2023-01-14 22:56:44,138 cisTopic     INFO     Reading fragments from /cellar/users/aklie/data/igvf/topic_grn_links/mouse_adrenal/encode/fragments/ENCFF035SPT/encode_scatac_dcc_2/results/ENCSR713FPX-1/fragments/fragments.tsv.gz
2023-01-14 22:58:25,383 cisTopic     INFO     Reading fragments from /cellar/users/aklie/data/igvf/topic_grn_links/mouse_adrenal/encode/fragments/ENCFF622EUO/encode_scatac_dcc_2/results/ENCSR858YSB-1/fragments/fragments.tsv.gz
2023-01-14 23:00:24,434 cisTopic     INFO     Reading fragments from /cellar/users/aklie/data/igvf/topic_grn_links/mouse_adrenal/encode/fragments/ENCFF119IVK/encode_scatac_dcc_2/results/ENCSR400PXQ-1/fragments/fragments.tsv.gz
2023-01-14 23:01:55,753 cisTopic     INFO     Reading fragments from /cellar/users/aklie/dat

Dump the paths to the bed and bw files for peak calling

In [None]:
import pickle

# Export the paths
pickle.dump(
    bed_paths,
    open(os.path.join(work_dir, 'scATAC/consensus_peak_calling/pseudobulk_bed_files/bed_paths.pkl'), 'wb')
)
pickle.dump(
    bw_paths,
    open(os.path.join(work_dir, 'scATAC/consensus_peak_calling/pseudobulk_bw_files/bw_paths.pkl'), 'wb')
)

We can now run cell type specific peak calling using MACS2. pycisTopic has another useful function for this.

In [None]:
from pycisTopic.pseudobulk_peak_calling import peak_calling

# Run peak calling
macs_path='/cellar/users/aklie/opt/miniconda3/envs/scenicplus/bin/macs2'
narrow_peaks_dict = peak_calling(
    macs_path,
    bed_paths,
    os.path.join(work_dir, 'scATAC/consensus_peak_calling/MACS/'),
    genome_size='hs',
    n_cpu=12,
    input_format='BEDPE',
    shift=73,
    ext_size=146,
    keep_dup = 'all',
    q_value = 0.05,
    _temp_dir = os.path.join(tmp_dir, 'ray_spill')
)

2023-01-15 07:53:26,502	INFO worker.py:1519 -- Started a local Ray instance. View the dashboard at [1m[32mhttp://127.0.0.1:8265 [39m[22m


[2m[36m(macs_call_peak_ray pid=2044428)[0m 2023-01-15 07:53:39,567 cisTopic     INFO     Calling peaks for Stromal with /cellar/users/aklie/opt/miniconda3/envs/scenicplus/bin/macs2 callpeak --treatment mouse_adrenal/scATAC/consensus_peak_calling/pseudobulk_bed_files/Stromal.bed.gz --name Stromal  --outdir mouse_adrenal/scATAC/consensus_peak_calling/MACS/ --format BEDPE --gsize hs --qvalue 0.05 --nomodel --shift 73 --extsize 146 --keep-dup all --call-summits --nolambda
[2m[36m(macs_call_peak_ray pid=2044425)[0m 2023-01-15 07:53:39,594 cisTopic     INFO     Calling peaks for Skeletal_muscle with /cellar/users/aklie/opt/miniconda3/envs/scenicplus/bin/macs2 callpeak --treatment mouse_adrenal/scATAC/consensus_peak_calling/pseudobulk_bed_files/Skeletal_muscle.bed.gz --name Skeletal_muscle  --outdir mouse_adrenal/scATAC/consensus_peak_calling/MACS/ --format BEDPE --gsize hs --qvalue 0.05 --nomodel --shift 73 --extsize 146 --keep-dup all --call-summits --nolambda
[2m[36m(macs_call_peak

In [None]:
# Dump the return object to a pickle
pickle.dump(
    narrow_peaks_dict,
    open(os.path.join(work_dir, 'scATAC/consensus_peak_calling/MACS/narrow_peaks_dict.pkl'), 'wb')
)

In order to generate a region x cell matrix, we need a consensus peak set. There a lot of ways to do this, again, pycisTopic has a couple functions that we will take advantage of

In [None]:
from pycisTopic.iterative_peak_calling import get_consensus_peaks

In [None]:
from pycisTopic.iterative_peak_calling import *

# Get consensus peaks
peak_half_width = 250
path_to_blacklist= os.path.join(work_dir, 'hg38-blacklist.v2.bed')
consensus_peaks=get_consensus_peaks(
    narrow_peaks_dict, 
    peak_half_width, 
    chromsizes=chromsizes, 
    path_to_blacklist=path_to_blacklist
)

2023-01-15 08:22:15,531 cisTopic     INFO     Extending and merging peaks per class
2023-01-15 08:32:59,529 cisTopic     INFO     Done!


Save the consensus regions as a bed file. This will be used for multiple downstream steps

In [None]:
consensus_peaks.to_bed(
    path = os.path.join(work_dir, 'scATAC/consensus_peak_calling/consensus_regions.bed'),
    keep=True,
    compression='infer',
    chain=False
)

We can now perform QC on our scATAC data using the consensus peak set. We start by grabbing TSS annotations for protein coding genes. Again remember to choose the correct organism

In [None]:
import pybiomart as pbm

# Grab TSS annotations for mouse protein coding genes
dataset = pbm.Dataset(name='mmusculus_gene_ensembl',  host='http://www.ensembl.org')
annot = dataset.query(attributes=['chromosome_name', 'transcription_start_site', 'strand', 'external_gene_name', 'transcript_biotype'])
annot['Chromosome/scaffold name'] = annot['Chromosome/scaffold name'].to_numpy(dtype = str)
filter = annot['Chromosome/scaffold name'].str.contains('CHR|GL|JH|MT')
annot = annot[~filter]
annot['Chromosome/scaffold name'] = annot['Chromosome/scaffold name'].str.replace(r'(\b\S)', r'chr\1')
annot.columns=['Chromosome', 'Start', 'Strand', 'Gene', 'Transcript_type']
annot = annot[annot.Transcript_type == 'protein_coding']

  result = pd.read_csv(StringIO(response.text), sep='\t')


In [None]:
# Each sample will have the same regions here as we want to merge
path_to_regions = dict.fromkeys(fragments_dict.keys(), os.path.join(work_dir, 'scATAC/consensus_peak_calling/consensus_regions.bed'))
path_to_regions

{'ENCFF187VMN': 'mouse_adrenal/scATAC/consensus_peak_calling/consensus_regions.bed',
 'ENCFF035SPT': 'mouse_adrenal/scATAC/consensus_peak_calling/consensus_regions.bed',
 'ENCFF622EUO': 'mouse_adrenal/scATAC/consensus_peak_calling/consensus_regions.bed',
 'ENCFF119IVK': 'mouse_adrenal/scATAC/consensus_peak_calling/consensus_regions.bed',
 'ENCFF683IBE': 'mouse_adrenal/scATAC/consensus_peak_calling/consensus_regions.bed',
 'ENCFF042ZJI': 'mouse_adrenal/scATAC/consensus_peak_calling/consensus_regions.bed',
 'ENCFF176LJV': 'mouse_adrenal/scATAC/consensus_peak_calling/consensus_regions.bed',
 'ENCFF101BLM': 'mouse_adrenal/scATAC/consensus_peak_calling/consensus_regions.bed'}

Next is to calculate a set of metrics for each cell based on consensus regions. Note that this requires counting fragments again

In [None]:
from pycisTopic.qc import *

# Calculate QC stats for each sample
metadata_bc, profile_data_dict = compute_qc_stats(
    fragments_dict = fragments_dict,
    tss_annotation = annot,
    stats=['barcode_rank_plot', 'duplicate_rate', 'insert_size_distribution', 'profile_tss', 'frip'],
    label_list = None,
    path_to_regions = path_to_regions,
    n_cpu = 1,
    valid_bc = None,
    n_frag = 100,
    n_bc = None,
    tss_flank_window = 1000,
    tss_window = 50,
    tss_minimum_signal_window = 100,
    tss_rolling_window = 10,
    remove_duplicates = True,
    _temp_dir = os.path.join(tmp_dir + 'ray_spill')
)

# Dump the qc results in some objects
if not os.path.exists(os.path.join(work_dir, 'scATAC/quality_control')):
    os.makedirs(os.path.join(work_dir, 'scATAC/quality_control'))

pickle.dump(metadata_bc,
            open(os.path.join(work_dir, 'scATAC/quality_control/metadata_bc.pkl'), 'wb'))

pickle.dump(profile_data_dict,
            open(os.path.join(work_dir, 'scATAC/quality_control/profile_data_dict.pkl'), 'wb'))

2023-01-15 09:05:14,033 cisTopic     INFO     Reading ENCFF187VMN
2023-01-15 09:06:44,590 cisTopic     INFO     Computing barcode rank plot for ENCFF187VMN
2023-01-15 09:06:44,591 cisTopic     INFO     Counting fragments
2023-01-15 09:06:50,918 cisTopic     INFO     Marking barcodes with more than 100
2023-01-15 09:06:50,969 cisTopic     INFO     Returning plot data
2023-01-15 09:06:50,973 cisTopic     INFO     Returning valid barcodes
2023-01-15 09:06:57,171 cisTopic     INFO     Computing duplicate rate plot for ENCFF187VMN
2023-01-15 09:07:05,086 cisTopic     INFO     Return plot data
2023-01-15 09:07:05,317 cisTopic     INFO     Computing insert size distribution for ENCFF187VMN
2023-01-15 09:07:05,319 cisTopic     INFO     Counting fragments
2023-01-15 09:07:06,860 cisTopic     INFO     Returning plot data
2023-01-15 09:07:56,480 cisTopic     INFO     Computing TSS profile for ENCFF187VMN
2023-01-15 09:08:05,050 cisTopic     INFO     Formatting annnotation
2023-01-15 09:08:05,102 

We can plot a few QC plots and see how filtering would affect the cells called

In [None]:
from pycisTopic.qc import *

# Perform some filtering to keep only high quality cells
QC_filters = {
    'Log_unique_nr_frag': [2 , None],
    'FRIP':               [0.3, None],
    'TSS_enrichment':     [1   , None],
    'Dupl_rate':          [None, None]

}
bc_passing_filters = {}
for sample in fragments_dict:
    print("Sample", sample)
    FRIP_NR_FRAG_filter = plot_barcode_metrics(
        metadata_bc[sample],
        var_x='Log_unique_nr_frag',
        var_y='FRIP',
        min_x=QC_filters['Log_unique_nr_frag'][0],
        max_x=QC_filters['Log_unique_nr_frag'][1],
        min_y=QC_filters['FRIP'][0],
        max_y=QC_filters['FRIP'][1],
        return_cells=True,
        return_fig=False,
        plot=False
    )
    # Return figure to plot together with other metrics, and cells passing filters
    TSS_NR_FRAG_filter = plot_barcode_metrics(
        metadata_bc[sample],
        var_x='Log_unique_nr_frag',
        var_y='TSS_enrichment',
        min_x=QC_filters['Log_unique_nr_frag'][0],
        max_x=QC_filters['Log_unique_nr_frag'][1],
        min_y=QC_filters['TSS_enrichment'][0],
        max_y=QC_filters['TSS_enrichment'][1],
        return_cells=True,
        return_fig=False,
        plot=False
    )
    bc_passing_filters[sample] = list((set(FRIP_NR_FRAG_filter) & set(TSS_NR_FRAG_filter)))

Sample ENCFF187VMN



`distplot` is a deprecated function and will be removed in seaborn v0.14.0.

Please adapt your code to use either `displot` (a figure-level function with
similar flexibility) or `histplot` (an axes-level function for histograms).

For a guide to updating your code to use the new functions, please see
https://gist.github.com/mwaskom/de44147ed2974457ad6372750bbe5751

  sns.distplot(

`distplot` is a deprecated function and will be removed in seaborn v0.14.0.

Please adapt your code to use either `displot` (a figure-level function with
similar flexibility) or `histplot` (an axes-level function for histograms).

For a guide to updating your code to use the new functions, please see
https://gist.github.com/mwaskom/de44147ed2974457ad6372750bbe5751

  sns.distplot(

`distplot` is a deprecated function and will be removed in seaborn v0.14.0.

Please adapt your code to use either `displot` (a figure-level function with
similar flexibility) or `histplot` (an axes-level function for histograms).

Sample ENCFF035SPT



`distplot` is a deprecated function and will be removed in seaborn v0.14.0.

Please adapt your code to use either `displot` (a figure-level function with
similar flexibility) or `histplot` (an axes-level function for histograms).

For a guide to updating your code to use the new functions, please see
https://gist.github.com/mwaskom/de44147ed2974457ad6372750bbe5751

  sns.distplot(

`distplot` is a deprecated function and will be removed in seaborn v0.14.0.

Please adapt your code to use either `displot` (a figure-level function with
similar flexibility) or `histplot` (an axes-level function for histograms).

For a guide to updating your code to use the new functions, please see
https://gist.github.com/mwaskom/de44147ed2974457ad6372750bbe5751

  sns.distplot(

`distplot` is a deprecated function and will be removed in seaborn v0.14.0.

Please adapt your code to use either `displot` (a figure-level function with
similar flexibility) or `histplot` (an axes-level function for histograms).

Sample ENCFF622EUO



`distplot` is a deprecated function and will be removed in seaborn v0.14.0.

Please adapt your code to use either `displot` (a figure-level function with
similar flexibility) or `histplot` (an axes-level function for histograms).

For a guide to updating your code to use the new functions, please see
https://gist.github.com/mwaskom/de44147ed2974457ad6372750bbe5751

  sns.distplot(

`distplot` is a deprecated function and will be removed in seaborn v0.14.0.

Please adapt your code to use either `displot` (a figure-level function with
similar flexibility) or `histplot` (an axes-level function for histograms).

For a guide to updating your code to use the new functions, please see
https://gist.github.com/mwaskom/de44147ed2974457ad6372750bbe5751

  sns.distplot(

`distplot` is a deprecated function and will be removed in seaborn v0.14.0.

Please adapt your code to use either `displot` (a figure-level function with
similar flexibility) or `histplot` (an axes-level function for histograms).

Sample ENCFF119IVK



`distplot` is a deprecated function and will be removed in seaborn v0.14.0.

Please adapt your code to use either `displot` (a figure-level function with
similar flexibility) or `histplot` (an axes-level function for histograms).

For a guide to updating your code to use the new functions, please see
https://gist.github.com/mwaskom/de44147ed2974457ad6372750bbe5751

  sns.distplot(

`distplot` is a deprecated function and will be removed in seaborn v0.14.0.

Please adapt your code to use either `displot` (a figure-level function with
similar flexibility) or `histplot` (an axes-level function for histograms).

For a guide to updating your code to use the new functions, please see
https://gist.github.com/mwaskom/de44147ed2974457ad6372750bbe5751

  sns.distplot(

`distplot` is a deprecated function and will be removed in seaborn v0.14.0.

Please adapt your code to use either `displot` (a figure-level function with
similar flexibility) or `histplot` (an axes-level function for histograms).

Sample ENCFF683IBE



`distplot` is a deprecated function and will be removed in seaborn v0.14.0.

Please adapt your code to use either `displot` (a figure-level function with
similar flexibility) or `histplot` (an axes-level function for histograms).

For a guide to updating your code to use the new functions, please see
https://gist.github.com/mwaskom/de44147ed2974457ad6372750bbe5751

  sns.distplot(

`distplot` is a deprecated function and will be removed in seaborn v0.14.0.

Please adapt your code to use either `displot` (a figure-level function with
similar flexibility) or `histplot` (an axes-level function for histograms).

For a guide to updating your code to use the new functions, please see
https://gist.github.com/mwaskom/de44147ed2974457ad6372750bbe5751

  sns.distplot(

`distplot` is a deprecated function and will be removed in seaborn v0.14.0.

Please adapt your code to use either `displot` (a figure-level function with
similar flexibility) or `histplot` (an axes-level function for histograms).

Sample ENCFF042ZJI



`distplot` is a deprecated function and will be removed in seaborn v0.14.0.

Please adapt your code to use either `displot` (a figure-level function with
similar flexibility) or `histplot` (an axes-level function for histograms).

For a guide to updating your code to use the new functions, please see
https://gist.github.com/mwaskom/de44147ed2974457ad6372750bbe5751

  sns.distplot(

`distplot` is a deprecated function and will be removed in seaborn v0.14.0.

Please adapt your code to use either `displot` (a figure-level function with
similar flexibility) or `histplot` (an axes-level function for histograms).

For a guide to updating your code to use the new functions, please see
https://gist.github.com/mwaskom/de44147ed2974457ad6372750bbe5751

  sns.distplot(

`distplot` is a deprecated function and will be removed in seaborn v0.14.0.

Please adapt your code to use either `displot` (a figure-level function with
similar flexibility) or `histplot` (an axes-level function for histograms).

Sample ENCFF176LJV



`distplot` is a deprecated function and will be removed in seaborn v0.14.0.

Please adapt your code to use either `displot` (a figure-level function with
similar flexibility) or `histplot` (an axes-level function for histograms).

For a guide to updating your code to use the new functions, please see
https://gist.github.com/mwaskom/de44147ed2974457ad6372750bbe5751

  sns.distplot(

`distplot` is a deprecated function and will be removed in seaborn v0.14.0.

Please adapt your code to use either `displot` (a figure-level function with
similar flexibility) or `histplot` (an axes-level function for histograms).

For a guide to updating your code to use the new functions, please see
https://gist.github.com/mwaskom/de44147ed2974457ad6372750bbe5751

  sns.distplot(

`distplot` is a deprecated function and will be removed in seaborn v0.14.0.

Please adapt your code to use either `displot` (a figure-level function with
similar flexibility) or `histplot` (an axes-level function for histograms).

Sample ENCFF101BLM



`distplot` is a deprecated function and will be removed in seaborn v0.14.0.

Please adapt your code to use either `displot` (a figure-level function with
similar flexibility) or `histplot` (an axes-level function for histograms).

For a guide to updating your code to use the new functions, please see
https://gist.github.com/mwaskom/de44147ed2974457ad6372750bbe5751

  sns.distplot(

`distplot` is a deprecated function and will be removed in seaborn v0.14.0.

Please adapt your code to use either `displot` (a figure-level function with
similar flexibility) or `histplot` (an axes-level function for histograms).

For a guide to updating your code to use the new functions, please see
https://gist.github.com/mwaskom/de44147ed2974457ad6372750bbe5751

  sns.distplot(

`distplot` is a deprecated function and will be removed in seaborn v0.14.0.

Please adapt your code to use either `displot` (a figure-level function with
similar flexibility) or `histplot` (an axes-level function for histograms).

In [None]:
# Dump and print the cells that passed!
pickle.dump(
    bc_passing_filters,
    open(os.path.join(work_dir, 'scATAC/quality_control/bc_passing_filters.pkl'), 'wb')
)
for sample in bc_passing_filters:
    print(f"{len(bc_passing_filters[sample])} barcodes passed QC stats for {sample}")

3887 barcodes passed QC stats for ENCFF187VMN
5623 barcodes passed QC stats for ENCFF035SPT
5210 barcodes passed QC stats for ENCFF622EUO
4669 barcodes passed QC stats for ENCFF119IVK
6547 barcodes passed QC stats for ENCFF683IBE
4875 barcodes passed QC stats for ENCFF042ZJI
6208 barcodes passed QC stats for ENCFF176LJV
5236 barcodes passed QC stats for ENCFF101BLM
