In [5]:
import boto3
import json
import os
import re
from bs4 import BeautifulSoup

**[Cell Ranger multi](https://www.10xgenomics.com/support/software/cell-ranger/latest/analysis/outputs/cr-3p-outputs-cellplex)**

In [23]:
#outs/
multi_outs = [
    'config.csv',
    'multi/count/feature_reference.csv',
    'multi/count/raw_cloupe.cloupe',
    'multi/count/raw_feature_bc_matrix.h5',
    'multi/count/raw_feature_bc_matrix.tar.gz',
    'multi/count/raw_molecule_info.h5',
    'multi/count/unassigned_alignments.bam',
    'multi/count/unassigned_alignments.bam.bai',
    'multi/multiplexing_analysis.tar.gz'
]

#outs/per_sample_outs/<sample_id>/
multi_per_sample_outs = [
    'count/analysis.tar.gz',
    'count/feature_reference.csv',
    'count/sample_cloupe.cloupe',
    'count/sample_alignments.bam',
    'count/sample_alignments.bam.bai',
    'count/sample_filtered_barcodes.csv',
    'count/sample_filtered_feature_bc_matrix.h5',
    'count/sample_filtered_feature_bc_matrix.tar.gz',
    'count/sample_molecule_info.h5',
    'metrics_summary.csv',
    'web_summary.html'
]

**[Cell Ranger multi - 10x Flex](https://www.10xgenomics.com/support/software/cell-ranger/latest/analysis/outputs/cr-flex-outputs-frp)**

In [None]:
#outs/
multi_flex_outs = [
    'config.csv',
    'multi/count/raw_cloupe.cloupe',
    'multi/count/raw_feature_bc_matrix.h5',
    'multi/count/raw_feature_bc_matrix.tar.gz',
    'multi/count/raw_molecule_info.h5',
    'multi/count/raw_probe_bc_matrix.h5',
    'multi/multiplexing_analysis.tar.gz'
]

#outs/per_sample_outs/<sample_id>/
multi_flex_per_sample_outs = [
    'count/analysis.tar.gz',
    'count/probe_set.csv',
    'count/sample_cloupe.cloupe',
    'count/sample_filtered_barcodes.csv',
    'count/sample_filtered_feature_bc_matrix.h5',
    'count/sample_filtered_feature_bc_matrix.tar.gz',
    'count/sample_molecule_info.h5',
    'count/sample_raw_feature_bc_matrix.h5',
    'count/sample_raw_feature_bc_matrix.tar.gz',
    'count/sample_raw_probe_bc_matrix.h5',
    'metrics_summary.csv',
    'web_summary.html'
]

**[Cell Ranger count](https://www.10xgenomics.com/support/software/cell-ranger/latest/analysis/outputs/cr-outputs-gex-overview)**

In [None]:
#outs/
count_outs = [
    'analysis.tar.gz',
    'cloupe.cloupe',
    'filtered_feature_bc_matrix.tar.gz',
    'filtered_feature_bc_matrix.h5',
    'metrics_summary.csv',
    'molecule_info.h5',
    'possorted_genome_bam.bam',
    'possorted_genome_bam.bam.bai',
    'raw_feature_bc_matrix.tar.gz',
    'raw_feature_bc_matrix.h5',
    'web_summary.html'
]

**[Cell Ranger ARC (multiome)](https://www.10xgenomics.com/support/software/cell-ranger-arc/latest/analysis/outputs/understanding-output)**

In [None]:
#outs/
multiome_outs = [
    'analysis.tar.gz',
    'cloupe.cloupe',
    'filtered_feature_bc_matrix.tar.gz',
    'filtered_feature_bc_matrix.h5',
    'summary.csv',
    'per_barcode_metrics.csv',
    'gex_molecule_info.h5',
    'gex_possorted_bam.bam',
    'gex_possorted_bam.bam.bai',
    'raw_feature_bc_matrix.tar.gz',
    'raw_feature_bc_matrix.h5',
    'web_summary.html',
    'atac_possorted_bam.bam',
    'atac_possorted_bam.bam.bai',
    'atac_peaks.bed',
    'atac_peak_annotation.tsv',
    'atac_cut_sites.bigwig',
    'atac_fragments.tsv.gz',
    'atac_fragments.tsv.gz.tbi'
]

In [None]:
#trying to enumerate the possible "chemistry" values from the web_summary.html
non_flex_chem = [
    "Single Cell 5' R2-only v3",
    "Single Cell 5' R2-only",
    "Single Cell 3' v4 (polyA)",
    "Single Cell 3' v3"
]

**[ScaleRna](https://github.com/ScaleBio/ScaleRna/blob/master/docs/outputs.md)**

In [None]:
#ScaleRna.out/
scale_rna = [
    'reports/multiqc_report.html',
    'reports/<sample>.<libIndex2>.report.html',
    'reports/<sample>_libraries/', #For QuantumScale runs, need files within
    'reports/allSamples.reportStatistics.csv',
    'reports/csv/', #need files within
    'reports/library/library_<libIndex2>.report.html',
    'reports/library/csv/', #need files within
    'samples/<sample>.<libIndex2>.filtered.matrix/', #need files within
    'samples/<sample>.<libIndex2>.allCells.csv',
    'samples/<sample>_libraries/', #For QuantumScale runs, need files within
    'fastq/fastqc/*_fastqc.html',
    'fastq/Reports/', #need files within
    'barcodes/split_bcparser_jobs/bcparser.<libIndex2>/<sample>.bam',
    'barcodes/<libIndex2>.metrics.json',
    'alignment/<sample>.<libIndex2>/<sample>.star.align/', #need files within
    'alignment/<sample>.<libIndex2>/<sample>.star.solo/' #need files within
]

In [None]:
def parse_web_summ(f):
    with open(f) as html_doc:
        soup = BeautifulSoup(html_doc, 'html.parser')
    for x in soup.find_all('script'):
        match = re.search("const data = ", x.string)
        if match:
            end = match.end()
            data = json.loads(x.string[end:])

    extra = []
    if 'library' in data:
        sub = data['sample']['subcommand']
    
        gex_tab = {row[0]:row[1] for row in data['library']['data']['gex_tab']['content']['parameters_table']['rows']}
        chem = gex_tab['Chemistry']

        if data['library']['data']['crispr_tab']:
            extra.append('CRISPR')
        if data['library']['data']['antibody_tab']:
            extra.append('Antibody')
    
        #location of some additional info to QA
        ref = gex_tab['Transcriptome']
        if chem != 'Flex Gene Expression':
            incl_int = gex_tab['Include Introns']
        cr_v = data['library']['data']['header_info']['Pipeline Version']
        if cr_v != 'cellranger-9.0.1':
            print(f'ERROR:version is {cr_v} but should be 9.0.1')

    else:
        sub = None
        info = {row[0]:row[1] for row in data['joint_pipeline_info_table']['rows']}
        chem = info['Chemistry']

        #location of some additional info to QA
        ref = info['Reference path']
        cr_v = info['Pipeline version']
        if cr_v != 'cellranger-arc-2.0.2':
            print(f'ERROR:version is {cr_v} but should be 2.0.2')
        

    return sub,chem,extra

In [None]:
s3client = boto3.client('s3')
paginator = s3client.get_paginator('list_objects')


# define scope of QA
bucket_name = 'czi-psomagen'
lab = 'Marson'
proj = 'mapping-grns-perturb-seq'
run_dates = [
    'Run_2025-05-23',
    'Run_2025-05-28',
    'Run_2025-06-03'
]

In [None]:
# build lists of files
files = []
for run_date in run_dates:
    my_dir = f'{lab}/{proj}/processed/cellranger/{run_date}/'
    for page in paginator.paginate(Bucket=bucket_name, Prefix=my_dir):
        if 'Contents' in page:
            for obj in page['Contents']:
                files.append(obj['Key'])

# build a list of libraries and their samples & run dates
libs = {}
for file_path in files:
    file_path = file_path.split('/')
    cri = file_path.index('cellranger')
    datei = cri + 1
    lib = file_path[datei + 1]
    if lib not in libs:
        libs[lib] = {
            'date': file_path[datei],
            'samples': []
        }
    if len(file_path) > (datei + 3):
        if file_path[datei + 3] == 'per_sample_outs':
            sub = file_path[datei + 4]
            if sub not in libs[lib]['samples']:
                libs[lib]['samples'].append(sub)

In [None]:
for lib,v in libs.items():
    print(lib)
    run_date = v['date']
    samples = v['samples']
    outs_path = f'{lab}/{proj}/processed/cellranger/{run_date}/{lib}/outs/'
    file_path = f'{outs_path}per_sample_outs/{samples[0]}/web_summary.html'
    f = file_path.split('/')[-1]
    s3client.download_file(bucket_name, file_path, f)

    sub,chem,extra = parse_web_summ(f)

    if chem == 'Single Cell Multiome ATAC + Gene Expression v1':
        outs_expected = multiome_outs
        per_samp_expected = []

    else:
        if sub == 'multi':
            if chem == 'Flex Gene Expression':
                outs_expected = multi_flex_outs.copy()
                per_samp_expected = multi_flex_per_sample_outs.copy()
            elif chem in non_flex_chem:
                outs_expected = multi_outs.copy()
                per_samp_expected = multi_per_sample_outs.copy()
        elif sub == 'count':
            outs_expected = count_outs.copy()
            per_samp_expected = []

        #https://www.10xgenomics.com/support/software/cell-ranger/latest/analysis/outputs/cr-outputs-ab-overview
        #https://www.10xgenomics.com/support/software/cell-ranger/latest/analysis/outputs/cr-outputs-crispr-overview
        if 'CRISPR' in extra or 'Antibody' in extra:
            outs_expected.append('multi/count/feature_reference.csv')
            if sub == 'multi':
                per_samp_expected.append('count/feature_reference.csv')
            if 'CRISPR' in extra:
                if sub == 'count':
                    outs_expected.append('crispr_analysis.tar.gz')
                if sub == 'multi':
                    per_samp_expected.append('count/crispr_analysis.tar.gz')
            if 'Antibody' in extra:
                if sub == 'count':
                    outs_expected.append('antibody_analysis.tar.gz')
                if sub == 'multi':
                    per_samp_expected.append('count/antibody_analysis.tar.gz')

        #https://www.10xgenomics.com/support/software/cell-ranger/latest/analysis/running-pipelines/cr-cell-annotation-pipeline
        for line in data['experimental_design']['csv'].split('\n'):
            if ',' in line:
                path = line.strip().split(',')
                if path[0] == 'skip-cell-annotation' and path[1] == 'false':
                    per_samp_expected.append('count/cell_types.tar.gz')
                    if sub == 'count':
                        outs_expected.append('count/web_summary_cell_types.html')

    actual = [f for f in files if f.startswith(outs_path)]

    #check multi/lib
    missing = [f for f in outs_expected if f'{outs_path}{f}' not in actual]
    if missing:
        print('missing')
        print(missing)
    extra = [f for f in actual if f.replace(outs_path,'') not in outs_expected and 'per_sample_outs' not in f]
    if extra:
        print('extra')
        print(extra)

    #check per sample
    if per_samp_expected:
        for s in samples:
            print(s)
            samp_outs_path = f'{outs_path}per_sample_outs/{s}/'
            missing = [f for f in per_samp_expected if f'{samp_outs_path}{f}' not in actual]
            if missing:
                print('missing')
                print(missing)
            extra = [f for f in actual if f.replace(samp_outs_path,'') not in per_samp_expected and f.startswith(samp_outs_path)]
            if extra:
                print('extra')
                print(extra)

    os.remove(f)