In [None]:
import boto3
import matplotlib.pyplot as plt
import os
import pandas as pd
from qa_mods import *


s3client = boto3.client('s3')
paginator = s3client.get_paginator('list_objects')

In [None]:
# Choose data source: 'manifest' or 's3'
data_source = 's3'

# === Common parameters (needed for both modes) ===
order = ''      # Needed for output file naming
raw_assay = ''  # 10x_viral_ORF, 10x, sci_jumbo, sci_plex, scale - needed for validation

# === Parameters for 's3' mode only ===
provider = ''  # psomagen, novogene
proj = ''
sub = '' #was this pipeline run as "count" or "multi"? for cellranger v9, this will be overwritten with whatever is in log

# === Parameters for 'manifest' mode only ===
manifest_path = ''        # Path to CSV/TSV manifest file
manifest_delimiter = '\t' # Use '\t' for TSV, ',' for CSV
manifest_s3_column = 0    # Column index containing S3 URIs (0-based)
manifest_has_header = False  # Whether manifest has a header row

In [None]:
### Go through raw files to gather raw files and reports high level missing file log in *_errors.txt
### Store metadata.json information in read_metadata

fastq_log = {}
all_raw_files = []
all_proc_files = {}
trimmer_failure_stats = {}
read_metadata = {}

output_dir = 'qa_outs'
if not os.path.exists(output_dir):
    os.makedirs(output_dir)

if data_source == 'manifest':
    # Load from manifest file
    all_raw_files, all_proc_files = load_files_from_manifest(
        manifest_path=manifest_path,
        delimiter=manifest_delimiter,
        s3_column=manifest_s3_column,
        has_header=manifest_has_header
    )
    print(f"Loaded {len(all_raw_files)} raw files from manifest")
    print(f"Loaded {sum(len(v) for v in all_proc_files.values())} processed files across {len(all_proc_files)} groups")
    
    # Note: When using manifest mode, the following are NOT available:
    # - fastq_log (requires parsing each file)
    # - trimmer_failure_stats (requires downloading and parsing trimmer files)
    # - read_metadata (requires downloading metadata JSON files)
    # These would require S3 access to populate.

elif data_source == 's3':
    bucket = f'czi-{provider}'

    ### Gathering all the raw files
    o = f'{proj}/{order}/'
    r_order = s3client.list_objects(Bucket=bucket, Prefix=o, Delimiter='/')
    if 'CommonPrefixes' in r_order:
        groups = [e['Prefix'] for e in r_order['CommonPrefixes']]
        for g in groups:
            fastq_log[g.replace(o, '').rstrip('/')] = {}
            r_group = s3client.list_objects(Bucket=bucket, Prefix=g, Delimiter='/')
            subdirs = [e['Prefix'] for e in r_group['CommonPrefixes']]
            if len(subdirs) > 2:
                print('EXTRA subdirs', s)
            if f'{g}raw/' not in subdirs:
                print('raw/ MISSING', g)
            else:
                r_raw = s3client.list_objects(Bucket=bucket, Prefix=f'{g}raw/', Delimiter='/')
                if 'CommonPrefixes' in r_raw:
                    non_10x_runs = [e['Prefix'] for e in r_raw['CommonPrefixes']]
                    raw_files = []
                    for run in non_10x_runs:
                        r_run = s3client.list_objects(Bucket=bucket, Prefix=run, Delimiter='/')
                        if 'Contents' in r_run:
                            content = [c['Key'] for c in r_run['Contents']]
                            raw_files.extend([c['Key'] for c in r_run['Contents']])
                            tenX = False
                if 'Contents' in r_raw:
                    raw_files = [c['Key'] for c in r_raw['Contents']]
                    tenX = True
                if raw_files:
                    all_raw_files.extend(raw_files)
                    for rf in raw_files:
                        parsed = parse_raw_filename(rf, raw_assay)
                        if parsed is not None:
                            run,group,assay,ug,barcode = parsed
                            if assay not in valid_assays:
                                e = f'WRONG ASSAY: {assay} {rf}'
                                with open(f'{output_dir}/{order}_errors.txt', 'a') as file:
                                    file.write(e + '\n')
                            if group != g.replace(o, '').rstrip('/') and tenX:
                                e = f"WRONG GROUP: {group} {g.replace(o, '').rstrip('/')} {rf}"
                                with open(f'{output_dir}/{order}_errors.txt', 'a') as file:
                                    file.write(e + '\n')
                            if (rf.endswith('.fastq.gz') and not rf.endswith('_sample.fastq.gz')) or \
                                (rf.endswith('.cram') and assay == 'viral_ORF') or \
                                (rf.endswith('.cram') and raw_assay == 'scale' and not rf.endswith('_unmatched.cram')):
                                if group not in fastq_log:
                                    fastq_log[group] = {}
                                if assay in fastq_log[group]:
                                    fastq_log[group][assay].append(rf.split('/')[-1])
                                else:
                                    fastq_log[group][assay] = [rf.split('/')[-1]]

                        ### Go through metadata.json and trimmer logs and store information
                        if rf.endswith('fastq.gz-metadata.json') and not rf.endswith('_sample.fastq.gz-metadata.json'):
                            s3client.download_file(bucket, rf, 'metadata.json')
                            metadata = json.load(open('metadata.json'))
                            read_metadata[metadata['filename']] = metadata
                            os.remove('metadata.json')
                        elif rf.endswith('trimmer-failure_codes.csv') and not rf.endswith('merged_trimmer-failure_codes.csv'):
                            grab_trimmer_stats(trimmer_failure_stats, rf, bucket)

            ### Go through processed files
            if f'{g}processed/' not in subdirs:
                print('processed/ MISSING', g)
            else:
                r_proc = s3client.list_objects(Bucket=bucket, Prefix=f'{g}processed/', Delimiter='/')
                if 'Contents' in r_proc:
                    print('UNEXPECTED FILES', f'{s}processed/')
                if f'{g}processed/cellranger/' not in [e['Prefix'] for e in r_proc['CommonPrefixes']]:
                    print('cellranger/ MISSING', g)
                else:
                    r_cr = s3client.list_objects(Bucket=bucket, Prefix=f'{g}processed/cellranger/', Delimiter='/')
                    if 'Contents' in r_cr:
                        print('UNEXPECTED FILES', f'{g}processed/cellranger/')
                    run_dates = [e['Prefix'] for e in r_cr['CommonPrefixes']]
                    for rd in run_dates:
                        date = rd.split('/')[-2]
                        if date[:7] != 'Run_202' or date[7] not in ['5'] or date[8] != '-' or date[11] != '-' or int(date[9:11]) > 12 or int(date[12:14]) > 31 or len(date) != 14:
                            e = f'INCORRECT DATE FORMAT: {date} {rd}'
                            with open(f'{output_dir}/{order}_errors.txt', 'a') as file:
                                file.write(e + '\n')
                        r_date = s3client.list_objects(Bucket=bucket, Prefix=rd, Delimiter='/')
                        if 'Contents' in r_date:
                            for c in r_date['Contents']:
                                print('UNEXPECTED FILES', c['Key'])
                        outsdirs = [e['Prefix'] for e in r_date['CommonPrefixes']]
                        if len(outsdirs) > 1:
                            print('EXTRA subdirs', rd)
                        if f'{rd}outs/' not in outsdirs:
                            print('NO outs/', rd)
                        else:
                            files = []
                            for page in paginator.paginate(Bucket=bucket, Prefix=f'{rd}outs/'):
                                if 'Contents' in page:
                                    for obj in page['Contents']:
                                        files.append(obj['Key'])
                            all_proc_files[g.replace(o, '').rstrip('/')] = files

else:
    raise ValueError(f"Invalid data_source: {data_source}. Must be 'manifest' or 's3'")

In [None]:
### Plots distribution of frequency of sequencing and trimming fail rates
### One plot per sample, where each single trimmer-failure_codes.csv and merged_trimmer-failure_codes.csv per wafer is considered to be a single value

for exp in trimmer_failure_stats.keys():
    plt.hist(trimmer_failure_stats[exp]['rsq'], bins=30, color='skyblue', edgecolor='black')
    plt.ylabel('Frequence')
    plt.xlabel('Percent fail sequencing (rsq file)')
    plt.title(f'Distribution Failed Sequencing {exp}')
    plt.show()

    plt.hist(trimmer_failure_stats[exp]['trimmer_fail'], bins=30, color='skyblue', edgecolor='black')
    plt.ylabel('Frequence')
    plt.xlabel('Percent fail trimming')
    plt.title(f'Distribution Failed Trimming {exp}')
    plt.show()

In [None]:
### Check for mismatching number of files for multiple modalities

for sample,v in fastq_log.items():
    if raw_assay == 'scale':
        if len(fastq_log[sample]['GEX']) != len(fastq_log[sample]['hash_oligo']):
            e = f"MISMATCH FQ COUNTS: {sample}: {len(v.get('GEX',[]))} GEX, {len(v.get('hash_oligo',[]))} hash_oligo"
            with open(f'{output_dir}/{order}_errors.txt', 'a') as file:
                file.write(e + '\n')
    if len(v.get('CRI',[])) != len(v.get('GEX',[])) and len(v.get('GEX',[]))/2 != len(v.get('viral_ORF',[])):
        e = f"MISMATCH FQ COUNTS: {sample}: {len(v.get('CRI',[]))} CRI, {len(v.get('GEX',[]))} GEX, {len(v.get('viral_ORF',[]))} viral_ORF"
        with open(f'{output_dir}/{order}_errors.txt', 'a') as file:
            file.write(e + '\n')

In [None]:
### Check for mismatching read counts between R1 and R2 for same run and appends to *_errors.txt
### If there is at least 1 error in the metadata.json, read_count is not evaluated
### Needs to be updated for Multiome

error = ''

group_read_counts = {}
for f,meta in read_metadata.items():
    if '_R2_' not in f:
        if meta['errors']!=[]:
            print(f'WARNING: ERROR in metadata.json log for {f}')
            error+=f'METADATA.JSON ERROR: {f} has error in metadata.json:{meta["errors"]}\n'
            continue
        reads = meta['read_count']
        parsed = parse_raw_filename(f, raw_assay)
        if parsed is not None:
            run,group,assay,ug,barcode = parsed
            if group not in group_read_counts:
                group_read_counts[group] = {assay: reads}
            elif assay not in group_read_counts[group]:
                group_read_counts[group][assay] = reads
            else:
                group_read_counts[group][assay] += reads
        if '_R1_' in f:
            r2file = f.replace('_R1_','_R2_')
            if r2file in read_metadata:
                if read_metadata[r2file]['errors']!=[]:
                    print(f'WARNING: ERROR in metadata.json log for {f}')
                    error+=f'METADATA.JSON ERROR: {r2file} has error in metadata.json:{read_metadata[r2file]["errors"]}\n'
                    continue
                r2reads = read_metadata[r2file].get('read_count')
                if reads != r2reads:
                    error = f'READ COUNT ERROR:{f}-{reads},{r2file}-{r2reads}\n'
                else:
                    print(f'GOOD: Matching read counts R1 and R2: {reads} {r2reads}')
        with open(f'{output_dir}/{order}_errors.txt', 'a') as file:
            file.write(error)

In [None]:
### Checking for expected files for raw, compiling list of "beginnings" and "endings" of filenames
### Will need to update the logging for "all good", it is all files for that "beginning" is all present

beginnings = {}
for fullpath in all_raw_files:
    parsed = parse_raw_filename(fullpath, raw_assay)
    if parsed is not None:
        run,group,assay,ug,barcode = parsed
        b = f'{run}-{group}_{assay}-{ug}-{barcode}'
        if b not in beginnings:
            raw_dir = '/'.join(fullpath.split('/')[:-1])
            endings = raw_expected[raw_assay]
            if raw_assay == '10x_viral_ORF' and assay == 'GEX':
                endings = raw_expected['10x']
            beginnings[b] = {
                'raw_dir': raw_dir,
                'endings': endings
            }

all_good = 0
raw_lost = []
raw_found = []
for b,v in beginnings.items():
    temp_missing = {'path': b}
    for e in v['endings']:
        f = f"{v['raw_dir']}/{b}{e}"
        if f not in all_raw_files:
            if e.endswith('-metadata.json') and f.replace('-metadata.json','') not in all_raw_files:
                continue
            temp_missing[e] = f
        else:
            raw_found.append(f)
    if len(temp_missing) > 1:
        raw_lost.append(temp_missing)
    else:
        all_good += 1
print(f'{all_good} out of {len(beginnings)} are GOOD and do not have missing raw files')
df = pd.DataFrame(raw_lost)
if not df.empty:
    print("WARNING: There are missing raw files that are expected")
    df.to_csv(f'{output_dir}/{order}_raw_missing.csv', index=False)
df

In [None]:
### checking for extra fastq files, and there is a set of "optional" raw files are commonly missing 

for f in all_raw_files:
    if f in raw_found:
        continue
    if f.endswith('-metadata.json') and f.replace('-metadata.json','') in all_raw_files:
        continue
    if raw_assay in raw_optional or raw_assay == '10x_viral_ORF':
        endings = raw_optional.get(raw_assay,[])
        if raw_assay == '10x_viral_ORF' and assay == 'GEX':
            endings = raw_optional['10x']
        parsed = parse_raw_filename(f, raw_assay)
        if parsed is not None:
            run,group,assay,ug,barcode = parsed
            b = f'{run}-{group}_{assay}-{ug}-{barcode}'
            raw_dir = '/'.join(f.split('/')[:-1])
            if f.replace(f'{raw_dir}/{b}','') in endings:
                continue
    with open(f'{output_dir}/{order}_raw_extra.txt', 'a') as file:
        file.write(f + '\n')

In [None]:
### Go through cellranger logs and check for expected run metadata and presence of files
### Will need to update for Multiome

proc_missing = []
alerts = []
for g,proc_files in all_proc_files.items():
    report = {}

    web_summ = False
    while not web_summ:
        for f in proc_files:
            if f.split('/')[-1] == 'web_summary.html':
                s3client.download_file(f'czi-{provider}', f, 'web_summary.html')
                report.update(parse_web_summ('web_summary.html'))
                os.remove('web_summary.html')
                web_summ = True
                break

    met_summ = False
    while not met_summ:
        for f in proc_files:
            if f.split('/')[-1] == 'metrics_summary.csv':
                s3client.download_file(f'czi-{provider}', f, 'metrics_summary.csv')
                report.update(parse_met_summ('metrics_summary.csv'))
                os.remove('metrics_summary.csv')
                met_summ = True
                break

    
    chem = report['chem']
    extra = report['extra']
    software = report['software']


    if software == 'cellranger-9.0.1':
        sub = report['sub']
    elif software == 'cellranger-10.0.0':
        sub = 'multi'    
    else:
        e = f'CR ERROR: {g} version is {software} but should be 9.0.1 or 10.0.0\n'
        with open(f'{output_dir}/{order}_errors.txt', 'a') as file:
            file.write(e)


    if 'min-crispr-umi' in report:
        cri_umi = report['min-crispr-umi']
        if cri_umi != '3':
            e = f'CR ERROR: {g} min-crispr-umi is {cri_umi} but should be 3\n'
            with open(f'{output_dir}/{order}_errors.txt', 'a') as file:
                file.write(e)

    if 'incl_int' in report:
        intron = report['incl_int']
        if intron != 'true':
            e = f'CR ERROR: {g} include-introns is {intron} but should be true\n'
            with open(f'{output_dir}/{order}_errors.txt', 'a') as file:
                file.write(e)

    if 'create-bam' in report:
        bam = report['create-bam']
        if chem != 'flex' and bam != 'true':
            e = f'CR ERROR: {g} create-bam is {bam} but should be true\n'
            with open(f'{output_dir}/{order}_errors.txt', 'a') as file:
                file.write(e)

    for a in report['gex_alerts']:
        alert = {'group': g, 'modality': 'GEX'}
        alert.update(a)
        alerts.append(alert)
    for a in report.get('crispr_alerts',[]):
        alert = {'group': g, 'modality': 'CRI'}
        alert.update(a)
        alerts.append(alert)

    if sub == 'multi':
        if chem == 'flex':
            expected = cellranger_expected[software]['flex']['outs'].copy()
            per_samp_expected = cellranger_expected[software]['flex']['per_sample'].copy()
        else:
            expected = cellranger_expected[software]['nonflex']['outs'].copy()
            per_samp_expected = cellranger_expected[software]['nonflex']['per_sample'].copy()
    elif sub == 'count':
        expected = cellranger_expected['count']['outs'].copy()
        per_samp_expected = []

    #https://www.10xgenomics.com/support/software/cell-ranger/latest/analysis/outputs/cr-outputs-ab-overview
    #https://www.10xgenomics.com/support/software/cell-ranger/latest/analysis/outputs/cr-outputs-crispr-overview
    #https://www.10xgenomics.com/support/software/cell-ranger/latest/analysis/running-pipelines/cr-cell-annotation-pipeline
    if 'CRISPR' in extra or 'Antibody' in extra:
        if software!='cellranger-10.0.0':
            expected.append('multi/count/feature_reference.csv')
            per_samp_expected.append('count/feature_reference.csv')
        if 'CRISPR' in extra and software!='cellranger-10.0.0':
            per_samp_expected.append('count/crispr_analysis.tar.gz')
        if 'Antibody' in extra:
            per_samp_expected.append('count/antibody_analysis.tar.gz')
    if 'CellAnnotate' in extra:
        per_samp_expected.append('count/cell_types.tar.gz')

    if sub == 'multi' and report.get('multiplex'):
        if software!='cellranger-10.0.0':
            expected.append('multi/multiplexing_analysis.tar.gz')
    
    actual = [f.split('/outs/', 1)[1] for f in proc_files if f.split('/')[-1] != 'curated.h5ad']
    per_samp_actual = [f for f in actual if f.startswith('per_sample_outs/')]
    missing = [f for f in expected if f not in actual]
    if missing:
        temp_missing = {'group': g}
        for m in missing:
            temp_missing[m] = 'Y'
        proc_missing.append(temp_missing)

    if per_samp_expected:
        samples = list(set([f.split('/')[8] for f in proc_files if f.split('/')[7] == 'per_sample_outs']))
        for s in samples:
            expected_samp = [f'per_sample_outs/{s}/{f}' for f in per_samp_expected]
            expected.extend(expected_samp)
            missing_samp = [f for f in expected_samp if f not in actual]
            if missing_samp:
                temp_missing = {'group': f'{g}/{s}'}
                for m in missing_samp:
                    temp_missing[m.replace(f'per_sample_outs/{s}/','')] = 'Y'
                proc_missing.append(temp_missing)                

    extra = [f for f in actual if f not in expected and not f.endswith('manifest.json')]
    if extra:
        with open(f'{output_dir}/{order}_process_extra.txt', 'a') as file:
            file.write(g + '\n')
            file.write(','.join(extra) + '\n' + '\n')

    ### Reads per assay and comparing to metadata.json count
    for k,v in report.items():
        if k.endswith('_reads'):
            assay = k.replace('_reads','')
            if g in group_read_counts:
                if assay in group_read_counts[g]:
                    v2 = group_read_counts[g][assay]
                    if v != v2:
                        e = f'READ COUNT ERROR: {g} {assay} {v} from proc,{v2} from raw,{v - v2} diff\n'
                        with open(f'{output_dir}/{order}_errors.txt', 'a') as file:
                            file.write(e)
                    else:
                        print(f'GOOD: Matching total read counts: {v} {v2}')
                else:
                    ### Could not find the group or the assay for this group
                    print('WARNING: Read count not found in metadata.json for',g,assay,'\n')
            else:
                print('WARNING: Read count for found in metadata.json for',g,'\n')

if alerts:
    pd.DataFrame(alerts).to_csv(f'{output_dir}/{order}_process_alerts.csv', index=False)

df = pd.DataFrame(proc_missing)
if not df.empty:
    print(f'\nWARNING: Table of missing items')
    df.to_csv(f'{output_dir}/{order}_process_missing.csv', index=False)
df