In [None]:
import boto3
import matplotlib.pyplot as plt
import os
import pandas as pd
from qa_mods import *


s3client = boto3.client('s3')
paginator = s3client.get_paginator('list_objects')

In [None]:
provider = 'psomagen'

proj = 'weissman-scaling-in-vivo-perturb-seq-in-the-liver-and-beyond/'

In [None]:
fastq_log = {}
all_raw_files = []
all_proc_files = {}
trimmer_failure_stats = {}
read_metadata = {}

bucket = f'czi-{provider}'

r_proj = s3client.list_objects(Bucket=bucket, Prefix=proj, Delimiter='/')
orders = [e['Prefix'] for e in r_proj['CommonPrefixes']]

for o in orders:
    r_order = s3client.list_objects(Bucket=bucket, Prefix=o, Delimiter='/')
    if 'CommonPrefixes' in r_order:
        groups = [e['Prefix'] for e in r_order['CommonPrefixes']]
        for g in groups:
            fastq_log[g.replace(o, '').rstrip('/')] = {}
            r_group = s3client.list_objects(Bucket=bucket, Prefix=g, Delimiter='/')
            subdirs = [e['Prefix'] for e in r_group['CommonPrefixes']]
            if len(subdirs) > 2:
                print('EXTRA subdirs', s)
            if f'{g}raw/' not in subdirs:
                print('raw MISSING', g)
            else:
                r_raw = s3client.list_objects(Bucket=bucket, Prefix=f'{g}raw/', Delimiter='/')
                if 'CommonPrefixes' in r_raw:
                    print('EXTRA subdirs', g)
                    non_10x_runs = [e['Prefix'] for e in r_raw['CommonPrefixes']]
                    for run in non_10x_runs:
                        r_run = s3client.list_objects(Bucket=bucket, Prefix=run, Delimiter='/')
                        if 'Contents' in r_run:
                            raw_files = [c['Key'] for c in r_run['Contents']]
                            tenX = False
                if 'Contents' in r_raw:
                    raw_files = [c['Key'] for c in r_raw['Contents']]
                    tenX = True
                if raw_files:
                    all_raw_files.extend(raw_files)
                    for rf in raw_files:
                        run,group,assay = parse_raw_filename(rf)

                        if assay not in valid_assays:
                            print('WRONG assay',assay,rf)
                        if group != g.replace(o, '').rstrip('/'):
                            print('WRONG group',group,g.replace(o, '').rstrip('/'),rf)
                        if (rf.endswith('.fastq.gz') and not rf.endswith('_sample.fastq.gz')) or (rf.endswith('.cram') and assay == 'viral_ORF'):
                            if assay in fastq_log[group]:
                                fastq_log[group][assay].append(rf.split('/')[-1])
                            else:
                                fastq_log[group][assay] = [rf.split('/')[-1]]

                        if rf.endswith('fastq.gz-metadata.json') and not rf.endswith('_sample.fastq.gz-metadata.json'):
                            s3client.download_file(bucket, rf, 'metadata.json')
                            metadata = json.load(open('metadata.json'))
                            read_metadata[metadata['filename']] = metadata
                            os.remove('metadata.json')
                        elif rf.endswith('trimmer-failure_codes.csv') and not rf.endswith('merged_trimmer-failure_codes.csv'):
                            grab_trimmer_stats(trimmer_failure_stats, rf, bucket)

            if f'{g}processed/' not in subdirs:
                print('processed MISSING', g)
            else:
                r_proc = s3client.list_objects(Bucket=bucket, Prefix=f'{g}processed/', Delimiter='/')
                if 'Contents' in r_proc:
                    print('UNEXPECTED FILES', f'{s}processed/')
                if f'{g}processed/cellranger/' not in [e['Prefix'] for e in r_proc['CommonPrefixes']]:
                    print('cellranger MISSING', g)
                else:
                    r_cr = s3client.list_objects(Bucket=bucket, Prefix=f'{g}processed/cellranger/', Delimiter='/')
                    if 'Contents' in r_cr:
                        print('UNEXPECTED FILES', f'{g}processed/cellranger/')
                    run_dates = [e['Prefix'] for e in r_cr['CommonPrefixes']]
                    for rd in run_dates:
                        date = rd.split('/')[-2]
                        if date[:7] != 'Run_202' or date[7] not in ['5'] or date[8] != '-' or date[11] != '-' or int(date[9:11]) > 12 or int(date[12:14]) > 31 or len(date) != 14:
                            print('INCORRECT date format',date,rd)
                        r_date = s3client.list_objects(Bucket=bucket, Prefix=rd, Delimiter='/')
                        if 'Contents' in r_date:
                            for c in r_date['Contents']:
                                print('UNEXPECTED FILES', c['Key'])
                        outsdirs = [e['Prefix'] for e in r_date['CommonPrefixes']]
                        if len(outsdirs) > 1:
                            print('TOO MANY OUTS', rd)
                        elif f'{rd}outs/' not in outsdirs:
                            print('NO OUTS', rd)
                        else:
                            files = []
                            for page in paginator.paginate(Bucket=bucket, Prefix=f'{rd}outs/'):
                                if 'Contents' in page:
                                    for obj in page['Contents']:
                                        files.append(obj['Key'])
                            all_proc_files[g.replace(o, '').rstrip('/')] = files

In [None]:
# Create histogram per order
for exp in trimmer_failure_stats.keys():
    plt.hist(trimmer_failure_stats[exp]['rsq'], bins=30, color='skyblue', edgecolor='black')
    plt.ylabel('Frequence')
    plt.xlabel('Percent fail sequencing (rsq file)')
    plt.title(f'Distribution Failed Sequencing {exp}')
    plt.show()

    plt.hist(trimmer_failure_stats[exp]['trimmer_fail'], bins=30, color='skyblue', edgecolor='black')
    plt.ylabel('Frequence')
    plt.xlabel('Percent fail trimming')
    plt.title(f'Distribution Failed Trimming {exp}')
    plt.show()

In [None]:
for sample,v in fastq_log.items():
    if len(v.get('CRI',[])) != len(v.get('GEX',[])) and len(v.get('GEX',[]))/2 != len(v.get('viral_ORF',[])):
        print(f"MISMATCH FQ counts: {sample}: {len(v.get('CRI',[]))} CRI, {len(v.get('GEX',[]))} GEX, {len(v.get('viral_ORF',[]))} viral_ORF")

In [None]:
beginnings = {}
for fullpath in all_raw_files:
    components = fullpath.split('/')[-1].split('-')
    if len(components) > 3:
        run = components[0]
        group_assay = components[1]
        ug = components[2]
        barcode = components[3].split('_')[0].split('.')[0]
        b = f'{run}-{group_assay}-{ug}-{barcode}'
        if b not in beginnings:
            raw_dir = '/'.join(fullpath.split('/')[:-1])
    
            if group_assay.endswith('viral_ORF'):
                endings = raw_expected['viralORF']
            else:
                endings = raw_expected['standard']
    
            beginnings[b] = {
                'raw_dir': raw_dir,
                'endings': endings
            }

raw_lost = []
for b,v in beginnings.items():
    temp_missing = {'path': b}
    for e in v['endings']:
        f = f"{v['raw_dir']}/{b}{e}"
        if f not in all_raw_files:
            temp_missing[e] = f
        else:
            all_raw_files.remove(f)
    if len(temp_missing) > 1:
        raw_lost.append(temp_missing)
pd.DataFrame(raw_lost)

In [None]:
group_read_counts = {}
r1r2_errors = []
for f,meta in read_metadata.items():
    if '_R1_' in f:
        run,group,assay = parse_raw_filename(f)
        r1_reads = meta['read_count']

        if group in group_read_counts:
            if assay in group_read_counts[group]:
                group_read_counts[group][assay] += r1_reads
            else:
                group_read_counts[group][assay] = r1_reads
        else:
            group_read_counts[group] = {assay: r1_reads}

        r2_file = f.replace('_R1_','_R2_')
        if r2_file not in read_metadata:
            r2_reads = 'n/a'
        else:
            r2_reads = read_metadata[r2_file]['read_count']
        if r1_reads != r2_reads:
            r1r2_errors.append({
                'R1 file': f,
                'R1 reads': r1_reads,
                'R2 file': r2_file,
                'R2 reads': r2_reads,
            })

    #add read_length metric here

pd.DataFrame(r1r2_errors)

In [None]:
#these are 'extra' files
for f in all_raw_files:
    print(f)

In [None]:
proc_missing = []
for g,proc_files in all_proc_files.items():
    report = {}

    web_summ = False
    while not web_summ:
        for f in proc_files:
            if f.split('/')[-1] == 'web_summary.html':
                s3client.download_file(f'czi-{provider}', f, 'web_summary.html')
                report.update(parse_web_summ('web_summary.html'))
                os.remove('web_summary.html')
                web_summ = True
                break

    met_summ = False
    while not met_summ:
        for f in proc_files:
            if f.split('/')[-1] == 'metrics_summary.csv':
                s3client.download_file(f'czi-{provider}', f, 'metrics_summary.csv')
                report.update(parse_met_summ('metrics_summary.csv'))
                os.remove('metrics_summary.csv')
                met_summ = True
                break

    sub = report['sub']
    chem = report['chem']
    extra = report['extra']
    software = report['software']

    if software != 'cellranger-9.0.1':
        print(f'ERROR {g}:version is {software} but should be 9.0.1\n')

    if 'min-crispr-umi' in report:
        cri_umi = report['min-crispr-umi']
        if cri_umi != '3':
            print(f'ERROR {g}:min-crispr-umi is {cri_umi} but should be 3\n')

    if 'incl_int' in report:
        intron = report['incl_int']
        if intron != 'true':
            print(f'ERROR {g}:include-introns is {intron} but should be true\n')

    if 'create-bam' in report:
        bam = report['create-bam']
        if chem != 'flex' and bam != 'true':
            print(f'ERROR {g}:create-bam is {bam} but should be true\n')

    for a in report['gex_alerts']:
        print('GEX ALERTS',g)
        print(a,'\n')
    for a in report.get('crispr_alerts',[]):
        print('CRI ALERTS',g)
        print(a,'\n')

    if sub == 'multi':
        if chem == 'flex':
            expected = cellranger_expected['flex']['outs'].copy()
            per_samp_expected = cellranger_expected['flex']['per_sample'].copy()
        else:
            expected = cellranger_expected['nonflex']['outs'].copy()
            per_samp_expected = cellranger_expected['nonflex']['per_sample'].copy()
    elif sub == 'count':
        expected = cellranger_expected['count']['outs'].copy()
        per_samp_expected = []

    #https://www.10xgenomics.com/support/software/cell-ranger/latest/analysis/outputs/cr-outputs-ab-overview
    #https://www.10xgenomics.com/support/software/cell-ranger/latest/analysis/outputs/cr-outputs-crispr-overview
    #https://www.10xgenomics.com/support/software/cell-ranger/latest/analysis/running-pipelines/cr-cell-annotation-pipeline
    if 'CRISPR' in extra or 'Antibody' in extra:
        expected.append('multi/count/feature_reference.csv')
        per_samp_expected.append('count/feature_reference.csv')
        if 'CRISPR' in extra:
            per_samp_expected.append('count/crispr_analysis.tar.gz')
        if 'Antibody' in extra:
            per_samp_expected.append('count/antibody_analysis.tar.gz')
    if 'CellAnnotate' in extra:
        per_samp_expected.append('count/cell_types.tar.gz')

    if sub == 'multi' and report.get('multiplex'):
        expected.append('multi/multiplexing_analysis.tar.gz')
    
    actual = [f.split('/outs/', 1)[1] for f in proc_files if f.split('/')[-1] != 'curated.h5ad']
    per_samp_actual = [f for f in actual if f.startswith('per_sample_outs/')]

    missing = [f for f in expected if f not in actual]
    if missing:
        temp_missing = {'group': g}
        for m in missing:
            temp_missing[m] = 'Y'
        proc_missing.append(temp_missing)

    if per_samp_expected:
        samples = list(set([f.split('/')[8] for f in proc_files if f.split('/')[7] == 'per_sample_outs']))
        for s in samples:
            expected_samp = [f'per_sample_outs/{s}/{f}' for f in per_samp_expected]
            expected.extend(expected_samp)
            missing_samp = [f for f in expected_samp if f not in actual]
            if missing_samp:
                temp_missing = {'group': f'{g}/{s}'}
                for m in missing_samp:
                    temp_missing[m.replace(f'per_sample_outs/{s}/','')] = 'Y'
                proc_missing.append(temp_missing)                

    extra = [f for f in actual if f not in expected]
    if extra:
        print('EXTRA',g)
        print(extra,'\n')

    for k,v in report.items():
        if k.endswith('_reads'):
            assay = k.replace('_reads','')
            if g in group_read_counts:
                if assay in group_read_counts[g]:
                    if v != group_read_counts[g][assay]:
                        print('READ COUNT ERROR:',g,assay,
                              v,'from proc',
                              group_read_counts[g][assay],'from raw',
                              v - group_read_counts[g][assay],'diff\n')
                else:
                    print('ERROR:assay not found',g,assay,'\n')
            else:
                print('ERROR:group not found',g,'\n')

pd.DataFrame(proc_missing)