In [40]:
import boto3
import matplotlib.pyplot as plt
import os
import pandas as pd
from qa_mods import *
import multiprocessing
import h5py
import fsspec
import numpy as np


s3client = boto3.client('s3')
paginator = s3client.get_paginator('list_objects')

In [44]:
# Choose data source: 'manifest' or 's3'
data_source = 's3'

# === Common parameters (needed for both modes) ===
order = 'AN00025549'      # Needed for output file naming

# === Parameters for 's3' mode only ===
provider = 'psomagen'  # psomagen, novogene
proj = 'weissman-scaling-in-vivo-perturb-seq-in-the-liver-and-beyond'

In [45]:
def find_raw_filtered(bucket_name, prefix):
    """
    Given a bucket and prefix, go through s3 directory and find all files ending with suffix
    If there are no uris that have "per_sample_outs", can assume that this was not run with "multi" and can use all filtered_feature_bc_matrix.h5
    """
    suffix = 'filtered_feature_bc_matrix.h5'
    
    # Use a paginator to handle cases with more than 1000 objects
    paginator = s3client.get_paginator('list_objects_v2')
    
    # List objects with the specified prefix
    pages = paginator.paginate(Bucket=bucket_name, Prefix=prefix)
    
    matching_files = []
    all_keys = []
    for page in pages:
        if 'Contents' in page:
            for obj in page['Contents']:
                # Filter client-side for the specific suffix
                if obj['Key'].endswith(suffix):
                    all_keys.append(obj['Key'])
                
    if len([i for i in all_keys if re.search('per_sample_outs',i)])>0:
        matching_files = [i for i in all_keys if re.search('per_sample_outs',i)]
    else:
        matching_files = all_keys
    return matching_files

In [46]:
# Gather raw filtered h5 cellranger output files

bucket = f'czi-{provider}'
order_dir = f'{proj}/{order}/'
r_order = s3client.list_objects(Bucket=bucket, Prefix=order_dir, Delimiter='/')
all_raw_h5 = []

if 'CommonPrefixes' in r_order:
    groups = [e['Prefix'] for e in r_order['CommonPrefixes']]
    for g in groups:
        matching_files = find_raw_filtered(bucket, g)
        all_raw_h5.extend(matching_files)

In [47]:
# Do a sanity check and review raw filtered h5 files that will be counted

all_raw_h5

['weissman-scaling-in-vivo-perturb-seq-in-the-liver-and-beyond/AN00025549/RSJS_fast_1/processed/cellranger/Run_2025_08_16/outs/per_sample_outs/bottom_30p_lipid_1/count/sample_filtered_feature_bc_matrix.h5',
 'weissman-scaling-in-vivo-perturb-seq-in-the-liver-and-beyond/AN00025549/RSJS_fast_1/processed/cellranger/Run_2025_08_16/outs/per_sample_outs/bottom_30p_lipid_2/count/sample_filtered_feature_bc_matrix.h5',
 'weissman-scaling-in-vivo-perturb-seq-in-the-liver-and-beyond/AN00025549/RSJS_fast_1/processed/cellranger/Run_2025_08_16/outs/per_sample_outs/bottom_30p_lipid_3/count/sample_filtered_feature_bc_matrix.h5',
 'weissman-scaling-in-vivo-perturb-seq-in-the-liver-and-beyond/AN00025549/RSJS_fast_1/processed/cellranger/Run_2025_08_16/outs/per_sample_outs/bottom_30p_lipid_4/count/sample_filtered_feature_bc_matrix.h5',
 'weissman-scaling-in-vivo-perturb-seq-in-the-liver-and-beyond/AN00025549/RSJS_fast_1/processed/cellranger/Run_2025_08_16/outs/per_sample_outs/double_positive_1/count/sampl

In [48]:
def process_raw_filtered(s3_uri):
    """
    Opens an H5 file directly from S3 using fsspec and h5py, 
    reading only the necessary metadata bytes to get the shape.
    """
    try:
        print(f"Accessing metadata for {s3_uri}")

        cell_count = 0
        with fsspec.open(s3_uri, 'rb') as f:
            with h5py.File(f, 'r') as h5:
                cell_count = h5['matrix']['barcodes'].shape[0]

        return {
            "uri": s3_uri,
            "observations_cells": cell_count,
            "status": "Success"
        }

    except Exception as e:
        print(f"[{os.getpid()}] Error processing {s3_uri}: {e}")
        return {
            "uri": s3_uri,
            "observations_cells": 0,
            "status": f"Error: {str(e)}"
        }

In [49]:
# Go through list of raw filtered h5 in parallel, adjust number of processes if many h5 files that need to be analyzed

NUM_PROCESSES = 4 

with multiprocessing.Pool(processes=NUM_PROCESSES) as pool:
    # pool.map distributes the s3_h5_files list to the processing function
    # and returns a list of dictionaries (the return values from the function)
    all_file_results = pool.map(process_raw_filtered, [f's3://{bucket}/{i}' for i in all_raw_h5])

Accessing metadata for s3://czi-psomagen/weissman-scaling-in-vivo-perturb-seq-in-the-liver-and-beyond/AN00025549/RSJS_fast_1/processed/cellranger/Run_2025_08_16/outs/per_sample_outs/bottom_30p_lipid_1/count/sample_filtered_feature_bc_matrix.h5Accessing metadata for s3://czi-psomagen/weissman-scaling-in-vivo-perturb-seq-in-the-liver-and-beyond/AN00025549/RSJS_fast_1/processed/cellranger/Run_2025_08_16/outs/per_sample_outs/double_positive_5/count/sample_filtered_feature_bc_matrix.h5Accessing metadata for s3://czi-psomagen/weissman-scaling-in-vivo-perturb-seq-in-the-liver-and-beyond/AN00025549/RSJS_fast_1/processed/cellranger/Run_2025_08_16/outs/per_sample_outs/double_positive_1/count/sample_filtered_feature_bc_matrix.h5Accessing metadata for s3://czi-psomagen/weissman-scaling-in-vivo-perturb-seq-in-the-liver-and-beyond/AN00025549/RSJS_fast_1/processed/cellranger/Run_2025_08_16/outs/per_sample_outs/top_30p_lipid_1/count/sample_filtered_feature_bc_matrix.h5



Accessing metadata for s3://c

In [50]:
# Print a summary report per group and final total

print("\n--- Summary Report ---")
total_observations = 0
for result in all_file_results:
    print(f"File: {result['uri']} | Observations: {result['observations_cells']} | Status: {result['status']}")
    total_observations += result['observations_cells']

print(f"\nTotal observations across all files: {total_observations}")


--- Summary Report ---
File: s3://czi-psomagen/weissman-scaling-in-vivo-perturb-seq-in-the-liver-and-beyond/AN00025549/RSJS_fast_1/processed/cellranger/Run_2025_08_16/outs/per_sample_outs/bottom_30p_lipid_1/count/sample_filtered_feature_bc_matrix.h5 | Observations: 6481 | Status: Success
File: s3://czi-psomagen/weissman-scaling-in-vivo-perturb-seq-in-the-liver-and-beyond/AN00025549/RSJS_fast_1/processed/cellranger/Run_2025_08_16/outs/per_sample_outs/bottom_30p_lipid_2/count/sample_filtered_feature_bc_matrix.h5 | Observations: 6595 | Status: Success
File: s3://czi-psomagen/weissman-scaling-in-vivo-perturb-seq-in-the-liver-and-beyond/AN00025549/RSJS_fast_1/processed/cellranger/Run_2025_08_16/outs/per_sample_outs/bottom_30p_lipid_3/count/sample_filtered_feature_bc_matrix.h5 | Observations: 5992 | Status: Success
File: s3://czi-psomagen/weissman-scaling-in-vivo-perturb-seq-in-the-liver-and-beyond/AN00025549/RSJS_fast_1/processed/cellranger/Run_2025_08_16/outs/per_sample_outs/bottom_30p_li