In [23]:
import boto3
import numpy as np
import os
import pandas as pd
import scanpy as sc
import tarfile
import requests
import re
import json
import argparse
import sys
import fsspec

from io import BytesIO
from dataclasses import dataclass
from urllib.request import Request,urlopen
from bs4 import BeautifulSoup
from cellxgene_schema.write_labels import AnnDataLabelAppender
from urllib.parse import quote
from cellxgene_ontology_guide.supported_versions import CXGSchema, load_supported_versions
from cellxgene_ontology_guide.ontology_parser import OntologyParser
ontology_parser = OntologyParser()
fs = fsspec.filesystem('s3')

In [20]:
x = 's3://czi-novogene/trapnell-seahub-bcp/NVUS2024101701-04/RNA3_096/processed/Run_2025-12-15_biohub/'
uri = URIPath(x)

In [24]:

files = ['R109-25-0361.QSR-8_anndata.h5ad','R109-25-0422.QSR-1_anndata.h5ad','R109-25-0361.QSR-1_anndata.h5ad']
pattern = r"\.([^_]+)_"

# List comprehension (keeps duplicates: ['QSR-8', 'QSR-1', 'QSR-1'])
results_set = {re.search(pattern, f).group(1) for f in files if re.search(pattern, f)}


In [51]:
uri = 's3://czi-novogene/trapnell-seahub-bcp/NVUS2024101701-04/RNA3_096/processed/Run_2025-12-15_biohub/'
uri_obj = URIPath(full_uri=uri)

sample_metadata = LatticeMetadata(sheet_id='', tab_name='sample template')
sample_metadata.metadata_df


In [77]:
if is_single(uri_obj):
    experiment_04 = SingleSublibrary(metadata=sample_metadata, uri=uri_obj)
else:
    experiment_04 = MultiSublibrary(metadata=sample_metadata, uri=uri_obj)

In [78]:
experiment_04.files_to_download

['czi-novogene/trapnell-seahub-bcp/NVUS2024101701-04/RNA3_096/processed/Run_2025-12-15_biohub/samples/R096-25-0255.QSR-8.allCells.csv',
 'czi-novogene/trapnell-seahub-bcp/NVUS2024101701-04/RNA3_096/processed/Run_2025-12-15_biohub/samples/R096-25-0255.QSR-8_anndata.h5ad',
 'czi-novogene/trapnell-seahub-bcp/NVUS2024101701-04/RNA3_096/processed/Run_2025-12-15_biohub/samples/R096-25-0256.QSR-8.allCells.csv',
 'czi-novogene/trapnell-seahub-bcp/NVUS2024101701-04/RNA3_096/processed/Run_2025-12-15_biohub/samples/R096-25-0256.QSR-8_anndata.h5ad',
 'czi-novogene/trapnell-seahub-bcp/NVUS2024101701-04/RNA3_096/processed/Run_2025-12-15_biohub/samples/R096-25-0257.QSR-8.allCells.csv',
 'czi-novogene/trapnell-seahub-bcp/NVUS2024101701-04/RNA3_096/processed/Run_2025-12-15_biohub/samples/R096-25-0257.QSR-8_anndata.h5ad',
 'czi-novogene/trapnell-seahub-bcp/NVUS2024101701-04/RNA3_096/processed/Run_2025-12-15_biohub/samples/R096-25-0258.QSR-8.allCells.csv',
 'czi-novogene/trapnell-seahub-bcp/NVUS202410170

In [79]:
experiment_04.download_files()

Downloading from czi-novogene/trapnell-seahub-bcp/NVUS2024101701-04/RNA3_096/processed/Run_2025-12-15_biohub/samples.csv to temp_plate/samples.csv


In [76]:
DOWN_DIR = 'temp_plate'

@dataclass
class URIPath:
    '''
    Dataclass to hold various aspects of a single run of SeqSuite.
    This path should include the entire URI path, including bucket name,
    of the SeqSuite rundate output files. Ex: s3://czi-novogene/trapnell-seahub-bcp/NVUS2024101701-04/RNA3_096/processed/Run_2025-12-15_biohub/
    '''
    
    full_uri: str

    def __post_init__(self):
        if not self.full_uri.endswith('/'):
            self.full_uri += '/'
        self.bucket_name: str = self.full_uri.split('/')[2]
        self.prefix: str = '/'.join(self.full_uri.split('/')[3:])
        self.sample_files = fs.ls(f's3://{self.bucket_name}/{self.prefix}samples/')


@dataclass
class LatticeMetadata:
    '''
    Dataclass to hold and map ontologies to a Lattice metadata spreadsheet.
    The functions allow for this object to read in a specific tab for a google sheet into a dataframe
    '''
    
    sheet_id: str
    tab_name: str

    def get_gid(self):
        '''
        Given sheet id and tab name, return gid
        '''
        sheet_url = f'https://docs.google.com/spreadsheets/d/{self.sheet_id}'
        req = Request(sheet_url, headers={'User-Agent' : "Magic Browser"})
        s = urlopen(req)
        soup = BeautifulSoup(s, 'html.parser')
        tab_ids = {}
        pattern = re.compile('var bootstrapData = (.*?)};')
        for s in soup.find_all('script'):
            if pattern.search(str(s)):
                d = pattern.search(str(s)).group()[20:-1]
                data = json.loads(d)
                for t in data['changes']['topsnapshot']:
                    u = t[1].split('"')
                    if len(u) > 5:
                        tab_ids[u[5]] = u[1]
        return tab_ids[self.tab_name]

    def get_metadata_df(self):
        '''
        Given sheet id and gid, return lattice metadata in a dataframe
        '''
        url = f'https://docs.google.com/spreadsheets/d/{self.sheet_id}/export?format=csv&gid={self.gid}'
        response = requests.get(url)
        sample_df = pd.read_csv(BytesIO(response.content), comment="#", dtype=str).dropna(axis=1,how='all')
        return sample_df

    def __post_init__(self):
        self.gid = self.get_gid()
        self.metadata_df = self.get_metadata_df()


@dataclass
class Experiment:
    '''
    Dataclass to hold various aspects of a single experiment, including
    the pseudosamples and corresponding sample names found in SeqSuite run
    '''
    metadata: LatticeMetadata
    uri: URIPath

    def map_sample_ids():
        '''
        Diven a lattice metadata and a samples.csv file, map the pseudosample and sample ids using 'RT_index' and 'barcodes' columns, respectively
        '''
        return sample_ids

    def download_files(self):
        '''
        Download all files needed for this experiment
        '''
        os.makedirs(DOWN_DIR, exist_ok=True)
        for s3_uri in self.files_to_download:
            filename = s3_uri.split('/')[-1]
            local_path = os.path.join(DOWN_DIR, filename)
            if not os.path.exists(local_path):
                print(f'Downloading from {s3_uri} to {local_path}')
                fs.get(s3_uri, local_path)
    
    def __post_init__(self):
        self.sample_ids: list = map_sample_ids()


class SingleSublibrary(Experiment):
    '''
    Plate Experiment to store specific variables and methods for an experiment that is a single sublibrary,
    which will include *allCell.csv files in addition to the sublibrary specific h5ad
    '''
    
    def determine_files(self):
        '''
        Return the actual file URIs of files that need to be downloaded
        '''
        want_files = []
        for file in self.uri.sample_files:
            if file.endswith(('allCells.csv','_anndata.h5ad')):
               want_files.append(file) 
        want_files.append(f'{self.uri.bucket_name}/{self.uri.prefix}samples.csv')
        return want_files
        
    def __post_init__(self):
        self.files_to_download: list = self.determine_files()


class MultiSublibrary(Experiment):
    '''
    Plate Experiment to store specific variables and methods for an experiment that has multiple sublibraries,
    which will include the sample merged h5ad files
    '''
    def determine_files():
        '''
        Return the actual file URIs of files that need to be downloaded
        '''
        want_files = []
        for file in self.uri.sample_files:
            if file.endswith('anndata.h5ad') and not re.search('QSR', file):
                want_files.append(file)
        want_files.append(f'{self.uri.bucket_name}/{self.uri.prefix}samples.csv')
        return want_files

    def __post_init__(self):
        self.files_to_download: list = self.determine_files()       


def is_single(uripath_obj):
    '''
    Given an URI object, determine if the experiment is a single sublibrary experiment
    by searching for 'QSR-#' for files on S3
    '''
    files = uripath_obj.sample_files
    pattern = r'\.(QSR-[1-8])_anndata.h5ad'
    
    sublibraries = list({re.search(pattern, f).group(1) for f in files if re.search(pattern, f)})
    return len(sublibraries) == 1


# Define Scale "sample" to Lattice metadata "pseudosample"
# experiment_name is the corresponding experiment in the Lattice spreadsheet
sample_map = {
#     'R096-25-0255.QSR-8':'bl1',
#     'R096-25-0256.QSR-8':'bl2',
#     'R096-25-0257.QSR-8':'bl3',
#     'R096-25-0258.QSR-8':'bl4'
# # }
    'R109-25-0422':'25.0422',
    'R109-25-0423':'25.0423',
    'R109-25-0424':'25.0424',
    'R109-25-0425':'25.0425',
    'R109-25-0426':'25.0426',
    'R109-25-0427':'25.0427',
    'R109-25-0428':'25.0428',
    'R109-25-0429':'25.0429',
    'R109-25-0430':'25.0430',
    'R109-25-0431':'25.0431',
    'R109-25-0432':'25.0432',
    'R109-25-0433':'25.0433',
    'R109-25-0434':'25.0434'
}
    
set_rundate = 'Run_2025-12-15_biohub'
pseudosamples = list(sample_map.keys())
experiment_name = 'CHEM4_R109'
down_dir = 'temp_plate'

    

def download_files(s3client, bucket, prefix):
    '''
    given s3 directory, bucket and prefix (proj/order/experiment_name), download the following files, one set for each pseudosample:
        - *_anndata.h5ad
        - *.merged.allCells.csv
    '''
    files = ['_anndata.h5ad',
             '.merged.allCells.csv']
    
    if os.path.exists(down_dir) == False:
        os.mkdir(down_dir)

    for pseudosample in pseudosamples:
        print(f'Downloaoding for {pseudosample}')
        rundates = get_subdirectories(s3client, bucket, prefix+'/processed/')
        if len(rundates)>1:
            print(f'WARNING: more than one rundate: {rundate}')
            rundate = set_rundates
        else:
            rundate = rundates[0].split('/')[-2]
        for file in files:
            file_path = f'{prefix}/processed/{rundate}/samples/{pseudosample}{file}'
            down_path = f'{down_dir}/{pseudosample}{file}'
            if not os.path.exists(down_path):
                s3client.download_file(bucket, file_path, down_path)


def assign_oligo(adata, hash_csv):
    '''
    given an allCell.csv, with "cell_id" colum for cellID and "assigned_scaleplex" for scaleplex hash assignment,
    merge "assigned_scaleplex" to adata.obs, where the well number and letter need to be swapped (eg: 7E -> E7) and add prefix "SCALE-"
    to match the Lattice metadata spreadsheet. Drop cells that have the following values as "assigned_scaleplex":
     - Indeterminate
     - Max_Fail
     - Unexpected
    '''
    drop_values = ['Indeterminate','Max_Fail','Unexpected']
    hash_df = pd.read_csv(hash_csv, index_col = 'cell_id')
    adata_subset = adata.copy()
    #adata_subset.obs = pd.merge(adata_subset.obs, hash_df[['passing_scaleplex', 'assigned_scaleplex']], left_index=True, right_index=True, how='left')
    #adata_subset.obs['assigned_scaleplex'] = adata_subset.obs['assigned_scaleplex'].astype('string')
    adata_subset = adata_subset[~adata_subset.obs['assigned_scaleplex'].isin(drop_values)].copy()
    adata_subset.obs['assigned_hash_index'] = adata_subset.obs['assigned_scaleplex'].str.replace(r'(\d+)([a-zA-Z])', r'SCALE-\2\1', regex=True).astype('string')
    return adata_subset


def cxg_add_labels(adata):
    adata.obs['cell_type_ontology_term_id'] = 'unknown'
    labeler = AnnDataLabelAppender(adata)
    labeler._add_labels()
    adata.obs.drop(columns=['cell_type_ontology_term_id','cell_type'],inplace=True)

    schema_v = labeler.schema_version
    adata.uns['schema_version'] = schema_v
    adata.uns['schema_reference'] = labeler._build_schema_reference_url(schema_v)


def add_guide_metadata(adata, sheet, guide_gid):
    '''
    Add guide metadata into adata.uns from Lattice wrangling sheet
    
    :param obj adata: the anndata object that is being transformed into the curated matrix
    :param obj guide_df: the dataframe containing guide metadata from wrangling sheet
    
    :returns obj adata: modified adata to contain guide metadata
    '''
    url = f'https://docs.google.com/spreadsheets/d/{sheet}/export?format=csv&gid={guide_gid}'
    response = requests.get(url)
    guide_df = pd.read_csv(BytesIO(response.content), comment="#", dtype=str)
    genetic_perturbations = {}
    
    for row in guide_df.itertuples():
        genetic_perturbations[row.guide_id] = {}
        genetic_perturbations[row.guide_id]['role'] = 'targeting' if row.guide_role == 'Targeting a Gene' else 'control'
        genetic_perturbations[row.guide_id]['protospacer_sequence'] = row.guide_protospacer
        genetic_perturbations[row.guide_id]['protospacer_adjacent_motif'] = row.guide_PAM
        if not pd.isna([row.start,row.end,row.strand]).all():
            chr_loc = str(row.chromosome).replace("chr","") + ":" + str(row.start) + "-" + str(row.end) + "(" + str(row.strand) + ")"
            genetic_perturbations[row.guide_id]['target_genomic_regions'] = [chr_loc]
        if not pd.isna(row.overlapping_gene_ids):
            genetic_perturbations[row.guide_id]['target_features'] = {}
            for i in range(len(row.overlapping_gene_ids.split(";"))):
                genetic_perturbations[row.guide_id]['target_features'][row.overlapping_gene_ids.split(";")[i]] = row.overlapping_gene_names.split(";")[i]
                                                                             
            
    adata.uns['genetic_perturbations'] = genetic_perturbations
    
    return adata


def determine_perturbation_strategy(adata):
    '''
    Assess feature_call from protospacer_calls_per_cell.csv, where if all guides
    assigned to a single cell are all control, then 'control'. Otherwise, it is "no perturbations"
    if no guids or one of the following if targeting:
        - "CRISPR activation screen"
        - "CRISPR interference screen"
        - "CRISPR knockout mutant"
        - "CRISPR knockout screen"
    
    :param obj adata: the anndata object that is being transformed into the curated matrix

    :returns obj adata: modified adata to contain perturbation_strategy as cell metadata
    '''
    adata.obs['genetic_perturbation_strategy_calculated'] = adata.obs['genetic_perturbation_id']
    adata.obs['genetic_perturbation_strategy_calculated'] = adata.obs['genetic_perturbation_strategy_calculated'].apply(
        lambda x: x.split(' || ') if pd.notna(x) else 'no perturbations'
    )
    adata.obs['genetic_perturbation_strategy_calculated'] = adata.obs['genetic_perturbation_strategy_calculated'].apply(
        lambda x: [adata.uns['genetic_perturbations'][i]['role'] for i in x] if isinstance(x, list)
            else x        
    )
    adata.obs['genetic_perturbation_strategy_calculated'] = adata.obs['genetic_perturbation_strategy_calculated'].apply(
         lambda x: 'control' if isinstance(x, list) and 'targeting' not in set(x)
            else x
    )
    adata.obs.loc[adata.obs['genetic_perturbation_strategy_calculated']=='control', 'genetic_perturbation_strategy'] = 'control'
    adata.obs.loc[adata.obs['genetic_perturbation_strategy_calculated']=='no perturbations', 'genetic_perturbation_strategy'] = 'no perturbations'
    adata.obs.drop(columns=['genetic_perturbation_strategy_calculated'], inplace=True)
    
    return adata



def map_ontologies(sample_df):
    '''
    Takes the sample metadata dataframe and standardizes ontologies
    Also checks that standard fields are only filled out for appropriate organism

    :param dataframe sample_df: the sample metadata from given google sheet

    :returns dataframe sample_df: sample metadata with ontologies added
    '''
    col_ont_map = {
        'organism':'NCBITaxon',
        'sex':'PATO',
        'self_reported_ethnicity':{'NCBITaxon:9606':'HANCESTRO',
                                   'other':'none'},
        'disease':'MONDO',
        'assay':'EFO',
        'development_stage':{'NCBITaxon:6239':'WBls', # C. Elegans
                             'NCBITaxon:7227':'FBdv', # Drosophila
                             'NCBITaxon:10090':'MmusDv', # Mouse
                             'NCBITaxon:7955':'ZFS', # Zebrafish
                             'other':'HsapDv' # For all other organisms, use HsapDv
                            },
        'tissue':{'NCBITaxon:6239':'WBbt', # C. Elegans
                  'NCBITaxon:7227':'FBbt', # Drosophila
                  'NCBITaxon:7955':'ZFA', # Zebrafish
                  'other':'UBERON' # For all other organisms, use UBERON
                 }
    }
    ontology_parser = OntologyParser()
    ont_err_lst = []
    
    for col in col_ont_map:
        map_dict = {}
        for label in sample_df[col].unique():
            term_id = None
            if col == 'disease' and label == 'normal': # Normal is not in MONDO ontology
                term_id = 'PATO:0000461'
            elif label in ['unknown','na']: # Unknown and na won't be in ontologies, pass along
                map_dict[label] = label
                continue
            elif col in ['tissue','development_stage','self_reported_ethnicity']:
                if col == 'tissue':
                    # Find what tissue type is at label row
                    if sample_df.loc[sample_df[col] == label, 'tissue_type'].tolist()[0] != 'tissue':
                        map_dict[label] = label # Don't map cell type in tissue
                        continue
                # Find what organism term id is at label row
                org_term_id = sample_df.loc[sample_df[col] == label, 'organism_ontology_term_id'].tolist()[0]
                if org_term_id in col_ont_map[col]:
                    # Get ontology of specific organism and map label
                    species_ont = col_ont_map[col][org_term_id]
                    term_id = ontology_parser.get_term_id_by_label(label, species_ont)
                else:
                    if col_ont_map[col]['other'] == 'none':
                        map_dict[label] = label
                        continue
                    else:
                        term_id = ontology_parser.get_term_id_by_label(label, col_ont_map[col]['other'])
            else:
                term_id = ontology_parser.get_term_id_by_label(label, col_ont_map[col])
            if term_id == None:
                print(f'{label}\t{col}\t{term_id}\t{org_term_id}')
                if org_term_id:
                    if org_term_id in col_ont_map[col]:
                        ont_err_lst.append(f"Error: Matching '{col_ont_map[col][org_term_id]}' term id not found for label '{label}' in column '{col}'")
                    else:
                        ont_err_lst.append(f"Error: Matching '{col_ont_map[col]['other']}' term id not found for label '{label}' in column '{col}'")
                else:
                    ont_err_lst.append(f"Error: Matching '{col_ont_map[col]}' term id not found for label '{label}' in column '{col}'")
                map_dict[label] = label
                continue
            map_dict[label] = term_id
        sample_df[col + '_ontology_term_id'] = sample_df[col].map(map_dict)
        del sample_df[col]
    
    ### Print out any errors from ontologizing
    if ont_err_lst:
        for e in ont_err_lst:
            print(e)
        sys.exit()

    ### Convert string to boolean for is_pilot_data and donor_living_at_sample_collection
    ### Check that donor_living_at_sample_collection is not filled out for non-human
    b_type = ['is_pilot_data','donor_living_at_sample_collection']
    for c in b_type:
        if c in sample_df.columns:
            if c == 'donor_living_at_sample_collection':
                for val in sample_df[c].unique():
                    if val != 'na' and sample_df.loc[sample_df[c] == val, 
                    'organism_ontology_term_id'].tolist()[0] != 'NCBITaxon:9606':
                        print(f"ERROR: donor_living_at_sample_collection for non-human data should be 'na' but '{val}' is present")
                        sys.exit()
            sample_df[c] == sample_df[c].replace({'FALSE':False, 'TRUE':True})
    
    ### Blank fields in worksheet result in NaN values in dataframe, replacing these with na?
    ### Could also replace with unknown for certain columns using fillna options?
    sample_df.fillna('na', inplace=True)
    sample_df.drop(columns=[c for c in sample_df.columns if c.startswith('!')], inplace=True)

    return sample_df


In [5]:
# Read in h5ad file, and assign scale hash oligo to each cell

adatas = []
for pseudosample in pseudosamples:
    print(f'Reading in {pseudosample}')
    adata = sc.read_h5ad(f'{down_dir}/{pseudosample}_anndata.h5ad')
    adata = assign_oligo(adata, f'{down_dir}/{pseudosample}.merged.allCells.csv')
    adatas.append(adata)

Reading in R109-25-0422
Reading in R109-25-0423
Reading in R109-25-0424
Reading in R109-25-0425
Reading in R109-25-0426
Reading in R109-25-0427
Reading in R109-25-0428
Reading in R109-25-0429
Reading in R109-25-0430
Reading in R109-25-0431
Reading in R109-25-0432
Reading in R109-25-0433
Reading in R109-25-0434


In [7]:
# For each adata, determine which sample (scale_sample) it is for, then subset sample_df for:
# - the corresponding sample (lab_sample)
# - the experiment name of the SeqSuite run
# Then do a merged based on hash_index and assigned_hash_index

for adata in adatas:
    scale_sample = adata.obs['sample'].unique()[0]
    print(f'Curating:\t{scale_sample}')
    lab_sample = sample_map[scale_sample]
    sample_df_subset = sample_df[(sample_df['pseudosample']==lab_sample) & (sample_df['experiment_name']== experiment_name)]
    adata.obs = pd.merge(adata.obs, sample_df_subset, left_on='assigned_hash_index', right_on='hash_index', how='left').set_index(adata.obs.index)
    adata.obs['is_primary_data'] = True

    # Set uns metadata
    adata.uns['organism_ontology_term_id'] = adata.obs['organism_ontology_term_id'].unique()[0]
    adata.obs.drop(columns=['organism_ontology_term_id'], inplace=True)
    prefix = f'{adata.obs["experiment_name"].unique()[0]}__{adata.obs["sample"].unique()[0]}'
    adata.uns['title'] = prefix

    # Write to file
    out = 'curated_matrices'
    if os.path.exists(out) == False:
        os.mkdir(out)
    adata.write(f'{out}/{prefix}.h5ad', compression="gzip")

Curating:	R096-25-0255.QSR-8
Curating:	R096-25-0256.QSR-8
Curating:	R096-25-0257.QSR-8
Curating:	R096-25-0258.QSR-8


In [8]:
import sys
sys.path.insert(1, '/home/jovyan/lattice-tools/cellxgene_resources')
from cellxgene_mods import *
evaluate_obs_schema(adatas[0].obs, labels=False)

assay_ontology_term_id ['EFO:0022490', nan]

[1m[31mERROR: cell_type_ontology_term_id not in obs
[0m
development_stage_ontology_term_id ['ZFS:0000035', nan]

disease_ontology_term_id ['PATO:0000461', nan]

self_reported_ethnicity_ontology_term_id ['na', nan]

sex_ontology_term_id ['unknown', nan]

tissue_ontology_term_id ['ZFA:0000103', nan]

donor_id ['CHEM13_SCALE_E7_Bl1', 'CHEM13_SCALE_D8_Bl1', 'CHEM13_SCALE_G10_Bl1', 'CHEM13_SCALE_H1_Bl1', 'CHEM13_SCALE_A3_Bl1', 'CHEM13_SCALE_E9_Bl1', 'CHEM13_SCALE_H8_Bl1', 'CHEM13_SCALE_H3_Bl1', 'CHEM13_SCALE_B7_Bl1', 'CHEM13_SCALE_F2_Bl1', 'CHEM13_SCALE_D2_Bl1', 'CHEM13_SCALE_D3_Bl1', 'CHEM13_SCALE_C10_Bl1', 'CHEM13_SCALE_D1_Bl1', 'CHEM13_SCALE_E1_Bl1', 'CHEM13_SCALE_E8_Bl1', 'CHEM13_SCALE_A2_Bl1', 'CHEM13_SCALE_G8_Bl1', 'CHEM13_SCALE_F5_Bl1', 'CHEM13_SCALE_H10_Bl1', 'CHEM13_SCALE_B9_Bl1', 'CHEM13_SCALE_E6_Bl1', 'CHEM13_SCALE_G4_Bl1', 'CHEM13_SCALE_F3_Bl1', 'CHEM13_SCALE_C4_Bl1', 'CHEM13_SCALE_A1_Bl1', 'CHEM13_SCALE_D7_Bl1', 'CHEM13_SCALE_A4_B