In [None]:
import boto3
import numpy as np
import os
import pandas as pd
import scanpy as sc
import tarfile
import requests
import re
import json
import argparse
import sys
import fsspec
import anndata as ad

from io import BytesIO
from dataclasses import dataclass, field
from urllib.request import Request,urlopen
from bs4 import BeautifulSoup
from cellxgene_schema.write_labels import AnnDataLabelAppender
from urllib.parse import quote
from cellxgene_ontology_guide.supported_versions import CXGSchema, load_supported_versions
from cellxgene_ontology_guide.ontology_parser import OntologyParser
ontology_parser = OntologyParser()
fs = fsspec.filesystem('s3')

In [None]:
# Instatiate URIPath and metadata spreadsheeet
uri = 's3://czi-novogene/trapnell-seahub-bcp/NVUS2024101701-04/RNA3_096/processed/Run_2025-12-15_biohub/'
uri_obj = URIPath(full_uri=uri)
sample_metadata = LatticeMetadata(sheet_id='', tab_name='sample template', uri=uri_obj)
sample_metadata.metadata_df

# Create experiment objecxt
if is_single(uri_obj):
    experiment_04 = SingleSublibrary(metadata=sample_metadata, uri=uri_obj)
else:
    experiment_04 = MultiSublibrary(metadata=sample_metadata, uri=uri_obj)

# Download files, map pseudosa/mple and sample ids, and create pseudosamples for experiment
experiment_04.download_files()
experiment_04.get_sample_ids_map()
experiment_04.add_pseudosamples()

# Add sample level metrics and ontology labels
for pseudos in experiment_04.all_pseudosamples:
    add_sample_metadata(pseudos)
    cxg_add_labels(pseudos.adata)

In [None]:
# Write to file
for pseudos in experiment_04.all_pseudosamples:
    pseudos.adata.write(f'{DOWN_DIR}/{pseudos.sample_id}_curated.h5ad', compression='gzip')

In [None]:
import sys
sys.path.insert(1, '/home/jovyan/lattice-tools/cellxgene_resources')
from cellxgene_mods import *
evaluate_obs_schema(experiment_04.all_pseudosamples[3].adata.obs, labels=False)

In [None]:
DOWN_DIR = 'temp_plate'

@dataclass
class URIPath:
    '''
    Dataclass to hold various aspects of a single run of SeqSuite.
    This path should include the entire URI path, including bucket name, of the SeqSuite rundate output files.
    Ex: s3://czi-novogene/trapnell-seahub-bcp/NVUS2024101701-04/RNA3_096/processed/Run_2025-12-15_biohub/
    sample_files includes all files associated with this URIPath that are needed
    '''
    
    full_uri: str

    def __post_init__(self):
        if not self.full_uri.endswith('/'):
            self.full_uri += '/'
        self.bucket_name: str = self.full_uri.split('/')[2]
        self.prefix: str = '/'.join(self.full_uri.split('/')[3:])
        self.sample_files = fs.ls(f's3://{self.bucket_name}/{self.prefix}samples/')
        self.sample_files.append(f'{self.bucket_name}/{self.prefix}samples.csv')
        self.sample_files.append(f'{self.bucket_name}/{self.prefix}reports/allSamples.reportStatistics.csv')
        self.experiment = self.full_uri.split('/')[5]


@dataclass
class LatticeMetadata:
    '''
    Dataclass to hold and map ontologies to a Lattice metadata spreadsheet.
    The functions allow for this object to read in a specific tab for a google sheet into a dataframe
    '''
    
    sheet_id: str
    tab_name: str
    uri: URIPath

    def get_gid(self):
        '''
        Given sheet id and tab name, return gid
        '''
        sheet_url = f'https://docs.google.com/spreadsheets/d/{self.sheet_id}'
        req = Request(sheet_url, headers={'User-Agent' : "Magic Browser"})
        s = urlopen(req)
        soup = BeautifulSoup(s, 'html.parser')
        tab_ids = {}
        pattern = re.compile('var bootstrapData = (.*?)};')
        for s in soup.find_all('script'):
            if pattern.search(str(s)):
                d = pattern.search(str(s)).group()[20:-1]
                data = json.loads(d)
                for t in data['changes']['topsnapshot']:
                    u = t[1].split('"')
                    if len(u) > 5:
                        tab_ids[u[5]] = u[1]
        return tab_ids[self.tab_name]

    def get_metadata_df(self):
        '''
        Given sheet id and gid, return lattice metadata in a dataframe, subset by experiment name
        '''
        url = f'https://docs.google.com/spreadsheets/d/{self.sheet_id}/export?format=csv&gid={self.gid}'
        response = requests.get(url)
        sample_df = pd.read_csv(BytesIO(response.content), comment="#", dtype=str).dropna(axis=1,how='all')
        sample_df = sample_df[sample_df['experiment_name'] == self.uri.experiment]
        return sample_df

    def __post_init__(self):
        self.gid = self.get_gid()
        self.metadata_df = self.get_metadata_df()


@dataclass
class Pseudosample:
    '''
    The pooled pseudosample entity that by which there is a single h5ad file.
    This object will track associated metadata and AnnData object
    '''
    local_h5ad: str
    pseudosample_id: str
    sample_id: str
    allcells: None
    metadata_sample: pd.DataFrame
    parent: 'Experiment'

    def load_h5ad(self):
        '''
        Load h5ad as AnnData object using local_h5ad variable, set uns variabls
        '''
        adata = ad.read_h5ad(self.local_h5ad)
        adata.uns['title'] = f'{self.parent.uri.experiment}_{self.pseudosample_id}_{self.sample_id}'
        return adata

    def assign_oligo(self):
        '''
        For SingleSublibrary experiments, the observations need to have the hash oligo mapped using the *allCells.csv file
        '''
        hash_df = pd.read_csv(self.allcells, index_col = 'cell_id')
        self.adata.obs = pd.merge(self.adata.obs, hash_df[['passing_scaleplex', 'assigned_scaleplex']], left_index=True, right_index=True, how='left')
        self.adata.obs['assigned_scaleplex'] = self.adata.obs['assigned_scaleplex'].astype('string')
    
    def drop_unassigned(self):
        '''
        given an allCell.csv, with "cell_id" colum for cellID and "assigned_scaleplex" for scaleplex hash assignment,
        merge "assigned_scaleplex" to adata.obs, where the well number and letter need to be swapped (eg: 7E -> E7) and add prefix "SCALE-"
        to match the Lattice metadata spreadsheet. Drop cells that have the following values as "assigned_scaleplex":
         - Indeterminate
         - Max_Fail
         - Unexpected
        '''
        drop_values = ['Indeterminate','Max_Fail','Unexpected']
        adata_subset = self.adata.copy()
        adata_subset = adata_subset[~adata_subset.obs['assigned_scaleplex'].isin(drop_values)].copy()
        self.adata = adata_subset

    def rename_cols(self):
        '''
        For the cell level metrics, append 'cell_' to field name, as there will be sample level metrics
        '''
        metrics_list = [
            'counts',
            'genes',
            'totalReads',
            'countedReads',
            'mappedReads',
            'geneReads',
            'exonReads',
            'intronReads',
            'antisenseReads',
            'mitoReads',
            'countedMultiGeneReads',
            'Saturation',
            'mitoProp'
        ]
        for col in metrics_list:
            self.adata.obs.rename(columns = {col: f'cell_{col}'}, inplace = True)
    
    def map_metadata(self):
        '''
        Add additional metadata to obs from spreadsheet
        For any cells that map to a hash index that is not on the spreadsheet for that particular pseudosample, they are dropped and warning printed
        Example variables:
            assigned_scaleplex: 10C
            assigned_hash_index: SCALE-C10 (converted from assigned_scaleplex)
            hash_index: SCALE-C10
        '''
        self.adata.obs['assigned_hash_index'] = self.adata.obs['assigned_scaleplex'].str.replace(r'(\d+)([a-zA-Z])', r'SCALE-\2\1', regex=True).astype('string')
        not_found = [i for i in self.adata.obs['assigned_hash_index'] if i not in self.metadata_sample['hash_index'].to_list()]
        if len(not_found) > 1:
            print(f'ERROR: no hash_index found for {self.pseudosample_id} {self.sample_id}: {not_found}')
        adata_subset = self.adata.copy()
        adata_subset = adata_subset[~adata_subset.obs['assigned_hash_index'].isin(not_found)]
        self.adata = adata_subset
        self.adata.obs = pd.merge(self.adata.obs, self.metadata_sample, left_on='assigned_hash_index', right_on='hash_index', how='left').set_index(self.adata.obs.index)
        self.adata.obs.drop(columns=['assigned_hash_index'], inplace=True)
    

    def __post_init__(self):
        self.adata = self.load_h5ad()
        if isinstance(self.parent, SingleSublibrary):
            self.assign_oligo()
        self.drop_unassigned()
        self.rename_cols()
        self.map_metadata()


@dataclass
class Experiment:
    '''
    Dataclass to hold various aspects of a single experiment, including
    the pseudosamples and corresponding sample names found in SeqSuite run
    '''
    metadata: LatticeMetadata
    uri: URIPath
    all_pseudosamples: list[Pseudosample] = field(init=False, default_factory=list)
    sample_map: dict[str, str] = field(init=False, default_factory=dict)

    def expand_ranges(self, value):
        '''
        Given a value found in 'barcodes' column in samples.csv, return the expanded form, where no letters are skipped (ex: 1A-1G)
        '''
        expanded = []
        for sub_string in value.split(';'):
            if re.search('-', sub_string):
                # Match the prefix and the range of letters (e.g., 1A-1C)
                # Group 1: Number prefix (e.g., '1')
                # Group 2: Start letter (e.g., 'A')
                # Group 3: End letter (e.g., 'C')
                match = re.match(r'(\d+)([A-Z])-(\d+)?([A-Z])', sub_string)
                if match:
                    prefix, start_char, _, end_char = match.groups()
                    # Iterate through the ASCII values of the characters
                    for char_code in range(ord(start_char), ord(end_char) + 1):
                        expanded.append(f"{prefix}{chr(char_code)}")
            else:
                expanded.append(sub_string)
        return ';'.join(expanded)

    
    def get_sample_ids_map(self):
        '''
        Given a lattice metadata and a samples.csv file, map the pseudosample and sample ids using 'RT_index' and 'barcodes' columns, respectively.
        Need to split metadata by '-' and then swap plate row/column, and then also expand sample csv if there is shorthand (ex: 1A-1G,8A-8G)
        '''
        samples_csv_df = pd.read_csv(f'{DOWN_DIR}/samples.csv', index_col='sample')
        metadata_df = self.metadata.metadata_df[['RT_index','pseudosample']].drop_duplicates().set_index('pseudosample')
        
        # After reading in csv and metadata df, parse barcode/RT_index so that they are comparable and store in dictionary
        metadata_df['RT_transformed'] =  metadata_df['RT_index'].str.replace(r'[A-Za-z0-9]+-([A-H])([0-9]+)', r'\2\1', regex=True)
        metadata_df['RT_transformed'] = metadata_df['RT_transformed'].apply(lambda x: ';'.join(sorted(x.split(','))) if isinstance(x, str) else x)
        metadata_map = metadata_df['RT_transformed'].to_dict()
        samples_csv_df['barcodes_transformed'] = samples_csv_df['barcodes'].apply(lambda x: self.expand_ranges(x))
        samples_csv_df['barcodes_transformed'] = samples_csv_df['barcodes_transformed'].apply(lambda x: ';'.join(sorted(x.split(';'))) if isinstance(x, str) else x)
        samples_csv_map = samples_csv_df['barcodes_transformed'].to_dict()

        # Compare index values and return mapping of sample IDs
        # This also checks that the samples.csv file is matching the metadata spreadsheet for wells
        sample_map = {}
        for pseudosample, value in metadata_map.items():
            sample_ids = [k for k, v in samples_csv_map.items() if v==value]
            if (len(sample_ids)>1):
                print(f'ERROR: multimapping indices\t{pseudosample}\t{value}')
            elif (len(sample_ids)==0):
                print(f'ERROR: indices not matching\t{pseudosample}\t{value}')
            else:
                sample_map[pseudosample] = sample_ids[0]
        self.sample_map = sample_map

    
    def download_files(self):
        '''
        Download all files needed for this experiment
        Add samples.csv and allSamples.reportStatistics.csv to download, as these are needed for all experiments
        '''
        os.makedirs(DOWN_DIR, exist_ok=True)
        self.file_dict[f'{self.uri.full_uri}samples.csv'] = f'{DOWN_DIR}/samples.csv'
        self.file_dict[f'{self.uri.full_uri}reports/allSamples.reportStatistics.csv'] = f'{DOWN_DIR}/allSamples.reportStatistics.csv'
        for s3_uri, local_path in self.file_dict.items():
            if not os.path.exists(local_path):
                print(f'Downloading from {s3_uri} to {local_path}')
                fs.get(s3_uri, local_path)


    def add_pseudosamples(self):
        '''
        Create all pseudosample object given sample_map and determine metadata for each.
        Only load allcells for SingleSublibrary object
        '''
        for pseudo_id, samp_id in self.sample_map.items():
            # First subset metadata df and map ontologies
            print(f'Loading {samp_id}')
            metadata_df_subset = self.metadata.metadata_df[self.metadata.metadata_df['pseudosample']==pseudo_id]
            metadata_df_subset = map_ontologies(metadata_df_subset)
            allcells_val = find_file(self.file_dict, samp_id, 'allCells.csv') if isinstance(self, SingleSublibrary) else None
            
            pseudosample = Pseudosample(
                local_h5ad = find_file(self.file_dict, samp_id, 'h5ad'),
                pseudosample_id = pseudo_id,
                sample_id = samp_id,
                allcells = allcells_val,
                metadata_sample = metadata_df_subset,
                parent = self
            )
            self.all_pseudosamples.append(pseudosample)
        
    def __post_init__(self):
        self.sample_map: dict = get_sample_ids_map()


class SingleSublibrary(Experiment):
    '''
    Plate Experiment to store specific variables and methods for an experiment that is a single sublibrary,
    which will include *allCell.csv files in addition to the sublibrary specific h5ad
    '''
    
    def determine_files(self):
        '''
        Return the actual file URIs of files that need to be downloaded
        '''
        file_dict = {}
        for file in self.uri.sample_files:
            if file.endswith(('allCells.csv','_anndata.h5ad')):
                filename = file.split('/')[-1]
                local_path = os.path.join(DOWN_DIR, filename)
                file_dict[file] = local_path
        return file_dict
        
    def __post_init__(self):
        self.file_dict = self.determine_files()


class MultiSublibrary(Experiment):
    '''
    Plate Experiment to store specific variables and methods for an experiment that has multiple sublibraries,
    which will include the sample merged h5ad files
    '''
    def determine_files(self):
        '''
        Return the actual file URIs of files that need to be downloaded.
        file_dict: dictionary where key is uri and value is local_path
        '''
        file_dict = {}
        for file in self.uri.sample_files:
            if file.endswith('anndata.h5ad') and not re.search('QSR', file):
                filename = file.split('/')[-1]
                local_path = os.path.join(DOWN_DIR, filename)
                file_dict[file] = local_path
        return file_dict

    def __post_init__(self):
        self.file_dict = self.determine_files()      


def is_single(uripath_obj):
    '''
    Given an URI object, determine if the experiment is a single sublibrary experiment
    by searching for 'QSR-#' for files on S3
    '''
    files = uripath_obj.sample_files
    pattern = r'\.(QSR-[1-8])_anndata.h5ad'
    
    sublibraries = list({re.search(pattern, f).group(1) for f in files if re.search(pattern, f)})
    return len(sublibraries) == 1


def find_file(file_dict, prefix, suffix):
    '''
    Given prefix and suffix, return file in file_dict
    '''
    matches = (f for f in list(file_dict.values())
       if f.split('/')[-1].startswith(prefix) 
       and f.endswith(suffix))
    return next(matches, None)


def add_sample_metadata(pseudosamp):
    '''
    Given a Pseudosample object, add the sample level metadata from allSamples.reportStatistics.csv
    '''
    samples_df = pd.read_csv(f'{DOWN_DIR}/allSamples.reportStatistics.csv')
    desired_rows = [
        'Reads',
        'Cells',
        'Scaleplex'
    ]
    matches = (f for f in samples_df.columns
       if f.split('/')[-1].startswith(pseudosamp.sample_id))
    col = next(matches, None)
    if col:
        samples_df = samples_df[samples_df['Category'].isin(desired_rows)]
        samples_df['new_index'] = samples_df['Category']+'_'+samples_df['Metric'].str.lower().str.replace(' ', '_')
        samples_df.set_index('new_index', inplace=True)
        samples_df = samples_df[[col]]
        
        pseudosamp.adata.obs['orig_index'] = pseudosamp.adata.obs.index
        pseudosamp.adata.obs = pseudosamp.adata.obs.merge(samples_df.T, how='cross')
        pseudosamp.adata.obs.set_index('orig_index', inplace=True)
    else:
        print(f'ERROR: cannot find sample metadata for\t{pseudosamp.sample_id}')


def cxg_add_labels(adata):
    '''
    Add ontology labels to adata.obs, and also fill in standard metadata fields in obs
    '''
    adata.uns['organism_ontology_term_id'] = adata.obs['organism_ontology_term_id'].unique()[0]
    adata.obs.drop(columns=['organism_ontology_term_id'],inplace=True)
    adata.obs['is_primary_data'] = True
    
    adata.obs['cell_type_ontology_term_id'] = 'unknown'
    labeler = AnnDataLabelAppender(adata)
    labeler._add_labels()
    adata.obs.drop(columns=['cell_type_ontology_term_id','cell_type'],inplace=True)

    schema_v = labeler.schema_version
    adata.uns['schema_version'] = schema_v
    adata.uns['schema_reference'] = labeler._build_schema_reference_url(schema_v)


def add_guide_metadata(adata, sheet, guide_gid):
    '''
    Add guide metadata into adata.uns from Lattice wrangling sheet
    
    :param obj adata: the anndata object that is being transformed into the curated matrix
    :param obj guide_df: the dataframe containing guide metadata from wrangling sheet
    
    :returns obj adata: modified adata to contain guide metadata
    '''
    url = f'https://docs.google.com/spreadsheets/d/{sheet}/export?format=csv&gid={guide_gid}'
    response = requests.get(url)
    guide_df = pd.read_csv(BytesIO(response.content), comment="#", dtype=str)
    genetic_perturbations = {}
    
    for row in guide_df.itertuples():
        genetic_perturbations[row.guide_id] = {}
        genetic_perturbations[row.guide_id]['role'] = 'targeting' if row.guide_role == 'Targeting a Gene' else 'control'
        genetic_perturbations[row.guide_id]['protospacer_sequence'] = row.guide_protospacer
        genetic_perturbations[row.guide_id]['protospacer_adjacent_motif'] = row.guide_PAM
        if not pd.isna([row.start,row.end,row.strand]).all():
            chr_loc = str(row.chromosome).replace("chr","") + ":" + str(row.start) + "-" + str(row.end) + "(" + str(row.strand) + ")"
            genetic_perturbations[row.guide_id]['target_genomic_regions'] = [chr_loc]
        if not pd.isna(row.overlapping_gene_ids):
            genetic_perturbations[row.guide_id]['target_features'] = {}
            for i in range(len(row.overlapping_gene_ids.split(";"))):
                genetic_perturbations[row.guide_id]['target_features'][row.overlapping_gene_ids.split(";")[i]] = row.overlapping_gene_names.split(";")[i]
                                                                             
            
    adata.uns['genetic_perturbations'] = genetic_perturbations
    
    return adata


def determine_perturbation_strategy(adata):
    '''
    Assess feature_call from protospacer_calls_per_cell.csv, where if all guides
    assigned to a single cell are all control, then 'control'. Otherwise, it is "no perturbations"
    if no guids or one of the following if targeting:
        - "CRISPR activation screen"
        - "CRISPR interference screen"
        - "CRISPR knockout mutant"
        - "CRISPR knockout screen"
    
    :param obj adata: the anndata object that is being transformed into the curated matrix

    :returns obj adata: modified adata to contain perturbation_strategy as cell metadata
    '''
    adata.obs['genetic_perturbation_strategy_calculated'] = adata.obs['genetic_perturbation_id']
    adata.obs['genetic_perturbation_strategy_calculated'] = adata.obs['genetic_perturbation_strategy_calculated'].apply(
        lambda x: x.split(' || ') if pd.notna(x) else 'no perturbations'
    )
    adata.obs['genetic_perturbation_strategy_calculated'] = adata.obs['genetic_perturbation_strategy_calculated'].apply(
        lambda x: [adata.uns['genetic_perturbations'][i]['role'] for i in x] if isinstance(x, list)
            else x        
    )
    adata.obs['genetic_perturbation_strategy_calculated'] = adata.obs['genetic_perturbation_strategy_calculated'].apply(
         lambda x: 'control' if isinstance(x, list) and 'targeting' not in set(x)
            else x
    )
    adata.obs.loc[adata.obs['genetic_perturbation_strategy_calculated']=='control', 'genetic_perturbation_strategy'] = 'control'
    adata.obs.loc[adata.obs['genetic_perturbation_strategy_calculated']=='no perturbations', 'genetic_perturbation_strategy'] = 'no perturbations'
    adata.obs.drop(columns=['genetic_perturbation_strategy_calculated'], inplace=True)
    
    return adata



def map_ontologies(sample_df):
    '''
    Takes the sample metadata dataframe and standardizes ontologies
    Also checks that standard fields are only filled out for appropriate organism

    :param dataframe sample_df: the sample metadata from given google sheet

    :returns dataframe sample_df: sample metadata with ontologies added
    '''
    col_ont_map = {
        'organism':'NCBITaxon',
        'sex':'PATO',
        'self_reported_ethnicity':{'NCBITaxon:9606':'HANCESTRO',
                                   'other':'none'},
        'disease':'MONDO',
        'assay':'EFO',
        'development_stage':{'NCBITaxon:6239':'WBls', # C. Elegans
                             'NCBITaxon:7227':'FBdv', # Drosophila
                             'NCBITaxon:10090':'MmusDv', # Mouse
                             'NCBITaxon:7955':'ZFS', # Zebrafish
                             'other':'HsapDv' # For all other organisms, use HsapDv
                            },
        'tissue':{'NCBITaxon:6239':'WBbt', # C. Elegans
                  'NCBITaxon:7227':'FBbt', # Drosophila
                  'NCBITaxon:7955':'ZFA', # Zebrafish
                  'other':'UBERON' # For all other organisms, use UBERON
                 }
    }
    ontology_parser = OntologyParser()
    ont_err_lst = []
    
    for col in col_ont_map:
        map_dict = {}
        for label in sample_df[col].unique():
            term_id = None
            if col == 'disease' and label == 'normal': # Normal is not in MONDO ontology
                term_id = 'PATO:0000461'
            elif label in ['unknown','na']: # Unknown and na won't be in ontologies, pass along
                map_dict[label] = label
                continue
            elif col in ['tissue','development_stage','self_reported_ethnicity']:
                if col == 'tissue':
                    # Find what tissue type is at label row
                    if sample_df.loc[sample_df[col] == label, 'tissue_type'].tolist()[0] != 'tissue':
                        map_dict[label] = label # Don't map cell type in tissue
                        continue
                # Find what organism term id is at label row
                org_term_id = sample_df.loc[sample_df[col] == label, 'organism_ontology_term_id'].tolist()[0]
                if org_term_id in col_ont_map[col]:
                    # Get ontology of specific organism and map label
                    species_ont = col_ont_map[col][org_term_id]
                    term_id = ontology_parser.get_term_id_by_label(label, species_ont)
                else:
                    if col_ont_map[col]['other'] == 'none':
                        map_dict[label] = label
                        continue
                    else:
                        term_id = ontology_parser.get_term_id_by_label(label, col_ont_map[col]['other'])
            else:
                term_id = ontology_parser.get_term_id_by_label(label, col_ont_map[col])
            if term_id == None:
                print(f'{label}\t{col}\t{term_id}\t{org_term_id}')
                if org_term_id:
                    if org_term_id in col_ont_map[col]:
                        ont_err_lst.append(f"Error: Matching '{col_ont_map[col][org_term_id]}' term id not found for label '{label}' in column '{col}'")
                    else:
                        ont_err_lst.append(f"Error: Matching '{col_ont_map[col]['other']}' term id not found for label '{label}' in column '{col}'")
                else:
                    ont_err_lst.append(f"Error: Matching '{col_ont_map[col]}' term id not found for label '{label}' in column '{col}'")
                map_dict[label] = label
                continue
            map_dict[label] = term_id
        sample_df[col + '_ontology_term_id'] = sample_df[col].map(map_dict)
        del sample_df[col]
    
    ### Print out any errors from ontologizing
    if ont_err_lst:
        for e in ont_err_lst:
            print(e)
        sys.exit()

    ### Convert string to boolean for is_pilot_data and donor_living_at_sample_collection
    ### Check that donor_living_at_sample_collection is not filled out for non-human
    b_type = ['is_pilot_data','donor_living_at_sample_collection']
    for c in b_type:
        if c in sample_df.columns:
            if c == 'donor_living_at_sample_collection':
                for val in sample_df[c].unique():
                    if val != 'na' and sample_df.loc[sample_df[c] == val, 
                    'organism_ontology_term_id'].tolist()[0] != 'NCBITaxon:9606':
                        print(f"ERROR: donor_living_at_sample_collection for non-human data should be 'na' but '{val}' is present")
                        sys.exit()
            sample_df[c] == sample_df[c].replace({'FALSE':False, 'TRUE':True})
    
    ### Blank fields in worksheet result in NaN values in dataframe, replacing these with na?
    ### Could also replace with unknown for certain columns using fillna options?
    sample_df.fillna('na', inplace=True)
    sample_df.drop(columns=[c for c in sample_df.columns if c.startswith('!')], inplace=True)

    return sample_df


In [None]:
import sys
sys.path.insert(1, '/home/jovyan/lattice-tools/cellxgene_resources')
from cellxgene_mods import *
evaluate_obs_schema(adatas[0].obs, labels=False)