In [1]:
import boto3
import numpy as np
import os
import pandas as pd
import scanpy as sc
import tarfile
from cellxgene_schema.write_labels import AnnDataLabelAppender
from urllib.parse import quote
from cellxgene_ontology_guide.ontology_parser import OntologyParser
ontology_parser = OntologyParser(schema_version="v6.0.0")
from cellxgene_ontology_guide.supported_versions import CXGSchema, load_supported_versions
import requests
from io import BytesIO


s3client = boto3.client('s3')

In [2]:
def download_files(s_dir):
    mx_h5 = f'{s_dir}/count/sample_filtered_feature_bc_matrix.h5'
    metrics_csv = f'{s_dir}/metrics_summary.csv'
    cri_tar = f'{s_dir}/count/crispr_analysis.tar.gz'
    
    for file_path in [mx_h5, metrics_csv, cri_tar]:
        f = file_path.split('/')[-1]
        s3client.download_file(bucket_name, file_path, f)


def custom_var_to_obs(adata):
    moved = []
    for gene_index in adata.var[
        (adata.var['feature_types'] == 'Gene Expression') &
        (adata.var['gene_ids'].str.startswith('ENS') == False)
    ].index:
        gene_id = adata.var.loc[gene_index]['gene_ids']
        gene_values = adata.X[:, adata.var.index.get_loc(gene_index)]
        adata.obs[gene_id] = gene_values.A.flatten() if hasattr(gene_values, 'A') else gene_values.toarray().flatten().tolist()
        moved.append(gene_id)

    adata.var.set_index('gene_ids', inplace=True)
    var_to_keep = [i for i in adata.var.index if i not in moved]
    adata = adata[:, var_to_keep]
    adata.var = adata.var.replace('', np.nan).dropna(axis=1, how='all')

    return adata


def gather_crispr():
    tar = tarfile.open('crispr_analysis.tar.gz', 'r:gz')
    f = tar.extractfile('protospacer_calls_per_cell.csv')
    df = pd.read_csv(f).rename(columns={'num_umis':'num_umis_guide_id'})
    tar.close()
    df['genetic_perturbation_guide_id'] = df['feature_call'].apply(lambda x: x.replace('|',' || '))
    df = df[['cell_barcode','genetic_perturbation_guide_id','num_umis_guide_id']].set_index('cell_barcode')

    return df


def gather_metrics(samp):
    df = pd.read_csv('metrics_summary.csv')
    df['Metric'] = df.apply(lambda x: f"{x['Metric Name']}, {x['Library Type']}, {x['Category'].replace('Cells','Sample')}", axis=1)

    probe_barcodes = ' || '.join(df[
        (df['Metric Name'] == 'Sample ID') &
        (df['Metric Value'] == samp)
    ]['Group Name'].unique())

    df = df[
        (df['Grouped By'].isin(['Fastq ID','Probe barcode ID']) == False) &
        (df['Category'].isin(['Library','Cells']))
    ]
    df = df[['Metric','Metric Value']].set_index('Metric').transpose()

    for c in df.columns:
        v = df[c].iloc[0]
        if v.endswith('%'):
            df[c] = df[c].apply(lambda x: float(x.rstrip('%')) / 100)
        elif '%' in v:
            df.drop(columns=c, inplace=True)
        else:
            df[c] = df[c].apply(lambda x: int(x.replace(',','')))

    df['Probe barcode IDs'] = probe_barcodes

    keep = {
        'Confidently mapped reads in cells, Gene Expression, Sample': 'fraction_mapped_reads_in_cells_gex',
        'Median genes per cell, Gene Expression, Sample': 'median_genes_per_cell',
        'Cells, CRISPR Guide Capture, Sample': 'cell_count_cri',
        'Cells, Gene Expression, Sample': 'cell_count_gex',
        'Valid GEM barcodes, CRISPR Guide Capture, Library': 'fraction_valid_barcodes_cri',
        'Valid GEM barcodes, Gene Expression, Library': 'fraction_valid_barcodes_gex',
        'Reads confidently mapped to probe set, Gene Expression, Sample': 'frac_reads_mapped_gex',
        'Confidently mapped to genome, Gene Expression, Library': 'frac_reads_mapped_gex',
        'Mean reads per cell, Gene Expression, Sample': 'mean_reads_per_cell_gex'
    }

    df = df[[f for f in keep.keys() if f in df.columns]].rename(columns=keep)

    return df


def cxg_add_labels(adata):
    adata.obs['cell_type_ontology_term_id'] = 'unknown'
    labeler = AnnDataLabelAppender(adata)
    labeler._add_labels()
    adata.obs.drop(columns=['cell_type_ontology_term_id','cell_type'],inplace=True)

    schema_v = labeler.schema_version
    adata.uns['schema_version'] = schema_v
    adata.uns['schema_reference'] = labeler._build_schema_reference_url(schema_v)

In [3]:
#read in the sample metadata
sheet = ''
gid = ''
url = f'https://docs.google.com/spreadsheets/d/{sheet}/export?format=csv&gid={gid}'
response = requests.get(url)
sample_df = pd.read_csv(BytesIO(response.content), comment="#", dtype=str).dropna(axis=1,how='all')

In [4]:
# Mapping ontologies

col_ont_map = {
    'organism':'NCBITaxon',
    'sex':'PATO',
    'self_reported_ethnicity':'HANCESTRO',
    'disease':'MONDO',
    'assay':'EFO',
    'development_stage':{'NCBITaxon:6239':'WBls', # C. Elegans
                         'NCBITaxon:7227':'FBdv', # Drosophila
                         'NCBITaxon:10090':'MmusDv', # Mouse
                         'NCBITaxon:7955':'ZFS', # Zebrafish
                         'other':'HsapDv' # For all other organisms, use HsapDv
                        },
    'tissue':{'NCBITaxon:6239':'WBbt', # C. Elegans
              'NCBITaxon:7227':'FBbt', # Drosophila
              'NCBITaxon:7955':'ZFA', # Zebrafish
              'other':'UBERON' # For all other organisms, use UBERON
             }
}


for col in col_ont_map:
    map_dict = {}
    for label in sample_df[col].unique():
        if col == 'disease' and label == 'normal': # normal is not in MONDO ontology
            map_dict[label] = 'PATO:0000461'
        elif label in ['unknown','n/a']: # unknown and n/a won't be in ontologies, pass along
            map_dict[label] = label
        elif col in ['tissue','development_stage']:
            if col == 'tissue':
                # Find what tissue type is at label row
                if sample_df.loc[sample_df[col] == label, 'tissue_type'].tolist()[0] != 'tissue':
                    map_dict[label] = label # Don't map cell type in tissue
                    continue
            # Find what organism term id is at label row
            org_term_id = sample_df.loc[sample_df[col] == label, 'organism_ontology_term_id'].tolist()[0]
            if org_term_id in col_ont_map[col]:
                # Get ontology of specific organism and map label
                species_ont = col_ont_map[col][sample_df.loc[org_term_id]]
                term_id = ontology_parser.get_term_id_by_label(label, species_ont)
            else:
                term_id = ontology_parser.get_term_id_by_label(label, col_ont_map[col]['other'])
        else:
            term_id = ontology_parser.get_term_id_by_label(label, col_ont_map[col])
        if term_id == None:
            print(f"Matching '{col_ont_map[col]}' term id not found for label '{label}' in column '{col}'")
        map_dict[label] = term_id
    sample_df[col + '_ontology_term_id'] = sample_df[col].map(map_dict)
    del sample_df[col]
    
# Blank fields in worksheet result in NaN values in dataframe, replacing these with n/a ? 
# Could also replace with unknown for certain columns using fillna options?
sample_df.fillna('n/a', inplace=True)
sample_df

Matching 'EFO' term id not found for label '10x flex' in column 'assay'


Unnamed: 0,sample_name,sample_probe_barcode,donor_id,tissue_type,suspension_type,perturbation_strategy,experimental_perturbation_type,experimental_perturbation,experimental_perturbation_time_point,organism_ontology_term_id,sex_ontology_term_id,self_reported_ethnicity_ontology_term_id,disease_ontology_term_id,assay_ontology_term_id,development_stage_ontology_term_id,tissue_ontology_term_id
0,CD4i_R1L01_D1_Rest,BC001+CR001|BC002+CR002|BC003+CR003|BC004+CR004,CE0008162,primary cell culture,cell,CRISPRi,stimulation,IL2,Rest,NCBITaxon:9606,PATO:0000383,HANCESTRO:0014,HANCESTRO:0568,,HsapDv:0000123,"CD4-positive, alpha-beta T cell"
1,CD4i_R1L01_D1_Stim8hr,BC009+CR009|BC010+CR010|BC011+CR011|BC012+CR012,CE0008162,primary cell culture,cell,CRISPRi,stimulation,IL2,Stim8hr,NCBITaxon:9606,PATO:0000383,HANCESTRO:0014,HANCESTRO:0568,,HsapDv:0000123,"CD4-positive, alpha-beta T cell"
2,CD4i_R1L01_D2_Rest,BC005+CR005|BC006+CR006|BC007+CR007|BC008+CR008,CE0010866,primary cell culture,cell,CRISPRi,stimulation,IL2,Rest,NCBITaxon:9606,PATO:0000383,HANCESTRO:0568,HANCESTRO:0568,,HsapDv:0000116,"CD4-positive, alpha-beta T cell"
3,CD4i_R1L01_D2_Stim8hr,BC013+CR013|BC014+CR014|BC015+CR015|BC016+CR016,CE0010866,primary cell culture,cell,CRISPRi,stimulation,IL2,Stim8hr,NCBITaxon:9606,PATO:0000383,HANCESTRO:0568,HANCESTRO:0568,,HsapDv:0000116,"CD4-positive, alpha-beta T cell"
4,CD4i_R1L02_D1_Rest,BC001+CR001|BC002+CR002|BC003+CR003|BC004+CR004,CE0008162,primary cell culture,cell,CRISPRi,stimulation,IL2,Rest,NCBITaxon:9606,PATO:0000383,HANCESTRO:0014,HANCESTRO:0568,,HsapDv:0000123,"CD4-positive, alpha-beta T cell"
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
87,CD4i_R1L22_D2_Stim8hr,BC013+CR013|BC014+CR014|BC015+CR015|BC016+CR016,CE0010866,primary cell culture,cell,CRISPRi,stimulation,IL2,Stim8hr,NCBITaxon:9606,PATO:0000383,HANCESTRO:0568,HANCESTRO:0568,,HsapDv:0000116,"CD4-positive, alpha-beta T cell"
88,CD4i_R1L23_D1_Rest,BC001+CR001|BC002+CR002|BC003+CR003|BC004+CR004,CE0008162,primary cell culture,cell,CRISPRi,stimulation,IL2,Rest,NCBITaxon:9606,PATO:0000383,HANCESTRO:0014,HANCESTRO:0568,,HsapDv:0000123,"CD4-positive, alpha-beta T cell"
89,CD4i_R1L23_D1_Stim8hr,BC009+CR009|BC010+CR010|BC011+CR011|BC012+CR012,CE0008162,primary cell culture,cell,CRISPRi,stimulation,IL2,Stim8hr,NCBITaxon:9606,PATO:0000383,HANCESTRO:0014,HANCESTRO:0568,,HsapDv:0000123,"CD4-positive, alpha-beta T cell"
90,CD4i_R1L23_D2_Rest,BC005+CR005|BC006+CR006|BC007+CR007|BC008+CR008,CE0010866,primary cell culture,cell,CRISPRi,stimulation,IL2,Rest,NCBITaxon:9606,PATO:0000383,HANCESTRO:0568,HANCESTRO:0568,,HsapDv:0000116,"CD4-positive, alpha-beta T cell"


In [5]:
bucket_name = 'czi-psomagen'
lab = 'weissman'
proj = 'scaling-in-vivo-perturb-seq-in-the-liver-and-beyond'

In [6]:
my_dir = f'{lab}-{proj}/'
r = s3client.list_objects(Bucket=bucket_name, Prefix=my_dir, Delimiter='/')
orders = [o['Prefix'].replace(my_dir,'') for o in r['CommonPrefixes']]

samples = []
for o in orders:
    r = s3client.list_objects(Bucket=bucket_name, Prefix=f'{my_dir}{o}', Delimiter='/')
    libs = [l['Prefix'].replace(f'{my_dir}{o}','') for l in r['CommonPrefixes']]
    for l in libs:
        r = s3client.list_objects(Bucket=bucket_name, Prefix=f'{my_dir}{o}{l}processed/cellranger/', Delimiter='/')
        dates = [d['Prefix'].replace(f'{my_dir}{o}{l}processed/cellranger/','') for d in r['CommonPrefixes']]
        for d in dates:
            r = s3client.list_objects(Bucket=bucket_name, Prefix=f'{my_dir}{o}{l}processed/cellranger/{d}outs/per_sample_outs/', Delimiter='/')
            subs = [s['Prefix'].replace(f'{my_dir}{o}{l}processed/cellranger/{d}outs/per_sample_outs/','') for s in r['CommonPrefixes']]
            for s in subs:
                samples.append({
                    'order': o,
                    'library': l.rstrip('/'),
                    'date': d,
                    'sample': s.rstrip('/')
                })

ClientError: An error occurred (AccessDenied) when calling the ListObjects operation: Access Denied

In [None]:
#identify which sample to curate
n = 0
s = samples[n]
s

In [None]:
order = s['order']
lib = s['library']
run_date = s['date']
samp = s['sample']

s_dir = f'{lab}-{proj}/{order}{lib}/processed/cellranger/{run_date}outs/per_sample_outs/{samp}'
download_files(s_dir)

adata = sc.read_10x_h5('sample_filtered_feature_bc_matrix.h5', gex_only=True)

adata = custom_var_to_obs(adata)

adata.obs['lab'] = lab
adata.obs['project'] = proj
adata.obs['sample_name'] = samp

adata.obs = adata.obs.merge(sample_df, on='sample_name', how='left').set_index(adata.obs.index)
adata.uns['organism_ontology_term_id'] = adata.obs['organism_ontology_term_id'].unique()[0]
adata.obs.drop(columns=['organism_ontology_term_id'],inplace=True)

crispr_df = gather_crispr()
adata.obs = adata.obs.merge(
    crispr_df, left_index=True, right_index=True, how='left'
).set_index(adata.obs.index)

metrics_df = gather_metrics(samp)
for c in metrics_df.columns:
    adata.obs[c] = metrics_df[c].values[0]

cxg_add_labels(adata)

#startswith('mt-') works for human & mouse, more attn needed for other organisms
adata.var['mt'] = adata.var['feature_name'].str.lower().str.startswith('mt-')
sc.pp.calculate_qc_metrics(adata, qc_vars=['mt'], inplace=True)

adata.write(filename=f'{samp}_curated.h5ad', compression='gzip')

for f in ['sample_filtered_feature_bc_matrix.h5','crispr_analysis.tar.gz','metrics_summary.csv']:
    os.remove(f)