In [None]:
import boto3
import numpy as np
import os
import pandas as pd
import scanpy as sc
import tarfile
from cellxgene_schema.write_labels import AnnDataLabelAppender

In [None]:
s3client = boto3.client('s3')
paginator = s3client.get_paginator('list_objects')

bucket_name = 'czi-psomagen'
lab = 'Marson'
proj = 'mapping-grns-perturb-seq'
run_dates = [
    'Run_2025-05-23',
    'Run_2025-05-28',
    'Run_2025-06-03'
]

In [None]:
def download_files(samp):
    mx_h5 = f'{lab}/{proj}/processed/cellranger/{run_date}/{lib}/outs/per_sample_outs/{samp}/count/sample_filtered_feature_bc_matrix.h5'
    metrics_csv = f'{lab}/{proj}/processed/cellranger/{run_date}/{lib}/outs/per_sample_outs/{samp}/metrics_summary.csv'
    cri_tar = f'{lab}/{proj}/processed/cellranger/{run_date}/{lib}/outs/per_sample_outs/{samp}/count/crispr_analysis.tar.gz'
    
    for file_path in [mx_h5, metrics_csv, cri_tar]:
        f = file_path.split('/')[-1]
        s3client.download_file(bucket_name, file_path, f)


def custom_var_to_obs(adata):
    moved = []
    for gene_index in adata.var[
        (adata.var['feature_types'] == 'Gene Expression') &
        (adata.var['gene_ids'].str.startswith('ENS') == False)
    ].index:
        gene_id = adata.var.loc[gene_index]['gene_ids']
        gene_values = adata.X[:, adata.var.index.get_loc(gene_index)]
        adata.obs[gene_id] = gene_values.A.flatten() if hasattr(gene_values, "A") else gene_values.toarray().flatten().tolist()
        moved.append(gene_id)

    adata.var.set_index('gene_ids', inplace=True)
    var_to_keep = [i for i in adata.var.index if i not in moved]
    adata = adata[:, var_to_keep]
    adata.var = adata.var.replace('', np.nan).dropna(axis=1, how='all')

    return adata


def gather_crispr():
    tar = tarfile.open('crispr_analysis.tar.gz', 'r:gz')
    f = tar.extractfile('protospacer_calls_per_cell.csv')
    df = pd.read_csv(f).rename(columns={'num_umis':'num_umis_guide_id'})
    tar.close()
    df['genetic_perturbation_guide_id'] = df['feature_call'].apply(lambda x: x.replace('|',' || '))
    df = df[['cell_barcode','genetic_perturbation_guide_id','num_umis_guide_id']].set_index('cell_barcode')

    return df


def gather_metrics(samp):
    df = pd.read_csv('metrics_summary.csv')
    df['Metric'] = df.apply(lambda x: f"{x['Metric Name']}, {x['Library Type']}, {x['Category'].replace('Cells','Sample')}", axis=1)

    probe_barcodes = ' || '.join(df[
        (df['Metric Name'] == 'Sample ID') &
        (df['Metric Value'] == samp)
    ]['Group Name'].unique())

    df = df[
        (df['Grouped By'].isin(['Fastq ID','Probe barcode ID']) == False) &
        (df['Category'].isin(['Library','Cells']))
    ]
    df = df[['Metric','Metric Value']].set_index('Metric').transpose()

    for c in df.columns:
        v = df[c].iloc[0]
        if v.endswith('%'):
            df[c] = df[c].apply(lambda x: float(x.rstrip('%')) / 100)
        elif '%' in v:
            df.drop(columns=c, inplace=True)
        else:
            df[c] = df[c].apply(lambda x: int(x.replace(',','')))

    df['Probe barcode IDs'] = probe_barcodes

    return df


def cxg_add_labels(adata):
    adata.obs['cell_type_ontology_term_id'] = 'unknown'
    labeler = AnnDataLabelAppender(adata)
    labeler._add_labels()
    adata.obs.drop(columns=['cell_type_ontology_term_id','cell_type'],inplace=True)

    #IF DESIRED TO TRACK, AS IT PERTAINS TO ADD-LABELS
    schema_v = labeler.schema_version
    adata.uns['schema_version'] = schema_v
    adata.uns['schema_reference'] = labeler._build_schema_reference_url(schema_v)

In [None]:
#read in the sample metadata
#for now, it's stored in a Google Sheet
sheet = '1VhASg4SaisOuZ3LUiTwkSkpaxdInBXquF0Lbkik5_os'
tab = 'samples'
url = f'https://docs.google.com/spreadsheets/d/{sheet}/gviz/tq?tqx=out:csv&sheet={tab}'
sample_df = pd.read_csv(url).dropna(axis=1,how='all')

#temp solution: replace any terms not in the CxG schema to avoid add-labels failure
sample_df.replace({
    'self_reported_ethnicity_ontology_term_id': {
        'HANCESTRO:0612': 'HANCESTRO:0014'
    }
}, inplace=True)

In [None]:
# build lists of files, libraries, samples
files = []
for run_date in run_dates:
    my_dir = f'{lab}/{proj}/processed/cellranger/{run_date}/'
    for page in paginator.paginate(Bucket=bucket_name, Prefix=my_dir):
        if 'Contents' in page:
            for obj in page['Contents']:
                files.append(obj['Key'])

subs = {}
for file_path in files:
    file_path = file_path.split('/')
    cri = file_path.index('cellranger')
    datei = cri + 1
    lib = file_path[datei + 1]
    if len(file_path) > (datei + 3):
        if file_path[datei + 3] == 'per_sample_outs':
            sub = file_path[datei + 4]
            subs[sub] = {
                'library': lib,
                'date': file_path[datei]
            }

In [None]:
for samp,v in subs.items():
    lib = v['library']
    run_date = v['date']
    print(samp)
    download_files(samp)

    adata = sc.read_10x_h5('sample_filtered_feature_bc_matrix.h5', gex_only=True)

    adata = custom_var_to_obs(adata)

    adata.obs['lab'] = lab #this can go away once it is no longer in the directory structure
    adata.obs['project'] = proj
    #ADD IN REFERENCE INFO - genome/annotation
    adata.obs['sample_id'] = samp
    adata.obs['library_id'] = lib
    adata.obs = adata.obs.merge(sample_df, on='sample_id', how='left').set_index(adata.obs.index)
    #STARTING 6.0.0
    #adata.uns['organism_ontology_term_id'] = adata.obs['organism_ontology_term_id'].unique()[0]
    #adata.obs.drop(columns=['organism_ontology_term_id'],inplace=True)

    crispr_df = gather_crispr()
    adata.obs = adata.obs.merge(
        crispr_df, left_index=True, right_index=True, how='left'
    ).set_index(adata.obs.index)

    metrics_df = gather_metrics(samp)
    for c in metrics_df.columns:
        adata.obs[c] = metrics_df[c].values[0]

    #ADD IN MITO %
    #ADD IN DOUBLETS
    #maybe https://scanpy.readthedocs.io/en/stable/generated/scanpy.pp.calculate_qc_metrics.html

    #add labels - somehow
    cxg_add_labels(adata)

    #adata.write(filename=f'{samp}_curated.h5ad', compression='gzip')
    #adata.write_zarr(f'{samp}_curated.zarr')
    #WHAT TO NAME IT? WHERE TO SEND IT?

    for f in ['sample_filtered_feature_bc_matrix.h5','crispr_analysis.tar.gz','metrics_summary.csv']:
        os.remove(f)