In [1]:
import os
from cellxgene_mods import CxG_API


CxG_API.config() # set env='dev' or 'staging' if working in either of those test environments"



Set 'SITE_URL' env var to https://cellxgene.cziscience.com
Set 'API_URL_BASE' env var to https://api.cellxgene.cziscience.com
Successfully set 'ACCESS_TOKEN' env var!


**Specify the Collection to upload to**<br>
If a Revision, use the Revision ID, not the Published ID

In [2]:
collection_id = '9c9d04c4-8899-417f-bb6f-6107dcadf14f'

**List the existing Datasets in the Collection**

In [3]:
coll_datasets = CxG_API.get_collection(collection_id)['datasets']
status = {d['dataset_id']:d['processing_status'] for d in coll_datasets}
titles = {d['dataset_id']:d['title'] for d in coll_datasets}
titles

{'91f31e05-56d8-46fc-b408-d90c9228a81b': 'Single-cell RNA-seq of the Adult Human Kidney (Version 2.0)',
 '7ff0197b-d175-49bf-b4fa-150fe0995d93': 'Single-nucleus RNA-seq of the Adult Human Kidney (Version 2.0)'}

**Set the directory that the files to upload are in**

In [4]:
directory = os.path.expanduser('/Users/brianmott/Documents/Curation/CellxGene/CXG-708/KPMP_version2_SingleCell_CELLxGENE/')

**List certain files in that directory**<br>
The curation_qa notebook saves files with a `_revised.h5ad` suffix

In [5]:
for f in os.listdir(directory):
    if f.endswith('_revised.h5ad') or f.endswith('fragments.tsv.gz'):
        print(f)

single_cell_kpmp_revised.h5ad
single_cell_kpmp_normalized_revised.h5ad


**Fill in Dataset ID and file names to upload**<br>
Use `new` for the _dataset_id_ if adding a Dataset, rather than replacing an existing Dataset\
_fragments_ is optional\
Use `existing` for _anndata_ if adding fragments to an existing Dataset without re-uploading the .h5ad

In [6]:
datasets = [
    {
        'dataset_id': '91f31e05-56d8-46fc-b408-d90c9228a81b',
        'anndata': directory + 'single_cell_kpmp_normalized_revised.h5ad'
    },
]

**Confirm the files are specified correctly, etc.**

In [7]:
all_ids = [d['dataset_id'] for d in datasets]
for index,d in enumerate(datasets):
    if d['anndata'] != 'existing' and not os.path.exists(d['anndata']):
        print(f"Invalid file: {d['anndata']}")
    if 'fragments' in d and not os.path.exists(d['fragments']):
        print(f"Invalid file: {d['fragments']}")
    if d['anndata'] == 'existing':
        if 'fragments' not in d:
            print(f"Must define fragments if revising an existing matrix for datasets[{index}]")
        if d['dataset_id'] == 'new':
            print(f"Must define either dataset_id or anndata file to upload for datasets[{index}]")
    if d['dataset_id'] != 'new':
        if all_ids.count(d['dataset_id']) > 1:
            print(f"Repeated dataset: {d['dataset_id']}")
        if d['dataset_id'] not in titles:
            print(f"Invalid dataset: {d['dataset_id']}")
            continue
        if status[d['dataset_id']] != 'SUCCESS':
            print(f"{d['dataset_id']} is processing_status:{status[d['dataset_id']]}, must wait for SUCCESS")

**Upload each Dataset**

In [8]:
for d in datasets:
    if d['dataset_id'] == 'new':
        d['dataset_id'] = CxG_API.create_dataset(collection_id)

    if d['anndata'] == 'existing':
        manifest = CxG_API.get_dataset_manifest(collection_id, d['dataset_id'])
    else:
        manifest = {
            'anndata': CxG_API.upload_local_datafile(d['anndata'], collection_id, d['dataset_id'])
        }

    if 'fragments' in d:
        manifest['atac_fragment'] = CxG_API.upload_local_datafile(d['fragments'], collection_id, d['dataset_id'])
    
    CxG_API.upload_datafiles_from_manifest(manifest, collection_id, d['dataset_id'])

Full S3 write path is s3://cellxgene-dataset-submissions/super/9c9d04c4-8899-417f-bb6f-6107dcadf14f/91f31e05-56d8-46fc-b408-d90c9228a81b/single_cell_kpmp_normalized_revised.h5ad


Uploading /Users/brianmott/Documents/Curation/CellxGene/CXG-708/KPMP_version2_SingleCell_CELLxGENE/single_cell_kpmp_normalized_revised.h5ad to Collection 9c9d04c4-8899-417f-bb6f-6107dcadf14f with dataset_id '91f31e05-56d8-46fc-b408-d90c9228a81b'...

9c9d04c4-8899-417f-bb6f-6107dcadf14f/91f31e05-56d8-46fc-b408-d90c9228a81b: [1m[38;5;10m100.0% uploaded[0m
[1m[38;5;10mSUCCESS[0m

UPLOAD COMPLETE.

[1m[38;5;10mSUCCESS[0m

Uploading Dataset with id '91f31e05-56d8-46fc-b408-d90c9228a81b' to Collection https://cellxgene.cziscience.com/collections/9c9d04c4-8899-417f-bb6f-6107dcadf14f sourcing from manifest: {'anndata': 's3://cellxgene-dataset-submissions/super/9c9d04c4-8899-417f-bb6f-6107dcadf14f/91f31e05-56d8-46fc-b408-d90c9228a81b/single_cell_kpmp_normalized_revised.h5ad'}
