**This notebook provides a sample curation workflow of a Visium dataset towards CELLxGENE standards starting with SpaceRanger outputs**\
The `WS_PLA_S9101764` data example can be downloaded from [E-MTAB-12698](https://www.ebi.ac.uk/biostudies/arrayexpress/studies/E-MTAB-12698)

In [None]:
import anndata as ad
import numpy as np
import pandas as pd
import scanpy as sc
from PIL import Image
from scipy import sparse

Specify the folder that includes these files...
- raw_feature_bc_matrix.h5
- spatial/
  - scalefactors_json.json
  - tissue_hires_image.png
  - tissue_lowres_image.png*
  - tissue_positions_list.csv

*the lowres image will be deleted but is required for `read_visium()`

In [None]:
folder = 'WS_PLA_S9101764'

#REQUIRED to include empty spots, so specify raw_feature_bc_matrix.h5
adata = sc.read_visium(folder, count_file='raw_feature_bc_matrix.h5')
adata.var.set_index('gene_ids', inplace=True)

In [None]:
#SpaceRanger will not output spots that have zero reads mapped, so fill those in with 0s, if needed
if adata.obs.shape[0] < 4992:
    all_barcodes = pd.read_csv(folder + '/spatial/tissue_positions_list.csv', header=None)
    missing_barcodes = all_barcodes[all_barcodes[0].isin(list(adata.obs.index)) == False]
    missing_barcodes.set_index(0, inplace=True)
    missing_barcodes.rename(columns={1: 'in_tissue', 2:'array_row', 3:'array_col'}, inplace=True)
    empty_matrix = sparse.csr_matrix((missing_barcodes.shape[0], adata.var.shape[0]))
    missing_adata = ad.AnnData(empty_matrix, var=adata.var, obs=missing_barcodes[['in_tissue','array_row','array_col']])
    comb_adata = ad.concat([adata, missing_adata], uns_merge='first', merge='first')
    comb_adata.obsm['spatial'] = np.concatenate((adata.obsm['spatial'],missing_barcodes[[5,4]].values))
    adata = comb_adata

In [None]:
library_id = list(adata.uns['spatial'].keys())[0]

In [None]:
#REQUIRED to distinguish single section data from integrated data
adata.uns['spatial']['is_single'] = True

In [None]:
#PREFERRED to include fullres image
fullres_path = 'WS_PLA_S9101764.tif'
adata.uns['spatial'][library_id]['images']['fullres'] = np.asarray(Image.open(fullres_path))

In [None]:
adata.uns['title'] = library_id

In [None]:
#OPTIONAL to move metadata before deleting it
adata.uns['spatial_metadata'] = adata.uns['spatial'][library_id]['metadata']

#MUST NOT SUBMIT metadata or lowres or fiducial_diameter_fullres in uns.spatial
del adata.uns['spatial'][library_id]['metadata']
del adata.uns['spatial'][library_id]['images']['lowres']
del adata.uns['spatial'][library_id]['scalefactors']['tissue_lowres_scalef']
del adata.uns['spatial'][library_id]['scalefactors']['fiducial_diameter_fullres']

In [None]:
#REQUIRED obs metadata - will be the same for all Visium Datasets
adata.obs['suspension_type'] = 'na'
adata.obs['assay_ontology_term_id'] = 'EFO:0010961'

In [None]:
#REQUIRED obs metadata - most likely the same value for all obs
#update based on the given donor/sample
adata.obs['donor_id'] = 'donor_A'
adata.obs['organism_ontology_term_id'] = 'NCBITaxon:9606' #NCBITaxon:9606 for human, NCBITaxon:10090 for mouse
adata.obs['sex_ontology_term_id'] = 'PATO:0000383' #PATO:0000383 for female, PATO:0000384 for male
adata.obs['development_stage_ontology_term_id'] = 'HsapDv:0000137' #HsapDv or MmusDv term
adata.obs['self_reported_ethnicity_ontology_term_id'] = 'HANCESTRO:0022' #HANCESTRO term
adata.obs['disease_ontology_term_id'] = 'PATO:0000461' #PATO:0000461 for normal, MONDO term for disease
adata.obs['tissue_type'] = 'tissue' #tissue, organoid
adata.obs['tissue_ontology_term_id'] = 'UBERON:0001115' #UBERON term

In [None]:
#REQUIRED obs.cell_type_ontology_term_id
#cell labels are not included in the SpaceRanger output and thus must be prepared separately
#this example reads in a csv with barcodes in the first column and CL terms in the second column
cell_types = pd.read_csv('cell_annotations.csv', names=['barcode','cell_type_ontology_term_id'])
adata.obs = adata.obs.merge(cell_types, left_index=True, right_on='barcode', how='left').set_index(adata.obs.index)

adata.obs.fillna({'cell_type_ontology_term_id': 'unknown'}, inplace=True)

In [None]:
#QA by plotting with the hires image
sc.pl.spatial(adata,
              library_id=library_id,
              color='cell_type_ontology_term_id')

In [None]:
#same plot with fullres image, if present
if 'fullres' in adata.uns['spatial'][library_id]['images']:
    sc.pl.spatial(adata,
                  library_id=library_id,
                  color='cell_type_ontology_term_id',
                  img_key='fullres',
                  scale_factor=1
                 )

In [None]:
adata.write(filename=library_id + '_curated.h5ad', compression='gzip')