**This notebook provides a sample curation workflow of a Visium dataset towards CELLxGENE standards starting with Space Ranger outputs**\
*Outline of tasks included*\
...\
...

The example is from [He et al 2022](https://doi.org/10.1016/j.cell.2022.11.005)\
The Space Ranger ouputs `6332STDY10289523.tar.gz` can be downloaded from [E-MTAB-11265](https://www.ebi.ac.uk/biostudies/arrayexpress/studies/E-MTAB-11265)\
The fullres image `V10S24-031_D1.jpg` can also be downloaded from [E-MTAB-11265](https://www.ebi.ac.uk/biostudies/arrayexpress/studies/E-MTAB-11265)\
`6332STDY10289523.220627.h5ad` with normalized data and cell2location proportions can be downloaded from the pcw19 Dataset at the [Fetal Lung portal](https://fetal-lung.cellgeni.sanger.ac.uk/visium.html)

In [None]:
import anndata as ad
import numpy as np
import os
import pandas as pd
import scanpy as sc
import squidpy as sq
from PIL import Image
from pyometiff import OMETIFFReader
from scipy import sparse

Specify the SpaceRanger output folder. It must include at least these files...
- raw_feature_bc_matrix.h5
- spatial/
  - scalefactors_json.json
  - tissue_hires_image.png
  - tissue_lowres_image.png*
  - tissue_positions_list.csv

*the lowres image will be deleted but is required for `read_visium()`

In [None]:
folder = 'A2'

#REQUIRED to include background spots, so specify raw_feature_bc_matrix.h5
adata = sq.read.visium(folder, counts_file='raw_feature_bc_matrix.h5')
adata.var.set_index('gene_ids', inplace=True)

In [None]:
#SpaceRanger will not output spots that have zero reads mapped, so fill those in with 0s, if needed
if adata.obs.shape[0] < 4992:
    all_barcodes = pd.read_csv(folder + '/spatial/tissue_positions_list.csv', header=None)
    missing_barcodes = all_barcodes[all_barcodes[0].isin(list(adata.obs.index)) == False]
    missing_barcodes.set_index(0, inplace=True)
    missing_barcodes.rename(columns={1: 'in_tissue', 2:'array_row', 3:'array_col'}, inplace=True)
    empty_matrix = sparse.csr_matrix((missing_barcodes.shape[0], adata.var.shape[0]))
    missing_adata = ad.AnnData(empty_matrix, var=adata.var, obs=missing_barcodes[['in_tissue','array_row','array_col']])
    comb_adata = ad.concat([adata, missing_adata], uns_merge='first', merge='first')
    comb_adata.obsm['spatial'] = np.concatenate((adata.obsm['spatial'],missing_barcodes[[5,4]].values))
    adata = comb_adata

In [None]:
library_id = list(adata.uns['spatial'].keys())[0]

In [None]:
#REQUIRED to distinguish single section data from integrated data
adata.uns['spatial']['is_single'] = True

In [None]:
#PREFERRED to include fullres image
#this is a SpaceRanger input, not an output
fullres_path = 'A2.jpg'

#some of the fullres images require expanding the limit
Image.MAX_IMAGE_PIXELS = 699408640

#.ome.tif examples - https://www.heartcellatlas.org/
if fullres_path.endswith('.ome.tif'):
    reader = OMETIFFReader(fpath=fullres_path)
    fullres_np, metadata, xml_metadata = reader.read()

    #may need to transpose the image if its an invalid shape
    fullres_np = np.transpose(fullres_np, (1,2,0))

    sr_adata.uns['spatial'][library_id]['images']['fullres'] = fullres_np

    #optional to store image metadata in the dataset
    sr_adata.uns['fullres_xml_metadata'] = xml_metadata
elif fullres_path.split('.')[-1] in ['tif','tiff','jpg']:
    fullres_np = np.asarray(Image.open(fullres_path))

#may need to rotate the image to align with embeddings
#k is the number of times to rotate the image 90 degrees counter-clockwise
fullres_np  = np.rot90(fullres_np, k=3)

adata.uns['spatial'][library_id]['images']['fullres'] = fullres_np

In [None]:
adata.uns['title'] = library_id

In [None]:
#OPTIONAL to move metadata before deleting it
adata.uns['spatial_metadata'] = adata.uns['spatial'][library_id]['metadata']

#MUST NOT SUBMIT metadata or lowres or fiducial_diameter_fullres in uns.spatial
del adata.uns['spatial'][library_id]['metadata']
del adata.uns['spatial'][library_id]['images']['lowres']
del adata.uns['spatial'][library_id]['scalefactors']['tissue_lowres_scalef']
del adata.uns['spatial'][library_id]['scalefactors']['fiducial_diameter_fullres']

In [None]:
#REQUIRED obs metadata - will be the same for all Visium Datasets
adata.obs['suspension_type'] = 'na'
adata.obs['assay_ontology_term_id'] = 'EFO:0010961'

In [None]:
#REQUIRED obs metadata - most likely the same value for all obs
#update based on the given donor/sample
adata.obs['donor_id'] = 'donor_A'
adata.obs['organism_ontology_term_id'] = 'NCBITaxon:9606' #NCBITaxon:9606 for human, NCBITaxon:10090 for mouse
adata.obs['sex_ontology_term_id'] = 'PATO:0000383' #PATO:0000383 for female, PATO:0000384 for male
adata.obs['development_stage_ontology_term_id'] = 'HsapDv:0000137' #HsapDv or MmusDv term
adata.obs['self_reported_ethnicity_ontology_term_id'] = 'HANCESTRO:0022' #HANCESTRO term, 'na' for mouse
adata.obs['disease_ontology_term_id'] = 'PATO:0000461' #PATO:0000461 for normal, MONDO term for disease
adata.obs['tissue_type'] = 'tissue' #tissue, organoid
adata.obs['tissue_ontology_term_id'] = 'UBERON:0001115' #UBERON term

In [None]:
#REQUIRED obs.cell_type_ontology_term_id
#cell labels are not included in the SpaceRanger output and thus must be prepared separately
#this example reads in a csv with barcodes in the first column and CL terms in the second column
cell_types = pd.read_csv('cell_annotations.csv', names=['barcode','cell_type_ontology_term_id'])
adata.obs = adata.obs.merge(cell_types, left_index=True, right_on='barcode', how='left').set_index(adata.obs.index)

#background spots will likely not have labels, so annotate those as 'unknown'
adata.obs.fillna({'cell_type_ontology_term_id': 'unknown'}, inplace=True)

In [None]:
#QA by plotting with the hires image
sq.pl.spatial_scatter(
    adata,
    library_id=library_id,
    color='in_tissue'
)

#same plot with fullres image, if present
if 'fullres' in adata.uns['spatial'][library_id]['images']:
    sq.pl.spatial_scatter(
        adata,
        library_id=library_id,
        color='in_tissue',
        img_res_key='fullres',
        scale_factor=1.0
        )

In [None]:
adata.write(filename=library_id + '_curated.h5ad', compression='gzip')