**This notebook provides a sample curation workflow of a Visium dataset towards CELLxGENE standards starting with Space Ranger outputs**\
*Outline of tasks included*\
...\
...

The example is from [He et al 2022](https://doi.org/10.1016/j.cell.2022.11.005)\
The Space Ranger ouputs `6332STDY10289523.tar.gz` can be downloaded from [E-MTAB-11265](https://www.ebi.ac.uk/biostudies/arrayexpress/studies/E-MTAB-11265)\
The fullres image `V10S24-031_D1.jpg` can also be downloaded from [E-MTAB-11265](https://www.ebi.ac.uk/biostudies/arrayexpress/studies/E-MTAB-11265)\
`6332STDY10289523.220627.h5ad` with normalized data and cell2location proportions can be downloaded from the pcw19 Dataset at the [Fetal Lung portal](https://fetal-lung.cellgeni.sanger.ac.uk/visium.html)

In [None]:
import anndata as ad
import numpy as np
import os
import pandas as pd
import scanpy as sc
import squidpy as sq
from PIL import Image
from pyometiff import OMETIFFReader
from scipy import sparse

Specify the Space Ranger output folder. It must include at least these files...
- raw_feature_bc_matrix.h5
- spatial/
  - scalefactors_json.json
  - tissue_hires_image.png
  - tissue_lowres_image.png*
  - tissue_positions_list.csv / tissue_positions.csv

*the lowres image will be deleted but is required for `read_visium()`

In [None]:
sr_outs = '6332STDY10289523/outs'

squidpy consumes tissue_positions_list.csv, but the file name and format are different for Space Ranger versions *????*

In [None]:
if not os.path.exists(sr_outs + '/spatial/tissue_positions_list.csv'):
    df = pd.read_csv(sr_outs + '/spatial/tissue_positions.csv')
    df.to_csv(sr_outs + '/spatial/tissue_positions_list.csv', header=False, index=False)

**REQUIRED** to include background spots, so specify `raw_feature_bc_matrix.h5`

In [None]:
adata = sq.read.visium(sr_outs, counts_file='raw_feature_bc_matrix.h5')

In [None]:
adata.var.set_index('gene_ids', inplace=True)

Space Ranger will not output spots that have zero reads mapped\
Fill those in with 0s, if needed

In [None]:
if adata.obs.shape[0] < 4992:
    all_barcodes = pd.read_csv(sr_outs + '/spatial/tissue_positions_list.csv', header=None)
    missing_barcodes = all_barcodes[all_barcodes[0].isin(list(adata.obs.index)) == False]
    missing_barcodes.set_index(0, inplace=True)
    missing_barcodes.rename(columns={1: 'in_tissue', 2:'array_row', 3:'array_col'}, inplace=True)
    empty_matrix = sparse.csr_matrix((missing_barcodes.shape[0], adata.var.shape[0]), dtype=np.float32)
    #make empty matrix float32?
    missing_adata = ad.AnnData(empty_matrix, var=adata.var, obs=missing_barcodes[['in_tissue','array_row','array_col']])
    comb_adata = ad.concat([adata, missing_adata], uns_merge='first', merge='first')
    comb_adata.obsm['spatial'] = np.concatenate((adata.obsm['spatial'],missing_barcodes[[5,4]].values))
    adata = comb_adata

Occassionally, in_tissue:1 observations have all 0s, indicating that they are not truly in tissue\
These can be updated to in_tissue:0...

In [None]:
gene_sum_df = adata.obs.copy()
gene_sum_df['total_counts'] = [np.sum(r) for r in adata.X.toarray()]
in_tissue_zeroes = gene_sum_df[(gene_sum_df['total_counts'] == 0) & (gene_sum_df['in_tissue'] != 0)]
if not in_tissue_zeroes.empty:
    adata.obs.loc[in_tissue_zeroes.index, 'in_tissue'] = 0
    print(in_tissue_zeroes.shape[0],'obs revised to in_tissue:0')

In [None]:
library_id = list(adata.uns['spatial'].keys())[0]

In [None]:
adata.uns['title'] = library_id

In [None]:
#REQUIRED to distinguish single section data from integrated data
adata.uns['spatial']['is_single'] = True

PREFERRED to include fullres image\
This is a Space Ranger input, not an output

In [None]:
fullres_path = 'V10S24-031_D1.jpg'

#some of the fullres images require expanding the limit
Image.MAX_IMAGE_PIXELS = 699408640

if fullres_path.split('.')[-1] in ['tif','tiff','jpg']:
    fullres_np = np.asarray(Image.open(fullres_path))
    
#.ome.tif examples - https://www.heartcellatlas.org/
elif fullres_path.endswith('.ome.tif'):
    reader = OMETIFFReader(fpath=fullres_path)
    fullres_np, metadata, xml_metadata = reader.read()

    #may need to transpose the image if its an invalid shape
    fullres_np = np.transpose(fullres_np, (1,2,0))

    sr_adata.uns['spatial'][library_id]['images']['fullres'] = fullres_np

    #optional to store image metadata in the dataset
    sr_adata.uns['fullres_xml_metadata'] = xml_metadata

#may need to rotate the image to align with embeddings
#k is the number of times to rotate the image 90 degrees counter-clockwise
#fullres_np  = np.rot90(fullres_np, k=3)

adata.uns['spatial'][library_id]['images']['fullres'] = fullres_np

In [None]:
#OPTIONAL to move metadata before deleting it
adata.uns['spatial_metadata'] = adata.uns['spatial'][library_id]['metadata']

#MUST NOT SUBMIT metadata or lowres or fiducial_diameter_fullres in uns.spatial
del adata.uns['spatial'][library_id]['metadata']
del adata.uns['spatial'][library_id]['images']['lowres']
del adata.uns['spatial'][library_id]['scalefactors']['tissue_lowres_scalef']
del adata.uns['spatial'][library_id]['scalefactors']['fiducial_diameter_fullres']

REQUIRED obs metadata - will be the same for all Visium Datasets

In [None]:
adata.obs['suspension_type'] = 'na'
adata.obs['assay_ontology_term_id'] = 'EFO:0010961'

REQUIRED obs metadata - most likely the same value for all obs\
update based on the given donor/sample

In [None]:
adata.obs['donor_id'] = 'HDBR15773'
adata.obs['organism_ontology_term_id'] = 'NCBITaxon:9606' #NCBITaxon:9606 for human, NCBITaxon:10090 for mouse
adata.obs['sex_ontology_term_id'] = 'PATO:0000384' #PATO:0000383 for female, PATO:0000384 for male
adata.obs['development_stage_ontology_term_id'] = 'HsapDv:0000056' #HsapDv or MmusDv term
adata.obs['self_reported_ethnicity_ontology_term_id'] = 'HANCESTRO:0022' #HANCESTRO term, 'na' for mouse
adata.obs['disease_ontology_term_id'] = 'PATO:0000461' #PATO:0000461 for normal, MONDO term for disease
adata.obs['tissue_type'] = 'tissue' #tissue, organoid
adata.obs['tissue_ontology_term_id'] = 'UBERON:0002048' #UBERON term

cell annotation example\
...

In [None]:
final_mx = '6332STDY10289523.220627.h5ad'

In [None]:
cl_map = {
    'Adventitial fibro': 'CL:4028006', #alveolar adventitial fibroblast
    'Alveolar fibro': 'CL:4028006', #alveolar adventitial fibroblast
    'AT1': 'CL:0002062', #pulmonary alveolar type 1 cell
    'AT2': 'CL:0002063', #pulmonary alveolar type 2 cell
    'ASPN+ chondrocyte': 'CL:0000138', #chondrocyte
    'Interm chondrocyte': 'CL:0000138', #chondrocyte
    'Myofibro 2': 'CL:0000186', #myofibroblast cell
    'Ciliated': 'CL:0000064', #ciliated cell
    'MUC16+ ciliated': 'CL:0000064', #ciliated cell
    'Late airway SMC': 'CL:0000192', #smooth muscle cell
    'Vascular SMC 2': 'CL:0000359', #vascular associated smooth muscle cell
    'Late airway progenitor': 'CL:0011026', #progenitor cell
    'Mid fibro': 'CL:0000057', #fibroblast
    'Mid Schwann': 'CL:0002573', #Schwann cell
    'Proximal secretory 2': 'CL:0000151', #secretory cell
    'Late tip': 'CL:0000423', #tip cell
    'Club': 'CL:0000158', #club cell
    'KCNIP4+ neuron': 'CL:0000540', #neuron
    'SST+ neuron': 'CL:0000540', #neuron
    'SCG3+ lymphatic endothelial': 'CL:0002138', #endothelial cell of lymphatic vessel
    'Deuterosomal': 'CL:4033044', #deuterosomal cell
    'Proximal basal': 'CL:0000646', #basal cell
    'Late basal': 'CL:0000646' #basal cell
}

In [None]:
final_adata = sc.read_h5ad(final_mx)
final_adata.var.set_index('gene_ids', inplace=True)

prefix = 'q05cell_abundance_w_sf_'
max_field = 'annotation'

#update the obs index values to match the Space Ranger outputs (<barcode>-1), if needed
final_adata.obs.index = [i[0] for i in final_adata.obs.index.str.split('_')]

#merge over the final_adata obs to adata.obs
final_adata.obs.drop(columns=[c for c in final_adata.obs.columns if c in adata.obs.columns], inplace=True)
adata.obs = adata.obs.merge(final_adata.obs, left_index=True, right_index=True, how='left').set_index(adata.obs.index)

adata.obs[max_field] = adata.obs[[c for c in final_adata.obs.columns if c.startswith(prefix)]].idxmax(axis='columns')
adata.obs[max_field] = adata.obs[max_field].str.replace(prefix, '')
adata.obs['cell_type_ontology_term_id'] = adata.obs[max_field].map(cl_map)
adata.obs['cell_type_ontology_term_id'] = adata.obs['cell_type_ontology_term_id'].fillna('unknown')

OPTIONAL to add normalized data layer\
Fills in all zeroes for barcodes filtered out of the normalized data (like in_tissue:0), so this may not be appropriate depending on the normalization/scaling of final layer

In [None]:
barcodes_add = [e for e in adata.obs.index if e not in final_adata.obs.index]

empty_matrix = sparse.csr_matrix((len(barcodes_add), final_adata.var.shape[0]))
missing_adata = ad.AnnData(empty_matrix, var=final_adata.var, obs=barcodes_add)
missing_adata.obs.set_index(0, inplace=True)
final_adata = ad.concat([final_adata, missing_adata], join='outer')

genes_add = [e for e in adata.var.index if e not in final_adata.var.index]
all_genes = final_adata.var.index.to_list()
all_genes.extend(genes_add)
new_var = pd.DataFrame(index=all_genes)

new_matrix = sparse.csr_matrix((final_adata.X.data, final_adata.X.indices, final_adata.X.indptr), shape = adata.shape)

new_adata = ad.AnnData(X=new_matrix, obs=final_adata.obs, var=new_var, obsm=final_adata.obsm)

adata.raw = adata

new_adata = new_adata[adata.obs.index.to_list(), :]
new_adata = new_adata[:, adata.var.index.to_list()]
adata.X = new_adata.X


adata.var['feature_is_filtered'] = np.where(adata.var.index.isin(genes_add), True, False)

OPTIONAL to add non-spatial embeddings

In [None]:
for k in new_adata.obsm:
    if 'spatial' not in k:
        adata.obsm[k] = new_adata.obsm[k]

QA by plotting with the hires image and fullres image, if present

In [None]:
sc.set_figure_params(figsize=(8, 8))

sq.pl.spatial_scatter(
    adata,
    library_id=library_id,
    color='in_tissue',
    figsize=(12,4)
)

sq.pl.spatial_scatter(
    adata,
    library_id=library_id,
    color=max_field,
    figsize=(12,4),
    legend_fontsize=10
)
del adata.uns[max_field + '_colors']

if 'fullres' in adata.uns['spatial'][library_id]['images']:
    sq.pl.spatial_scatter(
        adata,
        library_id=library_id,
        color='in_tissue',
        img_res_key='fullres',
        scale_factor=1.0,
        figsize=(12,4)
        )

    sq.pl.spatial_scatter(
        adata,
        library_id=library_id,
        color=max_field,
        img_res_key='fullres',
        scale_factor=1.0,
        figsize=(12,4),
        legend_fontsize=10
        )
    del adata.uns[max_field + '_colors']

Write to file

In [None]:
adata.write(filename=library_id + '_curated.h5ad', compression='gzip')