In [None]:
import anndata as ad
import json
import numpy as np
import os
import pandas as pd
import re
import requests
import scanpy as sc
import subprocess
from PIL import Image
from random import randint
from scipy import sparse
from urllib.parse import quote

Specify the folder that includes these files...
- raw_feature_bc_matrix.h5
- spatial/
  - scalefactors_json.json
  - tissue_hires_image.png
  - tissue_lowres_image.png*
  - tissue_positions_list.csv

*the lowres image will be deleted but is required for `read_visium()`

In [None]:
folder = 'WS_PLA_S9101764'

#REQUIRED to include empty spots, so specify raw_feature_bc_matrix.h5
adata = sc.read_visium(folder, count_file='raw_feature_bc_matrix.h5')

In [None]:
if adata.obs.shape[0] < 4992:
    all_barcodes = pd.read_csv(folder + '/spatial/tissue_positions_list.csv', header=None)
    missing_barcodes = all_barcodes[all_barcodes[0].isin(list(adata.obs.index)) == False]
    missing_barcodes.set_index(0, inplace=True)
    missing_barcodes.rename(columns={1: 'in_tissue', 2:'array_row', 3:'array_col'}, inplace=True)
    empty_matrix = sparse.csr_matrix((missing_barcodes.shape[0], adata.var.shape[0]))
    missing_adata = ad.AnnData(empty_matrix, var=adata.var, obs=missing_barcodes[['in_tissue','array_row','array_col']])
    comb_adata = ad.concat([adata, missing_adata], uns_merge='first', merge='first')
    comb_adata.obsm['spatial'] = np.concatenate((adata.obsm['spatial'],missing_barcodes[[5,4]].values))
    adata = comb_adata

In [None]:
library_id = list(adata.uns['spatial'].keys())[0]

In [None]:
adata.uns['spatial']['is_single'] = True

In [None]:
#PREFERRED to include fullres image
fullres_path = 'WS_PLA_S9101764.tif'
adata.uns['spatial'][library_id]['images']['fullres'] = np.asarray(Image.open(fullres_path))

In [None]:
adata.uns['title'] = library_id

In [None]:
#OPTIONAL
adata.uns['spatial_metadata'] = adata.uns['spatial'][library_id]['metadata']

#MUST NOT SUBMIT metadata or lowres or fiducial_diameter_fullres
del adata.uns['spatial'][library_id]['metadata']
del adata.uns['spatial'][library_id]['images']['lowres']
del adata.uns['spatial'][library_id]['scalefactors']['tissue_lowres_scalef']
del adata.uns['spatial'][library_id]['scalefactors']['fiducial_diameter_fullres']

In [None]:
#consistent for all Visium Datasets
adata.obs['suspension_type'] = 'na'
adata.obs['assay_ontology_term_id'] = 'EFO:0010961'

In [None]:
#consistent metadata for all obs
adata.obs['is_primary_data'] = True
adata.obs['donor_id'] = 'donor_A'
adata.obs['organism_ontology_term_id'] = 'NCBITaxon:9606'
adata.obs['sex_ontology_term_id'] = 'PATO:0000383'
adata.obs['development_stage_ontology_term_id'] = 'HsapDv:0000137'
adata.obs['self_reported_ethnicity_ontology_term_id'] = 'HANCESTRO:0022'
adata.obs['tissue_type'] = 'tissue'

In [None]:
#usually consistent metadata for all obs
adata.obs['tissue_ontology_term_id'] = 'UBERON:0001115'
adata.obs['disease_ontology_term_id'] = 'PATO:0000461'

In [None]:
#map in cell_types
cell_types = pd.read_csv('cell_annotations.csv', names=['barcode','cell_type_ontology_term_id'])
adata.obs = adata.obs.merge(cell_types, left_index=True, right_on='barcode', how='left').set_index(adata.obs.index)

adata.obs.fillna({'cell_type_ontology_term_id': 'unknown'}, inplace=True)

In [None]:
adata.obs['cell_type_ontology_term_id'].value_counts()

In [None]:
approved = pd.read_csv('ref_files/genes_approved.csv',dtype='str')
v44_gene_map = json.load(open('../gene_ID_mapping/gene_map_v44.json'))

adata.var.reset_index(inplace=True)

my_gene_map = {k:v for k,v in v44_gene_map.items() if k in adata.var['gene_ids'] and v not in adata.var['gene_ids']}
adata.var.replace({'gene_ids': my_gene_map}, inplace=True)

var_to_keep = adata.var[adata.var['gene_ids'].isin(approved['feature_id'])].index
adata = adata[:, var_to_keep]
adata.var.set_index('gene_ids', inplace=True)
adata.var.drop(columns=['feature_types','index'], inplace=True)

adata.var['feature_is_filtered'] = False

In [None]:
sc.pl.spatial(adata,
              library_id=library_id,
              color='in_tissue')

In [None]:
sc.pl.spatial(adata,
              library_id=library_id,
              color='in_tissue',
              img_key='fullres',
              scale_factor=1
             )

In [None]:
new_one = library_id + '_revised.h5ad'
adata.write(filename=new_one, compression='gzip')

In [None]:
validate_process = subprocess.run(['cellxgene-schema', 'validate', new_one], stdout=subprocess.PIPE, stderr=subprocess.PIPE)
for line in validate_process.stdout.decode('utf-8').split('\n'):
    print(line)
for line in validate_process.stderr.decode('utf-8').split('\n'):
    print(line)