In [29]:
# Define the directories
seqfish_dir = '../data/seqFISH'
visium_dir = '../data/Visium'
data_dir = '../data/methods/Tangram'
results_dir = '../results'

seqfish_sc_file = f'{data_dir}/seqFISH_sc.h5ad'
seqfish_st_file = f'{data_dir}/seqFISH_st.h5ad'
visium_sc_file = f'{data_dir}/visium_sc.h5ad'
visium_st_file = f'{data_dir}/visium_st.h5ad'

## Process seqFISH+ data

In [30]:
import os
import anndata
import pandas as pd

# Load dataset CSV files into pandas DataFrames
sc_counts = pd.read_csv(f'{seqfish_dir}/sc_counts.csv')
sc_labels = pd.read_csv(f'{seqfish_dir}/sc_labels.csv')
st_counts = pd.read_csv(f'{seqfish_dir}/st_counts.csv')
st_coords = pd.read_csv(f'{seqfish_dir}/st_coords.csv')

print(f"Single-cell counts: {sc_counts.shape}")
print(f"Single-cell labels: {sc_labels.shape}")
print(f"Spatial counts: {st_counts.shape}")
print(f"Spatial coordinates: {st_coords.shape}")
print(st_counts.dtypes)

# Set 'cell_id' as the primary identifier for the single-cell counts data
sc_counts = sc_counts.set_index('cell_id')

# Generate dictionaries to map cell types to numeric indices for analysis
cell_type = list(set(sc_labels.cell_type))
cell_type_dict = dict(zip([x + 1 for x in range(len(cell_type))], cell_type))
meta_cell_dict = dict(zip([str(x + 1) for x in range(len(cell_type))], cell_type))

# Assign numeric clusters based on cell type for easier analysis
sc_labels['cluster'] = [cell_type.index(x) + 1 for x in sc_labels.cell_type]

# Create the output directory if it doesn't exist
if not os.path.isdir(data_dir):
    os.mkdir(data_dir)

# Prepare and save spatial transcriptomics data as an AnnData object
obs = pd.DataFrame({
    'x': st_coords.x,
    'y': st_coords.y,
    'index': st_coords.bin_id.astype(str)
})
var_names = st_counts.iloc[:, 1:].columns
var = pd.DataFrame(index=var_names)

obs.index = obs.index.astype(str)
var.index = var.index.astype(str)

X = st_counts.iloc[:, 1:].values
st_anndata = anndata.AnnData(X, obs=obs, var=var, dtype='int32')
st_anndata.write(seqfish_st_file)

# Prepare and save single-cell transcriptomics data as an AnnData object
obs = pd.DataFrame({
    'cell_type': sc_labels.cell_type,
    'cluster': sc_labels.cluster,
    'index': sc_labels.cell_id.astype(str)
})
var_names = sc_counts.columns
var = pd.DataFrame(index=var_names)

obs.index = obs.index.astype(str)
var.index = var.index.astype(str)

X = sc_counts.values
sc_adata = anndata.AnnData(X, obs=obs, var=var, dtype='int32')
sc_adata.write(seqfish_sc_file)

Single-cell counts: (576, 10001)
Single-cell labels: (576, 2)
Spatial counts: (94, 10001)
Spatial coordinates: (94, 3)
bin_id           int64
1700022a21rik    int64
1700025g04rik    int64
4933401b06rik    int64
5830417i10rik    int64
                 ...  
pde6b            int64
zp1              int64
dlx4             int64
opn1sw           int64
pramef12         int64
Length: 10001, dtype: object


## Process Visium data

In [31]:
# Load dataset CSV files into pandas DataFrames
sc_counts = pd.read_csv(f'{visium_dir}/filtered_sc_counts.csv', index_col=0)
sc_labels = pd.read_csv(f'{visium_dir}/filtered_sc_annotations.csv')
st_counts = pd.read_csv(f'{visium_dir}/filtered_st_counts.csv', index_col=0)
st_coords = pd.read_csv(f'{visium_dir}/filtered_st_locations.csv')

# Transpose the spatial data to match the format
sc_counts = sc_counts.T
st_counts = st_counts.T

# Convert sc_labels
sc_labels = sc_labels[['Cell.ID', 'annotation_benchmark']].rename(columns={
    'Cell.ID': 'cell_id',
    'annotation_benchmark': 'cell_type'
})

print(f"Single-cell counts: {sc_counts.shape}")
print(f"Single-cell labels: {sc_labels.shape}")
print(f"Spatial counts: {st_counts.shape}")
print(f"Spatial coordinates: {st_coords.shape}")

# Generate dictionaries to map cell types to numeric indices for analysis
cell_type = list(set(sc_labels.cell_type))
cell_type_dict = dict(zip([x + 1 for x in range(len(cell_type))], cell_type))
meta_cell_dict = dict(zip([str(x + 1) for x in range(len(cell_type))], cell_type))

# Assign numeric clusters based on cell type for easier analysis
sc_labels['cluster'] = [cell_type.index(x) + 1 for x in sc_labels.cell_type]

# Create the output directory if it doesn't exist
if not os.path.isdir(data_dir):
    os.mkdir(data_dir)

# Prepare and save spatial transcriptomics data as an AnnData object
obs = pd.DataFrame({
    'x': st_coords.x,
    'y': st_coords.y,
    'index': st_coords.index.astype(str)
})
var_names = st_counts.columns
var = pd.DataFrame(index=var_names)

obs.set_index('index', inplace=True)
obs.index = obs.index.astype(str)
var.index = var.index.astype(str)

X = st_counts.values
st_anndata = anndata.AnnData(X, obs=obs, var=var, dtype='int32')
st_anndata.write(visium_st_file)

# Prepare and save single-cell transcriptomics data as an AnnData object
obs = pd.DataFrame({
    'cell_type': sc_labels.cell_type,
    'cluster': sc_labels.cluster,
    'index': sc_labels.cell_id.astype(str)
})
var_names = sc_counts.columns
var = pd.DataFrame(index=var_names)

obs.index = obs.index.astype(str)
var.index = var.index.astype(str)

X = sc_counts.values
sc_adata = anndata.AnnData(X, obs=obs, var=var, dtype='int32')
sc_adata.write(visium_sc_file)

Single-cell counts: (8147, 31053)
Single-cell labels: (8147, 2)
Spatial counts: (2987, 31053)
Spatial coordinates: (2987, 3)


## Benchmark Tangram

In [32]:
import pandas as pd
import numpy as np
import scanpy as sc
import tangram as tg
import time


def get_results(sc_file, st_file, db_name, cell_type_key='cell_type'):
    # Start a timer to measure the execution time of the script
    start = time.time()
    
    # Load single-cell and spatial transcriptomics data from H5AD files
    ad_sc = sc.read_h5ad(sc_file)
    ad_st = sc.read_h5ad(st_file)

    # Normalize the total count to make data comparable
    sc.pp.normalize_total(ad_sc)
    
    # Count how many samples exist for each cell type and drop types with less than 2 samples
    cell_type_counts = ad_sc.obs[cell_type_key].value_counts()
    cell_type_drop = cell_type_counts.index[cell_type_counts < 2]
    
    if cell_type_drop.size > 0:
        print(f"Drop cell type(s) {', '.join(list(cell_type_drop))}, as they have less than 2 samples.")
        ad_sc = ad_sc[~ad_sc.obs[cell_type_key].isin(cell_type_drop),].copy()
    
    # Rank genes within groups defined by cell type to find markers
    sc.tl.rank_genes_groups(ad_sc, groupby=cell_type_key, use_raw=False)
    markers_df = pd.DataFrame(ad_sc.uns["rank_genes_groups"]["names"]).iloc[0:200, :]
    print("Top 10 markers for each cell type:")
    print(markers_df.head(10))
    
    # Extract and print the list of unique gene markers
    genes_sc = np.unique(markers_df.melt().value.values)
    
    # Retrieve genes names from spatial data
    genes_st = ad_st.var_names.values
    
    # Determine the intersection of gene lists from single-cell and spatial data
    genes = list(set(genes_sc).intersection(set(genes_st)))
    
    # Preprocess the AnnData objects to filter and align them by the intersecting gene list
    tg.pp_adatas(ad_sc, ad_st, genes=genes)
    
    # Map single-cell data onto spatial locations using the Tangram algorithm
    ad_map = tg.map_cells_to_space(
        ad_sc,
        ad_st,
        mode='clusters',
        cluster_label=cell_type_key)
    
    # Project annotations from single-cell data to spatial data
    tg.project_cell_annotations(ad_map, ad_st, annotation=cell_type_key)
    
    # Calculate and normalize cell-type densities based on the mapped data
    cell_type_density = ad_st.obsm['tangram_ct_pred']
    cell_type_density = (cell_type_density.T / cell_type_density.sum(axis=1)).T
    
    # Save the cell-type density data to a CSV file
    cell_type_density.to_csv(f'{results_dir}/{db_name}_Tangram.csv')
    
    # Calculate and print the total execution time of the script
    end = time.time()
    print(f"Execution time: {end - start:.2f} seconds")
    
print("Results for seqFISH data:")
get_results(sc_file=seqfish_sc_file, st_file=seqfish_st_file, db_name='seqFISH')

print("\nResults for Visium data:")
get_results(sc_file=visium_sc_file, st_file=visium_st_file, db_name='Visium')

Results for seqFISH data:


INFO:root:1118 training genes are saved in `uns``training_genes` of both single cell and spatial Anndatas.
INFO:root:10000 overlapped genes are saved in `uns``overlap_genes` of both single cell and spatial Anndatas.
INFO:root:uniform based density prior is calculated and saved in `obs``uniform_density` of the spatial Anndata.
INFO:root:rna count based density prior is calculated and saved in `obs``rna_count_based_density` of the spatial Anndata.
INFO:root:Allocate tensors for mapping.
INFO:root:Begin training with 1118 genes and rna_count_based density_prior in clusters mode...
INFO:root:Printing scores every 100 epochs.


Top 10 markers for each cell type:
  astrocyte    endo  eneuron ineuron  microglia      olig
0      gja1    pltp  slc17a7  slc32a1     csf1r    cldn11
1    atp1b2  col4a2   clstn1   pcp4l1     itgb5       mag
2     fgfr3     eng   lingo1   diras1       grn       cnp
3     htra1  slc7a5     ngef    cend1    laptm5      gjc2
4   gpr37l1   epas1    icam5    rpp25    cx3cr1   gal3st1
5   fam107a    nid1    car10   eef1a2    selplg      fa2h
6  slc9a3r1  apcdd1     gnaz    rcan2      ctsd       gsn
7    acsbg1   cldn5    prrt3    kcnc1     itgam       mog
8     ttyh1   igf1r    stx1a   slc6a1      bin2  arhgef10
9   slc27a1   ltbp4    mmp17  zfp385a    coro1a     abca2
Score: 0.610, KL reg: 0.464
Score: 0.894, KL reg: 0.000
Score: 0.896, KL reg: 0.000
Score: 0.896, KL reg: 0.000
Score: 0.896, KL reg: 0.000
Score: 0.897, KL reg: 0.000
Score: 0.897, KL reg: 0.000
Score: 0.897, KL reg: 0.000
Score: 0.897, KL reg: 0.000
Score: 0.897, KL reg: 0.000


INFO:root:Saving results..
INFO:root:spatial prediction dataframe is saved in `obsm` `tangram_ct_pred` of the spatial AnnData.


Execution time: 3.75 seconds

Results for Visium data:
Top 10 markers for each cell type:
  Astro_AMY Astro_AMY_CTX Astro_CTX Astro_HPC Astro_HYPO Astro_STR  \
0       Ntm          Gpc5      Gpc5     Nrxn1        Ntm      Gpc5   
1    Slc1a2         Nrxn1     Lsamp      Gpc5      Nrxn1       Ezr   
2     Wdr17        Slc1a2       Ntm       Ntm      Npas3     Pipox   
3      Msi2         Lsamp    Slc1a2    Slc1a2       Apoe      Lgi4   
4      Gpc5           Ntm     Nrxn1     Wdr17       Gpc5      Sox9   
5     Nrxn1         Wdr17      Msi2      Msi2     Slc4a4     Tex12   
6     Npas3          Msi2     Wdr17       Cpe       Msi2    Cyp2u1   
7    Atp1a2        Tspan7    Gm3764     Lsamp      Trpm3      Pld2   
8      Apoe        Slc1a3     Luzp2    Slc1a3      Wdr17   Slc41a1   
9    Slc1a3         Npas3    Slc1a3    Gabrb1     Gm3764      Gldc   

  Astro_THAL_hab Astro_THAL_lat Astro_THAL_med Astro_WM  ... Inh_Meis2  \
0         Slc4a4          Trpm3          Trpm3     Cst3  ...     

INFO:root:2682 training genes are saved in `uns``training_genes` of both single cell and spatial Anndatas.
INFO:root:21058 overlapped genes are saved in `uns``overlap_genes` of both single cell and spatial Anndatas.
INFO:root:uniform based density prior is calculated and saved in `obs``uniform_density` of the spatial Anndata.
INFO:root:rna count based density prior is calculated and saved in `obs``rna_count_based_density` of the spatial Anndata.
INFO:root:Allocate tensors for mapping.
INFO:root:Begin training with 2682 genes and rna_count_based density_prior in clusters mode...
INFO:root:Printing scores every 100 epochs.


Score: 0.440, KL reg: 0.196
Score: 0.605, KL reg: 0.000
Score: 0.606, KL reg: 0.000
Score: 0.607, KL reg: 0.000
Score: 0.607, KL reg: 0.000
Score: 0.607, KL reg: 0.000
Score: 0.607, KL reg: 0.000
Score: 0.607, KL reg: 0.000
Score: 0.607, KL reg: 0.000
Score: 0.607, KL reg: 0.000


INFO:root:Saving results..
INFO:root:spatial prediction dataframe is saved in `obsm` `tangram_ct_pred` of the spatial AnnData.


Execution time: 177.28 seconds
