# Data Preparation

- **Creator**: Sebastian Birk (<sebastian.birk@helmholtz-munich.de>)
- **Date of Creation:** 01.10.2022
- **Date of Last Modification:** 29.01.2025 (Sebastian Birk; <sebastian.birk@helmholtz-munich.de>)

- Run this notebook in the nichecompass-reproducibility environment, installable from ```('../../envs/environment.yaml')```.

## 1. Setup

### 1.1 Import Libraries

In [None]:
import gc
import os
import warnings
from copy import deepcopy

import anndata as ad
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import scanpy as sc
import scipy.sparse as sp
import seaborn as sns
import squidpy as sq
import tiledb
import tiledbsoma
from sklearn.preprocessing import LabelEncoder

### 1.2 Run Notebook Setup

In [None]:
warnings.filterwarnings("ignore")
pd.set_option('display.max_columns', None)

### 1.3 Configure Paths and Create Directories

In [None]:
# Define paths
st_data_folder_path = "../../datasets/st_data" # spatially resolved transcriptomics data
st_data_bronze_folder_path = f"{st_data_folder_path}/bronze"
st_data_silver_folder_path = f"{st_data_folder_path}/silver"
st_data_gold_folder_path = f"{st_data_folder_path}/gold"

# Create required directories
os.makedirs(st_data_bronze_folder_path, exist_ok=True)
os.makedirs(st_data_silver_folder_path, exist_ok=True)
os.makedirs(st_data_gold_folder_path, exist_ok=True)

## 2. Manuscript Data

This sections contains all datasets used for the manuscript.

### 2.1 seqFISH Mouse Organogenesis

- **Publication:** Lohoff, T. et al. Integration of spatial and single-cell transcriptomic data elucidates mouse organogenesis. Nat. Biotechnol. 40, 74–85 (2022): https://www.nature.com/articles/s41587-021-01006-2.
- **Data Access:** https://marionilab.cruk.cam.ac.uk/SpatialMouseAtlas/
- **Data Structure:**
    - ```
         ├── ../datasets/st_data/bronze/seqfish_mouse_organogenesis  
         │  ├── metadata.Rds (metadata)
         │  ├── counts.Rds (raw counts)
         │  ├── exprs.Rds (log normalized counts)
         │  ├── imputed.h5 (imputed gene expression)
         │  ├── imputed_row_names.Rds (row names of imputed gene expression)
         │  ├── imputed_column_names.Rds (column names of imputed gene expression)
         ├── ../datasets/st_data/silver
         │  ├── seqfish_mouse_organogenesis.h5ad (preprocessed gene expression)
         │  ├── seqfish_mouse_organogenesis_imputed.h5ad (preprocessed imputed gene expression)
      ```
- **Summary:**
    - Sagittal tissue sections of three 8-12 somite stage mouse embryos
        - Two sections per embryo that can be combined horizontally to get three samples, one for each embryo
    - 19,451 observations (embryo 1), 14,891 observations (embryo 2) and 23,194 observations (embryo 3) on cell-level with cell-type annotations
    - 351 probed genes
    - Dataset is also available with imputed genes based on scRNA-seq

In [None]:
dataset = "seqfish_mouse_organogenesis"
cell_type_key = "celltype_mapped_refined"

#### 2.1.1 Load & Preprocess Raw Data

##### 2.1.1.1 Spatial Transcriptomics Data

Preprocessing is done with ```../scripts/data_preparation/seqfish_mouse_organogenesis_data_preparation.R``` and preprocessed version is stored under ```../datasets/st_data/silver/seqfish_mouse_organogenesis.h5ad```.

In [None]:
# Read preprocessed data
adata = sc.read_h5ad(f"{st_data_silver_folder_path}/{dataset}.h5ad")

print(f"Step 1: Number of cells: {len(adata)}")
print(f"Step 1: Number of genes: {len(adata.var_names)}")
# Filter low quality cells
adata = adata[adata.obs["celltype_mapped_refined"] != "Low quality"]
print(f"Step 2: Number of cells: {len(adata)}")

# Format adata
adata.obsm["spatial"] = np.column_stack((adata.obs.x, adata.obs.y))
adata.obs.drop(["x", "y"], axis=1, inplace=True)
adata.X = adata.X.astype(np.float32)
adata.layers["counts"] = adata.layers["counts"].astype(np.float32)

# Add sample & batch columns
adata.obs.loc[adata.obs.index.str.contains("embryo1"), "sample"] = "embryo1"
adata.obs.loc[adata.obs.index.str.contains("embryo2"), "sample"] = "embryo2"
adata.obs.loc[adata.obs.index.str.contains("embryo3"), "sample"] = "embryo3"

adata.obs.loc[adata.obs.index.str.startswith("embryo1") &
              adata.obs.index.str.endswith("z2"), "batch"] = "embryo1_z2"
adata.obs.loc[adata.obs.index.str.startswith("embryo1") &
              adata.obs.index.str.endswith("z5"), "batch"] = "embryo1_z5"
adata.obs.loc[adata.obs.index.str.startswith("embryo2") &
              adata.obs.index.str.endswith("z2"), "batch"] = "embryo2_z2"
adata.obs.loc[adata.obs.index.str.startswith("embryo2") &
              adata.obs.index.str.endswith("z5"), "batch"] = "embryo2_z5"
adata.obs.loc[adata.obs.index.str.startswith("embryo3") &
              adata.obs.index.str.endswith("z2"), "batch"] = "embryo3_z2"
adata.obs.loc[adata.obs.index.str.startswith("embryo3") &
              adata.obs.index.str.endswith("z5"), "batch"] = "embryo3_z5"

# Write adata to disk separated by batches
batch_indeces = list(np.arange(6) + 1)
batches = ["embryo1_z2",
           "embryo1_z5",
           "embryo2_z2",
           "embryo2_z5",
           "embryo3_z2",
           "embryo3_z5"]

for batch_idx, batch in zip(batch_indeces, batches):
    adata_batch = adata[adata.obs["batch"] == batch]
    adata_batch.write(f"{st_data_gold_folder_path}/{dataset}_batch{batch_idx}.h5ad")
    
# Create adata for embryo 2 for single sample method benchmarking
adata_embryo2 = adata[adata.obs["sample"] == "embryo2"]
adata_embryo2.write(f"{st_data_gold_folder_path}/{dataset}_embryo2.h5ad")

##### 2.1.1.2 Spatial Transcriptomics Data Subsamples

In [None]:
# Create subsamples of all batches for method benchmarking
for subsample_pct in [50, 25, 10, 5, 1]:
    # Read preprocessed data
    adata = sc.read_h5ad(f"{st_data_silver_folder_path}/{dataset}.h5ad")

    # Filter low quality cells
    adata = adata[adata.obs["celltype_mapped_refined"] != "Low quality"]

    # Format adata
    adata.obsm["spatial"] = np.column_stack((adata.obs.x, adata.obs.y))
    adata.obs.drop(["x", "y"], axis=1, inplace=True)
    adata.X = adata.X.astype(np.float32)
    adata.layers["counts"] = adata.layers["counts"].astype(np.float32)

    # Add sample & batch columns
    adata.obs.loc[adata.obs.index.str.contains("embryo1"), "sample"] = "embryo1"
    adata.obs.loc[adata.obs.index.str.contains("embryo2"), "sample"] = "embryo2"
    adata.obs.loc[adata.obs.index.str.contains("embryo3"), "sample"] = "embryo3"

    adata.obs.loc[adata.obs.index.str.startswith("embryo1") &
                  adata.obs.index.str.endswith("z2"), "batch"] = "embryo1_z2"
    adata.obs.loc[adata.obs.index.str.startswith("embryo1") &
                  adata.obs.index.str.endswith("z5"), "batch"] = "embryo1_z5"
    adata.obs.loc[adata.obs.index.str.startswith("embryo2") &
                  adata.obs.index.str.endswith("z2"), "batch"] = "embryo2_z2"
    adata.obs.loc[adata.obs.index.str.startswith("embryo2") &
                  adata.obs.index.str.endswith("z5"), "batch"] = "embryo2_z5"
    adata.obs.loc[adata.obs.index.str.startswith("embryo3") &
                  adata.obs.index.str.endswith("z2"), "batch"] = "embryo3_z2"
    adata.obs.loc[adata.obs.index.str.startswith("embryo3") &
                  adata.obs.index.str.endswith("z5"), "batch"] = "embryo3_z5"

    # Write adata to disk separated by batches
    batch_indeces = list(np.arange(6) + 1)
    batches = ["embryo1_z2",
               "embryo1_z5",
               "embryo2_z2",
               "embryo2_z5",
               "embryo3_z2",
               "embryo3_z5"]

    for batch_idx, batch in zip(batch_indeces, batches):
        adata_batch = adata[adata.obs["batch"] == batch]
        adata_batch = adata_batch[:int(subsample_pct/100 * len(adata_batch)),:]
        adata_batch.write(f"{st_data_gold_folder_path}/{dataset}_subsample_{subsample_pct}pct_batch{batch_idx}.h5ad")
        
    # Create adata for embryo 2 for single sample method benchmarking
    adata_embryo2 = adata[adata.obs["sample"] == "embryo2"]
    adata_embryo2 = adata_embryo2[:int(subsample_pct/100 * len(adata_embryo2)),:]
    adata_embryo2.write(f"{st_data_gold_folder_path}/{dataset}_subsample_{subsample_pct}pct_embryo2.h5ad")

##### 2.1.1.3 Imputed Data

In [None]:
# Read preprocessed imputed adata
adata_imputed = sc.read_h5ad(f"{st_data_silver_folder_path}/{dataset}_imputed.h5ad")

print(f"Step 1: Number of genes: {len(adata_imputed.var_names)}")

# Filter low quality cells
adata_imputed = adata_imputed[adata_imputed.obs["celltype_mapped_refined"] != "Low quality"]

# Format imputed adata and reverse log normalization
adata_imputed.obsm["spatial"] = np.column_stack((adata_imputed.obs.x, adata_imputed.obs.y))
adata_imputed.obs.drop(["x", "y"], axis=1, inplace=True)
adata_imputed.X = sp.csr_matrix(np.round((np.exp(adata_imputed.X.toarray()) - 1)))
adata_imputed.X = adata_imputed.X.astype(np.float32)
adata_imputed.layers["counts"] = adata_imputed.X 

# Filter genes with max log counts > 141, which was the maximum across all genes in the non-imputed dataset
adata_imputed = adata_imputed[:, np.max(adata_imputed.layers["counts"].toarray(), axis=0) < 141]
                              
# Add sample & batch columns
adata_imputed.obs.loc[adata_imputed.obs.index.str.contains("embryo1"), "sample"] = "embryo1"
adata_imputed.obs.loc[adata_imputed.obs.index.str.contains("embryo2"), "sample"] = "embryo2"
adata_imputed.obs.loc[adata_imputed.obs.index.str.contains("embryo3"), "sample"] = "embryo3"

adata_imputed.obs.loc[adata_imputed.obs.index.str.startswith("embryo1") &
                      adata_imputed.obs.index.str.endswith("z2"), "batch"] = "embryo1_z2"
adata_imputed.obs.loc[adata_imputed.obs.index.str.startswith("embryo1") &
                      adata_imputed.obs.index.str.endswith("z5"), "batch"] = "embryo1_z5"
adata_imputed.obs.loc[adata_imputed.obs.index.str.startswith("embryo2") &
                      adata_imputed.obs.index.str.endswith("z2"), "batch"] = "embryo2_z2"
adata_imputed.obs.loc[adata_imputed.obs.index.str.startswith("embryo2") &
                      adata_imputed.obs.index.str.endswith("z5"), "batch"] = "embryo2_z5"
adata_imputed.obs.loc[adata_imputed.obs.index.str.startswith("embryo3") &
                      adata_imputed.obs.index.str.endswith("z2"), "batch"] = "embryo3_z2"
adata_imputed.obs.loc[adata_imputed.obs.index.str.startswith("embryo3") &
                      adata_imputed.obs.index.str.endswith("z5"), "batch"] = "embryo3_z5"

# Write adata to disk separated by batches
batch_indeces = list(np.arange(6) + 1)
batches = ["embryo1_z2",
           "embryo1_z5",
           "embryo2_z2",
           "embryo2_z5",
           "embryo3_z2",
           "embryo3_z5"]

for batch_idx, batch in zip(batch_indeces, batches):
    adata_imputed_batch = adata_imputed[adata_imputed.obs["batch"] == batch]
    #adata_imputed_batch.write(f"{st_data_gold_folder_path}/{dataset}_imputed_batch{batch_idx}.h5ad")
    
# Create adata for embryo 2 for single sample method benchmarking
adata_embryo2 = adata_imputed[adata_imputed.obs["sample"] == "embryo2"]
#adata_embryo2.write(f"{st_data_gold_folder_path}/{dataset}_imputed_embryo2.h5ad")

##### 2.1.1.4 Imputed Data Subsamples

In [None]:
batch_indeces = [1, 2, 3, 4, 5, 6]
for subsample_pct in [50, 25, 10, 5, 1]:
    for batch_idx in batch_indeces:
        # Read preprocessed data
        adata = sc.read_h5ad(f"{st_data_gold_folder_path}/{dataset}_imputed_batch{batch_idx}.h5ad")
        adata = adata[:int(subsample_pct/100 * len(adata)),:].copy()
        adata.write(f"{st_data_gold_folder_path}/{dataset}_imputed_subsample_{subsample_pct}pct_batch{batch_idx}.h5ad")

#### 2.1.2 Explore Data

##### 2.1.2.1 Spatial Transcriptomics Data

In [None]:
batch_indeces = [1, 2, 3, 4, 5, 6]
for batch_idx in batch_indeces:
    adata = sc.read_h5ad(f"{st_data_gold_folder_path}/{dataset}_batch{batch_idx}.h5ad")
    
    print(f"Exploring dataset {dataset}_batch{batch_idx}.")
    print(f"Number of nodes (cells): {adata.X.shape[0]}")
    print(f"Number of node features (genes): {adata.X.shape[1]}")

    # Visualize cell-level annotated data in physical space
    sq.pl.spatial_scatter(adata, color=cell_type_key, shape=None, figsize=(12, 12))        

##### 2.1.2.2 Spatial Transcriptomics Data Subsamples

In [None]:
batch_indeces = [1, 2, 3, 4, 5, 6]
for batch_idx in [1]:
    for subsample_pct in [50, 25, 10, 5, 1]:
        adata = sc.read_h5ad(f"{st_data_gold_folder_path}/{dataset}_imputed_subsample_{subsample_pct}pct_embryo2.h5ad")

        print(f"Exploring dataset {dataset}_batch{batch_idx}.")
        print(f"Number of nodes (cells): {adata.X.shape[0]}")
        print(f"Number of node features (genes): {adata.X.shape[1]}")

        # Visualize cell-level annotated data in physical space
        sq.pl.spatial_scatter(adata, color=cell_type_key, shape=None, figsize=(12, 12))

##### 2.1.2.3 Imputed Data

In [None]:
batch_indeces = [1, 2, 3, 4, 5, 6]
for batch_idx in batch_indeces:
    adata_imputed = sc.read_h5ad(f"{st_data_gold_folder_path}/{dataset}_imputed_batch{batch_idx}.h5ad")
    
    print(f"Exploring dataset {adata_imputed}_batch{batch_idx}.")
    print(f"Number of nodes (cells): {adata_imputed.X.shape[0]}")
    print(f"Number of node features (genes): {adata_imputed.X.shape[1]}")

    # Visualize cell-level annotated data in physical space
    sq.pl.spatial_scatter(adata_imputed, color=cell_type_key, shape=None, figsize=(12, 12))        

In [None]:
adata = sc.read_h5ad(f"{st_data_gold_folder_path}/{dataset}_imputed_embryo2.h5ad")

print(f"Exploring dataset {dataset}_imputed_embryo2.")
print(f"Number of nodes (cells): {adata.X.shape[0]}")
print(f"Number of node features (genes): {adata.X.shape[1]}")

# Visualize cell-level annotated data in physical space
sq.pl.spatial_scatter(adata, color=cell_type_key, shape=None, figsize=(12, 12))

##### 2.1.2.4 Imputed Data Subsamples

In [None]:
batch_indeces = [1, 2, 3, 4, 5, 6]
for subsample_pct in [50, 25, 10, 5, 1]:
    for batch_idx in batch_indeces:
        adata = sc.read_h5ad(f"{st_data_gold_folder_path}/{dataset}_imputed_subsample_{subsample_pct}pct_batch{batch_idx}.h5ad")

        print(f"Exploring dataset {dataset}_imputed_embryo2.")
        print(f"Number of nodes (cells): {adata.X.shape[0]}")
        print(f"Number of node features (genes): {adata.X.shape[1]}")

        # Visualize cell-level annotated data in physical space
        sq.pl.spatial_scatter(adata, color=cell_type_key, shape=None, figsize=(12, 12))

### 2.2 Slide-seqV2 Mouse Hippocampus

- **Publication**: Stickels, R. R. et al. Highly sensitive spatial transcriptomics at near-cellular resolution with Slide-seqV2. Nat. Biotechnol. 39, 313–319 (2021). [doi:10.1038/s41587-020-0739-1](https://doi.org/10.1038/s41587-020-0739-1)
- **Data Access:** Preprocessed version from squidpy API (original source https://singlecell.broadinstitute.org/single_cell/study/SCP815/highly-sensitive-spatial-transcriptomics-at-near-cellular-resolution-with-slide-seqv2)
- **Summary:**
    - Mouse hippocampus puck
    - 41,786 observations on cell-level with cell-type annotations
    - 4,000 probed genes

In [None]:
dataset = "slideseqv2_mouse_hippocampus"
cell_type_key = "cell_type"

#### 2.2.1 Load & Preprocess Raw Data

##### 2.2.1.1 Spatial Transcriptomics Data

In [None]:
print(f"Loading and preprocessing dataset '{dataset}'.")

# Retrieve adata from squidpy
adata = sq.datasets.slideseqv2()

print(f"Step 1: Number of cells: {len(adata)}")
print(f"Step 1: Number of genes: {len(adata.var_names)}")

# Create new adata to drop all unnecessary columns
adata_new = sc.AnnData(sp.csr_matrix(
    (adata.shape[0], adata.shape[1]),
    dtype=np.float32))
adata_new.X = adata.X
adata_new.var_names = adata.var_names
adata_new.obs_names = adata.obs_names
adata_new.obsm["spatial"] = adata.obsm["spatial"]
adata_new.obs["cell_type"] = adata.obs["cluster"].values

# Store raw counts in adata.layers
adata_new.layers["counts"] = sp.csr_matrix(np.round((np.exp(adata.X.toarray()) - 1)))

adata_new.obs["batch"] = "sample1"

# Store data to disk
adata_new.write(f"{st_data_gold_folder_path}/{dataset}.h5ad")

##### 2.2.1.2 Spatial Transcriptomics Data Subsamples

Data is not spatially ordered so we need to use coordinates to subsample.

In [None]:
y_diff_list = [1406, 1920, 2223, 2319, 2398] # 50%, 25%, 10%, 5%, 1%

for i, subsample_pct in enumerate([50, 25, 10, 5, 1]):
    y_max_sample = max(adata.obs.y) - y_diff_list[i]
    y_min_sample = min(adata.obs.y) + y_diff_list[i]
    adata_sample = adata[(adata.obs.y < y_max_sample) & (adata.obs.y > y_min_sample)]

    # Create new adata to drop all unnecessary columns
    adata_new_sample = sc.AnnData(sp.csr_matrix(
        (adata_sample.shape[0], adata_sample.shape[1]),
        dtype=np.float32))
    adata_new_sample.X = adata_sample.X
    adata_new_sample.var_names = adata_sample.var_names
    adata_new_sample.obs_names = adata_sample.obs_names
    adata_new_sample.obsm["spatial"] = adata_sample.obsm["spatial"]
    adata_new_sample.obs["cell_type"] = adata_sample.obs["cluster"].values

    # Store raw counts in adata.layers
    adata_new_sample.layers["counts"] = sp.csr_matrix(np.round((np.exp(adata_new_sample.X.toarray()) - 1)))
    adata_new_sample.obs["batch"] = "sample1"

    # Store data to disk
    adata_new_sample.write(f"{st_data_gold_folder_path}/{dataset}_subsample_{subsample_pct}pct.h5ad")

#### 2.2.2 Explore Data

##### 2.2.2.1 Spatial Transcriptomics Data

In [None]:
adata = sc.read_h5ad(f"{st_data_gold_folder_path}/{dataset}.h5ad")
    
print(f"Exploring dataset {dataset}.")
print(f"Number of nodes (cells): {adata.X.shape[0]}")
print(f"Number of node features (genes): {adata.X.shape[1]}")

# Visualize cell-level annotated data in physical space
sq.pl.spatial_scatter(adata, color=cell_type_key, shape=None, figsize=(12, 12))        

##### 2.2.2.1 Spatial Transcriptomics Data Subsamples

In [None]:
for subsample_pct in [50, 25, 10, 5, 1]:
    adata = sc.read_h5ad(f"{st_data_gold_folder_path}/{dataset}_subsample_{subsample_pct}pct.h5ad")

    print(f"Exploring dataset {dataset}_subsample_{subsample_pct}pct.")
    print(f"Number of nodes (cells): {adata.X.shape[0]}")
    print(f"Number of node features (genes): {adata.X.shape[1]}")

    # Visualize cell-level annotated data in physical space
    sq.pl.spatial_scatter(adata, color=cell_type_key, shape=None, figsize=(12, 12))        

### 2.3 Vizgen MERFISH Mouse Liver

- **Publication**: Vizgen MERFISH Mouse Liver Map January 2022
- **Data Access:** https://info.vizgen.com/mouse-liver-access
    - Animal 1 replicate 1 is used
    - 2 animals with 2 replicates each can be downloaded
- **Data Structure:**
    - ```
         ├── ../datasets/st_data/bronze/vizgen_merfish_mouse_liver  
         │  ├── Liver1Slice1_cell_by_gene.csv (gene expression)
         │  ├── Liver1Slice1_cell_metadata.csv.csv (metadata) 
      ```
- **Preprocessing Vignette:** https://squidpy.readthedocs.io/en/latest/external_tutorials/tutorial_vizgen_mouse_liver.html
- **Summary:**
    - Liver tissue section of mouse liver map
    - 367,335 observations on cell-level with cell-type annotations
    - 347 probed genes

In [None]:
dataset = "vizgen_merfish_mouse_liver"
cell_type_key = "Cell_Type"
merfish_dir = f"{st_data_bronze_folder_path}/{dataset}/"

#### 2.3.1 Load & Preprocess Raw Data

##### 2.3.1.1 ST Data

In [None]:
print(f"Loading and preprocessing dataset '{dataset}'.")
    
# Read adata from files
adata = sq.read.vizgen(merfish_dir,
                       counts_file="Liver1Slice1_cell_by_gene.csv",
                       meta_file="Liver1Slice1_cell_metadata.csv")

print(f"Step 1: Number of cells: {len(adata)}")
print(f"Step 1: Number of genes: {len(adata.var_names)}")

# Preprocess as per squidpy vignette
adata.var_names_make_unique()
adata.var["mt"] = adata.var_names.str.startswith("mt-")
sc.pp.calculate_qc_metrics(adata, qc_vars=["mt"],percent_top=(50, 100, 200, 300), inplace=True)
sc.pp.filter_cells(adata, min_counts=50)
sc.pp.filter_genes(adata, min_cells=10)

print(f"Step 2: Number of cells: {len(adata)}")
print(f"Step 2: Number of genes: {len(adata.var_names)}")

# Store raw counts in `layers`
adata.layers["counts"] = adata.X.copy()

# Store data to disk
#adata.write(f"{st_data_silver_folder_path}/{dataset}.h5ad")

# Determine cell annotation as per squidpy vignette
sc.pp.normalize_total(adata, target_sum=1e4)
sc.pp.log1p(adata)
sc.pp.scale(adata, max_value=10)
sc.tl.pca(adata, svd_solver="arpack")
sc.pp.neighbors(adata, n_neighbors=10, n_pcs=20)
sc.tl.umap(adata)
sc.tl.leiden(adata, resolution=1.5)

sc.set_figure_params(figsize=(10,10))
sc.pl.umap(adata, color=["leiden"], size=5)

gene_panel = "https://static-content.springer.com/esm/art%3A10.1038%2Fs41421-021-00266-1/MediaObjects/41421_2021_266_MOESM1_ESM.xlsx"
df_ref_panel_ini = pd.read_excel(gene_panel, index_col=0)
df_ref_panel = df_ref_panel_ini.iloc[1:,:1]
df_ref_panel.index.name = None
df_ref_panel.columns = ["Function"]

# Assign marker gene metadata using reference dataset
marker_genes = df_ref_panel[df_ref_panel["Function"].str.contains("marker")].index.tolist()

meta_gene = deepcopy(adata.var)
common_marker_genes = list(set(meta_gene.index.tolist()).intersection(marker_genes))
meta_gene.loc[common_marker_genes, "Markers"] = df_ref_panel.loc[common_marker_genes, "Function"]
meta_gene["Markers"] = meta_gene["Markers"].apply(lambda x: "N.A." if "marker" not in str(x) else x)
meta_gene["Markers"].value_counts()

ser_counts = adata.obs["leiden"].value_counts()
ser_counts.name = "cell counts"
meta_leiden = pd.DataFrame(ser_counts)

cat_name = "leiden"
sig_leiden = pd.DataFrame(columns=adata.var_names, index=adata.obs[cat_name].cat.categories)
for clust in adata.obs[cat_name].cat.categories:
    sig_leiden.loc[clust] = adata[adata.obs[cat_name].isin([clust]),:].X.mean(0)
sig_leiden = sig_leiden.transpose()
leiden_clusters = ["Leiden-" + str(x) for x in sig_leiden.columns.tolist()]
sig_leiden.columns = leiden_clusters
meta_leiden.index = sig_leiden.columns.tolist()
meta_leiden["leiden"] = pd.Series(meta_leiden.index.tolist(), index=meta_leiden.index.tolist())

meta_gene = pd.DataFrame(index=sig_leiden.index.tolist())
meta_gene["info"] = pd.Series("", index=meta_gene.index.tolist())
meta_gene["Markers"] = pd.Series("N.A.", index=sig_leiden.index.tolist())
meta_gene.loc[common_marker_genes, "Markers"] = df_ref_panel.loc[common_marker_genes, "Function"]

meta_leiden["Cell_Type"] = pd.Series("N.A.", index=meta_leiden.index.tolist())
num_top_genes = 30
for inst_cluster in sig_leiden.columns.tolist():
    top_genes = sig_leiden[inst_cluster].sort_values(ascending=False).index.tolist()[:num_top_genes]

    inst_ser = meta_gene.loc[top_genes, "Markers"]
    inst_ser = inst_ser[inst_ser != "N.A."]
    ser_counts = inst_ser.value_counts()
  
    max_count = ser_counts.max()
  
    max_cat = "_".join(sorted(ser_counts[ser_counts == max_count].index.tolist()))
    max_cat = max_cat.replace(" marker", "").replace(" ", "-")
  
    print(inst_cluster, max_cat)
    meta_leiden.loc[inst_cluster, "Cell_Type"] = max_cat

# Rename clusters
meta_leiden["name"] = meta_leiden.apply(lambda x: x["Cell_Type"] + "_" + x["leiden"] , axis=1)
leiden_names = meta_leiden["name"].values.tolist()
meta_leiden.index = leiden_names

# Transfer cell type labels to single cells
leiden_to_cell_type = deepcopy(meta_leiden)
leiden_to_cell_type.set_index("leiden", inplace=True)
leiden_to_cell_type.index.name = None

adata.obs["Cell_Type"] = adata.obs["leiden"].apply(lambda x: leiden_to_cell_type.loc["Leiden-" + str(x), "Cell_Type"])
adata.obs["Cluster"] = adata.obs["leiden"].apply(lambda x: leiden_to_cell_type.loc["Leiden-" + str(x), "name"])

adata_old = adata.copy()

adata = ad.AnnData(sp.csr_matrix(adata_old.X))
adata.obs_names = adata_old.obs_names
adata.var_names = adata_old.var_names

adata.layers["counts"] = adata_old.layers["counts"]
adata.obsm["spatial"] = adata_old.obsm["spatial"]
adata.obs["Cell_Type"] = adata_old.obs["Cell_Type"]

# Remove negative probes
adata = adata[:, ~adata.var_names.str.contains("Blank")]

print(f"Step 3: Number of cells: {len(adata)}")
print(f"Step 3: Number of genes: {len(adata.var_names)}")

adata.obs["batch"] = "animal1_z2"

# Store gene expression in sparse row format
adata.layers["counts"] = sp.csr_matrix(adata.layers["counts"])

# Add field of view
adata.obs["fov"] = adata_old.obs["fov"].values

# Store data to disk
adata.write(f"{st_data_gold_folder_path}/{dataset}.h5ad")

##### 2.3.1.2 ST Data Subsamples

In [None]:
for subsample_pct in [50, 25, 10, 5, 1]:
    # Read preprocessed data
    adata = sc.read_h5ad(f"{st_data_gold_folder_path}/{dataset}.h5ad")
    adata = adata[:int(subsample_pct/100 * len(adata)),:].copy()
    adata.write(f"{st_data_gold_folder_path}/{dataset}_subsample_{subsample_pct}pct.h5ad")

#### 2.3.2 Explore Data

##### 2.3.2.1 ST Data

In [None]:
adata = sc.read_h5ad(f"{st_data_gold_folder_path}/{dataset}.h5ad")
    
print(f"Exploring dataset {dataset}.")
print(f"Number of nodes (cells): {adata.X.shape[0]}")
print(f"Number of node features (genes): {adata.X.shape[1]}")

# Visualize cell-level annotated data in physical space
sq.pl.spatial_scatter(adata, color=cell_type_key, shape=None, figsize=(12, 12))        

##### 2.3.2.2 ST Data Subsamples

In [None]:
for subsample_pct in [50, 25, 10, 5, 1]:
    adata = sc.read_h5ad(f"{st_data_gold_folder_path}/{dataset}_subsample_{subsample_pct}pct.h5ad")

    print(f"Exploring dataset {dataset}.")
    print(f"Number of nodes (cells): {adata.X.shape[0]}")
    print(f"Number of node features (genes): {adata.X.shape[1]}")

    # Visualize cell-level annotated data in physical space
    sq.pl.spatial_scatter(adata, color=cell_type_key, shape=None, figsize=(12, 12))    

### 2.4 nanoString CosMx SMI Human Non-Small-Cell Lung Cancer (NSCLC)

- **Publication:** 
    - He, S. et al. High-plex imaging of RNA and proteins at subcellular resolution in fixed tissue by spatial molecular imaging. Nat. Biotechnol. 40, 1794–1806 (2022)
    - Tang, Z., Zhang, T., Yang, B., Su, J. & Song, Q. SiGra: Single-cell spatial elucidation through image-augmented graph transformer. bioRxiv (2022). [doi:10.1101/2022.08.18.504464v1](https://www.biorxiv.org/content/10.1101/2022.08.18.504464v1)
- **Data Access:**
    - https://nanostring.com/products/cosmx-spatial-molecular-imager/ffpe-dataset/
    - Preprocessed version available from SiGra: https://purdue0-my.sharepoint.com/:f:/g/personal/tang385_purdue_edu/EoJcJv8OZHRIhLyplj5r1PABW-UQfD1p1YU00gAdZNeK7A?e=K3Mmqg (lung 9 replicate 1 only; not used in the manuscript)
- **Data Structure**:
    - ```
         ├── ../datasets/st_data/bronze/nanostring_cosmx_human_nsclc
         │  └── raw         
         │    └── Lung5_Rep1
         │      └── Lung5_Rep1-Flat_files_and_images (unannotated data)
         │    └── Lung5_Rep2
         │      └── Lung5_Rep2-Flat_files_and_images (unannotated data)
         │    └── Lung5_Rep3
         │      └── Lung5_Rep3-Flat_files_and_images (unannotated data)
         │    └── Lung6
         │      └── Lung6-Flat_files_and_images (unannotated data)
         │    └── Lung9_Rep1
         │      └── Lung9_Rep1-Flat_files_and_images (unannotated data)
         │    └── Lung9_Rep2
         │      └── Lung9_Rep2-Flat_files_and_images (unannotated data)
         │    └── Lung12
         │      └── Lung12-Flat_files_and_images (unannotated data)
         │    └── Lung13
         │      └── Lung13-Flat_files_and_images (unannotated data)
         │  └── metadata_giotto.csv
      ```
- **Summary:**
    - 8 tissue sections of non-small-cell lung cancer of human from 5 patients
        - 800,559 observations at cellular resolution without annotation
            - 93,206 observations lung5_rep1
            - 97,487 observations lung5_rep2
            - 91,691 observations lung5_rep3
            - 83,723 observations lung6
            - 77,391 observations lung9
            - 115,676 observations lung9_rep2
            - 66,489 observations lung12
            - 76,536 observations lung13
        - 83,621 observations at cellular resolution with annotations
    - 960 genes

In [None]:
dataset = "nanostring_cosmx_human_nsclc"
cell_type_key = "cell_type"

#### 2.4.1 Load & Preprocess Raw Data

##### 2.4.1.1 ST Data

In [None]:
batches = ["Lung5_Rep1",
           "Lung5_Rep2",
           "Lung5_Rep3",
           "Lung6",
           "Lung9_Rep1",
           "Lung9_Rep2",
           "Lung12",
           "Lung13"]

annotation_df = pd.read_csv(f"{st_data_bronze_folder_path}/{dataset}/metadata_giotto.csv", index_col=0)

for batch_idx, batch in enumerate(batches):
    gene_expr_df = pd.read_csv(f"{st_data_bronze_folder_path}/{dataset}/{batch}/{batch}-Flat_files_and_images/{batch}_exprMat_file.csv")
    metadata_df = pd.read_csv(f"{st_data_bronze_folder_path}/{dataset}/{batch}/{batch}-Flat_files_and_images/{batch}_metadata_file.csv")

    adata = ad.AnnData(gene_expr_df[gene_expr_df.columns.difference(["fov", "cell_ID"])].values,
                       obs=gene_expr_df[["fov", "cell_ID"]],
                       dtype="float32")
    
    print(f"Step 1: Number of cells: {len(adata)}")
    print(f"Step 1: Number of genes: {len(adata.var_names)}")
    
    adata.var_names = gene_expr_df.columns.difference(["fov", "cell_ID"])
    adata.obs["batch"] = batch.lower()
    
    # Add spatial coordinates from metadata
    adata.obs = pd.merge(adata.obs, metadata_df, on=["fov", "cell_ID"], how="left")
    adata.obsm["spatial"] = np.array(adata.obs[["CenterX_global_px", "CenterY_global_px"]])
    
    # Drop obs without metadata
    adata.obs.reset_index(drop=True, inplace=True)
    adata = adata[adata.obs.index.isin(adata.obs.dropna().index), :].copy()
    
    print(f"Step 2: Number of cells: {len(adata)}")
    print(f"Step 2: Number of genes: {len(adata.var_names)}")
    
    # Add cell type annotations, remove cells without annotations, and make fov unique across batches
    adata.obs["cell_ID"] = f"c_{batch_idx + 1}_" + adata.obs["fov"].astype("str") + "_" + adata.obs["cell_ID"].astype("str")
    adata.obs = pd.merge(adata.obs, annotation_df, on="cell_ID", how="left")
    adata.obs["fov"] = adata.obs["batch"] + "_" + adata.obs["fov_x"].astype(str)
    adata.obs = adata.obs[["cell_ID", "patient", "batch", "fov", "cell_type", "niche"]]
    adata.obs.index = adata.obs.index.astype(str)
    adata = adata[adata.obs.index.isin(adata.obs.dropna().index), :].copy()
    adata.obs.reset_index(drop=True, inplace=True)
    
    print(f"Step 3: Number of cells: {len(adata)}")
    print(f"Step 3: Number of genes: {len(adata.var_names)}")
    
    # Convert cell type annotations to coarser resolution
    adata.obs["cell_type_original"] = adata.obs["cell_type"].astype(str)
    adata.obs.loc[adata.obs["cell_type_original"].apply(
        lambda x: "T" in x),"cell_type"] = "NK/T cell"
    adata.obs.loc[adata.obs["cell_type_original"].apply(
        lambda x: "tumor" in x),"cell_type"] = "tumor"
    adata.obs.loc[adata.obs["cell_type_original"] == "NK","cell_type"] = "NK/T cell"
    adata.obs.loc[adata.obs["cell_type_original"].apply(
        lambda x: "DC" in x),"cell_type"] = "DC"
    adata.obs.loc[(adata.obs["cell_type_original"] == "monocyte") |
                  (adata.obs["cell_type_original"] == "macrophage") |
                  (adata.obs["cell_type"] == "DC"),"cell_type"] = "myeloid"
    
    # Remove negative probes
    adata.var.index = adata.var.index.map(str)
    adata = adata[:, ~adata.var_names.str.contains("NegPrb")].copy()
    
    # Remove low quality cells
    sc.pp.filter_cells(adata, min_counts=50)
        
    # Store gene expression in sparse format
    adata.X = sp.csr_matrix(np.array(adata.X))
    adata.layers["counts"] = adata.X.copy()
    
    # Log normalize counts
    sc.pp.normalize_total(adata, target_sum=1e4)
    sc.pp.log1p(adata)
    
    print(f"Step 4: Number of cells: {len(adata)}")
    print(f"Step 4: Number of genes: {len(adata.var_names)}")
    print("==========")
    
    adata.write(f"{st_data_gold_folder_path}/{dataset}_batch{batch_idx + 1}.h5ad")

##### 2.4.1.2 ST Data Subsamples

In [None]:
# Create subsamples of all batches for method benchmarking
for subsample_pct in [50, 25, 10, 5, 1]:
    print(f"Processing subsample {subsample_pct}pct...")
    batch_indeces = list(np.arange(8) + 1)
    batches = ["Lung5_Rep1",
               "Lung5_Rep2",
               "Lung5_Rep3",
               "Lung6",
               "Lung9_Rep1",
               "Lung9_Rep2",
               "Lung12",
               "Lung13"]

    annotation_df = pd.read_csv(f"{st_data_bronze_folder_path}/{dataset}/metadata_giotto.csv", index_col=0)

    for batch_idx, batch in enumerate(batches):
        print(f"Loading, preprocessing and saving batch '{batch}'.")
        gene_expr_df = pd.read_csv(f"{st_data_bronze_folder_path}/{dataset}/{batch}/{batch}-Flat_files_and_images/{batch}_exprMat_file.csv")
        metadata_df = pd.read_csv(f"{st_data_bronze_folder_path}/{dataset}/{batch}/{batch}-Flat_files_and_images/{batch}_metadata_file.csv")

        adata = ad.AnnData(gene_expr_df[gene_expr_df.columns.difference(["fov", "cell_ID"])].values,
                           obs=gene_expr_df[["fov", "cell_ID"]],
                           dtype="float32")
        adata.var_names = gene_expr_df.columns.difference(["fov", "cell_ID"])
        adata.obs["batch"] = batch.lower()

        # Add spatial coordinates from metadata
        adata.obs = pd.merge(adata.obs, metadata_df, on=["fov", "cell_ID"], how="left")
        adata.obsm["spatial"] = np.array(adata.obs[["CenterX_global_px", "CenterY_global_px"]])

        # Drop obs without metadata
        adata.obs.reset_index(drop=True, inplace=True)
        adata = adata[adata.obs.index.isin(adata.obs.dropna().index), :].copy()

        # Add cell type annotations, remove cells without annotations, and make fov unique across batches
        adata.obs["cell_ID"] = f"c_{batch_idx + 1}_" + adata.obs["fov"].astype("str") + "_" + adata.obs["cell_ID"].astype("str")
        adata.obs = pd.merge(adata.obs, annotation_df, on="cell_ID", how="left")
        adata.obs["fov"] = adata.obs["batch"] + "_" + adata.obs["fov_x"].astype(str)
        adata.obs = adata.obs[["cell_ID", "patient", "batch", "fov", "cell_type", "niche"]]
        adata.obs.index = adata.obs.index.astype(str)
        adata = adata[adata.obs.index.isin(adata.obs.dropna().index), :].copy()
        adata.obs.reset_index(drop=True, inplace=True)

        # Convert cell type annotations to coarser resolution
        adata.obs["cell_type_original"] = adata.obs["cell_type"].astype(str)
        adata.obs.loc[adata.obs["cell_type_original"].apply(
            lambda x: "T" in x),"cell_type"] = "NK/T cell"
        adata.obs.loc[adata.obs["cell_type_original"].apply(
            lambda x: "tumor" in x),"cell_type"] = "tumor"
        adata.obs.loc[adata.obs["cell_type_original"] == "NK","cell_type"] = "NK/T cell"
        adata.obs.loc[adata.obs["cell_type_original"].apply(
            lambda x: "DC" in x),"cell_type"] = "DC"
        adata.obs.loc[(adata.obs["cell_type_original"] == "monocyte") |
                      (adata.obs["cell_type_original"] == "macrophage") |
                      (adata.obs["cell_type"] == "DC"),"cell_type"] = "myeloid"

        # Remove negative probes
        adata.var.index = adata.var.index.map(str)
        adata = adata[:, ~adata.var_names.str.contains("NegPrb")].copy()

        # Remove low quality cells
        sc.pp.filter_cells(adata, min_counts=50)
        
        # Filter subsample
        adata = adata[:int((subsample_pct/100) * len(adata)), :].copy()

        # Store gene expression in sparse format
        adata.X = sp.csr_matrix(np.array(adata.X))
        adata.layers["counts"] = adata.X.copy()

        # Log normalize counts
        sc.pp.normalize_total(adata, target_sum=1e4)
        sc.pp.log1p(adata)

        adata.write(f"{st_data_gold_folder_path}/{dataset}_subsample_{subsample_pct}pct_batch{batch_idx + 1}.h5ad")

##### 2.4.1.3 ST Data Modified

This is not used in the manuscript.

In [None]:
batch_indeces = list(np.arange(8) + 1)
for batch_idx in batch_indeces:
    adata = sc.read_h5ad(f"{st_data_gold_folder_path}/{dataset}_batch{batch_idx}.h5ad")
    if batch_idx != 4:
        adata_modified = adata[(adata.obs["niche"] != "tumor interior") &
                               (adata.obs["niche"] != "tumor-stroma boundary")]
    else:
        adata_modified = adata
    #adata_modified = adata[(adata.obs["fov"] != "lung5_rep1_20") &
    #                       (adata.obs["fov"] != "lung5_rep1_25") &
    #                       (adata.obs["fov"] != "lung5_rep1_30") &
    #                       (adata.obs["fov"] != "lung5_rep2_20") &
    #                       (adata.obs["fov"] != "lung5_rep2_25") &
    #                       (adata.obs["fov"] != "lung5_rep2_30")]
    adata_modified.write(f"{st_data_gold_folder_path}/{dataset}_modified_batch{batch_idx}.h5ad") 

#### 2.4.2 Explore Data

##### 2.4.2.1 ST Data

In [None]:
batch_indeces = list(np.arange(8) + 1)
for batch_idx in batch_indeces:
    adata = sc.read_h5ad(f"{st_data_gold_folder_path}/{dataset}_batch{batch_idx}.h5ad")
    
    print(f"Exploring dataset {dataset}_batch{batch_idx}.")
    print(f"Number of nodes (cells): {adata.X.shape[0]}")
    print(f"Number of node features (genes): {adata.X.shape[1]}")

    # Visualize cell-level annotated data in physical space
    sq.pl.spatial_scatter(adata, color=cell_type_key, shape=None, figsize=(12, 12))        

##### 2.4.2.2 ST Data Subsamples

In [None]:
for batch_idx in [5]:
    for subsample_pct in [50, 25, 10, 5, 1]:
        adata = sc.read_h5ad(f"{st_data_gold_folder_path}/{dataset}_subsample_{subsample_pct}pct_batch{batch_idx}.h5ad")

        print(f"Exploring dataset {dataset}_batch{batch_idx}.")
        print(f"Number of nodes (cells): {adata.X.shape[0]}")
        print(f"Number of node features (genes): {adata.X.shape[1]}")

        # Visualize cell-level annotated data in physical space
        sq.pl.spatial_scatter(adata, color=cell_type_key, shape=None, figsize=(12, 12))        

##### 2.4.2.3 ST Data Modified

This is not used in the manuscript.

In [None]:
batch_indeces = list(np.arange(8) + 1)
for batch_idx in batch_indeces:
    adata_modified = sc.read_h5ad(f"{st_data_gold_folder_path}/{dataset}_modified_batch{batch_idx}.h5ad")
    
    print(f"Exploring dataset {dataset}_batch{batch_idx}.")
    print(adata_modified.obs["patient"])
    print(f"Number of nodes (cells): {adata_modified.X.shape[0]}")
    print(f"Number of node features (genes): {adata_modified.X.shape[1]}")

    # Visualize cell-level annotated data in physical space
    sq.pl.spatial_scatter(adata_modified, color=cell_type_key, shape=None, figsize=(12, 12)) 

### 2.5 STARmap PLUS Mouse Central Nervous System

- **Publication:** Shi, H. et al. Spatial Atlas of the Mouse Central Nervous System at Molecular Resolution. bioRxiv 2022.06.20.496914 (2022). [doi:10.1101/2022.06.20.496914](https://doi.org/10.1101/2022.06.20.496914)
- **Data Access:** https://singlecell.broadinstitute.org/single_cell/study/SCP1830
- **Data Structure:**
    - ```
         ├── ../datasets/st_data/bronze/starmap_plus_mouse_cns  
         │  ├── sagittal1raw_expression_pd.csv (gene expression)
         │  ├── imputation_sagittal1.h5ad (imputed gene expression)
         │  ├── sagittal1_spatial.csv (spatial coordinates)
         │  ├── ...
      ```
- **Summary:**
    - Individual Sample Method Benchmarking
        - Sagittal section of mouse brain (batch1)
        - 91,246 observations on cell level with cell type annotations
        - 1022 genes
    - Sample Integration Method Benchmarking
        - batch1, ..., batch20
        - x observations on cell level with cell type annotations
        - 1022 genes    
    - Analysis
        - 20 transduced CNS tissue slices
        - 1,091,527 observations on cell level with cell type annotations
        - 11,844 imputed genes

In [None]:
dataset = "starmap_plus_mouse_cns"
cell_type_key = "Main_molecular_cell_type"
starmap_plus_dir = f"{st_data_bronze_folder_path}/{dataset}/"

#### 2.5.1 Load & Preprocess Raw Data

##### 2.5.1.1 Spatial Transcriptomics Data

In [None]:
batch_indeces = list(np.arange(20) + 1)
batches = ["sagittal1",
           "sagittal2",
           "sagittal3",
           "spinalcord",
           "well01OB",
           "well01brain",
           "well03",
           "well04",
           "well05",
           "well06",
           "well07",
           "well08",
           "well09",
           "well10",
           "well11",
           "well1_5",
           "well2_5",
           "well3_5",
           "well7_5",
           "well10_5"]

for batch_idx, batch in zip(batch_indeces, batches):
    print(f"Loading, preprocessing and saving batch '{batch}'.")
    
    # Read counts and change format
    counts = pd.read_csv(starmap_plus_dir + f"{batch}raw_expression_pd.csv")
    counts.set_index("GENE", inplace=True)
    counts = counts.T

    # Read metadata and change format
    metadata = pd.read_csv(starmap_plus_dir + f"{batch}_spatial.csv", skiprows=[1])
    metadata.set_index("NAME", inplace=True)

    # Merge counts with metadata and store counts in `layers` and spatial coords in `obsm`
    adata = ad.AnnData(counts, dtype=np.float32)
    adata.obs = pd.merge(adata.obs, metadata, how="left", left_index=True, right_index=True)
    adata.obsm["spatial"] = adata.obs[["X", "Y"]].values / 1000 # smaller scale for plotting
    adata.obs.drop(columns=["X", "Y", "Z"], inplace=True)
    adata.X = sp.csr_matrix(adata.X) # sparse row format
    adata.layers["counts"] = adata.X.copy()
    
    # Preprocess adata
    sc.pp.normalize_total(adata, target_sum=1e4)
    sc.pp.log1p(adata)
    
    adata.obs["batch"] = batch
    adata.obs["dataset"] = "cns"

    # Store adata to disk
    adata.write(f"{st_data_gold_folder_path}/{dataset}_batch{batch_idx}.h5ad")

##### 2.5.1.2 Spatial Transcriptomics Data Subsamples

In [None]:
# Create subsamples of all batches for method benchmarking
for subsample_pct in [50, 25, 10, 5, 1]:
    print(f"Processing subsample {subsample_pct}pct...")
    batch_indeces = list(np.arange(20) + 1)
    batches = ["sagittal1",
               "sagittal2",
               "sagittal3",
               "spinalcord",
               "well01OB",
               "well01brain",
               "well03",
               "well04",
               "well05",
               "well06",
               "well07",
               "well08",
               "well09",
               "well10",
               "well11",
               "well1_5",
               "well2_5",
               "well3_5",
               "well7_5",
               "well10_5"]

    for batch_idx, batch in zip(batch_indeces, batches):
        print(f"Loading, preprocessing and saving batch '{batch}'.")

        # Read counts and change format
        counts = pd.read_csv(starmap_plus_dir + f"{batch}raw_expression_pd.csv")
        counts.set_index("GENE", inplace=True)
        counts = counts.T

        # Read metadata and change format
        metadata = pd.read_csv(starmap_plus_dir + f"{batch}_spatial.csv", skiprows=[1])
        metadata.set_index("NAME", inplace=True)

        # Merge counts with metadata and store counts in `layers` and spatial coords in `obsm`
        adata = ad.AnnData(counts, dtype=np.float32)
        adata.obs = pd.merge(adata.obs, metadata, how="left", left_index=True, right_index=True)
        adata.obsm["spatial"] = adata.obs[["X", "Y"]].values / 1000 # smaller scale for plotting
        adata.obs.drop(columns=["X", "Y", "Z"], inplace=True)
        adata.X = sp.csr_matrix(adata.X) # sparse row format
        adata.layers["counts"] = adata.X.copy()

        # Preprocess adata
        sc.pp.normalize_total(adata, target_sum=1e4)
        sc.pp.log1p(adata)
        adata.obs["batch"] = batch
        adata = adata[:int(subsample_pct/100 * len(adata)),:]

        # Store adata to disk
        adata.write(f"{st_data_gold_folder_path}/{dataset}_subsample_{subsample_pct}pct_batch{batch_idx}.h5ad")

##### 2.5.1.3 Imputed Data

This is not used in the manuscript.

In [None]:
batch_indeces = [1, 2, 3]
batches = ["sagittal1", "sagittal2", "sagittal3"]
for batch_idx, batch in zip(batch_indeces, batches):
    print(f"Loading, preprocessing and saving batch '{batch}'.")    
    
    # Read adata
    adata_imputed = sc.read_h5ad(starmap_plus_dir + f"imputation_{batch}.h5ad")

    # Read metadata and change format
    metadata = pd.read_csv(starmap_plus_dir + f"{batch}_spatial.csv", skiprows=[1])
    metadata.set_index("NAME", inplace=True)

    # Merge adata with metadata
    adata_imputed.obs = pd.merge(adata_imputed.obs,
                                 metadata,
                                 how="left",
                                 left_index=True,
                                 right_index=True)
    
    # Format adata
    adata_imputed.var_names = adata_imputed.var["Gene"]
    adata_imputed.X = adata_imputed.X.astype(np.float32)
    adata_imputed.X = (
        adata_imputed.X * np.array(adata_imputed.var["std"])) # Undo std scaling of data with sc.pp.scale()
    adata_imputed.X = (
        adata_imputed.X + np.abs(np.array(adata_imputed.X.min(axis=0)))) # Undo mean scaling of data with sc.pp.scale()
    
    # Threshold gene expression values to create a sparse row matrix
    adata = sc.read_h5ad(f"{st_data_gold_folder_path}/{dataset}_batch{batch_idx}.h5ad")
    avg_nnz_per_gene = round(adata.X.nnz / adata.X.shape[1]) 
    zero_threshs = np.partition(
        adata_imputed.X, -avg_nnz_per_gene, axis=0)[-avg_nnz_per_gene,:] # threshhold based on the 'avg_nnz_per_gene'th
                                                                         # largest value per gene to arrive at an equal nnz per gene
    adata_imputed.X[adata_imputed.X < zero_threshs] = 0
    adata_imputed.X = sp.csr_matrix(adata_imputed.X)
    adata_imputed.var.drop(columns=["Gene", "mean", "std"], inplace=True)
    adata_imputed.obsm["spatial"] = adata_imputed.obs[["X", "Y"]].values / 1000 # smaller scale for plotting with squidpy
    adata_imputed.obs.drop(columns=["X", "Y", "Z"], inplace=True)
    adata_imputed.obs["batch"] = batch

    # Store adata to disk
    adata_imputed.write(f"{st_data_gold_folder_path}/{dataset}_imputed_batch{batch_idx}.h5ad")

#### 2.5.2 Explore Data

##### 2.5.2.1 Spatial Transcriptomics Data

In [None]:
batch_indeces = list(np.arange(20) + 1)
for batch_idx in batch_indeces:
    adata = sc.read_h5ad(f"{st_data_gold_folder_path}/{dataset}_batch{batch_idx}.h5ad")
    
    print(f"Exploring dataset {dataset}_batch{batch_idx}.")
    print(f"Number of nodes (cells): {adata.X.shape[0]}")
    print(f"Number of node features (genes): {adata.X.shape[1]}")

    # Visualize cell-level annotated data in physical space
    sq.pl.spatial_scatter(adata, color=cell_type_key, shape=None, figsize=(12, 12))        

##### 2.5.2.2 Spatial Transcriptomics Data Subsamples

In [None]:
for batch_idx in [1]:
    for subsample_pct in [50, 25, 10, 5, 1]:
        adata = sc.read_h5ad(f"{st_data_gold_folder_path}/{dataset}_subsample_{subsample_pct}pct_batch{batch_idx}.h5ad")

        print(f"Exploring dataset {dataset}_batch{batch_idx}.")
        print(f"Number of nodes (cells): {adata.X.shape[0]}")
        print(f"Number of node features (genes): {adata.X.shape[1]}")

        # Visualize cell-level annotated data in physical space
        sq.pl.spatial_scatter(adata, color=cell_type_key, shape=None, figsize=(12, 12))        

##### 2.5.2.3 Imputed Data

In [None]:
batch_indeces = [1, 2, 3]
for batch_idx in [1, 2, 3]:
    adata = sc.read_h5ad(f"{st_data_gold_folder_path}/{dataset}_imputed_batch{batch_idx}.h5ad")
    
    print(f"Exploring dataset {dataset}_batch{batch_idx}.")
    print(f"Number of nodes (cells): {adata.X.shape[0]}")
    print(f"Number of node features (genes): {adata.X.shape[1]}")

    # Visualize cell-level annotated data in physical space
    sq.pl.spatial_scatter(adata, color=cell_type_key, shape=None, figsize=(12, 12))        

### 2.6 Xenium Human Breast Cancer

- **Publication:** Janesick, A. et al. High resolution mapping of the breast cancer tumor microenvironment using integrated single cell, spatial and in situ analysis of FFPE tissue. bioRxiv 2022.10.06.510405 (2022) doi:10.1101/2022.10.06.510405.
- **Data Access:** 
    - https://www.10xgenomics.com/products/xenium-in-situ/preview-dataset-human-breast
    - Only sample 1 has been used but there is an additional sample
- **Data Structure:**
    - ```
         ├── ../datasets/st_data/bronze/xenium_human_breast_cancer
         │  ├── sample1/rep1
         │      ├── cell_feature_matrix.h5
         │      ├── cells.csv.gz
         │  ├── sample1/rep2
         │      ├── cell_feature_matrix.h5
         │      ├── cells.csv.gz
      ```
- **Summary:**
    - 2 FFPE tissue slice replicates of human breast cancer
    - 896,638 observations at cellular resolution without annotations
        - Sample 1, Replicate 1: 164,000 observations
        - Sample 1, Replicate 2: 118,363 observations
    - 313 probed genes

In [None]:
dataset = "xenium_human_breast_cancer"
cell_type_key = "cell_states"

#### 2.6.1 Load & Preprocess Raw Data

In [None]:
# Load and combine data
xenium_dir = f"{st_data_bronze_folder_path}/{dataset}/"
adata_rep1 = sc.read_10x_h5(filename=xenium_dir + "sample1/rep1/cell_feature_matrix.h5")
metadata_rep1 = pd.read_csv(xenium_dir + "sample1/rep1/cells.csv.gz")
adata_rep2 = sc.read_10x_h5(filename=xenium_dir + "sample1/rep2/cell_feature_matrix.h5")
metadata_rep2 = pd.read_csv(xenium_dir + "sample1/rep2/cells.csv.gz")

metadata_rep1.set_index(adata_rep1.obs_names, inplace=True)
adata_rep1.obs = metadata_rep1.copy()
adata_rep1.obsm["spatial"] = adata_rep1.obs[["x_centroid", "y_centroid"]].copy().to_numpy()
adata_rep1.obs.head()

print(f"Step 1 Rep 1: Number of cells: {len(adata_rep1)}")
print(f"Step 1 Rep 1: Number of genes: {len(adata_rep1.var_names)}")

metadata_rep2.set_index(adata_rep2.obs_names, inplace=True)
adata_rep2.obs = metadata_rep2.copy()
adata_rep2.obsm["spatial"] = adata_rep2.obs[["x_centroid", "y_centroid"]].copy().to_numpy()
adata_rep2.obs.head()

print(f"Step 1 Rep 2: Number of cells: {len(adata_rep2)}")
print(f"Step 1 Rep 2: Number of genes: {len(adata_rep2.var_names)}")

adata = adata_rep1.concatenate(adata_rep2, batch_key = 'replicates', batch_categories = ['Rep_1', 'Rep_2'], join = 'inner')
adata

print(f"Step 1: Number of cells: {len(adata)}")
print(f"Step 1: Number of genes: {len(adata.var_names)}")

In [None]:
## Perform basic QC analysis á la Squidpy
sc.pp.calculate_qc_metrics(adata, percent_top = (10, 20, 50, 150), inplace = True)

# Calculate percentage of control probes and control codewords
cprobes = (
    adata.obs["control_probe_counts"].sum() / adata.obs["total_counts"].sum() * 100
)
cwords = (
    adata.obs["control_codeword_counts"].sum() / adata.obs["total_counts"].sum() * 100
)
print(f"Negative DNA probe count % : {cprobes}")
print(f"Negative decoding count % : {cwords}")

In [None]:
# Plot distribution of total transcripts per cell, unique transcripts per cell,
# area of segmented cells and the ratio of nuclei area to their cells
fig, axs = plt.subplots(1, 4, figsize = (15, 4))

axs[0].set_title("Total transcripts per cell")
sns.histplot(
    adata.obs["total_counts"],
    kde=False,
    ax=axs[0],
)

axs[1].set_title("Unique transcripts per cell")
sns.histplot(
    adata.obs["n_genes_by_counts"],
    kde=False,
    ax=axs[1],
)


axs[2].set_title("Area of segmented cells")
sns.histplot(
    adata.obs["cell_area"],
    kde=False,
    ax=axs[2],
)

axs[3].set_title("Nucleus ratio")
sns.histplot(
    adata.obs["nucleus_area"] / adata_rep1.obs["cell_area"],
    kde=False,
    ax=axs[3],
)

In [None]:
# Filter cells
sc.pp.filter_cells(adata, min_counts = 10)
sc.pp.filter_cells(adata, min_genes = 3)
sc.pp.filter_genes(adata, min_counts = 1)
sc.pp.filter_genes(adata, min_cells = 2)
adata

print(f"Step 2: Number of cells: {len(adata)}")
print(f"Step 2: Number of genes: {len(adata.var_names)}")

In [None]:
# Normalize and transform data
adata.layers["counts"] = adata.X.copy()
sc.pp.normalize_total(adata, inplace = True)
sc.pp.log1p(adata)

In [None]:
# Visualize manifold
sc.pp.pca(adata, n_comps = 50, random_state = 1712)
sc.pp.neighbors(adata, n_neighbors = 50, random_state = 1769, method = 'umap')
sc.tl.leiden(adata, resolution = 2, random_state = 1786)
sc.tl.umap(adata, min_dist = 0.3, spread = 2, random_state = 1789)

In [None]:
sc.pl.umap(
    adata,
    color = [
        "total_counts",
        "n_genes_by_counts",
        "leiden",
    ],
    wspace = 0.4,
    frameon = False, 
    size = 1
)

In [None]:
# Characterize clusters
sc.tl.rank_genes_groups(adata, 'leiden', method = 'wilcoxon', n_genes = 100, use_raw = False)
result = adata.uns['rank_genes_groups']
groups = result['names'].dtype.names
markers_cells = pd.DataFrame(
    {group + '_' + key[:1]: result[key][group]
    for group in groups for key in ['names', 'pvals_adj', 'logfoldchanges']})
markers_cells.head(5)

In [None]:
adata.obs['leiden'].cat.categories

In [None]:
adata.obs['cell_states'] = adata.obs['leiden'].copy()
adata.obs['cell_states'] = adata.obs['cell_states'].cat.rename_categories(['Epi_ABCC11+', 'EC_CLEC14A+', 'adipo_FB', 'tcm_CD4+T', 'CD161+FOXP3+T', 'eff_CD8+T1', 'ADIPOQ+Mast', 'Epi_FOXA1+', 'GJB2+iKC-FB', 'EMT-Epi1_CEACAM6+', 'DC1', 
                                                                           'M2MØ', 'Epi_AGR3+', 'Epi_CENPF+', 'mgEpi_KRT14+', 'DERL3+B', 'EMT-Epi2_CEACAM6+', 'EMT-Epi_SERPINA3+', 'BANK1+B', 'EMT-Epi_KRT23+', 'MMP12+miMØ',
                                                                           'eff_CD8+T2', 'B', 'Epi_KRT14+', 'NK/T', 'EC_CAVIN2+'])
sc.pl.umap(adata, color = ['cell_states'], size = 0.8, legend_fontsize = 6, legend_loc = 'on data', frameon = False)

In [None]:
# Split replicates
adata_rep1 = adata[adata.obs['replicates'].isin(['Rep_1'])]
adata_rep2 = adata[adata.obs['replicates'].isin(['Rep_2'])]

In [None]:
adata_rep1.write(f"{st_data_gold_folder_path}/{dataset}_batch1.h5ad")
adata_rep2.write(f"{st_data_gold_folder_path}/{dataset}_batch2.h5ad")

#### 2.6.2 Explore Data

In [None]:
batch_indeces = [1, 2]
for batch_idx in batch_indeces:
    adata = sc.read_h5ad(f"{st_data_gold_folder_path}/{dataset}_batch{batch_idx}.h5ad")

    print(f"Exploring dataset {dataset}_batch{batch_idx}.")
    print(f"Number of nodes (cells): {adata.X.shape[0]}")
    print(f"Number of node features (genes): {adata.X.shape[1]}")

    # Visualize cell-level annotated data in physical space
    sq.pl.spatial_scatter(adata, color=cell_type_key, shape=None, figsize=(12, 12))

### 2.7 Spatial ATAC-RNA-Seq Mouse Embryo & Brain

- **Publication:** Zhang, D. et al. Spatial epigenome–transcriptome co-profiling of mammalian tissues. Nature 1–10 (2023): https://www.nature.com/articles/s41586-023-05795-1.
- **Data Access:** 
    - RNA raw counts & spatial coordinates: https://www.ncbi.nlm.nih.gov/geo/query/acc.cgi?acc=GSE205055
    - Cell type labels & ATAC peaks raw counts: https://brain-spatial-omics.cells.ucsc.edu
- **Data Structure:**
    - ```
         ├── ../datasets/st_data/bronze
         │  └── spatial_atac_rna_seq_mouse_embryo
         │    ├── GSM6799937_ME13_50um_matrix_merge.tsv.gz (E13 RNA-seq raw counts)    
         │    ├── tissue_positions_list_e13.csv (E13 spatial coordinates)
         │    ├── meta_e13.tsv (E13 metadata)
         │  └── spatial_atac_rna_seq_mouse_brain
         │    ├── GSM6204636_MouseBrain_20um_matrix.tsv.gz (P21 RNA-seq raw counts)
         │    ├── GSM6753043_MouseBrain_20um_100barcodes_ATAC_matrix.tsv.gz (P22 RNA-seq raw counts)
         │    ├── tissue_positions_list_p21.csv (P21 spatial coordinates)
         │    ├── tissue_positions_list_p22.csv (P22 spatial coordinates)
         │    ├── meta_p21.tsv (P21 metadata)
         │    ├── meta_p22.tsv (P22 metadata)
         ├── ../datasets/st_data/silver
         │  ├── spatial_atac_rna_seq_mouse_embryo_e13.h5ad (E13 ATAC peaks raw counts)
         │  ├── spatial_atac_rna_seq_mouse_brain_p22.h5ad (P22 ATAC peaks raw counts)
      ```
- **Summary:**
    - Embryonic day 13 mouse embryo (E13; pixel size 50 μm), mouse postnatal day 21 & 22 brains (P21 & P22; pixel size 20 μm)
    - We use the preprocessed ATAC peaks counts. Preprocessing has been done here: https://github.com/di-0579/Spatial_epigenome-transcriptome_co-sequencing/blob/main/Data_visualization/01_joint_P21C_RNA.R. The preprocessed Seurat objects are converted to adata with the script located at ```nichecompass-reproducibility/scripts/data_preparation/spatial_atac_rna_seq_mouse_embryo_and_brain_data_preparation.R```.
    - Also available: histone modification modality for mouse tissues; adult human brain hippocampus tissue (pixel size: 50 μm)

In [None]:
dataset_embryo = "spatial_atac_rna_seq_mouse_embryo"
dataset_brain = "spatial_atac_rna_seq_mouse_brain"

#### 2.7.1 Load & Preprocess Raw Data

In [None]:
# Write brain dataset adata to disk separated by batches
batch_indeces = list(np.arange(2) + 1)
batches = ["p21",
           "p22"]
rna_file_names = ["GSM6204636_MouseBrain_20um_matrix.tsv.gz",
                  "GSM6753043_MouseBrain_20um_100barcodes_ATAC_matrix.tsv.gz"]

for batch_idx, batch, rna_file_name in zip(batch_indeces,
                                           batches,
                                           rna_file_names):
    # Load spatial coordinates
    df_spatial = pd.read_csv(
        f"{st_data_bronze_folder_path}/{dataset_brain}/tissue_positions_list_{batch}.csv",
        header=None)
    df_spatial.index = df_spatial[0]

    # Load metadata with cell labels
    df_meta = pd.read_csv(
        f"{st_data_bronze_folder_path}/{dataset_brain}/meta_{batch}.tsv",
        sep="\t",
        index_col=0)

    # Load and format RNA-seq data
    adata_rna_tmp = sc.read_text(
        f"{st_data_bronze_folder_path}/{dataset_brain}/{rna_file_name}")
    adata_rna = sc.AnnData(
        sp.csr_matrix(adata_rna_tmp.X.T, dtype=np.float32))
    adata_rna.var = adata_rna_tmp.obs
    adata_rna.obs = adata_rna_tmp.var
    del(adata_rna_tmp)
    adata_rna.obs = adata_rna.obs.join(df_spatial)
    adata_rna.obsm["spatial"] = adata_rna.obs[[4, 5]].values
    adata_rna.obs = adata_rna.obs.drop([0, 1, 2, 3, 4, 5], axis=1)
    adata_rna.obs = adata_rna.obs.join(df_meta)

    # Store gene expression counts in 'counts' layer
    adata_rna.layers["counts"] = adata_rna.X
    
    # Add batch
    adata_rna.obs["batch"] = batch
    
    print(f"Step 1: Number of cells: {len(adata_rna)}")
    print(f"Step 1: Number of genes: {len(adata_rna.var_names)}")

    adata_rna.write(f"{st_data_gold_folder_path}/{dataset_brain}_batch{batch_idx}.h5ad")
    
    # Load and format ATAC preprocessed data
    try:
        adata_atac = sc.read_h5ad(f"{st_data_silver_folder_path}/{dataset_brain}_{batch}.h5ad")
        adata_atac.X = adata_atac.X.astype(np.float32)
        adata_atac.obs = adata_atac.obs.drop(adata_atac.obs.columns, axis=1)
        adata_atac.var = adata_atac.var.drop(adata_atac.var.columns, axis=1)
        del(adata_atac.obsm["X_umap"])
        adata_atac.obs = adata_atac.obs.join(df_spatial)
        adata_atac.obsm["spatial"] = adata_atac.obs[[4, 5]].values
        adata_atac.obs = adata_atac.obs.drop([0, 1, 2, 3, 4, 5], axis=1)
        adata_atac.obs = adata_atac.obs.join(df_meta)
        adata_atac.var.index = adata_atac.var.index.str.replace("-", ":", 1)
        adata_atac.__dict__['_raw'].__dict__['_var'] = adata_atac.__dict__['_raw'].__dict__['_var'].rename(columns={'_index': 'features'})
        
        # Store chromatin accessibility counts in 'counts' layer
        adata_atac.layers["counts"] = adata_atac.X

        # Add batch
        adata_atac.obs["batch"] = batch
        
        # Bring 'adata_atac' in same order as 'adata_rna'
        adata_atac = adata_atac[adata_rna.obs.index, :]
        
    
        print(f"Step 1: Number of cells: {len(adata_atac)}")
        print(f"Step 1: Number of genes: {len(adata_atac.var_names)}")

        adata_atac.write(f"{st_data_gold_folder_path}/{dataset_brain}_batch{batch_idx}_atac.h5ad")
        del(adata_atac)
    except:
        print(f"Could not find preprocessed ATAC data for batch {batch}.")

    del(df_spatial)
    del(df_meta)
    del(adata_rna)
    
    gc.collect()

#### 2.7.2 Explore Data

##### 2.7.2.1 Spatial Multi-omics Data (RNA)

In [None]:
adata_rna = sc.read_h5ad(f"{st_data_gold_folder_path}/{dataset_embryo}_rna.h5ad")

print(f"Exploring dataset {dataset_embryo}_rna.")
print(f"Number of nodes (cells): {adata_rna.X.shape[0]}")
print(f"Number of node features (genes): {adata_rna.X.shape[1]}")

# Visualize cell-level annotated data in physical space
sq.pl.spatial_scatter(adata_rna, color="RNA_clusters", shape=None)        

In [None]:
for batch_idx in batch_indeces:
    adata_rna = sc.read_h5ad(f"{st_data_gold_folder_path}/{dataset_brain}_batch{batch_idx}_rna.h5ad")
    
    print(f"Exploring dataset {dataset_brain}_batch{batch_idx}_rna.")
    print(f"Number of nodes (cells): {adata_rna.X.shape[0]}")
    print(f"Number of node features (genes): {adata_rna.X.shape[1]}")

    # Visualize cell-level annotated data in physical space
    sq.pl.spatial_scatter(adata_rna, color="RNA_clusters", shape=None)        

##### 2.7.2.2 Spatial Multi-omics Data (ATAC)

In [None]:
adata_atac = sc.read_h5ad(f"{st_data_gold_folder_path}/{dataset_embryo}_atac.h5ad")

print(f"Exploring dataset {dataset_embryo}_batch{batch_idx}_atac.")
print(f"Number of nodes (cells): {adata_atac.X.shape[0]}")
print(f"Number of node features (genes): {adata_atac.X.shape[1]}")

# Visualize cell-level annotated data in physical space
sq.pl.spatial_scatter(adata_atac, color="ATAC_clusters", shape=None)        

In [None]:
for batch_idx in batch_indeces:
    try:
        adata_atac = sc.read_h5ad(f"{st_data_gold_folder_path}/{dataset_brain}_batch{batch_idx}_atac.h5ad")

        print(f"Exploring dataset {dataset_brain}_batch{batch_idx}_atac.")
        print(f"Number of nodes (cells): {adata_atac.X.shape[0]}")
        print(f"Number of node features (genes): {adata_atac.X.shape[1]}")

        # Visualize cell-level annotated data in physical space
        sq.pl.spatial_scatter(adata_atac, color="ATAC_clusters", shape=None)
    except:
        print(f"Could not find preprocessed ATAC data for batch {batch}.")

### 2.8 MERFISH Mouse Brain

- **Publication:** Zhang, M. et al. Molecularly defined and spatially resolved cell atlas of the whole mouse brain. Nature 624, 343–354 (2023)
- **Data Access:** https://cellxgene.cziscience.com/collections/0cca8620-8dee-45d0-aef5-23f032a5cf09
- **Data Structure**:
    - ```
         ├── ../datasets/st_data/bronze/merfish_mouse_brain
         │  ├──  WB_MERFISH_animal1_coronal.h5ad
            ├──  WB_MERFISH_animal2_coronal.h5ad
            ├──  WB_MERFISH_animal3_sagittal.h5ad
            ├──  WB_MERFISH_animal4_sagittal.h5ad
      ```
- **Summary:**
    - 239 coronal and sagittal sections from four mice
    - 8.4 million filtered cells
    - 1,122 probed genes

In [None]:
dataset = "merfish_mouse_brain"

#### 2.8.1 Load & Preprocess Raw Data

In [None]:
batch_idx_increment = 1

for file in ["animal1_coronal",
             "animal2_coronal",
             "animal3_sagittal",
             "animal4_sagittal"]:
    adata = sc.read_h5ad(f"{st_data_bronze_folder_path}/{dataset}/WB_MERFISH_{file}.h5ad")
    
    print(f"Step 1 Rep 1: Number of cells: {len(adata)}")
    print(f"Step 1 Rep 1: Number of genes: {len(adata.var_names)}")

    label_encoder = LabelEncoder()

    adata.var["gene_id"] = adata.var.index
    adata.var.index = adata.var["gene_name"]
    adata.var = adata.var[["gene_id"]]
    adata.obsm["spatial"] = adata.obsm["X_spatial"]
    del(adata.obsm["X_spatial"])
    adata.layers["counts"] = adata.raw.X
    adata.obs = adata.obs[["donor_id", "brain_section_label", "tissue", "cell_type"]]
    adata.obs["batch"] = label_encoder.fit_transform(adata.obs['brain_section_label'])
    adata.obs["batch"] = adata.obs["batch"].apply(lambda x: "batch" + str(x + batch_idx_increment))

    for batch in adata.obs["batch"].unique():
        adata_batch = adata[adata.obs["batch"] == batch]
        adata_batch.write(f"{st_data_gold_folder_path}/{dataset}_batch{batch[5:]}.h5ad")
        
    batch_idx_increment += adata.obs["batch"].nunique()

### 2.9 SDMBench Data

Download from http://sdmbench.drai.cn/:
- STARmap Mouse MPFC: Data ID 31
- StereoSeq Mouse Embryo: Data ID 13

In [None]:
dataset = "starmap_mouse_mpfc"

In [None]:
adata = sc.read_h5ad(f"{st_data_silver_folder_path}/{dataset}.h5ad")
adata.layers["counts"] = sp.csr_matrix(adata.X)
adata.X = sp.csr_matrix(adata.X)
adata.obs["cell_type"] = adata.obs["ct"]
del adata.obs["ct"]
adata.obs["niche_type"] = adata.obs["Region"]
del adata.obs["region"]
del adata.obs["Region"]
adata.obs["batch"] = "batch1"

print(f"Step 1: Number of cells: {len(adata)}")
print(f"Step 1: Number of genes: {len(adata.var_names)}")

adata.write(f"{st_data_gold_folder_path}/{dataset}.h5ad")

In [None]:
dataset = "stereoseq_mouse_embryo"

In [None]:
adata = sc.read_h5ad(f"{st_data_silver_folder_path}/{dataset}.h5ad")
adata.layers["counts"] = adata.X
adata.X = sp.csr_matrix(adata.X)

# Determine cell annotation as per squidpy vignette
sc.pp.normalize_total(adata, target_sum=1e4)
sc.pp.log1p(adata)
sc.pp.scale(adata, max_value=10)
sc.tl.pca(adata, svd_solver="arpack")
sc.pp.neighbors(adata, n_neighbors=10, n_pcs=20)
sc.tl.umap(adata)
sc.tl.leiden(adata, resolution=0.48)

adata.X = adata.layers["counts"]

adata.obs["niche_type"] = adata.obs["ground_truth"]
del adata.obs["ground_truth"]
del adata.obs["annotation"]
del adata.obs["Region"]
adata.obs["batch"] = "batch1"

print(f"Step 1 Rep 1: Number of cells: {len(adata)}")
print(f"Step 1 Rep 1: Number of genes: {len(adata.var_names)}")

adata.write(f"{st_data_gold_folder_path}/{dataset}.h5ad")

In [None]:
adata = sc.read_h5ad("../artifacts/single_sample_method_benchmarking/starmap_mouse_mpfc_nichecompass_gatv2conv.h5ad")
adata

In [None]:
adata.uns.pop('leiden_colors')

sq.pl.spatial_scatter(adata,
                      color="leiden", # "niche_type",
                      shape=None,
                      figsize=(12, 12))

#### 2.9.1 Load & Preprocess Raw Data

## 3. Additional Data

This data is not used as part of the publication.

### 3.1 nanoString CosMx SMI Human Liver

- **Publication:** nanoString CosMx SMI Human Liver FFPE Dataset
- **Data Access:** https://nanostring.com/products/cosmx-spatial-molecular-imager/human-liver-rna-ffpe-dataset/
- **Data Structure**:
    - ```
         ├── ../datasets/st_data/bronze/nanostring_cosmx_human_liver
         │  ├── LiverDataRelease 
      ```
- **Preprocessing Vignette**: https://nanostring.com/wp-content/uploads/2023/01/LiverPublicDataRelease.html
- **Summary:**
    - 2 samples: normal human liver (healthy sample) & human hepatocellular carcinoma (disease sample)
    - 332,877 observations (healthy sample) & 460,441 observations (disease sample) on cell-level with cell-type annotations
    - 1000 genes

In [None]:
dataset = "nanostring_cosmx_human_liver"
cell_type_key = "cellType"

#### 3.1.1 Load & Preprocess Raw Data

In [None]:
config = tiledb.Config()
ctx = tiledb.Ctx(config)

# Read in SOMACollection
pySoma = tiledbsoma.SOMACollection(f"{st_data_bronze_folder_path}/nanostring_cosmx_human_liver/LiverDataRelease", ctx=ctx)
pySoma.keys()

# Raw counts
counts = pySoma['RNA'].X['counts'].csr()
counts

# Normalized counts
norm = pySoma['RNA_normalized'].X['data'].csr()
norm

# Cell metadata
obs = pySoma['RNA'].obs.df()
obs.head()

# Target transcript coordinates
transcriptCoords = tiledb.open_dataframe(pySoma['RNA'].obsm["transcriptCoords"].uri, ctx=ctx)
transcriptCoords

# Convert to adata
coordinates = np.array(obs[["x_slide_mm", "y_slide_mm"]])
adata = ad.AnnData(norm, obs = obs, obsm={"spatial": coordinates}, dtype = "float32")
adata.layers["counts"] = counts
adata.obs = adata.obs[["fov", "cellType", "niche"]]
adata.var_names = pySoma["RNA"].var_names

adata.X = adata.X.astype(np.float32)
adata.layers["counts"] = adata.layers["counts"].astype(np.float32)

adata_batch1 = adata[adata.obs.index.str.startswith("c_1")]
adata_batch2 = adata[adata.obs.index.str.startswith("c_2")]

adata_sample1.obs["batch"] = "sample1"
adata_sample2.obs["batch"] = "sample2"

adata_sample1.write(f"{st_data_gold_folder_path}/{dataset}_batch1.h5ad")
adata_sample2.write(f"{st_data_gold_folder_path}/{dataset}_batch2.h5ad")

#### 3.1.2 Explore Data

In [None]:
adata_sample1 = sc.read_h5ad(f"{st_data_gold_folder_path}/{dataset}_batch1.h5ad")
adata_sample2 = sc.read_h5ad(f"{st_data_gold_folder_path}/{dataset}_batch2.h5ad")

In [None]:
print(f"Exploring dataset {dataset}_sample1.")
print(f"Number of nodes (cells): {adata_sample1.layers['counts'].shape[0]}")
print(f"Number of node features (genes): {adata_sample1.layers['counts'].shape[1]}")

# Visualize cell-level annotated data in physical space
sq.pl.spatial_scatter(adata_sample1, color=cell_type_key, shape=None, figsize=(12, 12))

In [None]:
print(f"Exploring dataset {dataset}_sample2.")
print(f"Number of nodes (cells): {adata_sample2.layers['counts'].shape[0]}")
print(f"Number of node features (genes): {adata_sample2.layers['counts'].shape[1]}")

# Visualize cell-level annotated data in physical space
sq.pl.spatial_scatter(adata_sample2, color=cell_type_key, shape=None, figsize=(12, 12))

### 3.2 Vizgen MERFISH Mouse Brain Receptor

- **Publication:** Vizgen Data Release V1.0. May 2021
- **Data Access:** https://info.vizgen.com/mouse-brain-map?submissionGuid=a66ccb7f-87cf-4c55-83b9-5a2b6c0c12b9
    - Slice 1 replicate 1
        - cell_by_gene_S1R1.csv
        - cell_metadata_S1R1.csv
- **Preprocessing Vignette:** https://squidpy.readthedocs.io/en/latest/external_tutorials/tutorial_vizgen.html
- **Summary:**
    - Coronal section of mouse brain receptor map
    - 78,329 observations on cell-level without annotations
    - 649 probed genes

In [None]:
dataset = "vizgen_merfish_mouse_brain"
cell_type_key = "leiden"

#### 3.2.1 Load & Preprocess Raw Data

In [None]:
print(f"Loading and preprocessing dataset '{dataset}'.")

# Read adata from files
merfish_dir = f"{st_data_bronze_folder_path}/merfish/"
adata = sq.read.vizgen(path=merfish_dir,
                       counts_file="datasets_mouse_brain_map_BrainReceptorShowcase_Slice1_replicate1_cell_by_gene_S1R1.csv",
                       meta_file="datasets_mouse_brain_map_BrainReceptorShowcase_Slice1_replicate1_cell_metadata_S1R1.csv")
# Preprocess as per squidpy vignette
adata.var_names.unique()
adata.var["mt"] = adata.var_names.str.startswith("mt-")
sc.pp.calculate_qc_metrics(adata, qc_vars=["mt"], percent_top=(50, 100, 200, 300), inplace=True)
fig, axs = plt.subplots(1, 4, figsize=(15, 4))
sns.histplot(adata.obs["total_counts"],
             kde=False,
             ax=axs[0])
sns.histplot(adata.obs["total_counts"][adata.obs["total_counts"] < 10000],
             kde=False,
             bins=40,
             ax=axs[1])
sns.histplot(adata.obs["n_genes_by_counts"],
             kde=False,
             bins=60,
             ax=axs[2])
sns.histplot(adata.obs["n_genes_by_counts"][adata.obs["n_genes_by_counts"] < 4000],
             kde=False,
             bins=60,
             ax=axs[3])
sc.pp.filter_cells(adata, min_counts=10)
sc.pp.filter_genes(adata, min_cells=10)

# Store raw counts in `layers`
adata.layers["counts"] = adata.X.copy()

# Store data to disk
adata.write(f"{st_data_silver_folder_path}/{dataset}.h5ad")

# Determine cell annotation as per squidpy vignette
sc.pp.highly_variable_genes(adata, flavor="seurat_v3", n_top_genes=4000)
sc.pp.normalize_total(adata, inplace=True)
sc.pp.log1p(adata)
sc.pp.pca(adata)
sc.pp.neighbors(adata)
sc.tl.umap(adata)
sc.tl.leiden(adata)
sc.pl.umap(adata,
           color=["total_counts", "n_genes_by_counts", "leiden"])

# Store data to disk
adata.write(f"{st_data_gold_folder_path}/{dataset}.h5ad")

#### 3.2.2 Explore Data

In [None]:
print(f"Exploring dataset {dataset}.")
print(f"Number of nodes (cells): {adata.layers['counts'].shape[0]}")
print(f"Number of node features (genes): {adata.layers['counts'].shape[1]}")

# Visualize cell-level annotated data in physical space
sq.pl.spatial_scatter(adata, color=cell_type_key, shape=None, figsize=(12, 12))

### 3.3 Vizgen MERFISH Human Liver Cancer

- **Publication:** Vizgen MERFISH FFPE Human Immuno-oncology Data Set,  May 2022
- **Data Access:** https://info.vizgen.com/ffpe-showcase?submissionGuid=bbe0d6ca-92ad-4257-bdcd-cbc6dee8219b
    - Liver cancer 1
        - cell_by_gene.csv
        - cell_metadata.csv
- **Summary:**
    - FFPE tissue section of human liver cancer
    - 480,592 observations on cell-level without annotations
    - 500 probed genes

In [None]:
dataset = "vizgen_merfish_human_liver_cancer"
cell_type_key = "leiden"

#### 3.3.1 Load & Preprocess Raw Data

In [None]:
print(f"Loading and preprocessing dataset '{dataset}'.")
    
# Read counts and remove blank genes
vizgen_dir = f"{st_data_bronze_folder_path}/merfish/"
adata = ad.read_text(vizgen_dir + "HumanLiverCancerPatient1_cell_by_gene.csv" ,delimiter=",", first_column_names=True)
blank_genes = np.array(["Blank" in v for v in adata.var_names])
adata.obsm["blank_genes"] = pd.DataFrame(adata[:, blank_genes].X.copy(), columns=adata.var_names[blank_genes], index=adata.obs_names)
adata = adata[:, ~blank_genes].copy()
adata.X = sp.csr_matrix(adata.X)

# Read metadata and align index for join
metadata = pd.read_csv(vizgen_dir + "HumanLiverCancerPatient1_cell_metadata.csv", header=0, index_col=0)
metadata.sort_index(inplace=True)
metadata.index = metadata.index.astype("str")

# Join counts with obs and store spatial coordinates
adata.obs = pd.merge(adata.obs, metadata, how="left", left_index=True, right_index=True)
adata.obsm["spatial"] = adata.obs[["center_x", "center_y"]].values
adata.obs.drop(columns=["center_x", "center_y"], inplace=True)

# Preprocess
adata.var_names_make_unique()
adata.var["mt"] = adata.var_names.str.startswith("mt-")
sc.pp.calculate_qc_metrics(adata, qc_vars=["mt"], percent_top=(50, 100, 200, 300), inplace=True)
sc.pp.filter_cells(adata, min_counts=50)
sc.pp.filter_genes(adata, min_cells=10)

# Store raw counts in `layers`
adata.layers["counts"] = adata.X.copy()

# Store data to disk
adata.write(f"{st_data_silver_folder_path}/{dataset}.h5ad")

# Determine cell annotation
sc.pp.normalize_total(adata, inplace=True)
sc.pp.log1p(adata)
sc.pp.pca(adata)
sc.pp.neighbors(adata)
sc.tl.umap(adata)
sc.tl.leiden(adata)
sc.pl.umap(adata,
           color=["total_counts", "n_genes_by_counts", "leiden"])

# Store data to disk
adata.write(f"{st_data_gold_folder_path}/{dataset}.h5ad")

#### 3.3.2 Explore Data

In [None]:
print(f"Exploring dataset {dataset}.")
print(f"Number of nodes (cells): {adata.layers['counts'].shape[0]}")
print(f"Number of node features (genes): {adata.layers['counts'].shape[1]}")

# Visualize cell-level annotated data in physical space
sq.pl.spatial_scatter(adata, color=cell_type_key, shape=None, figsize=(12, 12))

### 3.4 Vizgen MERFISH Human Colon Cancer

- **Publication:** Vizgen MERFISH FFPE Human Immuno-oncology Data Set,  May 2022
- **Data Access:** https://info.vizgen.com/ffpe-showcase?submissionGuid=bbe0d6ca-92ad-4257-bdcd-cbc6dee8219b
- **Data Structure:**
    - ```
         ├── ../datasets/st_data/bronze/vizgen_merfish_human_colon_cancer
         │  ├── HumanColonCancerPatient1_cell_by_gene.csv
         │  ├── HumanColonCancerPatient1_cell_metadata.csv            
         │  ├── HumanColonCancerPatient2_cell_by_gene.csv
         │  ├── HumanColonCancerPatient2_cell_metadata.csv
      ```
- **Summary:**
    - 2 FFPE tissue slices of human colon cancer from 2 patients
    - 1,495,039 observations at cellular resolution without annotations
        - Sample 1: P1, 677,451 observations
        - Sample 2: P2, 817,588 observations
    - 500 probed genes

In [None]:
dataset = "vizgen_merfish_human_colon_cancer"

#### 3.4.1 Load & Preprocess Raw Data

##### 3.4.1.1 ST Data

In [None]:
batches = ["Patient1",
           "Patient2"]

for batch_idx, batch in enumerate(batches):
    gene_expr_df = pd.read_csv(f"{st_data_bronze_folder_path}/{dataset}/HumanColonCancer{batch}_cell_by_gene.csv")
    metadata_df = pd.read_csv(f"{st_data_bronze_folder_path}/{dataset}/HumanColonCancer{batch}_cell_metadata.csv")
    metadata_df.rename(columns={"Unnamed: 0": "cell"}, inplace=True)
    
    cols_to_be_removed = ["cell"] + [col for col in gene_expr_df.columns if "Blank" in col]
    adata = ad.AnnData(gene_expr_df[gene_expr_df.columns.difference(cols_to_be_removed)],
                       obs=gene_expr_df[["cell"]],
                       dtype="float32")
    
    # Store gene expression in sparse format
    adata.X = sp.csr_matrix(adata.X)
    
    sc.pp.calculate_qc_metrics(adata, percent_top=(50, 100, 200, 300), inplace=True)
    fig, axs = plt.subplots(1, 4, figsize=(15, 4))
    sns.histplot(adata.obs["total_counts"],
                 kde=False,
                 ax=axs[0])
    sns.histplot(adata.obs["total_counts"][adata.obs["total_counts"] < 500],
                 kde=False,
                 bins=40,
                 ax=axs[1])
    sns.histplot(adata.obs["n_genes_by_counts"],
                 kde=False,
                 bins=60,
                 ax=axs[2])
    sns.histplot(adata.obs["n_genes_by_counts"][adata.obs["n_genes_by_counts"] < 50],
                 kde=False,
                 bins=60,
                 ax=axs[3])

    sc.pp.filter_cells(adata, min_counts=50)
    
    # Add metadata
    adata.obs = pd.merge(adata.obs, metadata_df, on=["cell"], how="left")
    
    adata.obsm["spatial"] = np.array(adata.obs[["center_x", "center_y"]])
    adata.obs = adata.obs.drop(adata.obs.columns, axis=1)
    adata.var = adata.var.drop(adata.var.columns, axis=1)
    adata.layers["counts"] = adata.X
    adata.obs["batch"] = batch.lower()
    adata.write(f"{st_data_gold_folder_path}/{dataset}_batch{batch_idx + 1}.h5ad")

### 3.5 EEL FISH Mouse Brain

- **Publication:** Borm, L. E. et al. Scalable in situ single-cell profiling by electrophoretic capture of mRNA using EEL FISH. Nat. Biotechnol. (2022) [doi:10.1038/s41587-022-01455-3](https://doi.org/10.1038/s41587-022-01455-3)
- **Data Access:** https://figshare.com/articles/dataset/EEL_Mouse_440_genes_single_cell_data/20310771
    - LBEXP20210718_EEL_Mouse_448_2_20220512.h5ad
- **Summary:**
    - Sagittal mouse brain section
    - 127,591 observations on cell-level with cluster annotations 
    - 440 genes

In [None]:
dataset = "eel_fish_mouse_brain"
cell_type_key = "Clusters"

#### 3.5.1 Load & Preprocess Raw Data

In [None]:
print(f"Loading and preprocessing dataset '{dataset}'.")
    
# Read adata from file
adata = sc.read_h5ad(f"{st_data_bronze_folder_path}/eel_fish_mouse_brain/LBEXP20210718_EEL_Mouse_448_2_20220512.h5ad")

# Store counts in `layers` and spatial coords in `obsm`
adata.obsm["spatial"] = adata.obs[["X", "Y"]].values # smaller scale for plotting with squidpy
adata.obs.drop(columns=["X", "Y", "X_um", "Y_um"], inplace=True)
adata.layers["counts"] = adata.X.copy()
adata.obs["batch"] = "sample1"

# Store data to disk
adata.write(f"{st_data_gold_folder_path}/{dataset}.h5ad")

#### 3.5.2 Explore Data

In [None]:
print(f"Exploring dataset {dataset}.")
print(f"Number of nodes (cells): {adata.layers['counts'].shape[0]}")
print(f"Number of node features (genes): {adata.layers['counts'].shape[1]}")

# Visualize cell-level annotated data in physical space
sq.pl.spatial_scatter(adata, color=cell_type_key, shape=None, figsize=(12, 12))

### 3.6 Visium Human Heart

- **Publication:**
- **Data Access:**
- **Data Structure:**
    - ```
         ├── ../datasets/st_data/silver
         │  ├── HCAHeartST8795933_D5LV_stereoscope_210202.h5ad (donor 5 left ventricle gene expression raw counts)
         │  ├── HCAHeartST9383352_D6LV_stereoscope_210202.h5ad (donor 6 sample 1 left ventricle gene expression raw counts)
         │  ├── HCAHeartST9383354_D6LV_stereoscope_210202.h5ad (donor 6 sample 2 left ventricle gene expression raw counts)
      ```
- **Summary:**

In [None]:
dataset = "visium_human_heart"
donor_key = "donor"
batch_key = "batch"
cell_type_key = "majority_cell_type"
cell_types = ["Adip1", "Adip2", "Adip3", "Adip4", "B",
   "B_cells", "B_follicular", "B_memory", "B_plasma", "CD4+T_cytox",
   "CD4+T_tem", "CD4T", "CD4T_Tfh", "CD4T_Th1", "CD4T_naive", "CD4T_reg",
   "CD8+T_cytox", "CD8+T_tem", "CD8T", "CD14+Mo", "CD16+Mo", "DC",
   "DOCK4+MØ1", "DOCK4+MØ2", "EC1_cap", "EC2_cap", "EC3_cap", "EC4_immune",
   "EC5_art", "EC6_ven", "EC7_atria", "EC8_ln", "FB1", "FB2", "FB3", "FB4",
   "FB5", "IL17RA+Mo", "ILC", "LYVE1+MØ1", "LYVE1+MØ2", "LYVE1+MØ3",
   "MAIT", "Mast", "Meso", "Mo_pi", "MØ_AgP", "MØ_mod", "NC1", "NC2",
   "NC3", "NC4", "NC5", "NC6", "NK", "NKT", "NK_ITGAD", "NØ", "PC1_vent",
   "PC2_atria", "PC3_str", "SMC1_basic", "SMC2_art", "gdT", "vCM1", "vCM2",
   "vCM3", "vCM4", "vCM5"]

#### 3.6.1 Load & Preprocess Raw Data

In [None]:
adata_batch1 = sc.read_h5ad(f"{st_data_silver_folder_path}/HCAHeartST8795933_D5LV_stereoscope_210202.h5ad")
cell_prop_df_batch1 = adata_batch1.obs[cell_types]
adata_batch1.obs[cell_type_key] = cell_prop_df_batch1.idxmax(axis=1)
adata_batch1.obs[donor_key] = "donor5"
adata_batch1.obs[batch_key] = "batch1"
adata_batch1.write(f"{st_data_gold_folder_path}/{dataset}_batch1.h5ad")

In [None]:
adata_batch2 = sc.read_h5ad(f"{st_data_silver_folder_path}/HCAHeartST9383352_D6LV_stereoscope_210202.h5ad")
cell_prop_df_batch2 = adata_batch2.obs[cell_types]
adata_batch2.obs[cell_type_key] = cell_prop_df_batch2.idxmax(axis=1)
adata_batch2.obs[donor_key] = "donor6"
adata_batch2.obs[batch_key] = "batch2"
adata_batch2.write(f"{st_data_gold_folder_path}/{dataset}_batch2.h5ad")

In [None]:
adata_batch3 = sc.read_h5ad(f"{st_data_silver_folder_path}/HCAHeartST9383354_D6LV_stereoscope_210202.h5ad")
cell_prop_df_batch3 = adata_batch3.obs[cell_types]
adata_batch3.obs[cell_type_key] = cell_prop_df_batch3.idxmax(axis=1)
adata_batch3.obs[donor_key] = "donor6"
adata_batch3.obs[batch_key] = "batch3"
adata_batch3.write(f"{st_data_gold_folder_path}/{dataset}_batch3.h5ad")

#### 3.6.2 Explore Data

In [None]:
print(f"Exploring dataset {dataset}_batch1.")
print(f"Number of nodes (cells): {adata_batch1.layers['counts'].shape[0]}")
print(f"Number of node features (genes): {adata_batch1.layers['counts'].shape[1]}")

# Visualize cell-level annotated data in physical space
sq.pl.spatial_scatter(adata_batch1,
                      #groups="FB5",
                      color=cell_type_key,
                      shape=None,
                      figsize=(8, 8))

In [None]:
print(f"Exploring dataset {dataset}_batch2.")
print(f"Number of nodes (cells): {adata_batch2.layers['counts'].shape[0]}")
print(f"Number of node features (genes): {adata_batch2.layers['counts'].shape[1]}")

# Visualize cell-level annotated data in physical space
sq.pl.spatial_scatter(adata_batch2,
                      #groups="CD8+T_cytox",
                      color=cell_type_key,
                      shape=None,
                      figsize=(10, 10))

In [None]:
print(f"Exploring dataset {dataset}_batch3.")
print(f"Number of nodes (cells): {adata_batch3.layers['counts'].shape[0]}")
print(f"Number of node features (genes): {adata_batch3.layers['counts'].shape[1]}")

# Visualize cell-level annotated data in physical space
sq.pl.spatial_scatter(adata_batch3,
                      #groups="Adip1",
                      color=cell_type_key,
                      shape=None,
                      figsize=(10, 10))

### 3.7 Vizgen MERFISH Human Ovarian Cancer

- **Publication:** Vizgen MERFISH FFPE Human Immuno-oncology Data Set,  May 2022
- **Data Access:** https://info.vizgen.com/ffpe-showcase?submissionGuid=bbe0d6ca-92ad-4257-bdcd-cbc6dee8219b
- **Data Structure:**
    - ```
         ├── ../datasets/st_data/bronze/vizgen_merfish_human_ovarian_cancer
         │  ├── HumanOvarianCancerPatient1_cell_by_gene.csv
         │  ├── HumanOvarianCancerPatient1_cell_metadata.csv            
         │  ├── HumanOvarianCancerPatient2Slice1_cell_by_gene.csv
         │  ├── HumanOvarianCancerPatient2Slice1_cell_metadata.csv
         │  ├── HumanOvarianCancerPatient2Slice2_cell_by_gene.csv
         │  ├── HumanOvarianCancerPatient2Slice2_cell_metadata.csv
         │  ├── HumanOvarianCancerPatient2Slice3_cell_by_gene.csv
         │  ├── HumanOvarianCancerPatient2Slice3_cell_metadata.csv
      ```
- **Summary:**
    - 4 FFPE tissue slices of human ovarian cancer from 2 patients
    - 896,638 observations at cellular resolution without annotations
        - Sample 1: P1, 358,485 observations
        - Sample 2: P2, 254,347 observations
        - Sample 3: P2, 71,381 observations
        - Sample 4: P2, 212,425 observations
    - 500 probed genes

In [None]:
dataset = "vizgen_merfish_human_ovarian_cancer"

#### 3.7.1 Load & Preprocess Raw Data

##### 3.7.1.1 ST Data

In [None]:
batches = ["Patient1",
           "Patient2Slice1",
           "Patient2Slice2",
           "Patient2Slice3"]

for batch_idx, batch in enumerate(batches):
    gene_expr_df = pd.read_csv(f"{st_data_bronze_folder_path}/{dataset}/HumanOvarianCancer{batch}_cell_by_gene.csv")
    metadata_df = pd.read_csv(f"{st_data_bronze_folder_path}/{dataset}/HumanOvarianCancer{batch}_cell_metadata.csv")
    metadata_df.rename(columns={"Unnamed: 0": "cell"}, inplace=True)
    
    cols_to_be_removed = ["cell"] + [col for col in gene_expr_df.columns if "Blank" in col]
    adata = ad.AnnData(gene_expr_df[gene_expr_df.columns.difference(cols_to_be_removed)],
                       obs=gene_expr_df[["cell"]],
                       dtype="float32")
    
    # Store gene expression in sparse format
    adata.X = sp.csr_matrix(adata.X)
    
    sc.pp.calculate_qc_metrics(adata, percent_top=(50, 100, 200, 300), inplace=True)
    fig, axs = plt.subplots(1, 4, figsize=(15, 4))
    sns.histplot(adata.obs["total_counts"],
                 kde=False,
                 ax=axs[0])
    sns.histplot(adata.obs["total_counts"][adata.obs["total_counts"] < 500],
                 kde=False,
                 bins=40,
                 ax=axs[1])
    sns.histplot(adata.obs["n_genes_by_counts"],
                 kde=False,
                 bins=60,
                 ax=axs[2])
    sns.histplot(adata.obs["n_genes_by_counts"][adata.obs["n_genes_by_counts"] < 50],
                 kde=False,
                 bins=60,
                 ax=axs[3])

    sc.pp.filter_cells(adata, min_counts=50)
    
    # Add metadata
    adata.obs = pd.merge(adata.obs, metadata_df, on=["cell"], how="left")
    
    adata.obsm["spatial"] = np.array(adata.obs[["center_x", "center_y"]])
    adata.obs = adata.obs.drop(adata.obs.columns, axis=1)
    adata.var = adata.var.drop(adata.var.columns, axis=1)
    adata.layers["counts"] = adata.X
    adata.obs["patient"] = batch[:8].lower()
    adata.obs["batch"] = batch.lower()
    adata.write(f"{st_data_gold_folder_path}/{dataset}_batch{batch_idx + 1}.h5ad")
    
# Add annotations
adata = sc.read_h5ad(f"{st_data_silver_folder_path}/{dataset}.h5ad)
adata_batch1 = adata[adata.obs["batch"] == "sample1"]
adata_batch2 = adata[adata.obs["batch"] == "sample2"]
adata_batch3 = adata[adata.obs["batch"] == "sample3"]
adata_batch4 = adata[adata.obs["batch"] == "sample4"]
                     
adata_batch1_original = sc.read_h5ad(f"{st_data_silver_folder_path}/{dataset}_batch1.h5ad")
adata_batch2_original = sc.read_h5ad(f"{st_data_silver_folder_path}/{dataset}_batch2.h5ad")
adata_batch3_original = sc.read_h5ad(f"{st_data_silver_folder_path}/{dataset}_batch3.h5ad")
adata_batch4_original = sc.read_h5ad(f"{st_data_silver_folder_path}/{dataset}_batch4.h5ad")
                     
adata_batch1_original.obs["cell"] = adata_batch1.obs["cell"].values
adata_batch1_original.obs["fov"] = adata_batch1.obs["fov"].values
adata_batch1_original.obs["cell_type"] = adata_batch1.obs["cell_type"].values
                     
adata_batch2_original.obs["cell"] = adata_batch2.obs["cell"].values
adata_batch2_original.obs["fov"] = adata_batch2.obs["fov"].values
adata_batch2_original.obs["cell_type"] = adata_batch2.obs["cell_type"].values
                     
adata_batch3_original.obs["cell"] = adata_batch3.obs["cell"].values
adata_batch3_original.obs["fov"] = adata_batch3.obs["fov"].values
adata_batch3_original.obs["cell_type"] = adata_batch3.obs["cell_type"].values
                     
adata_batch4_original.obs["cell"] = adata_batch4.obs["cell"].values
adata_batch4_original.obs["fov"] = adata_batch4.obs["fov"].values
adata_batch4_original.obs["cell_type"] = adata_batch4.obs["cell_type"].values
                     
adata_batch1_original.write(f"{st_data_silver_folder_path}/{dataset}_batch1.h5ad")
adata_batch2_original.write(f"{st_data_silver_folder_path}/{dataset}_batch2.h5ad")
adata_batch3_original.write(f"{st_data_silver_folder_path}/{dataset}_batch3.h5ad")
adata_batch4_original.write(f"{st_data_silver_folder_path}/{dataset}_batch4.h5ad")

#### 3.7.2 Explore Data

##### 3.7.2.1 ST Data

In [None]:
batch_indeces = list(np.arange(4) + 1)
for batch_idx in batch_indeces:
    adata = sc.read_h5ad(f"{st_data_gold_folder_path}/{dataset}_batch{batch_idx}.h5ad")
    
    print(f"Exploring dataset {dataset}_batch{batch_idx}.")
    print(f"Number of nodes (cells): {adata.X.shape[0]}")
    print(f"Number of node features (genes): {adata.X.shape[1]}")

    # Visualize cell-level annotated data in physical space
    sq.pl.spatial_scatter(adata, color="batch", shape=None, figsize=(12, 12))        