In [1]:
import random
from pathlib import Path

import anndata as ad
import numpy as np
import pandas as pd
import scanpy as sc
import scipy as sp
import SpaGCN as spg
import torch
from PIL import Image

# disable DecompressionBombWarning
Image.MAX_IMAGE_PIXELS = None

In [2]:
data_dir = Path("./data/LIBD_DLPFC")
result_dir = Path("./results/LIBD_DLPFC")

technology = "Visium"
seed = 42

alpha = 1
beta = 49
p = 0.5

In [3]:
metadata = pd.read_table(data_dir / "samples.tsv").loc[:, ["directory", "n_clusters"]]

In [4]:
# Set seed
random.seed(seed)
torch.manual_seed(seed)
np.random.seed(seed)

In [5]:
def get_anndata(path):
    X = sp.io.mmread(path / "counts.mtx").tocsr()

    observations = pd.read_table(path / "observations.tsv", index_col=0)
    features = pd.read_table(path / "features.tsv", index_col=0)

    coordinates = (
        pd.read_table(path / "coordinates.tsv", index_col=0)
        .loc[observations.index, :]
        .to_numpy()
    )

    adata = ad.AnnData(
        X=X, obs=observations, var=features, obsm={"spatial_pixel": coordinates}
    )

    adata.uns["image"] = np.array(Image.open(path / "H_E.tiff"))

    return adata

In [7]:
for _, sample in metadata.iterrows():
    print("Processing " + sample.directory)

    sample_dir = data_dir / sample.directory
    out_dir = result_dir / sample.directory
    adata = get_anndata(sample_dir)

    adj = spg.calculate_adj_matrix(
        adata.obs["col"],
        adata.obs["row"],
        adata.obsm["spatial_pixel"][:, 0],
        adata.obsm["spatial_pixel"][:, 1],
        image=adata.uns["image"],
        alpha=alpha,
        beta=beta,
        histology=True,
    )

    spg.prefilter_genes(adata, min_cells=3)
    spg.prefilter_specialgenes(adata)
    adata.X = adata.X.astype("float64")
    sc.pp.normalize_per_cell(adata)
    sc.pp.log1p(adata)

    clf = spg.SpaGCN()

    # Find the l value given p
    l = spg.search_l(p, adj)
    clf.set_l(l)

    # Search for suitable resolution
    res = spg.search_res(
        adata, adj, l, sample.n_clusters, r_seed=seed, t_seed=seed, n_seed=seed
    )

    clf.train(adata, adj, res=res)

    y_pred, prob = clf.predict()

    adata.obs["cluster"] = pd.Series(y_pred, index=adata.obs_names, dtype="category")

    # Do cluster refinement(optional)
    adj_2d = spg.calculate_adj_matrix(
        adata.obs["col"], adata.obs["row"], histology=False
    )

    refined_pred = spg.refine(
        sample_id=adata.obs_names.tolist(),
        pred=adata.obs["cluster"].tolist(),
        dis=adj_2d,
        shape="hexagon",
    )

    adata.obs["refined_cluster"] = pd.Series(
        refined_pred, index=adata.obs_names, dtype="category"
    )

    # label_df = adata.obs[["cluster"]]
    label_df_ref = adata.obs[["refined_cluster"]]

    ## Write output
    out_dir.mkdir(parents=True, exist_ok=True)

    label_df_ref.columns = ["label"]
    label_df_ref.to_csv(out_dir / "SpaGCN.tsv", sep="\t", index_label="")

Processing Br5292_151507
Calculateing adj matrix using histology image...
Var of c0,c1,c2 =  98.7653707607211 347.04188339945216 67.83401448585366
Var of x,y,z =  1123.1580554335508 435.69138627328755 1123.1580554335505
Run 1: l [0.01, 1000], p [0.0, 4213.694327626796]
Run 2: l [0.01, 500.005], p [0.0, 4180.11669921875]
Run 3: l [0.01, 250.0075], p [0.0, 4050.718017578125]
Run 4: l [0.01, 125.00874999999999], p [0.0, 3601.731689453125]
Run 5: l [0.01, 62.509375], p [0.0, 2493.169921875]
Run 6: l [0.01, 31.2596875], p [0.0, 1146.388916015625]
Run 7: l [0.01, 15.63484375], p [0.0, 349.5950012207031]
Run 8: l [0.01, 7.822421875], p [0.0, 77.59295654296875]
Run 9: l [0.01, 3.9162109375], p [0.0, 13.964275360107422]
Run 10: l [0.01, 1.9631054687499998], p [0.0, 2.123582363128662]
Run 11: l [0.9865527343749999, 1.9631054687499998], p [0.23047912120819092, 2.123582363128662]
Run 12: l [0.9865527343749999, 1.4748291015625], p [0.23047912120819092, 0.9038530588150024]
recommended l =  1.2306909