In [None]:
# Cell 0  ──────────────────────────────────────────────────────────────
import numpy as np
import torch
import dgl

# 👇 import the encoder implementation that came with SpaFormer
#    (the file you uploaded as edcoder.py)
import importlib.util, pathlib, sys
spec = importlib.util.spec_from_file_location("edcoder", pathlib.Path("Data") / "edcoder.py")
edcoder = importlib.util.module_from_spec(spec)
sys.modules["edcoder"] = edcoder
spec.loader.exec_module(edcoder)

# --- convenience wrapper --------------------------------------------------
def build_graph(edge_index, num_nodes):
    """
    edge_index: np.ndarray with shape (2, E) – COO (src, dst)
    returns    : DGLGraph on CPU
    """
    g = dgl.graph((edge_index[0], edge_index[1]), num_nodes=num_nodes)
    g = dgl.add_self_loop(g)          # SpaFormer expects self-loops
    return g

In [None]:
# Cell 1  ──────────────────────────────────────────────────────────────
# paths
PP_DIR = pathlib.Path("Data/spaformer_prepared")

# ① expression matrix  (cells × genes)        float32
X_np     = np.load(PP_DIR / "X.npy")          # (N, G)
# ② spatial coordinates (cells × 2, 0-1 norm) float32
C_np     = np.load(PP_DIR / "C.npy")          # (N, 2)
# ③ graph (2 × E) int32  – already K-NN, undirected, COO
edges_np = np.load(PP_DIR / "edges.npy")      # (2, E)

N, G = X_np.shape
print(f"cells: {N}   genes: {G}   edges: {edges_np.shape[1]}")

In [None]:
# Cell 2  ──────────────────────────────────────────────────────────────
# torch tensors on CPU
X_t = torch.from_numpy(X_np)        # (N, G)
C_t = torch.from_numpy(C_np)        # (N, 2)

# DGL graph
g = build_graph(edges_np, N)

print(g)          # sanity check
print("X_t", X_t.shape, X_t.dtype)
print("C_t", C_t.shape)

In [None]:
# Cell 3  ──────────────────────────────────────────────────────────────
# ⚙️  hyper-params ------  (tweak as you like)
EMBED_DIM    = 256   # latent dimension per token (cell)
DEPTH        = 6     # number of transformer layers
NUM_HEADS    = 8

# 🏗  build encoder-only model
model = edcoder.EDcoder(
            gene_dim     = G,        # input feature size
            pos_dim      = 2,        # (x,y)
            embed_dim    = EMBED_DIM,
            depth        = DEPTH,
            num_heads    = NUM_HEADS,
            decoder      = False,    # 👈 turn off the decoder
        )

print(f"Encoder params: {sum(p.numel() for p in model.parameters())/1e6:.2f} M")
model.eval()          # inference mode (no dropout, no grad)

In [None]:
# Cell 4  ──────────────────────────────────────────────────────────────
with torch.no_grad():                         # inference
    # 👇 forward signature may differ; many repos use (g, X, pos)
    #    or concatenate C into X inside. Adjust if needed.
    latent_t = model(g, X_t, C_t)             # (N, EMBED_DIM)

print("latent_t:", latent_t.shape)

In [None]:
# Cell 5  ──────────────────────────────────────────────────────────────
LATENT_PATH = PP_DIR / "latent.npy"
np.save(LATENT_PATH, latent_t.cpu().numpy())
print("Saved →", LATENT_PATH)

# (optional) small AnnData wrapper for Scanpy/UMAP downstream
import scanpy as sc, anndata as ad
adata_latent = ad.AnnData(
        X            = latent_t.cpu().numpy(),
        obs          = pd.DataFrame(index=[f"cell_{i}" for i in range(N)]),
        var          = pd.DataFrame(index=[f"z{i}"     for i in range(EMBED_DIM)]),
    )
adata_latent.obsm["spatial"] = C_np          # keep coords
adata_latent.write(PP_DIR / "latent.h5ad")
print("AnnData written →", PP_DIR / "latent.h5ad")

In [None]:
# Cell 6  ──────────────────────────────────────────────────────────────
import scanpy as sc

sc.pp.neighbors(adata_latent, n_neighbors=15, use_rep="X")
sc.tl.umap(adata_latent)
sc.tl.leiden(adata_latent, resolution=0.6)

sc.pl.umap(
    adata_latent,
    color=["leiden"],
    size=20
)