In [None]:
import tensorflow as tf
import scanpy as sc
import os
from scipy.spatial import KDTree
from tensorflow import keras
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.decomposition import NMF
from tqdm.notebook import tqdm
from core_functions.neighborhood_decomposition import *
import glob

We will use the Epithelial and Stromal classes defined by GeneFormer to perform a spatial decompostion on Epithelial and Stromal cells so that we have a feature set for crypt villus axis prediction

In [None]:
output_folder = r"uninfected/analysis/cleaned"

In [None]:
input_folders = glob.glob("uninfected/segmentation_SI*")

reading in the uninfected adatas

In [None]:
adatas = []
ct = 0
for input_file in input_folders:
    ada = sc.read(
        os.path.join(input_file, "adatas", "08_full_celltypes_and_leiden.h5ad")
    )
    ada = ada[:, ~ada.var["gene"].isna().values]
    adatas.append(ada)
    ct += 1

Path to the timecourse dataset

In [None]:
reference_path = r"timecourse.h5ad"

Add the reference adata to the uninfected data

In [None]:
# subset to only the reference data (the dataset with the best morphology that was treated as a reference for the first set of replicates)
reference_adata = sc.read(reference_path)
reference_adata = reference_adata[reference_adata.obs["batch"] == "day8_SI_Ctrl"]

adatas.append(reference_adata)
combined_adata = sc.concat(adatas)

Train Decomposition model

In [None]:
unchanging_type_keys = ["Epithelial", "Stromal"]
combined_adata_no_immune = combined_adata[
    combined_adata.obs["Class"].isin(unchanging_type_keys)
]
unique_batches = np.unique(combined_adata_no_immune.obs.batch.values)

In [None]:
nneighbors = 10
dfs = []
for input_file in unique_batches:
    adata = combined_adata_no_immune[
        combined_adata_no_immune.obs["batch"] == input_file
    ]
    adata_arr = np.array(adata.X.A)
    celltype_cluster = adata.obs.index.values
    list_of_arrays = []
    spatial_points = np.array(
        [adata.obsm["X_spatial"][:, 0], adata.obsm["X_spatial"][:, 1]]
    ).T
    tree = KDTree(spatial_points)
    for i_bac in tqdm(range(len(celltype_cluster))):
        current_cell = celltype_cluster[i_bac]
        distances, neighbors = tree.query(spatial_points[i_bac], k=nneighbors)
        neighbors = np.array(list(neighbors))
        gene_array = np.array(np.sum(adata_arr[neighbors, :], axis=0)).squeeze()
        list_of_arrays.append(gene_array)

    X = pd.DataFrame(np.array(list_of_arrays))
    dfs.append(X)

In [None]:
X_arr = pd.concat(dfs)

In [None]:
num_neighborhoods = 15
X = X_arr
del X_arr
f = len(X.columns)
n = len(X.index.tolist())

model = NMF(n_components=num_neighborhoods, random_state=0)
W = model.fit_transform(X)
H = model.components_

Apply decomposition model

In [None]:
for input_file in input_folders:
    adata = sc.read(
        os.path.join(input_file, "adatas", "08_full_celltypes_and_leiden.h5ad")
    )

    superclusters = adata.obs["Class"].values
    celltype_cluster = adata.obs.index.values

    base_dictionary = {}
    for i in np.unique(celltype_cluster):
        base_dictionary[i] = 0

    nneighbors = 10
    list_of_arrays = []
    adata_epi = adata[adata.obs["Class"].isin(unchanging_type_keys)]
    spatial_points_epi = np.array(
        [adata_epi.obsm["X_spatial"][:, 0], adata_epi.obsm["X_spatial"][:, 1]]
    ).T
    spatial_points = np.array(
        [adata.obsm["X_spatial"][:, 0], adata.obsm["X_spatial"][:, 1]]
    ).T
    adata_epi_arr = np.array(adata_epi.X.A)

    tree = KDTree(spatial_points_epi)
    for i_bac in tqdm(range(len(celltype_cluster))):
        current_cell = celltype_cluster[i_bac]
        distances, neighbors = tree.query(spatial_points[i_bac], k=nneighbors)
        neighbors = np.array(list(neighbors))
        gene_array = np.array(np.sum(adata_epi_arr[neighbors, :], axis=0)).squeeze()
        list_of_arrays.append(gene_array)

    X = pd.DataFrame(np.array(list_of_arrays)).astype(H.dtype)
    X = X[X.columns[np.array(adata.var.index.isin(reference_adata.var.index))]]
    W = model.transform(X)

    topics_frame = pd.DataFrame(W)

    topics_frame.columns = [
        "Topic " + str(i + 1) for i in range(len(topics_frame.columns))
    ]
    topics_frame.index = adata.obs.index.tolist()

    def zscore(column):
        return (column - column.mean()) / column.std()

    # Apply the z-score function to each column in the dataframe
    topics_frame = topics_frame.apply(zscore)
    adata.obs = adata.obs.merge(topics_frame, left_index=True, right_index=True)
    adata.obs["topic"] = pd.Categorical(
        (np.argmax(topics_frame.values, axis=1) + 1).astype(str)
    )

    sc.set_figure_params(dpi=300)
    figure = sc.pl.embedding(
        adata,
        basis="spatial",
        color="topic",
        vmax=1,
        cmap="Blues",
        title="Neighborhood",
        size=2,
        show=False,
        return_fig=True,
    )
    try:
        os.mkdir(os.path.join(input_file, "figures", "neighborhoods"))
    except:
        print("Figures/neighborhoods already made.")
    figure.tight_layout()
    plt.axis("equal")
    figure.savefig(
        os.path.join(input_file, "figures", "neighborhoods", "neighborhoods.png")
    )
    plt.close()
    adata.write(
        os.path.join(input_file, "adatas", "09_before_decomposition_model.h5ad")
    )