In [19]:
import tensorflow as tf
import scanpy as sc
import os
from scipy.spatial import KDTree
from tensorflow import keras
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.decomposition import NMF
from tqdm.notebook import tqdm
from core_functions.neighborhood_decomposition import *
import glob

### Similarly to replicate 1 processing, we will use the Epithelial and Stromal classes defined by GeneFormer to perform a spatial decompostion on Epithelial and Stromal cells so that we have a feature set for crypt villus axis prediction

##### Path to replicate datasets. 

In [20]:
output_folder = r'/mnt/sata1/Analysis_Alex/timecourse_replicates/analysis/cleaned'

In [21]:
input_folders = glob.glob('/mnt/sata1/Analysis_Alex/timecourse_replicates/day*')

In [22]:
adatas = []
for input_file in input_folders:
    adatas.append(sc.read(os.path.join(input_file, 'adatas', '08_full_celltypes_and_leiden.h5ad')))



##### Path to replicate 1 final dataset. We will be using the same reference dataset for morphology prediction as we used in the replicate 1 processing, in our case - Day 7 DMSO

In [23]:
reference_path = r'/mnt/sata1/Analysis_Alex/timecourse_final/analysis/cleaned/final_celltyped_and_axes.h5ad'

In [24]:
# subset to only the reference data (the dataset with the best morphology that was treated as a reference for the first set of replicates)
reference_adata = sc.read(reference_path)
reference_adata = reference_adata[reference_adata.obs['batch'] == 'day7_SI_DMSO']

  utils.warn_names_duplicates("obs")


In [25]:
adatas.append(reference_adata)

In [26]:
combined_adata = sc.concat(adatas)

  utils.warn_names_duplicates("obs")


In [27]:
del adatas

### Train Decomposition model

In [28]:
unchanging_type_keys = ['Epithelial', 'Stromal']
combined_adata_no_immune = combined_adata[combined_adata.obs['Class'].isin(unchanging_type_keys)]
unique_batches = np.unique(combined_adata_no_immune.obs.batch.values)

In [29]:
nneighbors = 10
dfs = []
for input_file in unique_batches:
    adata = combined_adata_no_immune[combined_adata_no_immune.obs['batch'] == input_file]
    adata_arr = np.array(adata.X)
    celltype_cluster = adata.obs.index.values
    list_of_arrays = []
    spatial_points = np.array([adata.obsm['X_spatial'][:,0], adata.obsm['X_spatial'][:,1]]).T
    tree = KDTree(spatial_points)
    for i_bac in tqdm(range(len(celltype_cluster))):
        current_cell = celltype_cluster[i_bac]
        distances, neighbors = tree.query(spatial_points[i_bac], k=nneighbors)
        neighbors = np.array(list(neighbors))
        gene_array = np.array(np.sum(adata_arr[neighbors, :], axis=0)).squeeze()
        list_of_arrays.append(gene_array)
    
    X = pd.DataFrame(np.array(list_of_arrays))
    dfs.append(X)

In [32]:
del combined_adata
del combined_adata_no_immune

In [33]:
X_arr = pd.concat(dfs)

In [34]:
del dfs

In [35]:
num_neighborhoods = 15
X = X_arr
del X_arr
f = len(X.columns)
n = len(X.index.tolist())

model = NMF(n_components=num_neighborhoods, random_state=0)
W = model.fit_transform(X)
H = model.components_



### Apply decomposition model

In [36]:
for input_file in input_folders:
    adata = sc.read(os.path.join(input_file, 'adatas', '08_full_celltypes_and_leiden.h5ad'))
    
    superclusters = adata.obs['Class'].values
    celltype_cluster = adata.obs.index.values

    base_dictionary = {}
    for i in np.unique(celltype_cluster):
        base_dictionary[i] = 0

    nneighbors = 10
    list_of_arrays = []
    adata_epi = adata[adata.obs['Class'].isin(unchanging_type_keys)]
    spatial_points_epi = np.array([adata_epi.obsm['X_spatial'][:,0], adata_epi.obsm['X_spatial'][:,1]]).T
    spatial_points = np.array([adata.obsm['X_spatial'][:,0], adata.obsm['X_spatial'][:,1]]).T
    adata_epi_arr = np.array(adata_epi.X)
    
    tree = KDTree(spatial_points_epi)
    for i_bac in tqdm(range(len(celltype_cluster))):
        current_cell = celltype_cluster[i_bac]
        distances, neighbors = tree.query(spatial_points[i_bac], k=nneighbors)
        neighbors = np.array(list(neighbors))
        gene_array = np.array(np.sum(adata_epi_arr[neighbors, :], axis=0)).squeeze()
        list_of_arrays.append(gene_array)
    
    X = pd.DataFrame(np.array(list_of_arrays)).astype(H.dtype)
    W = model.transform(X)
    
    topics_frame = pd.DataFrame(W)
    
    topics_frame.columns = ['Topic '+str(i+1) for i in range(len(topics_frame.columns))]
    topics_frame.index = adata.obs.index.tolist()
    def zscore(column):
        return (column - column.mean()) / column.std()
    
    # Apply the z-score function to each column in the dataframe
    topics_frame = topics_frame.apply(zscore)
    adata.obs=adata.obs.merge(topics_frame, left_index=True, right_index=True)
    adata.obs['topic'] = pd.Categorical((np.argmax(topics_frame.values, axis = 1)+1).astype(str))

    sc.set_figure_params(dpi=300)
    figure = sc.pl.embedding(adata, basis='spatial', color='topic', vmax=1, cmap='Blues', title='Neighborhood', size=2, show=False, return_fig=True)
    try:
        os.mkdir(os.path.join(input_file,'figures', 'neighborhoods'))
    except:
        print('Figures/neighborhoods already made.')
    figure.tight_layout()
    plt.axis('equal')
    figure.savefig(os.path.join(input_file,'figures', 'neighborhoods', 'neighborhoods.png'))
    plt.close()
    adata.write(os.path.join(input_file, 'adatas', '09_before_decomposition_model.h5ad'))



  0%|          | 0/274464 [00:00<?, ?it/s]

  color_vector = pd.Categorical(values.map(color_map))
  cax = scatter(


Figures/neighborhoods already made.




  0%|          | 0/258269 [00:00<?, ?it/s]

  color_vector = pd.Categorical(values.map(color_map))
  cax = scatter(


Figures/neighborhoods already made.




  0%|          | 0/241781 [00:00<?, ?it/s]

  color_vector = pd.Categorical(values.map(color_map))
  cax = scatter(


Figures/neighborhoods already made.




  0%|          | 0/198887 [00:00<?, ?it/s]

  color_vector = pd.Categorical(values.map(color_map))
  cax = scatter(


Figures/neighborhoods already made.


### Adding topic enrichment vectors to  the reference day 7 DMSO data

In [37]:
xenium_path = '/mnt/sata1/Analysis_Alex/timecourse_final/day7_SI_DMSO'

In [38]:
reference_prep = sc.read(os.path.join(xenium_path, 'adatas', '07_axes_defined_reference.h5ad'))



In [40]:
reference_prep.obs = reference_prep.obs.drop(reference_prep.obs.columns[reference_prep.obs.columns.str.contains('Topic')], axis=1)

In [42]:
superclusters = reference_prep.obs['Class'].values
celltype_cluster = reference_prep.obs.index.values

base_dictionary = {}
for i in np.unique(celltype_cluster):
    base_dictionary[i] = 0

nneighbors = 10
list_of_arrays = []
reference_prep_epi = reference_prep[reference_prep.obs['Class'].isin(unchanging_type_keys)]
spatial_points_epi = np.array([reference_prep_epi.obsm['X_spatial'][:,0], reference_prep_epi.obsm['X_spatial'][:,1]]).T
spatial_points = np.array([reference_prep.obsm['X_spatial'][:,0], reference_prep.obsm['X_spatial'][:,1]]).T
reference_prep_epi_arr = np.array(reference_prep_epi.X)

tree = KDTree(spatial_points_epi)
for i_bac in tqdm(range(len(celltype_cluster))):
    current_cell = celltype_cluster[i_bac]
    distances, neighbors = tree.query(spatial_points[i_bac], k=nneighbors)
    neighbors = np.array(list(neighbors))
    gene_array = np.array(np.sum(reference_prep_epi_arr[neighbors, :], axis=0)).squeeze()
    list_of_arrays.append(gene_array)

X = pd.DataFrame(np.array(list_of_arrays)).astype(H.dtype)
W = model.transform(X)

topics_frame = pd.DataFrame(W)

topics_frame.columns = ['Topic '+str(i+1) for i in range(len(topics_frame.columns))]
topics_frame.index = reference_prep.obs.index.tolist()
def zscore(column):
    return (column - column.mean()) / column.std()

# Apply the z-score function to each column in the dataframe
topics_frame = topics_frame.apply(zscore)
reference_prep.obs=reference_prep.obs.merge(topics_frame, left_index=True, right_index=True)
reference_prep.obs['topic'] = pd.Categorical((np.argmax(topics_frame.values, axis = 1)+1).astype(str))

  0%|          | 0/225671 [00:00<?, ?it/s]

In [43]:
reference_prep.write(os.path.join('/mnt/sata1/Analysis_Alex/timecourse_replicates/unrolling_meta', 'reference_prep_decomposition_model.h5ad'))