##### This notebook calculates spatial neighborhood decomposition across all of the MERSCOPE experiments. 
The purpose of this is to get several vectors that parameterize the niche of each cell for crypt-villus axis prediction. By using non negative matrix factorization on neighborhoods of epithelial and stromal cells, we capture recurrent neighborhood patterns that are conserved across space and time.

In [2]:
import tensorflow as tf
import scanpy as sc
import os
from scipy.spatial import KDTree
from tensorflow import keras
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.decomposition import NMF
from tqdm.notebook import tqdm
from core_functions.neighborhood_decomposition import *
import glob

### Grab the celltyped and clustered adata and put the paths to all of the individual SI adatas

In [3]:
output_folder = r'/projects/2023_Spatial_Paper/Analysis_Alex/merscope_final/analysis/final'

In [4]:
input_folders = glob.glob('/projects/2023_Spatial_Paper/Analysis_Alex/merscope_final/SI*') 

In [6]:
combined_adata = sc.read(os.path.join(output_folder, 'full_celltypes_and_leiden.h5ad'))



### Fit an NMF model on neighborhoods of gene expression within epithelial and stromal cells.

In [7]:
unchanging_type_keys = ['Epithelial', 'Stromal']
combined_adata_no_immune = combined_adata[combined_adata.obs['Class'].isin(unchanging_type_keys)]

In [None]:
# Number of neihgbors in each training neighborhood. 
nneighbors = 10
dfs = []

unique_batches = np.unique(combined_adata_no_immune.obs.new_batch.values)
for input_file in input_folders:
    for roll in ['roll1', 'roll2']:
        adata = combined_adata_no_immune[combined_adata_no_immune.obs['batch'] == os.path.basename(input_file)+ f'_roll_{roll}']
        adata_arr = np.array(adata.X)
        celltype_cluster = adata.obs.index.values
        list_of_arrays = []
        spatial_points = np.array([adata.obsm['X_spatial'][:,0], adata.obsm['X_spatial'][:,1]]).T
        tree = KDTree(spatial_points)
        for i_bac in tqdm(range(len(celltype_cluster))):
            current_cell = celltype_cluster[i_bac]
            distances, neighbors = tree.query(spatial_points[i_bac], k=nneighbors)
            neighbors = np.array(list(neighbors))
            gene_array = np.array(np.sum(adata_arr[neighbors, :], axis=0)).squeeze()
            list_of_arrays.append(gene_array)
        
        X = pd.DataFrame(np.array(list_of_arrays))
        dfs.append(X)

In [19]:
X_arr = pd.concat(dfs)

num_neighborhoods = 15
X = X_arr
f = len(X.columns)
n = len(X.index.tolist())

model = NMF(n_components=num_neighborhoods, random_state=0)
W = model.fit_transform(X)
H = model.components_

  if not hasattr(array, "sparse") and array.dtypes.apply(is_sparse).any():
  if is_sparse(pd_dtype):
  if is_sparse(pd_dtype) or not is_extension_array_dtype(pd_dtype):


### Use the NMF model to calculate neighborhood enrichment vectors across all of the MERSCOPE experiments.

In [None]:
for input_file in input_folders:
    for roll in ['roll1', 'roll2']:
        adata = sc.read(os.path.join(input_file, 'adatas', f'05_unrolled_roll_{roll}.h5ad'))
        
        superclusters = combined_adata[combined_adata.obs['batch'] == os.path.basename(input_file)+ f'_roll_{roll}'].obs['Class'].values
        cluster_observations = combined_adata[combined_adata.obs['batch'] == os.path.basename(input_file)+ f'_roll_{roll}'].obs[['leiden', 'Sub_leiden', 'Class', 'Type', 'Subtype', 'Immunocentric_Type']]
        #cluster_observations.index = [i.split('-')[0] for i in cluster_observations.index.values]
        adata.obs = adata.obs.merge(cluster_observations, left_on='cell', right_index=True, how='left')
        celltype_cluster = adata.obs.index.values
        base_dictionary = {}
        for i in np.unique(celltype_cluster):
            base_dictionary[i] = 0
        nneighbors = 10
        list_of_arrays = []
        adata_epi = adata[adata.obs['Class'].isin(unchanging_type_keys)]
        spatial_points_epi = np.array([adata_epi.obsm['X_spatial'][:,0], adata_epi.obsm['X_spatial'][:,1]]).T
        spatial_points = np.array([adata.obsm['X_spatial'][:,0], adata.obsm['X_spatial'][:,1]]).T
        adata_epi_arr = np.array(adata_epi.X)
        
        tree = KDTree(spatial_points_epi)
        for i_bac in tqdm(range(len(celltype_cluster))):
            current_cell = celltype_cluster[i_bac]
            distances, neighbors = tree.query(spatial_points[i_bac], k=nneighbors)
            neighbors = np.array(list(neighbors))
            gene_array = np.array(np.sum(adata_epi_arr[neighbors, :], axis=0)).squeeze()
            list_of_arrays.append(gene_array)
        
        X = pd.DataFrame(np.array(list_of_arrays)).astype(H.dtype)
        W = model.transform(X)
        
        topics_frame = pd.DataFrame(W)
        
        topics_frame.columns = ['Topic '+str(i+1) for i in range(len(topics_frame.columns))]
        topics_frame.index = adata.obs.index.tolist()
        def zscore(column):
            return (column - column.mean()) / column.std()
        
        # Apply the z-score function to each column in the dataframe
        topics_frame = topics_frame.apply(zscore)
        adata.obs=adata.obs.merge(topics_frame, left_index=True, right_index=True)
        adata.obs['topic'] = pd.Categorical((np.argmax(topics_frame.values, axis = 1)+1).astype(str))

        sc.set_figure_params(dpi=300)
        figure = sc.pl.embedding(adata, basis='spatial', color='topic', vmax=1, cmap='Blues', title='Neighborhood', size=2, show=False, return_fig=True)
        try:
            os.mkdir(os.path.join(input_file,'figures', 'neighborhoods'))
        except:
            print('Figures/neighborhoods already made.')
        figure.tight_layout()
        plt.axis('equal')
        figure.savefig(os.path.join(input_file,'figures', 'neighborhoods', f'neighborhoods_{roll}.png'))
        plt.close()
        adata.write(os.path.join(input_file, 'adatas', f'06_before_decomposition_model_{roll}.h5ad'))