In [30]:
import scanpy as sc
import numpy as np
from tqdm.notebook import tqdm
import scipy.stats as stats
import pandas as pd
import os
import seaborn as sns
import matplotlib.pyplot as plt
import shapely
import glob
from sklearn.neighbors import NearestNeighbors
from PIL import Image, ImageDraw
import numpy as np
from scipy.spatial import cKDTree
import json
import tensorflow as tf
from tensorflow import keras
from tensorflow.keras.layers import Input, Embedding, Flatten, Dense, Concatenate
from tensorflow.keras.models import Model
from sklearn.preprocessing import OneHotEncoder
from core_functions.unrolling import *
from core_functions.initial_neighborhoods import *

##### This script performs several calculations, to get the crypt villus axis. These include calculating neighborhoods, parsing image labels on the human data to create training data, and making crypt villus axis predictions. 

In [62]:

def recalculate_crypt_villi_axis(adata, data_dir):
    unique_categories = np.unique(adata.obs['batch'])

    subset_ads = []
    batch_ctr = 0
    for input_file in unique_categories:
        json_file_path = os.path.join(data_dir, input_file, 'label_img.json')
        downsized_adata = sc.read(os.path.join(data_dir, input_file, 'adatas', '03_intial_neighborhoods.h5ad'))

        batch_adata = adata[adata.obs['batch'] == input_file]

        all_spatial = batch_adata.obsm['X_spatial']
        print(json_file_path)
        # Load the JSON data from the file
        with open(json_file_path, 'r') as json_file:
            data = json.load(json_file)
        
        # Extract relevant information from the JSON data
        image_height = data['imageHeight']
        image_width = data['imageWidth']
        image_path = data['imagePath']
        shapes = data['shapes']


        for shape in shapes:
            label = shape['label']
            if label == 'bottom_keypoint':
                poly = shapely.Polygon(np.array(shape['points'])* downsized_adata.uns['unrolling_downsize'])
                x = np.array(poly.boundary.xy[0])
                y = np.array(poly.boundary.xy[1])
        bottom_points = np.array([x, y]).T

        # Process the shapes (annotations)
        villis = [] 
        peyers = []
        villus_ct = int(0 + 1000*batch_ctr)
        for shape in tqdm(shapes):
            label = shape['label']
            if (label == 'villlus') or (label == 'villus'):
                poly = shapely.Polygon(np.array(shape['points'])*downsized_adata.uns['unrolling_downsize'])
                indices = []
                for i in range(len(all_spatial)):
                    pt = shapely.Point(all_spatial[i])
                    if pt.within(poly):
                        indices.append(i)
                villis.append(indices)
                villus_ct += 1
            elif label == 'peyers':
                peyers.append(shape['points'])

            def find_closest_point(target_point, point_array):
                distances = np.linalg.norm(point_array - target_point, axis=1)
                closest_index = np.argmin(distances)
                return point_array[closest_index], np.min(distances)

        total_indices = []
        for ir in peyers:
            ir_ = np.array(ir)* downsized_adata.uns['unrolling_downsize']
            poly = shapely.Polygon(ir_)
            indices = []
            for i in tqdm(range(len(all_spatial))):
                pt = shapely.Point(all_spatial[i])
                if pt.within(poly):
                    indices.append(i)
            total_indices.append(indices)
        
        total_indices=list(set([element for sublist in total_indices for element in sublist]))

        peyers = np.zeros(len(batch_adata.obs.index))
        peyers[total_indices] = 1
        batch_adata.obs['peyers'] = peyers


        villi_bottoms = []
        for i, e in enumerate(villis):
            points = batch_adata.obsm['X_spatial'][e]
            closest_points = []
            distance = []
            for point in points:
                closest_point, dt = find_closest_point(point, bottom_points)
                closest_points.append(closest_point)
                distance.append(dt)
            # Find the overall closest point
            villi_bottoms.append(closest_points[np.argmin(distance)])

        def euclidean_distance(point1, point2):
            return np.sqrt(np.sum((point1 - point2)**2))

        def distances_to_reference(array, reference_point):
            return [euclidean_distance(point, reference_point) for point in array]

        normalized_crypt_villi = np.zeros(len(batch_adata.obs.index))
        for i, e in enumerate(villis):
            reference_point = np.array(villi_bottoms[i])
            array = batch_adata.obsm['X_spatial'][e]
            distances = distances_to_reference(array, reference_point)
            distances = distances/max(distances)
            normalized_crypt_villi[e] = distances

        batch_adata.obs['reference_crypt_villi'] = normalized_crypt_villi

        villi_number = np.zeros(len(batch_adata.obs.index))
        for i, e in enumerate(villis):
            villi_number[e] = i+ (1000*batch_ctr)
        batch_adata.obs['villi_number'] = villi_number


        sc.pl.embedding(batch_adata, basis='spatial', color=['villi_number', 'reference_crypt_villi', 'peyers'])
        batch_ctr += 1
        subset_ads.append(batch_adata)

    new_crypt_villus = np.zeros(len(adata.obs))
    new_villus_number = np.zeros(len(adata.obs))
    new_peyers = np.zeros(len(adata.obs))

    b_count = 0
    for input_file in unique_categories:
        actual_adata = np.where(adata.obs['batch'] == input_file)[0]
        new_crypt_villus[actual_adata] = subset_ads[b_count].obs['reference_crypt_villi']
        new_villus_number[actual_adata] = subset_ads[b_count].obs['villi_number']
        new_peyers[actual_adata] = subset_ads[b_count].obs['peyers']
        b_count += 1

    adata.obs['reference_crypt_villi'] = new_crypt_villus
    adata.obs['villi_number'] = new_villus_number
    adata.obs['peyers'] = new_peyers
    return adata

def train_model(adata_in_villi, unchanging_type_keys, n_neighborhoods = 6):
    combined_adata_no_immune = adata_in_villi[adata_in_villi.obs['Class'].isin(unchanging_type_keys) & (adata_in_villi.obs['peyers'].values.astype(int) == 0)]
    unique_batches = np.unique(combined_adata_no_immune.obs.batch.values)

    nneighbors = 30
    dfs = []
    for input_file in unique_batches:
        adata = combined_adata_no_immune[combined_adata_no_immune.obs['batch'] == input_file]
        adata_arr = np.array(adata.X)
        celltype_cluster = adata_in_villi.obs.index.values
        list_of_arrays = []
        spatial_points = np.array([adata_in_villi.obsm['X_spatial'][:,0], adata_in_villi.obsm['X_spatial'][:,1]]).T
        spatial_points_ref = np.array([adata.obsm['X_spatial'][:,0], adata.obsm['X_spatial'][:,1]]).T
        tree = KDTree(spatial_points_ref)
        for i_bac in tqdm(range(len(celltype_cluster))):
            current_cell = celltype_cluster[i_bac]
            distances, neighbors = tree.query(spatial_points[i_bac], k=nneighbors)
            neighbors = np.array(list(neighbors))
            gene_array = np.array(np.sum(adata_arr[neighbors, :], axis=0)).squeeze()
            list_of_arrays.append(gene_array)
        
        X = pd.DataFrame(np.array(list_of_arrays))
        dfs.append(X)


    X_arr = pd.concat(dfs)

    num_neighborhoods = n_neighborhoods

    f = len(X.columns)
    n = len(X.index.tolist())

    model = NMF(n_components=num_neighborhoods, random_state=0)
    W = model.fit_transform(X)
    H = model.components_
    
    return model, unique_batches, H

def create_test_data(original_adata, unique_batches, model, unchanging_type_keys, input_folders, H):
    for batch in unique_batches:
        adata = original_adata[original_adata.obs['batch'] == batch]
        
        superclusters = adata.obs['Class'].values
        celltype_cluster = adata.obs.index.values

        dir_dictionary = {}
        for i in np.unique(celltype_cluster):
            dir_dictionary[i] = 0

        nneighbors = 30
        list_of_arrays = []
        adata_epi = adata[adata.obs['Class'].isin(unchanging_type_keys) & (adata.obs['peyers'].values.astype(int) == 0)]
        # print(np.shape(adata_epi))
        spatial_points_epi = np.array([adata_epi.obsm['X_spatial'][:,0], adata_epi.obsm['X_spatial'][:,1]]).T
        spatial_points = np.array([adata.obsm['X_spatial'][:,0], adata.obsm['X_spatial'][:,1]]).T
        adata_epi_arr = np.array(adata_epi.X)
        
        tree = KDTree(spatial_points_epi)
        for i_bac in range(len(celltype_cluster)):
            current_cell = celltype_cluster[i_bac]
            distances, neighbors = tree.query(spatial_points[i_bac], k=nneighbors)
            neighbors = np.array(list(neighbors))
            gene_array = np.array(np.sum(adata_epi_arr[neighbors, :], axis=0)).squeeze()
            list_of_arrays.append(gene_array)
        
        X = pd.DataFrame(np.array(list_of_arrays)).astype(H.dtype)
        W = model.transform(X)
        
        topics_frame = pd.DataFrame(W)
        
        topics_frame.columns = ['Topic '+str(i+1) for i in range(len(topics_frame.columns))]
        topics_frame.index = adata.obs.index.tolist()
        def zscore(column):
            return (column - column.mean()) / column.std()
        
        # Apply the z-score function to each column in the dataframe
        topics_frame = topics_frame.apply(zscore)
        adata.obs = adata.obs.drop(adata.obs.columns[adata.obs.columns.str.contains('Topic')], axis=1)
        adata.obs=adata.obs.merge(topics_frame, left_index=True, right_index=True)
        adata.obs['topic'] = pd.Categorical((np.argmax(topics_frame.values, axis = 1)+1).astype(str))

        sc.set_figure_params(dpi=300)
        figure = sc.pl.embedding(adata, basis='spatial', color='topic', vmax=1, cmap='Blues', title='Neighborhood', size=2, show=False, return_fig=True)
        try:
            os.mkdir(os.path.join(os.path.dirname(input_folders[0]), batch,'figures', 'neighborhoods'))
        except:
            print('Figures/neighborhoods already made.')
        figure.tight_layout()
        plt.axis('equal')
        figure.savefig(os.path.join(os.path.dirname(input_folders[0]), batch,'figures', 'neighborhoods', 'neighborhoods.png'))
        plt.close()
        adata.write(os.path.join(os.path.dirname(input_folders[0]), batch, 'adatas', '05_before_decomposition_model.h5ad'))


def create_training_data(adata_subset, model, H, unchanging_type_keys):
    unique_batches = np.unique(adata_subset.obs['batch'])
    training_datas = []
    all_train_labels = []
    for uniq in unique_batches:
        adata_sub_batch = adata_subset[adata_subset.obs['batch'] == uniq]
        superclusters = adata_sub_batch.obs['Class'].values
        celltype_cluster = adata_sub_batch.obs.index.values

        dir_dictionary = {}
        for i in np.unique(celltype_cluster):
            dir_dictionary[i] = 0

        nneighbors = 30
        list_of_arrays = []
        adata_epi = adata_sub_batch[adata_sub_batch.obs['Class'].isin(unchanging_type_keys) & (adata_sub_batch.obs['peyers'] == 0)]
        # print(np.shape(adata_epi))
        spatial_points_epi = np.array([adata_epi.obsm['X_spatial'][:,0], adata_epi.obsm['X_spatial'][:,1]]).T
        spatial_points = np.array([adata_sub_batch.obsm['X_spatial'][:,0], adata_sub_batch.obsm['X_spatial'][:,1]]).T
        adata_epi_arr = np.array(adata_epi.X)

        tree = KDTree(spatial_points_epi)
        for i_bac in tqdm(range(len(celltype_cluster))):
            current_cell = celltype_cluster[i_bac]
            distances, neighbors = tree.query(spatial_points[i_bac], k=nneighbors)
            neighbors = np.array(list(neighbors))
            gene_array = np.array(np.sum(adata_epi_arr[neighbors, :], axis=0)).squeeze()
            list_of_arrays.append(gene_array)

        X = pd.DataFrame(np.array(list_of_arrays)).astype(H.dtype)
        W = model.transform(X)

        topics_frame = pd.DataFrame(W)

        topics_frame.columns = ['Topic '+str(i+1) for i in range(len(topics_frame.columns))]
        topics_frame.index = adata_sub_batch.obs.index.tolist()
        def zscore(column):
            return (column - column.mean()) / column.std()

        # Apply the z-score function to each column in the dataframe
        topics_frame = topics_frame.apply(zscore)
        training_datas.append(topics_frame.values)
        all_train_labels.append(adata_sub_batch.obs['reference_crypt_villi'].values)


    training_data = []
    for i in training_datas:
        for j in i:
            training_data.append(j)
    training_data = np.array(training_data)  

    training_labels = []
    for i in all_train_labels:
        for j in i:
            training_labels.append(j)
    training_labels = np.array(training_labels) 

    return training_data, training_labels

def train_neural_network(training_data, training_labels, epoch_num = 15):
    # Define model
    neural = keras.Sequential([
        keras.layers.Dense(64, activation='relu', input_shape=(training_data.shape[1],)),
        keras.layers.Dense(32, activation='relu'),
        keras.layers.Dense(1, activation='sigmoid')
    ])

    # Compile the model
    neural.compile(optimizer='adam', loss='mean_squared_error')

    # Train the model
    neural.fit(training_data, training_labels, epochs=epoch_num, batch_size=32, verbose= 2)
    return neural

def calculate_crypt_villi_axis(input_folders, neural):
    for input_file in input_folders:
        adata = sc.read(os.path.join(input_file,'adatas', '05_before_decomposition_model.h5ad'))
        if len(adata.obs.columns[adata.obs.columns.str.contains('_x')]) > 0:
            adata.obs = adata.obs.drop(adata.obs.columns[adata.obs.columns.str.contains('_x')], axis=1)
        else:
            adata.obs = adata.obs.drop(adata.obs.columns[adata.obs.columns.str.contains('_y')], axis=1)

        testing_data = adata.obs[adata.obs.columns[adata.obs.columns.str.contains('Topic')]].values
        predictions = neural.predict(testing_data)
        adata.obs['crypt_villi_axis'] = predictions  
        adata.write(os.path.join(input_file, 'adatas', '06_axes_defined.h5ad'))
        fig = sc.pl.embedding(adata[adata.obs.peyers==0], basis = 'spatial', color='crypt_villi_axis', return_fig=True, show=False, vmax=1, cmap='viridis', size=4)
        fig.tight_layout()
        plt.axis('equal')
        fig.savefig(os.path.join(input_file, 'figures', 'axes', f'spatial_crypt_villi.png'))
        plt.close()

def calculate_epithelial_axis(input_folders):
    sc.set_figure_params(dpi=1000, dpi_save=1000)
    for input_file in input_folders:
        ad = sc.read(os.path.join(input_file, 'adatas', '06_axes_defined.h5ad')) 
        points_epi = ad[ad.obs.Class.isin(['Epithelial'])].obsm['X_spatial']

        all_tree = KDTree(ad.obsm['X_spatial'])
        epi_tree = KDTree(points_epi)
        distances_all, neighbors_all = all_tree.query(ad.obsm['X_spatial'], k=5)
        distances, neighbors = epi_tree.query(ad.obsm['X_spatial'], k=5)
        distance_medians = (np.mean(distances, axis=1)/np.mean(distances_all, axis=1))
        ad.obs['epithelial_distance'] = distance_medians
        ad.obs['epithelial_distance'] = ad.obs['epithelial_distance']/np.percentile(ad.obs['epithelial_distance'], 99)
        fig = sc.pl.embedding(ad, basis = 'spatial', color='epithelial_distance', return_fig=True, show=False, vmax=1, cmap='viridis', size=4)
        fig.tight_layout()
        plt.axis('equal')
        fig.savefig(os.path.join(input_file, 'figures', 'axes', f'spatial_epithelial.png'))
        plt.close()
        ad.write(os.path.join(input_file, 'adatas', '07_final_object.h5ad'))

##### Add the path to the preliminary celltyped human data

In [None]:
output_folder = r'/mnt/sata1/Analysis_Alex/human_r1/analysis/cleaned'
adata_whole = sc.read(os.path.join(output_folder, 'celltyped_do_not_touch.h5ad'))


subjects = [['human_09_r1', 'human_09_r2'], ['human_05_r1', 'human_05_r2']]
for sub in subjects:
    adata = adata_whole[adata_whole.obs['batch'].isin(sub)]

    print()
    unchanging_type_keys = ['Epithelial']

    data_dir = '/mnt/sata1/Analysis_Alex/human_r1'

    input_folders = glob.glob(os.path.join(data_dir, sub[0].split('_')[0]+'_'+sub[0].split('_')[1]+'*'))

    adata = recalculate_crypt_villi_axis(adata, data_dir)

    original_adata = adata.copy()

    adata_subset = adata[adata.obs['villi_number'] > 0]

    print(np.shape(adata_subset.obs))

    model, unique_batches, H = train_model(adata_subset, unchanging_type_keys)

    create_test_data(original_adata, unique_batches, model, unchanging_type_keys, input_folders, H)

    training_data, training_labels = create_training_data(adata_subset, model, H, unchanging_type_keys)

    neural = train_neural_network(training_data, training_labels)

    calculate_crypt_villi_axis(input_folders, neural)

    calculate_epithelial_axis(input_folders=input_folders)

### Concatenating all experiments

In [64]:
input_folders = glob.glob(os.path.join(data_dir, 'human_0*'))

In [65]:
ads_concat = []
for input_file in input_folders:
    ad = sc.read(os.path.join(input_file, 'adatas', '07_final_object.h5ad'))
    ads_concat.append(ad)



In [66]:
total_adata = sc.concat(ads_concat, uns_merge='first')

In [69]:
total_adata.write(os.path.join(output_folder, 'final_human_adata.h5ad'))