Load required libraries. int_nmf_model must be in the same directory. If it is not it can be added to pythons path

In [None]:
import anndata as ad
import scanpy as sc
import numpy as np
import scipy
import sys
sys.path.append("../../CellPie/") # go to parent dir
import cellpie_main as cp
from cellpie_main import intNMF
import anndata as ad
from matplotlib import pyplot as plt
import scipy.io
import pandas as pd
import squidpy as sq
from PIL import Image

In [None]:
sc.set_figure_params(scanpy=True, dpi=250, dpi_save=150)

# CellPie

In [None]:
histo_2 = pd.read_csv('Data/Histology_Visium_FFPE_Human_Prostate_Cancer_cloupe.csv').dropna()

In [None]:
# read ST data
adata = sc.read_visium(path = 'Data/', 
                       count_file='Visium_FFPE_Human_Prostate_Cancer_filtered_feature_bc_matrix.h5', 
                       library_id='A1_spot',                        
                       load_images=True)
adata.var_names_make_unique()
adata.var['SYMBOL'] = adata.var_names
sc.pp.calculate_qc_metrics(adata, inplace=True)
adata.var['mt'] = [gene.startswith('mt-') for gene in adata.var['SYMBOL']]
adata.obs['mt_frac'] = adata[:, adata.var['mt'].tolist()].X.sum(1).A.squeeze()/adata.obs['total_counts']

In [None]:
img = sq.im.ImageContainer(
adata.uns["spatial"]['A1_spot']["images"]["hires"],
scale=adata.uns["spatial"]['A1_spot']["scalefactors"]["tissue_hires_scalef"])

In [None]:
img.show(channelwise=True)

In [None]:
# the image contains one extra channel that looks empty, so we want to remove this before extracting the features
adata.uns["spatial"]['A1_spot']["images"]["hires"]=adata.uns["spatial"]['A1_spot']["images"]["hires"][:,:,0:3]

In [None]:
sc.pp.normalize_total(adata)

In [None]:
adata = adata[adata.obs_names.isin(histo_2['Barcode']),:]

In [None]:
cp.extract_image_features(adata,
    scale_img = 0.2,
    spot_scale = 6)

In [None]:
adata.obsm['features']

In [None]:
nmf_model = intNMF(adata,9,epochs = 50, init = 'NNDSVD',mod1_skew=1)
nmf_model.fit(adata)

In [None]:
cp.plot_topic_proportions(adata,9)

### extract gene loading matrix

In [None]:
l=cp.get_genes_topic(adata,nmf_model.phi_expr)

In [None]:
l.T.to_csv('Results/marker_genes_prostate_reproduce.csv')

In [None]:
adata.write('Results/prostate_reproduce.h5ad')

In [None]:
adata_l=sc.AnnData(l)

In [None]:
adata_l.write('Results/genes_prostate_reproduce.h5ad')

# kmeans 

In [None]:
sc.pp.pca(adata)

In [None]:
from sklearn.cluster import KMeans
X = adata.obsm['X_pca']
kmeans = KMeans(n_clusters=5, init='k-means++',max_iter=500,random_state=2).fit(X)
adata.obs['kmeans']=kmeans.labels_.astype(str)

In [None]:
sc.pl.spatial(adata,color='kmeans',size=1.5)

In [None]:
histo_2.index = histo_2['Barcode']

# CellPie clustering for equal weights 

In [None]:
from sklearn.cluster import KMeans
X = adata.obs.iloc[:,12:21].values
kmeans = KMeans(n_clusters=5, init='k-means++',max_iter=500,random_state=2).fit(X)
adata.obs['kmeans_CellPie_int']=kmeans.labels_.astype(str)

# CellPie clustering for single NMF (weight=2)

In [None]:
adata_0 = adata.copy()

In [None]:
nmf_model = intNMF(adata_0,9,epochs = 50, init = 'NNDSVD',mod1_skew=2)
nmf_model.fit(adata_0)

In [None]:
from sklearn.cluster import KMeans
X = adata_0.obs.iloc[:,12:21].values
kmeans = KMeans(n_clusters=5, init='k-means++',max_iter=500,random_state=2).fit(X)
adata.obs['kmeans_CellPie_0']=kmeans.labels_.astype(str)

In [None]:
adata.uns['kmeans_CellPie_int_colors'] = ['#1f77b4','#d62728','#e377c2','#d4ce6c','#279e68']

In [None]:
adata.uns['kmeans_CellPie_0_colors'] = ['#279e68','#1f77b4','#d62728','#e377c2','#d4ce6c',]

In [None]:
sc.pl.spatial(adata,color='kmeans_CellPie_0',size=1.5)

In [None]:
sc.pl.spatial(adata,color='kmeans_CellPie_int',size=1.5)

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
plt.figure(figsize=(10, 8))
sns.heatmap(
    pd.crosstab(
        adata.obs['kmeans_CellPie_int'].values, histo_2['Histology'],
        normalize ='index'
    ), 
    cmap=sns.color_palette("crest", as_cmap=True),
    annot = True
)
plt.show()

In [None]:
from sklearn import metrics
res = []

for i in np.arange(0, 2.01,0.1):
    
    nmf_model = intNMF(adata,9,epochs = 50, init = 'NNDSVD',mod1_skew=i)
    nmf_model.fit(adata)
    from sklearn.cluster import KMeans
    X = adata.obs.iloc[:,12:21].values
    kmeans = KMeans(n_clusters=5, init='k-means++',max_iter=500,random_state=2).fit(X)
    adata.obs['kmeans_t']=kmeans.labels_.astype(str)
    mut_info=metrics.fowlkes_mallows_score(adata.obs['kmeans_t'],histo_2['Histology'])
    adj_rand=metrics.adjusted_rand_score(adata.obs['kmeans_t'],histo_2['Histology'])
    adj_mut_info=metrics.adjusted_mutual_info_score(adata.obs['kmeans_t'],histo_2['Histology'])
    print(i,mut_info,adj_rand,adj_mut_info)
    res.append((i,mut_info,adj_rand,adj_mut_info)) 

In [None]:
score = pd.DataFrame(res)   

In [None]:
plt.plot(score[0],score[1],color="green",label='Fowlkes Mallows Score')
plt.plot(score[0],score[2],color="red",label='Adjusted Rand Score')
plt.plot(score[0],score[3],color="blue",label='Adjusted Mutual Info Score')
plt.xlabel("Weight")
plt.ylabel("Score")
plt.legend(prop={'size': 9})

In [None]:
adata.obs['path_anot'] = histo_2['Histology'].values

In [None]:
sc.pl.spatial(adata,color='path_anot',size=1.5)

# SPAGCN

In [None]:
import SpaGCN as spg
import cv2

In [None]:
#Read in gene expression and spatial location
adata_spg = sc.read_visium(path = 'Data/', 
                       count_file='Visium_FFPE_Human_Prostate_Cancer_filtered_feature_bc_matrix.h5', 
                       library_id='A1_spot',                       
                       load_images=True)
adata_spg.var_names_make_unique()
adata_spg.var['SYMBOL'] = adata_spg.var_names
#Read in hitology image
adata_spg = adata_spg[adata_spg.obs_names.isin(histo_2['Barcode']),:]
spatial=pd.read_csv("Data/spatial/tissue_positions_list.csv",sep=",",header=None,na_filter=False,index_col=0) 
adata_spg.obs["x1"]=spatial[1]
adata_spg.obs["x2"]=spatial[2]
adata_spg.obs["x3"]=spatial[3]
adata_spg.obs["x4"]=spatial[4]
adata_spg.obs["x5"]=spatial[5]
adata_spg.obs["x_array"]=adata_spg.obs["x2"]
adata_spg.obs["y_array"]=adata_spg.obs["x3"]
adata_spg.obs["x_pixel"]=adata_spg.obs["x4"]
adata_spg.obs["y_pixel"]=adata_spg.obs["x5"]
#Select captured samples
adata_spg=adata_spg[adata_spg.obs["x1"]==1]
adata_spg.var_names=[i.upper() for i in list(adata_spg.var_names)]
adata_spg.var["genename"]=adata_spg.var.index.astype("str")
adata_spg.write_h5ad("sample_data.h5ad")
                 
adata_spg=sc.read("sample_data.h5ad")


In [None]:
img=cv2.imread("Data/Visium_FFPE_Human_Prostate_Cancer_image.tif")

In [None]:
#Set coordinates
x_array=adata_spg.obs["x_array"].tolist()
y_array=adata_spg.obs["y_array"].tolist()
x_pixel=adata_spg.obs["x_pixel"].tolist()
y_pixel=adata_spg.obs["y_pixel"].tolist()

#Test coordinates on the image
img_new=img.copy()
for i in range(len(x_pixel)):
    x=x_pixel[i]
    y=y_pixel[i]
    img_new[int(x-20):int(x+20), int(y-20):int(y+20),:]=0


cv2.imwrite('ic_map.jpg', img_new)

In [None]:
#Calculate adjacent matrix
s=1
b=49
adj=spg.calculate_adj_matrix(x=x_pixel,y=y_pixel, x_pixel=x_pixel, y_pixel=y_pixel, image=img, beta=b, alpha=s, histology=True)
#If histlogy image is not available, SpaGCN can calculate the adjacent matrix using the fnction below
# adj=spg.calculate_adj_matrix(x=x_pixel,y=y_pixel, histology=False)
np.savetxt('ic_adj.csv', adj, delimiter=',')

In [None]:
adj=np.loadtxt('ic_adj.csv', delimiter=',')
spg.prefilter_genes(adata,min_cells=3) # avoiding all genes are zeros
spg.prefilter_specialgenes(adata_spg)
#Normalize and take log for UMI
sc.pp.normalize_per_cell(adata_spg)
sc.pp.log1p(adata_spg)

In [None]:
p=0.5 
#Find the l value given p
l=spg.search_l(p, adj, start=0.01, end=1000, tol=0.01, max_run=100)

In [None]:
#If the number of clusters known, we can use the spg.search_res() fnction to search for suitable resolution(optional)
#For this toy data, we set the number of clusters=7 since this tissue has 7 layers
n_clusters=5
#Set seed
r_seed=t_seed=n_seed=100
#Seaech for suitable resolution
res=spg.search_res(adata_spg, adj, l, n_clusters, start=0.7, step=0.1, tol=5e-3, lr=0.05, max_epochs=20, r_seed=r_seed, t_seed=t_seed, n_seed=n_seed)

In [None]:
import random, torch
clf=spg.SpaGCN()
res =0.25
clf.set_l(l)
#Set seed
random.seed(r_seed)
torch.manual_seed(t_seed)
np.random.seed(n_seed)
#Run
clf.train(adata_spg,adj,init_spa=True,init="louvain",res=res, tol=5e-3, lr=0.05, max_epochs=200)
y_pred, prob=clf.predict()
adata_spg.obs["pred"]= y_pred
adata_spg.obs["pred"]=adata_spg.obs["pred"].astype('category')
#Do cluster refinement(optional)
#shape="hexagon" for Visium data, "square" for ST data.
adj_2d=spg.calculate_adj_matrix(x=x_array,y=y_array, histology=False)
refined_pred=spg.refine(sample_id=adata_spg.obs.index.tolist(), pred=adata_spg.obs["pred"].tolist(), dis=adj_2d, shape="hexagon")
adata_spg.obs["SpaGCN_clusters"]=refined_pred
adata_spg.obs["SpaGCN_clusters"]=adata_spg.obs["SpaGCN_clusters"].astype('category')
#Save results
adata_spg.write_h5ad("ic_spagcn_results.h5ad")

In [None]:
adata_spg=sc.read("ic_spagcn_results.h5ad")
#Set colors used
plot_color=["#F56867","#FEB915","#C798EE","#59BE86","#7495D3","#D1D1D1","#6D1A9C","#15821E","#3A84E6","#997273","#787878","#DB4C6C","#9E7A7A","#554236","#AF5F3C","#93796C","#F9BD3F","#DAB370","#877F6C","#268785"]
#Plot spatial domains
domains="pred"
num_celltype=len(adata_spg.obs[domains].unique())
adata_spg.uns[domains+"_colors"]=list(plot_color[:num_celltype])
ax=sc.pl.scatter(adata_spg,alpha=1,x="y_pixel",y="x_pixel",color=domains,title=domains,color_map=plot_color,show=False,size=100000/adata_spg.shape[0])
ax.set_aspect('equal', 'box')
ax.axes.invert_yaxis()
plt.savefig("ic_spagcn_pred.png", dpi=600)
plt.close()

#Plot refined spatial domains
domains="SpaGCN_clusters"
num_celltype=len(adata_spg.obs[domains].unique())
adata_spg.uns[domains+"_colors"]=list(plot_color[:num_celltype])
ax=sc.pl.scatter(adata_spg,alpha=1,x="y_pixel",y="x_pixel",color=domains,title=domains,color_map=plot_color,show=False,size=100000/adata_spg.shape[0])
ax.set_aspect('equal', 'box')
ax.axes.invert_yaxis()
plt.savefig("ic_spagcn_refined_pred.png", dpi=600)
plt.close()

In [None]:
adata_spg.uns['SpaGCN_clusters_colors'] = ['#359c62','#d32929','#1f77b4','#e377c2','#d4ce6c']

In [None]:
sc.pl.spatial(adata_spg,color='SpaGCN_clusters',size=1.5)

In [None]:
mut_info=metrics.fowlkes_mallows_score(adata_spg.obs['SpaGCN_clusters'],histo_2['Histology'])
adj_rand=metrics.adjusted_rand_score(adata_spg.obs['SpaGCN_clusters'],histo_2['Histology'])
adj_mut_info=metrics.adjusted_mutual_info_score(adata_spg.obs['SpaGCN_clusters'],histo_2['Histology'])
print(mut_info,adj_rand,adj_mut_info)

# stLearn

In [None]:
import stlearn as st
from pathlib import Path

In [None]:
# specify PATH to data
BASE_PATH = Path("Data/")

# spot tile is the intermediate result of image pre-processing
TILE_PATH = Path("/tmp/tiles")
TILE_PATH.mkdir(parents=True, exist_ok=True)

# output path
OUT_PATH = Path("/")
OUT_PATH.mkdir(parents=True, exist_ok=True)

In [None]:
# load data
data = st.Read10X(BASE_PATH)

In [None]:
data = data[data.obs_names.isin(histo_2['Barcode']),:]

In [None]:
# pre-processing for gene count table
st.pp.filter_genes(data,min_cells=1)
st.pp.normalize_total(data)
st.pp.log1p(data)

In [None]:
# pre-processing for spot image
st.pp.tiling(data, TILE_PATH)

# this step uses deep learning model to extract high-level features from tile images
# may need few minutes to be completed
st.pp.extract_feature(data)

In [None]:
# run PCA for gene expression data
st.em.run_pca(data,n_comps=50)

In [None]:
data_SME = data.copy()
# apply stSME to normalise log transformed data
st.spatial.SME.SME_normalize(data_SME, use_data="raw")
data_SME.X = data_SME.obsm['raw_SME_normalized']
st.pp.scale(data_SME)
st.em.run_pca(data_SME,n_comps=50)

In [None]:
# K-means clustering on stSME normalised PCA
st.tl.clustering.kmeans(data_SME,n_clusters=5, use_data="X_pca", key_added="kmeans_stlearn")
st.pl.cluster_plot(data_SME, use_label="kmeans_stlearn")

In [None]:
mut_info=metrics.fowlkes_mallows_score(data_SME.obs['kmeans_stlearn'],histo_2['Histology'])
adj_rand=metrics.adjusted_rand_score(data_SME.obs['kmeans_stlearn'],histo_2['Histology'])
adj_mut_info=metrics.adjusted_mutual_info_score(data_SME.obs['kmeans_stlearn'],histo_2['Histology'])
print(mut_info,adj_rand,adj_mut_info)

In [None]:
mut_info=metrics.fowlkes_mallows_score(adata.obs['kmeans'],histo_2['Histology'])
adj_rand=metrics.adjusted_rand_score(adata.obs['kmeans'],histo_2['Histology'])
adj_mut_info=metrics.adjusted_mutual_info_score(adata.obs['kmeans'],histo_2['Histology'])
print(mut_info,adj_rand,adj_mut_info)

In [None]:
sc.pl.spatial(data_SME,color=['kmeans_stlearn'],size=1.5)

In [None]:
f={'CellPie_int':[0.3430071350730446],'CellPie_0':[0.25486035301123516],'kmeans':[0.2667040151343367],
'SpaGCN':[0.2616348436322739],
 'stLearn':[0.39077121051876296]}

In [None]:
score_comp = pd.DataFrame(data=f)

In [None]:
score_comp.index = ['ARI']

In [None]:
score_comp

In [None]:
import seaborn as sns
plot=sns.barplot(data=score_comp,orient='h')
plot.set_xlabel( "ARI")

In [None]:
list1 = ['Gleason 3','Gleason 4']
histo_2_new = histo_2[histo_2['Histology'].isin(list1)]

In [None]:
histo_2_new

In [None]:
adata_new = adata[adata.obs_names.isin(histo_2_new['Barcode']),:]

In [None]:
data_SME_new = data_SME[data_SME.obs_names.isin(histo_2_new['Barcode']),:]
adata_spg_new = adata_spg[adata_spg.obs_names.isin(histo_2_new['Barcode']),:]

In [None]:
adj_rand_stlearn=metrics.adjusted_rand_score(data_SME_new.obs['kmeans_stlearn'],adata_new.obs['path_anot'])
print(adj_rand_stlearn)

In [None]:
adata_new.obs['SpaGCN_clusters']=adata_spg_new.obs['SpaGCN_clusters'].values
adata_new.obs['stLearn_clusters']=data_SME_new.obs['kmeans_stlearn'].values

In [None]:
adj_rand_spag=metrics.adjusted_rand_score(adata_spg_new.obs['SpaGCN_clusters'],adata_new.obs['path_anot'])
print(adj_rand_spag)

In [None]:
# mut_info=metrics.fowlkes_mallows_score(adata_new.obs['kmeans_CellPie_0'],adata_new.obs['path_anot'])
adj_rand_0=metrics.adjusted_rand_score(adata_new.obs['kmeans_CellPie_0'],adata_new.obs['path_anot'])
# adj_mut_info=metrics.adjusted_mutual_info_score(adata_new.obs['kmeans_CellPie_0'],adata_new.obs['path_anot'])
print(adj_rand_0)

In [None]:
adj_rand_int=metrics.adjusted_rand_score(adata_new.obs['kmeans_CellPie_int'],adata_new.obs['path_anot'])
print(adj_rand_int)
adj_rand_tr=metrics.adjusted_rand_score(adata_new.obs['kmeans_CellPie_0'],adata_new.obs['path_anot'])
print(adj_rand_tr)
adj_rand_kmeans=metrics.adjusted_rand_score(adata_new.obs['kmeans'],adata_new.obs['path_anot'])
print(adj_rand_kmeans)

In [None]:
gl={'CellPie_int':[0.16703789839046385],'CellPie_0':[0.16336514905292712],'kmeans':[-0.016692173989926708],
'SpaGCN':[0.02691286576524496],
 'stLearn':[-0.026170964381168647]}

In [None]:
adata_new.uns['SpaGCN_clusters_colors'] = ['#279e68','#d62728', '#1f77b4', '#ff7f0e']

In [None]:
adata_new.uns['stLearn_clusters_colors'] = ['#1f77b4','#279e68','#ff7f0e','#d62728']

In [None]:
adata_new.uns['kmeans_CellPie_tr_colors'] = [ '#279e68', '#d62728','#279e68', '#aa40fc']

In [None]:
sc.pl.spatial(adata_new,
              color=['path_anot','kmeans_CellPie_int','kmeans','kmeans_CellPie_0','SpaGCN_clusters','stLearn_clusters'],
              size=1.5,ncols=3)


In [None]:
score_comp_g = pd.DataFrame(data=gl)
score_comp_g.index = ['ARI']

In [None]:
plot=sns.barplot(data=score_comp_g,orient='h')
plot.set_xlabel( "ARI")

In [None]:
adata_new.obs.to_csv('adata_prostate_reprod.csv')

In [None]:
adata.write('adata_prostate_repro.h5ad')