In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from os import path
from math import ceil
import squidpy as sq
import scanpy as sc
from scanpy import read_h5ad
import sys
sys.path.append("/Users/user/CellPie/CellPie/") # go to parent dir
import cellpie_main as cp
from cellpie_main import intNMF

In [None]:
random_state=123

In [None]:
histo_2 = pd.read_csv('H1_labeled_coordinates.tsv',sep='\t')
histo_2.set_index('Row.names')

In [None]:
adata=sc.read_h5ad('her2_h1.h5ad')

In [None]:
# df_sp.index = df_sp['Unnamed: 0']
histo_2.index = histo_2['Row.names']
histo_2.index.names = ['']
new_index_values = []

for i in range(len(histo_2)):
    x = histo_2['x'].round().astype(np.int64).iloc[i]
    y = histo_2['y'].round().astype(np.int64).iloc[i]
    new_index_values.append("{0}x{1}".format(x, y))
    
histo_2.index=new_index_values
adata.obs['path_labels'] =histo_2['label']

In [None]:
adata = adata[adata.obs_names.isin(histo_2.index),:]

In [None]:
scale_range = np.arange(0.1,4,1)

In [None]:
from sklearn import metrics
from sklearn.cluster import KMeans
res = []

In [None]:
from feature_extr import extract_features
extract_features(adata,img_path='HE_BT24044_D2.jpg',
                         spot_scale=scale_range)

In [None]:
from cp_utils import preprocess_data
preprocess_data(adata,min_cells=100)

In [None]:
adata.obs['Histology']=adata.obs['path_labels'] 

In [None]:
from cp_utils import model_selection_alpha
k = np.arange(1,30)
mod_sel = model_selection_alpha(adata,k,random_state=random_state)

In [None]:
mod = []

for i in np.arange(0, 2.01,0.1):
    nmf_model = intNMF(adata,5,epochs = 50, init = 'nndsvd',random_state=random_state,mod1_skew=i)
    nmf_model.fit(adata)
    kmeans_cellpie = KMeans(n_clusters=6, init='k-means++',max_iter=500,random_state=random_state).fit(nmf_model.theta[:,:])
    adata.obs['CellPie']=kmeans_cellpie.labels_.astype(str)
    mut_info=metrics.fowlkes_mallows_score(adata.obs['CellPie'],adata.obs['path_labels'])
    adj_rand=metrics.adjusted_rand_score(adata.obs['CellPie'],adata.obs['path_labels'])
    adj_mut_info=metrics.adjusted_mutual_info_score(adata.obs['CellPie'],adata.obs['path_labels'])
    mod.append((i,mut_info,adj_rand,adj_mut_info)) 

In [None]:
score_mod = pd.DataFrame(mod)  
plt.plot(score_mod[0],score_mod[1],color="green",label='Fowlkes Mallows Score')
plt.plot(score_mod[0],score_mod[2],color="red",label='Adjusted Rand Score')
plt.plot(score_mod[0],score_mod[3],color="blue",label='Adjusted Mutual Info Score')
plt.xlabel("Modality Weight")
plt.ylabel("Score")
plt.legend(prop={'size': 9})

In [None]:
nmf_model = intNMF(adata,5,epochs = 50, init = 'nndsvd',random_state=random_state,mod1_skew=1.9)
nmf_model.fit(adata)
kmeans_cellpie = KMeans(n_clusters=6, init='k-means++',max_iter=500,random_state=random_state).fit(nmf_model.theta[:,:])
adata.obs['CellPie']=kmeans_cellpie.labels_.astype(str)

In [None]:
from sklearn.decomposition import FactorAnalysis
transformer = FactorAnalysis(n_components=4, random_state=random_state)
X_transformed = transformer.fit_transform(adata.X)
kmeans = KMeans(n_clusters=6, init='k-means++',max_iter=500,random_state=random_state).fit(X_transformed)
adata.obs['FA']=kmeans.labels_.astype(str)

In [None]:
mef = pd.read_csv(f'../range_benchmark/factors_mefisto_her2_4_bench.csv',index_col=0)
mef = mef.reindex(adata.obs.index)

kmeans = KMeans(n_clusters=6, init='k-means++',max_iter=500,random_state=random_state).fit(mef.iloc[:,:])
adata.obs['MEFISTO']=kmeans.labels_.astype(str)


In [None]:
nsf = pd.read_csv(f'../range_benchmark/factors_nsf_her2_poi_4_bench.csv',index_col=0)
nsf = nsf.reindex(adata.obs.index)
kmeans = KMeans(n_clusters=6, init='k-means++',max_iter=500,random_state=random_state).fit(nsf.iloc[:,:])
adata.obs['NSF']=kmeans.labels_.astype(str)

In [None]:
nsfh_s = pd.read_csv(f'../range_benchmark/spatialfactors_nsfh_her2_poi_4_bench.csv',index_col=0)
nsfh_ns = pd.read_csv(f'../range_benchmark/nonspatialfactors_her2_prostate_poi_4_bench.csv',index_col=0)
nsfh = pd.concat([nsfh_ns,nsfh_s], axis=1)
nsfh = nsfh.reindex(adata.obs.index)
kmeans = KMeans(n_clusters=6, init='k-means++',max_iter=500,random_state=random_state).fit(nsfh.iloc[:,:])
adata.obs['NSFH']=kmeans.labels_.astype(str)


In [None]:
pnmf = pd.read_csv(f'../range_benchmark/factors_pnmf_her2_poi_6_bench.csv',index_col=0)
pnmf = pnmf.reindex(adata.obs.index)

kmeans = KMeans(n_clusters=6, init='k-means++',max_iter=500,random_state=random_state).fit(pnmf.iloc[:,:])
adata.obs['PNMF']=kmeans.labels_.astype(str)

In [None]:
nmf_model_0 = intNMF(adata,5,epochs = 50,init = 'nndsvd',random_state=random_state,mod1_skew=2)
nmf_model_0.fit(adata)
kmeans_cp_0 = KMeans(n_clusters=6, init='k-means++',max_iter=500,random_state=random_state).fit(nmf_model_0.theta[:,:])
adata.obs['CellPie_0']=kmeans_cp_0.labels_.astype(str)

In [None]:
adata.uns['CellPie_colors'] = ['#d62728','#9467bd','#2ca02c','#1f77b4','#ff7f0e','#8c564b']
adata.uns['NSF_colors'] = ['#1f77b4','#2ca02c','#ff7f0e','#d62728','#ff7f0e','#9467bd']

adata.uns['NSFH_colors'] = ['#1f77b4','#9467bd','#ff7f0e','#8c564b','#00e6e6','#2ca02c']

adata.uns['MEFISTO_colors'] = ['#1f77b4','#2ca02c','#ff7f0e','#9467bd','#8c564b','#d62728']


adata.uns['FA_colors'] = ['#d62728','#2ca02c','#1f77b4','#8c564b','#ff7f0e','#9467bd']

adata.uns['PNMF_colors'] = ['#d62728','#2ca02c','#8c564b','#1f77b4','#ff7f0e','#9467bd']

adata.uns['CellPie_0_colors'] = ['#d62728','#2ca02c','#ff7f0e','#9467bd','#8c564b','#1f77b4']

In [None]:
sc.pl.spatial(adata,color=['Histology','CellPie','NSF','NSFH','MEFISTO','FA','PNMF','CellPie_0'],
              size=4,ncols=4)
              # save = 'her2_kmeans_revision.png')

In [None]:
tls = pd.read_csv('Benchmark/tls_score_H1.csv',index_col=0)

In [None]:
adata.obs['tls'] = tls['0']

In [None]:
import matplotlib as mpl
with mpl.rc_context({'figure.figsize': (5, 6), 'axes.facecolor': 'black'}):
    sc.pl.spatial(adata,color='tls',size=4,alpha_img=0.2,save = 'tls.png')

In [None]:
mi_scores = np.zeros((1,5))
for i in range(1):
    for j in range(5):
        mi_scores[i,j] = np.corrcoef(adata.obs['tls'],nmf_model.theta[:, j])[1][0]      
mi_scores_nsf = np.zeros((1,4))
for i in range(1):
    for j in range(4):
        mi_scores_nsf[i,j] = np.corrcoef(adata.obs['tls'],nsf.iloc[:, j])[1][0]      
mi_scores_nsfh = np.zeros((1,4))
for i in range(1):
    for j in range(4):
        mi_scores_nsfh[i,j] = np.corrcoef(adata.obs['tls'],nsfh.iloc[:, j])[1][0]    
mi_scores_mef = np.zeros((1,4))
for i in range(1):
    for j in range(4):
        mi_scores_mef[i,j] = np.corrcoef(adata.obs['tls'],mef.iloc[:, j])[1][0]
mi_scores_fa = np.zeros((1,4))
for i in range(1):
    for j in range(4):
        mi_scores_fa[i,j] = np.corrcoef(adata.obs['tls'],X_transformed[:, j])[1][0]
mi_scores_pnmf = np.zeros((1,6))
for i in range(1):
    for j in range(6):
        mi_scores_pnmf[i,j] = np.corrcoef(adata.obs['tls'],pnmf.iloc[:, j])[1][0]

In [None]:
factors = ['Factor_1', 'Factor_2', 'Factor_3', 'Factor_4', 'Factor_5']

In [None]:
import seaborn as sns
plt.figure(figsize=(33, 1))
plt.rcParams["axes.grid"] = False
sns.heatmap(mi_scores, annot=True, cmap='viridis',
            xticklabels=factors,annot_kws={"fontsize":35})
sns.set(font_scale=2.5)
plt.title('Pearson Correlation Between TLS score and CellPie Factors')
plt.xlabel('Factors')
plt.ylabel('TLS')
plt.show()

In [None]:
plt.figure(figsize=(33, 1))
plt.rcParams["axes.grid"] = False
sns.heatmap(mi_scores_nsf, annot=True, cmap='viridis',
            xticklabels=factors,annot_kws={"fontsize":35})
sns.set(font_scale=2.5)
plt.title('Pearson Correlation Between TLS score and NSF Factors')
plt.xlabel('Factors')
plt.ylabel('TLS')
plt.show()

In [None]:
plt.figure(figsize=(33, 1))
plt.rcParams["axes.grid"] = False
sns.heatmap(mi_scores_nsfh, annot=True, cmap='viridis',
            xticklabels=factors,annot_kws={"fontsize":35})
sns.set(font_scale=2.5)
plt.title('Pearson Correlation Between TLS score and NSFH Factors')
plt.xlabel('Factors')
plt.ylabel('TLS')
plt.show()

In [None]:
plt.figure(figsize=(33, 1))
plt.rcParams["axes.grid"] = False
sns.heatmap(mi_scores_fa, annot=True, cmap='viridis',
            xticklabels=factors,annot_kws={"fontsize":35})
sns.set(font_scale=2.5)
plt.title('Pearson Correlation Between TLS score and FA Factors')
plt.xlabel('Factors')
plt.ylabel('TLS')
plt.show()

In [None]:
plt.figure(figsize=(33, 1))
plt.rcParams["axes.grid"] = False
sns.heatmap(mi_scores_pnmf, annot=True, cmap='viridis',
            xticklabels=factors,annot_kws={"fontsize":35})
sns.set(font_scale=2.5)
plt.title('Pearson Correlation Between TLS score and PNMF Factors')
plt.xlabel('Factors')
plt.ylabel('TLS')
plt.show()

In [None]:
plt.figure(figsize=(33, 1))
plt.rcParams["axes.grid"] = False
sns.heatmap(mi_scores_mef, annot=True, cmap='viridis',
            xticklabels=factors,annot_kws={"fontsize":35})
sns.set(font_scale=2.5)
plt.title('Pearson Correlation Between TLS score and MEFISTO Factors')
plt.xlabel('Factors')
plt.ylabel('TLS')
plt.show()

In [None]:
from sklearn import metrics
l_t = ['CellPie','MEFISTO','PNMF','NSF','NSFH','FA', 'CellPie_0']
res = []
for l_t in l_t:
    mut_info=metrics.fowlkes_mallows_score(adata.obs[l_t],adata.obs['path_labels'])
    adj_rand=metrics.adjusted_rand_score(adata.obs[l_t],adata.obs['path_labels'])
    adj_mut_info=metrics.adjusted_mutual_info_score(adata.obs[l_t],adata.obs['path_labels'])
    res.append((l_t,mut_info,adj_rand,adj_mut_info))
res_df = pd.DataFrame(res, columns = ['Method','Mutual Info','Adjusted Rand Index','Adjusted Mutual Info'])

In [None]:
res_df

In [None]:
res_df

In [None]:
import seaborn as sns
plot=sns.barplot(data=res_df,x = res_df['Method'],y='Adjusted Rand Index',hue='Adjusted Rand Index',width=0.6, legend=False)
plot.set_xticklabels(plot.get_xticklabels(), fontsize=10)
plt.title('ARI-kmeans Clustering')
plt.grid(False)

In [None]:
k = 5
from re import sub
import matplotlib as mpl
sel_clust = ['Factor_'+str(i+1) for i in range(k)]
with mpl.rc_context({'figure.figsize': (5, 6), 'axes.facecolor': 'black'}):
    sc.pl.spatial(adata,
                cmap='magma',
                color=sel_clust,
                ncols=1,
                size=4, img_key='hires', 
                alpha_img=0
                 )

In [None]:
import gseapy as gp
def gene_er(topic):
    enr = gp.enrichr(gene_list=topic,
                gene_sets=['GO_Biological_Process_2023'],cutoff = 0.5,
                organism='human', 
                outdir=None,
                )
    return enr

In [None]:
l=cp.get_genes_topic(adata,nmf_model.phi_expr)
enr0 = gene_er(l.T[0].sort_values(ascending=False).index[0:150].to_list())
enr1 = gene_er(l.T[1].sort_values(ascending=False).index[0:150].to_list())
enr2 = gene_er(l.T[2].sort_values(ascending=False).index[0:150].to_list())
enr3 = gene_er(l.T[3].sort_values(ascending=False).index[0:150].to_list())
enr4 = gene_er(l.T[4].sort_values(ascending=False).index[0:150].to_list())

In [None]:
gp.barplot(enr2.res2d,title='GO_Biological_Process_2023 Factor 3',color=['darkred'])
plt.grid(None)

In [None]:
gp.barplot(enr1.res2d,title='GO_Biological_Process_2023 Factor 2',color=['darkred'])
plt.grid(None)

In [None]:
methods = ['CellPie', 'NSF','NSFH','PNMF','FA','MEFISTO']

for i in methods:
    plt.figure(figsize=(12, 8))
    sns.heatmap(
        pd.crosstab(
           adata.obs['Histology'].values, adata.obs[f'{i}'].values,
            normalize ='index'
        ), 
        cmap=sns.color_palette("crest", as_cmap=True),
        annot = True
    )
    plt.title(f'Contingency Table {i}')
    plt.grid(None)
    plt.show()