In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import os
from os import path
from math import ceil
from scanpy import read_h5ad
from tensorflow_probability import math as tm
tfk = tm.psd_kernels
from sklearn.cluster import KMeans
from sklearn import metrics

In [None]:
from models import cf,sf,sfh
from models.mefisto import MEFISTO
from utils import preprocess,training,misc,visualize,postprocess

In [None]:
import squidpy as sq
import scanpy as sc

In [None]:
random_state=123

In [None]:
folder_name = "range_benchmark"
if not os.path.exists(folder_name):
    os.makedirs(folder_name)

In [None]:
histo_2 = pd.read_csv('../H1_labeled_coordinates.tsv',sep='\t')
histo_2.set_index('Row.names')

In [None]:
adata=sc.read_h5ad('../her2_h1.h5ad')

In [None]:
# df_sp.index = df_sp['Unnamed: 0']
histo_2.index = histo_2['Row.names']
histo_2.index.names = ['']
new_index_values = []

for i in range(len(histo_2)):
    x = histo_2['x'].round().astype(np.int64).iloc[i]
    y = histo_2['y'].round().astype(np.int64).iloc[i]
    new_index_values.append("{0}x{1}".format(x, y))
    
histo_2.index=new_index_values
adata.obs['path_labels'] =histo_2['label']

In [None]:
dtp = "float32"
# change this to your path
pth ='/Users/user/her2st_H/' 
mpth = '/Users/user/her2st_H/'

In [None]:
import random
ad = adata

sc.pp.filter_genes(ad, min_cells=1)
sc.pp.filter_cells(ad, min_counts=100)
ad.layers = {"counts":ad.X.copy()} #store raw counts before normalization changes ad.X
sc.pp.normalize_total(ad, inplace=True, layers=None, key_added="sizefactor")
sc.pp.log1p(ad)

# %% normalization, feature selection and train/test split
ad.var['deviance_poisson'] = preprocess.deviancePoisson(ad.layers["counts"])
o = np.argsort(-ad.var['deviance_poisson'])
idx = list(range(ad.shape[0]))
random.shuffle(idx)
ad = ad[idx,o]

ad.write_h5ad(path.join(pth,"her2_nsf.h5ad"),compression="gzip")
#ad = sc.read_h5ad(path.join(pth,"data/visium_brain_sagittal.h5ad"))
ad2 = ad[:,:2000]
ad2.write_h5ad(path.join(pth,"her2_nsf_J2000.h5ad"),compression="gzip")

In [None]:
J = 2000
ad = read_h5ad(path.join(pth,"her2_nsf_J{}.h5ad".format(J)))#[:,:J]
Dtr,Dval = preprocess.anndata_to_train_val(ad,train_frac=1,layer="counts",sz="scanpy")
Dtr_n,Dval_n = preprocess.anndata_to_train_val(ad,train_frac=1) #normalized data
fmeans,Dtr_c,Dval_c = preprocess.center_data(Dtr_n,Dval_n) #centered features
Xtr = Dtr["X"] #note this should be identical to Dtr_n["X"]
Ntr = Xtr.shape[0]
Dtf = preprocess.prepare_datasets_tf(Dtr,Dval=Dval,shuffle=False)
Dtf_n = preprocess.prepare_datasets_tf(Dtr_n,Dval=Dval_n,shuffle=False)
Dtf_c = preprocess.prepare_datasets_tf(Dtr_c,Dval=Dval_c,shuffle=False)
visualize.heatmap(Xtr,Dtr["Y"][:,0],marker="D",s=15)

In [None]:
#%% Visualize raw data
import numpy as np
plt.imshow(np.log1p(Dtr["Y"])[:50,:100],cmap="Blues")

In [None]:
#%% Visualize inducing points
Z = misc.kmeans_inducing_pts(Xtr,500)
fig,ax=plt.subplots(figsize=(12,10))
ax.scatter(Xtr[:,0],Xtr[:,1],marker="D",s=50,)
ax.scatter(Z[:,0],Z[:,1],c="red",s=30)

In [None]:
# %% initialize inducing points and tuning parameters
Z = misc.kmeans_inducing_pts(Xtr, 500)
M = Z.shape[0]
ker = tfk.MaternThreeHalves
S = 3 #samples for elbo approximation

In [None]:
L = np.arange(4,61)
L

In [None]:
# NSF: Spatial only with non-negative factors
for col in L:
    fit = sf.SpatialFactorization(J,col,Z,psd_kernel=ker,nonneg=True,lik="poi")
    fit.elbo_avg(Xtr,Dtr["Y"],sz=Dtr["sz"])
    fit.init_loadings(Dtr["Y"],X=Xtr,sz=Dtr["sz"])
    fit.elbo_avg(Xtr,Dtr["Y"],sz=Dtr["sz"])
    pp = fit.generate_pickle_path("scanpy",base=mpth)
    tro = training.ModelTrainer(fit,pickle_path=pp)


    hmkw = {"figsize":(4,4), "s":0.3, "marker":"D", "subplot_space":0,
        "spinecolor":"white"}
    insf = postprocess.interpret_nsf(fit,Xtr,S=10,lda_mode=False)
    tgnames = [str(i) for i in range(1,col+1)]

    W = insf["loadings"]#*insf["totals"][:,None]
    topgenes = W.argmax(axis=0).tolist()
    tgnames = ad.var.index[topgenes]
    Ytg = Dtr["Y"][:,topgenes]/Dtr["sz"]
    fig,axes=visualize.multiheatmap(Xtr, np.sqrt(Ytg), (4,3), **hmkw)
# visualize.set_titles(fig, tgnames, x=0.05, y=.85, fontsize="medium", c="white",
#                      ha="left", va="top")
# fig.savefig(path.join(plt_pth,"vz_brn_heatmap_nsf12_genes.pdf"),bbox_inches='tight')
#save loadings to disk for further interpretation
    Wdf=pd.DataFrame(W*insf["totals"][:,None], index=ad.var.index, columns=range(1,col+1))
    W = insf["loadings"]#*insf["totals"][:,None]
    Wdf=pd.DataFrame(W*insf["totals"][:,None], index=ad.var.index, columns=range(1,col+1))
    pd.DataFrame(np.sqrt(insf["factors"]),index = ad.obs_names).to_csv(f'range_benchmark/factors_nsf_her2_poi_{col}_bench.csv')

In [None]:
#%% PNMF: Non-spatial, nonnegative
for col in L:
    fit = cf.CountFactorization(Ntr, J, col, lik="poi", nonneg=True)
    fit.elbo_avg(Dtr["Y"],sz=Dtr["sz"],idx=Dtr["idx"])
    fit.init_loadings(Dtr["Y"],sz=Dtr["sz"])
    pp = fit.generate_pickle_path("scanpy",base=mpth)
    tro = training.ModelTrainer(fit,pickle_path=pp)
    ttl = "PNMF: nonspatial, non-negative factors, Poisson likelihood"
# visualize.plot_loss(tro.loss,title=ttl)#,ss=range(2000,4000))
#dev_pnmf = visualize.gof(fit,Dtr,Dval=Dval,title=ttl)
#%% Postprocess
    hmkw = {"figsize":(10,8), "s":0.5, "marker":"D", "subplot_space":0,
        "spinecolor":"white"}
    ipnmf = postprocess.interpret_pnmf(fit,S=8,lda_mode=False)
    tgnames = [str(i) for i in range(1,col+1)]
    
    pd.DataFrame(ipnmf["factors"],index = ad.obs_names).to_csv(f'range_benchmark/factors_pnmf_her2_poi_{col}_bench.csv')
# fig,axes=visualize.multiheatmap(Xtr, np.sqrt(ipnmf["factors"]), (4,5), **hmkw)
# visualize.set_titles(fig, tgnames, x=0.05, y=.85, fontsize="medium", c="white",
#                      ha="left", va="top")

In [None]:
#%% NSF Hybrid object
for col in L:
    fit = sfh.SpatialFactorizationHybrid(Ntr, J, col, Z, lik="poi", nonneg=True,
                                       psd_kernel=ker)
    fit.elbo_avg(Dtr["X"],Dtr["Y"],Dtr["idx"])
    fit.init_loadings(Dtr["Y"],X=Dtr["X"])
    pp = fit.generate_pickle_path("scanpy",base=mpth)
    tro = training.ModelTrainer(fit,pickle_path=pp)

    ttl = "NSFH: spatial, non-negative factors, Poisson likelihood"

    hmkw = {"figsize":(10,4), "s":0.5, "marker":"D", "subplot_space":0,
        "spinecolor":"white"}
    insfh = postprocess.interpret_nsfh(fit,Xtr,S=10,lda_mode=False)
    Ws = insfh['spatial']["loadings"]#*insf["totals"][:,None]
    Wdfs=pd.DataFrame(Ws*insfh["totals"][:,None], index=ad.var.index)
    
    Wns = insfh['nonspatial']["loadings"]#*insf["totals"][:,None]
    Wdfns=pd.DataFrame(Wns*insfh["totals"][:,None], index=ad.var.index)
    
    pd.DataFrame(insfh['nonspatial']["factors"],index = ad.obs_names).to_csv(f'range_benchmark/nonspatialfactors_her2_prostate_poi_{col}_bench.csv')
    pd.DataFrame(insfh['spatial']["factors"],index = ad.obs_names).to_csv(f'range_benchmark/spatialfactors_nsfh_her2_poi_{col}_bench.csv')


In [None]:
# %% MEFISTO- Gaussian
for col in L:
    mef = MEFISTO(Dtr_n, col, inducing_pts=500, pickle_path=pp)

    #mef = MEFISTO.from_pickle(pp)
    ttl = "MEFISTO"
    dev_mef = visualize.gof(mef,Dtr,Dval=Dval,title=ttl)
    pd.DataFrame(mef.ent.model.nodes["Z"].getExpectations()["E"],index = ad.obs_names).to_csv(f'range_benchmark/factors_mefisto_her2_{col}_bench.csv')

In [None]:
histo_2 = pd.read_csv('/Users/user/CellPie/Paper_Notebooks/Paper_Notebookds/HER2/H1_labeled_coordinates.tsv',sep='\t')
histo_2.set_index('Row.names')

In [None]:
# read ST data
adata=sc.read_h5ad('/Users/user/CellPie/Paper_Notebooks/Paper_Notebookds/HER2/her2_h1.h5ad')

In [None]:
histo_2.index = histo_2['Row.names']
histo_2.index.names = ['']
new_index_values = []

for i in range(len(histo_2)):
    x = histo_2['x'].round().astype(np.int64).iloc[i]
    y = histo_2['y'].round().astype(np.int64).iloc[i]
    new_index_values.append("{0}x{1}".format(x, y))
    
histo_2.index=new_index_values
adata.obs['path_labels'] =histo_2['label']

In [None]:
adata = adata[adata.obs_names.isin(histo_2.index),:]

In [None]:
res = []

In [None]:
adata.obs['Histology']=adata.obs['path_labels'] 

In [None]:
L = np.arange(4,61)

In [None]:
res_nsf = []
for col in L:
    nsf = pd.read_csv(f'range_benchmark/factors_nsf_her2_poi_{col}_bench.csv',index_col=0)
    nsf_aligned = nsf.reindex(adata.obs.index)
    adata.obsm['nsf'] = nsf_aligned
    kmeans_nsf = KMeans(n_clusters=6, init='k-means++',max_iter=500,random_state=random_state).fit(adata.obsm['nsf'])
    adata.obs['NSF']=kmeans_nsf.labels_.astype(str)
    nsf_aligned['Histology'] = adata.obs['Histology']
    
    mut_info=metrics.fowlkes_mallows_score(adata.obs['NSF'], nsf_aligned ['Histology'])
    adj_rand=metrics.adjusted_rand_score(adata.obs['NSF'], nsf_aligned ['Histology'])
    adj_mut_info=metrics.adjusted_mutual_info_score(adata.obs['NSF'], nsf_aligned ['Histology'])
    res_nsf.append((col,mut_info,adj_rand,adj_mut_info))    

In [None]:
score_nsf = pd.DataFrame(res_nsf)   
plt.plot(score_nsf [0],score_nsf [1],color="green",label='Fowlkes Mallows Score')
plt.plot(score_nsf [0],score_nsf [2],color="red",label='Adjusted Rand Index Score')
plt.plot(score_nsf [0],score_nsf [3],color="blue",label='Adjusted Mutual Info Score')
plt.xlabel("Number of Factors")
plt.ylabel("Score")
plt.title("NSF")
plt.legend(prop={'size': 9})

In [None]:
score_nsf[2].max()

In [None]:
res_nsfh = []
Lnsfh = np.arange(4,59)
for col in Lnsfh:
    nsfh_s = pd.read_csv(f'range_benchmark/spatialfactors_nsfh_her2_poi_{col}_bench.csv',index_col=0)

    nsfh_ns = pd.read_csv(f'range_benchmark/nonspatialfactors_her2_prostate_poi_{col}_bench.csv',index_col=0)
    nsfh = pd.concat([nsfh_ns,nsfh_s], axis=1)

    nsfh = nsfh.reindex(adata.obs.index)
    nsfh_aligned = nsfh.reindex(adata.obs.index)
    adata.obsm['nsfh'] = nsfh_aligned
    kmeans_nsfh = KMeans(n_clusters=6, init='k-means++',max_iter=500,random_state=random_state).fit(nsfh_aligned)
    adata.obs['NSFH']=kmeans_nsfh.labels_.astype(str)
    nsfh_aligned['Histology'] = adata.obs['Histology']
    
    mut_info=metrics.fowlkes_mallows_score(adata.obs['NSFH'], nsfh_aligned['Histology'])
    adj_rand=metrics.adjusted_rand_score(adata.obs['NSFH'], nsfh_aligned['Histology'])
    adj_mut_info=metrics.adjusted_mutual_info_score(adata.obs['NSFH'], nsfh_aligned['Histology'])
    res_nsfh.append((col,mut_info,adj_rand,adj_mut_info)) 
    
    

In [None]:
score_nsfh = pd.DataFrame(res_nsfh)   
plt.plot(score_nsfh[0],score_nsfh[1],color="green",label='Fowlkes Mallows Score')
plt.plot(score_nsfh[0],score_nsfh[2],color="red",label='Adjusted Rand Index Score')
plt.plot(score_nsfh[0],score_nsfh[3],color="blue",label='Adjusted Mutual Info Score')
plt.xlabel("Number of Factors")
plt.ylabel("Score")
plt.title("NSFH")
plt.legend(prop={'size': 9})

In [None]:
score_nsfh = pd.DataFrame(res_nsfh)   
plt.plot(score_nsfh[0],score_nsfh[1],color="green",label='Fowlkes Mallows Score')
plt.plot(score_nsfh[0],score_nsfh[2],color="red",label='Adjusted Rand Index Score')
plt.plot(score_nsfh[0],score_nsfh[3],color="blue",label='Adjusted Mutual Info Score')
plt.xlabel("Number of Factors")
plt.ylabel("Score")
plt.title("NSFH")
plt.legend(prop={'size': 9})

In [None]:
score_nsfh[2].max()

In [None]:
res_pnmf = []
for col in L:
    pnmf = pd.read_csv(f'range_benchmark/factors_pnmf_her2_poi_{col}_bench.csv',index_col=0)

    pnmf_aligned = pnmf.reindex(adata.obs.index)
    adata.obsm['pnmf'] = pnmf_aligned
    kmeans_pnmf = KMeans(n_clusters=6, init='k-means++',max_iter=500,random_state=random_state).fit(pnmf_aligned)
    adata.obs['PNMF']=kmeans_pnmf.labels_.astype(str)
    pnmf_aligned['Histology'] = adata.obs['Histology']
    
    mut_info=metrics.fowlkes_mallows_score(adata.obs['PNMF'],pnmf_aligned['Histology'])
    adj_rand=metrics.adjusted_rand_score(adata.obs['PNMF'],pnmf_aligned['Histology'])
    adj_mut_info=metrics.adjusted_mutual_info_score(adata.obs['PNMF'],pnmf_aligned['Histology'])
    res_pnmf.append((col,mut_info,adj_rand,adj_mut_info))  

In [None]:
score_pnmf = pd.DataFrame(res_pnmf)   
plt.plot(score_pnmf[0],score_pnmf[1],color="green",label='Fowlkes Mallows Score')
plt.plot(score_pnmf[0],score_pnmf[2],color="red",label='Adjusted Rand Index Score')
plt.plot(score_pnmf[0],score_pnmf[3],color="blue",label='Adjusted Mutual Info Score')
plt.xlabel("Number of Factors")
plt.ylabel("Score")
plt.title("PNMF")
plt.legend(prop={'size': 9})

In [None]:
score_pnmf[2].max()

In [None]:
res_mefisto = []
for col in L:
    mef = pd.read_csv(f'range_benchmark/factors_mefisto_her2_{col}_bench.csv',index_col=0)

    mef_aligned = mef.reindex(adata.obs.index)
    adata.obsm['mefisto'] = mef_aligned
    kmeans_mef = KMeans(n_clusters=6, init='k-means++',max_iter=500,random_state=random_state).fit(mef_aligned)
    adata.obs['MEFISTO']=kmeans_mef.labels_.astype(str)
    mef_aligned['Histology'] = adata.obs['Histology']

    
    mut_info=metrics.fowlkes_mallows_score(adata.obs['MEFISTO'],mef_aligned['Histology'])
    adj_rand=metrics.adjusted_rand_score(adata.obs['MEFISTO'],mef_aligned['Histology'])
    adj_mut_info=metrics.adjusted_mutual_info_score(adata.obs['MEFISTO'],mef_aligned['Histology'])
    res_mefisto.append((col,mut_info,adj_rand,adj_mut_info)) 

In [None]:
score_mef = pd.DataFrame(res_mefisto)   
plt.plot(score_mef[0],score_mef[1],color="green",label='Fowlkes Mallows Score')
plt.plot(score_mef[0],score_mef[2],color="red",label='Adjusted Rand Index Score')
plt.plot(score_mef[0],score_mef[3],color="blue",label='Adjusted Mutual Info Score')
plt.xlabel("Number of Factors")
plt.ylabel("Score")
plt.title("MEFISTO")
plt.legend(prop={'size': 9})

In [None]:
score_mef[2].max()

In [None]:
from sklearn.decomposition import FactorAnalysis
res_fa = []
for col in L:
    transformer = FactorAnalysis(n_components=col, random_state=random_state)
    X_transformed = transformer.fit_transform(adata.X)
    adata.obsm['fa'] = X_transformed
    kmeans_fa = KMeans(n_clusters=6, init='k-means++',max_iter=500,random_state=random_state).fit(X_transformed)
    adata.obs['FA']=kmeans_fa.labels_.astype(str)
    
    mut_info=metrics.fowlkes_mallows_score(adata.obs['FA'],adata.obs['Histology'])
    adj_rand=metrics.adjusted_rand_score(adata.obs['FA'],adata.obs['Histology'])
    adj_mut_info=metrics.adjusted_mutual_info_score(adata.obs['FA'],adata.obs['Histology'])
    res_fa.append((col,mut_info,adj_rand,adj_mut_info)) 

In [None]:
score_fa = pd.DataFrame(res_fa)   
plt.plot(score_fa[0],score_fa[1],color="green",label='Fowlkes Mallows Score')
plt.plot(score_fa[0],score_fa[2],color="red",label='Adjusted Rand Index Score')
plt.plot(score_fa[0],score_fa[3],color="blue",label='Adjusted Mutual Info Score')
plt.xlabel("Number of Factors")
plt.ylabel("Score")
plt.title("FA")
plt.legend(prop={'size': 9})


In [None]:
score_fa[2].max()