In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from os import path
from math import ceil
from scanpy import read_h5ad
import squidpy as sq
import scanpy as sc
from sklearn import metrics
import sys
import seaborn as sns
sys.path.append("/Users/user/CellPie/CellPie/") # go to parent dir
import cellpie_main as cp
from cellpie_main import intNMF

In [None]:
sc.set_figure_params(scanpy=True, dpi=250, dpi_save=300)

In [None]:
random_state=123

In [None]:
histo_2 = pd.read_csv('Histology_Visium_FFPE_Human_Prostate_Cancer_cloupe.csv').dropna()
histo_2.index = histo_2['Barcode']

Download Visium Data

In [None]:
%%sh
mkdir -p invasive_prostate_visium
pushd invasive_prostate_visium/
wget https://cf.10xgenomics.com/samples/spatial-exp/1.3.0/Visium_FFPE_Human_Prostate_Cancer/Visium_FFPE_Human_Prostate_Cancer_image.tif
wget https://cf.10xgenomics.com/samples/spatial-exp/1.3.0/Visium_FFPE_Human_Prostate_Cancer/Visium_FFPE_Human_Prostate_Cancer_filtered_feature_bc_matrix.h5
wget https://cf.10xgenomics.com/samples/spatial-exp/1.3.0/Visium_FFPE_Human_Prostate_Cancer/Visium_FFPE_Human_Prostate_Cancer_spatial.tar.gz
tar -xzf Visium_FFPE_Human_Prostate_Cancer_spatial.tar.gz
rm Visium_FFPE_Human_Prostate_Cancer_spatial.tar.gz
popd

In [None]:
# read ST data
adata = sc.read_visium(path = 'invasive_prostate_visium/', 
                       count_file='Visium_FFPE_Human_Prostate_Cancer_filtered_feature_bc_matrix.h5',                   
                       load_images=True)
adata.var_names_make_unique()
adata.var['SYMBOL'] = adata.var_names

In [None]:
adata = adata[adata.obs_names.isin(histo_2['Barcode']),:]

In [None]:
scale_range = np.arange(0.1,3.1,1)

In [None]:
from feature_extr import extract_features
features = extract_features(adata,img_path='Visium_FFPE_Human_Prostate_Cancer_image.tif',
                           scale=1,spot_scale=scale_range,bins=100)

In [None]:
adata.obs['Histology'] = histo_2['Histology']

In [None]:
from cp_utils import preprocess_data_visium
preprocess_data_visium(adata,min_cells=100)

In [None]:
from sklearn import metrics
res = []

In [None]:
import scanpy as sc

def find_leiden_resolution(
    adata,
    target_clusters=6,
    start_resolution=0.1,
    tolerance=0.01,
    max_iters=100,
    random_state=123,
    neighbors_key='cellpie',
    key_added='CellPie',
    use_rep = 'cellpie',
    
    n_neighbors=90
):
    resolution = start_resolution
    step = 0.01 
    cluster_counts = []
    
    sc.pp.neighbors(adata, n_neighbors= n_neighbors, random_state=random_state, use_rep=use_rep, key_added=neighbors_key)
    
    for i in np.arange(start_resolution,1.01,step):
        sc.tl.leiden(adata, resolution=i, random_state=random_state, neighbors_key=neighbors_key, key_added=key_added)
    
        n_clusters = adata.obs[key_added].nunique()
        cluster_counts.append((resolution, n_clusters))
        
        if abs(n_clusters - target_clusters) <= tolerance:
            print(f"Found resolution: {i} with {n_clusters} clusters")
            return 
        else:
            continue

In [None]:
from cp_utils import model_selection_alpha
k = np.arange(8,51)
mod_sel = model_selection_alpha(adata,k,random_state=random_state,epochs=20,init='random',mod1_skew=1)

In [None]:
res_1 = []
for i in np.arange(0.1, 2.01,0.1):
    nmf_model = intNMF(adata,26,epochs = 20, init = 'random',random_state=random_state,mod1_skew=i)
    nmf_model.fit(adata)
    
    adata.obsm['cellpie'] = nmf_model.theta
    find_leiden_resolution(adata, target_clusters=6,neighbors_key='cellpie',key_added='CellPie',use_rep='cellpie')    
    mut_info=metrics.fowlkes_mallows_score(adata.obs['CellPie'],adata.obs['Histology'])
    adj_rand=metrics.adjusted_rand_score(adata.obs['CellPie'],adata.obs['Histology'])
    adj_mut_info=metrics.adjusted_mutual_info_score(adata.obs['CellPie'],adata.obs['Histology'])
    res_1.append((i,mut_info,adj_rand,adj_mut_info)) 

In [None]:
score_1 = pd.DataFrame(res_1)   
plt.plot(score_1[0],score_1[1],color="green",label='Fowlkes Mallows Score')
plt.plot(score_1[0],score_1[2],color="red",label='Adjusted Rand Index Score')
plt.plot(score_1[0],score_1[3],color="blue",label='Adjusted Mutual Info Score')
plt.xlabel("Modality weight")
plt.ylabel("Score")
plt.title("CellPie")
plt.legend(prop={'size': 9})
plt.show()

In [None]:
nmf_model = intNMF(adata,26,epochs=20,init = 'random',random_state=random_state,mod1_skew=1)
nmf_model.fit(adata)
adata.obsm['cellpie'] = nmf_model.theta

In [None]:
from re import sub
import matplotlib as mpl
k=26
sel_clust = ['Factor_'+str(i+1) for i in range(k)]
with mpl.rc_context({'figure.figsize': (10, 9), 'axes.facecolor': 'black'}):
    sc.pl.spatial(adata,
                cmap='magma',
                color=sel_clust,
                ncols=4,
                size=1, img_key='hires',
                alpha_img=0
                 )

In [None]:
sc.pp.neighbors(adata, n_neighbors=90, random_state=123, use_rep='cellpie', key_added='cellpie')

In [None]:
# Run Leiden clustering
sc.tl.leiden(adata, resolution=0.32999999999999985, random_state=random_state, neighbors_key='cellpie', key_added='CellPie')
n_clusters = adata.obs['CellPie'].nunique()
n_clusters

In [None]:
from sklearn.decomposition import FactorAnalysis
transformer = FactorAnalysis(n_components=21,random_state=random_state)
X_transformed = transformer.fit_transform(adata.X.toarray())
adata.obsm['FA'] = X_transformed

In [None]:
sc.pp.neighbors(adata, n_neighbors= 90, random_state=random_state, use_rep='FA', key_added='fa')
sc.tl.leiden(adata, resolution= 0.4299999999999998, random_state=random_state, neighbors_key='fa', key_added='fa')

In [None]:
mef = pd.read_csv(f'Benchmark/new_factors_mefisto_prostate_35_bench.csv',index_col=0)
mef_aligned = mef.reindex(adata.obs.index)
adata.obsm['mefisto'] = mef_aligned
sc.pp.neighbors(adata, n_neighbors= 90, random_state=random_state, use_rep='mefisto', key_added='mefisto')
sc.tl.leiden(adata, resolution=0.3999999999999998, random_state=random_state, neighbors_key='mefisto', key_added='MEFISTO')

In [None]:
nsf = pd.read_csv(f'Benchmark/factors_nsf_prostate_poi_38_bench.csv',index_col=0)
nsf_aligned = nsf.reindex(adata.obs.index)
adata.obsm['nsf'] = nsf_aligned
sc.pp.neighbors(adata, n_neighbors= 90, random_state=random_state, use_rep='nsf', key_added='nsf')
sc.tl.leiden(adata, resolution=0.3199999999999999, random_state=random_state, neighbors_key='nsf', key_added='nsf')

In [None]:
nsfh_s = pd.read_csv(f'Benchmark/spatialfactors_nsfh_prostate_poi_23_bench.csv',index_col=0)

nsfh_ns = pd.read_csv(f'Benchmark/nonspatialfactors_nsfh_prostate_poi_23_bench.csv',index_col=0)
nsfh = pd.concat([nsfh_ns,nsfh_s], axis=1)
    
nsfh_aligned = nsfh.reindex(adata.obs.index)
adata.obsm['nsfh'] = nsfh_aligned
sc.pp.neighbors(adata, n_neighbors= 90, random_state=random_state, use_rep='nsfh', key_added='nsfh')
sc.tl.leiden(adata, resolution=0.32999999999999985, random_state=random_state, neighbors_key='nsfh', key_added='NSFH')

In [None]:
pnmf = pd.read_csv(f'Benchmark/factors_pnmf_prostate_poi_14_bench.csv',index_col=0)

pnmf_aligned = pnmf.reindex(adata.obs.index)
adata.obsm['pnmf'] = pnmf_aligned

sc.pp.neighbors(adata, n_neighbors= 90, random_state=random_state, use_rep='pnmf', key_added='pnmf')
sc.tl.leiden(adata, resolution=0.2799999999999999, random_state=random_state, neighbors_key='pnmf', key_added='PNMF')

In [None]:
nmf_model_0 = intNMF(adata,26,epochs=20,init = 'random',random_state=random_state,mod1_skew=2)
nmf_model_0.fit(adata)
adata.obsm['cellpie_0'] = nmf_model_0.theta

In [None]:
sc.pp.neighbors(adata, n_neighbors=90, random_state=123, use_rep='cellpie_0', key_added='cellpie_0')
sc.tl.leiden(adata, resolution=0.389999999, random_state=random_state, neighbors_key='cellpie_0', key_added='CellPie_0')
n_clusters = adata.obs['CellPie_0'].nunique()
n_clusters

In [None]:
adata.uns['CellPie_colors'] = ['#1f77b4','#ff7f0e','#00e6e6','#2ca02c','#8c564b','#9467bd']
adata.uns['nsf_colors'] = ['#2ca02c', '#ff7f0e','#1f77b4', '#8c564b', '#cc9900','#9467bd']
adata.uns['NSFH_colors'] = ['#ff7f0e','#2ca02c', '#8c564b','#1f77b4','#cc9900','#9467bd']
adata.uns['MEFISTO_colors'] = ['#1f77b4','#ff7f0e','#00e6e6','#8c564b','#2ca02c','#9467bd']
adata.uns['fa_colors'] = ['#1f77b4','#ff7f0e', '#2ca02c','#8c564b','#00e6e6','#9467bd']
adata.uns['PNMF_colors'] = ['#1f77b4','#ff7f0e','#00e6e6','#8c564b','#a93226','#9467bd']
adata.uns['CellPie_0_colors'] = ['#ff7f0e','#1f77b4','#00e6e6', '#8c564b', '#2ca02c','#9467bd']

In [None]:
sc.pl.spatial(adata,color=['Histology','CellPie','nsf','NSFH','MEFISTO','fa','PNMF', 'CellPie_0'],
              size=1.4, ncols = 1
             )

In [None]:
from sklearn import metrics
l_t = ['CellPie','nsf','NSFH','MEFISTO','fa','PNMF','CellPie_0']
res_all = []
for l_t in l_t:
    mut_info=metrics.fowlkes_mallows_score(adata.obs[l_t],adata.obs['Histology'])
    adj_rand=metrics.adjusted_rand_score(adata.obs[l_t],adata.obs['Histology'])
    adj_mut_info=metrics.adjusted_mutual_info_score(adata.obs[l_t],adata.obs['Histology'])
    res_all.append((l_t,mut_info,adj_rand,adj_mut_info))
res_df = pd.DataFrame(res_all, columns = ['Method','Mutual Info','Adjusted Rand Index','Adjusted Mutual Info'])

In [None]:
res_df

In [None]:
import seaborn as sns
plot=sns.barplot(data=res_df,x = res_df['Method'],y='Adjusted Rand Index',hue='Adjusted Rand Index',width=0.6, legend=False)
plot.set_xticklabels(plot.get_xticklabels(), fontsize=10)
plt.title('ARI-Leiden Clustering')
plt.grid(False)

In [None]:
methods = ['CellPie', 'nsf','NSFH','PNMF','fa','MEFISTO', 'CellPie_0']

for i in methods:
    plt.figure(figsize=(12, 8))
    sns.heatmap(
        pd.crosstab(
           adata.obs['Histology'].values, adata.obs[f'{i}'].values,
            normalize ='columns'
        ), annot_kws={'size': 25},
        cmap=sns.color_palette("crest", as_cmap=True),
        annot = True
    )
    plt.title(f'Contingency Table {i}')
    plt.grid(False)
    plt.show()

In [None]:
nmf_model = intNMF(adata,26,epochs=20,init = 'random',random_state=random_state,mod1_skew=1)
nmf_model.fit(adata)
adata.obsm['cellpie'] = nmf_model.theta

In [None]:
l=cp.get_genes_topic(adata,nmf_model.phi_expr)

In [None]:
import gseapy as gp
def gene_er(topic):
    enr = gp.enrichr(gene_list=topic,
                gene_sets=['GO_Biological_Process_2023'],
                organism='human', 
                outdir=None,
                )
    return enr

In [None]:

selected_indices = [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 19, 25] # Select the factors for enrichment analysis - index

enrichment_results = {
    i: gene_er(l.T[i].sort_values(ascending=False).index[:150].to_list())
    for i in selected_indices
}

In [None]:
for i in enrichment_results:
    gp.barplot(
        enrichment_results[i].res2d,
        title=f'GO_Biological_Process_2023 Factor {i+1}',
        color=['darkred']
    )
    plt.grid(None)
    plt.show()