### Notebook for the `NicheCompass` exploratory analysis of 10X Genomics Xenium for BRCA  

- **Developed by**: Carlos Talavera-López Ph.D
- **Würzburg Institute for Systems Immunology (WüSI) & Julius-Maximilian-Universität Würzburg**
- **Created**: 230620
- **Last modified**: 230621

### Load required packages

In [None]:
import numpy as np
import scanpy as sc
import pandas as pd
import squidpy as sq
import seaborn as sns
import matplotlib.pyplot as plt
import plotly.graph_objects as go

### Set up wortking environment 

In [None]:
sc.settings.verbosity = 3
sc.logging.print_versions()
sc.settings.set_figure_params(dpi = 160, color_map = 'magma', dpi_save = 300, vector_friendly = True, format = 'svg', fontsize = 8)

In [None]:
color_palette = sns.color_palette("Paired", 26)

### Read in Xenium dataset

In [None]:
adata = sc.read("../../datasets/xenium/xenium_human_breast_cancer.h5ad")
adata

In [None]:
sq.pl.spatial_scatter(
    adata,
    library_id = "spatial",
    shape = None,
    color = [
        "cell_states",
        "latent_leiden_0.2",
        "NODAL_ligand_receptor_target_gene_GP",
        "LAMA2_ligand_receptor_target_gene_GP",
        "VEGFC_ligand_receptor_target_gene_GP",
        "NELL2_ligand_receptor_target_gene_GP"
    ], frameon = False,
    legend_fontsize = 6,
    figsize = (7,7),
    ncols = 3,
    cmap = 'PuOr_r'
)

### Investigate active GPs

In [None]:
adata.uns['nichecompass_active_gp_names'].shape

In [None]:
LAMA2 = ['IGSF6', 'FCER1A', 'CLCA2', 'MMP12', 'VWF', 'TAC1', 'LEP', 'POSTN', 'ITGAX', 'GATA3']
sc.pl.stacked_violin(adata, LAMA2, groupby = 'cell_states', cmap = 'magma')

In [None]:
VEGFC = ['ESM1', 'KDR', 'CLDN5', 'RAMP2', 'CCL8', 'ANGPT2', 'CD19', 'CCL20', 'LTB', 'TOP2A', 'PDK4', 'KLF5', 'CYP1A1', 'APOBEC3B', 'FASN', 'TRIB1', 'STC1']
sc.pl.stacked_violin(adata, VEGFC, groupby = 'cell_states', cmap = 'magma')

In [None]:
CD80 = ['CD274', 'CCR7', 'CTLA4', 'ELF3', 'PRF1', 'AR', 'SCD', 'CD69', 'EDN1', 'MMP2', 'IL7R', 'LIF', 'PPARG', 'TRIB1', 'CCL20', 'MMP1', 'CAV1', 'PRDM1', 'DUSP5', 'KLF5', 'CCND1', 'CCL5', 'CXCR4', 'PIM1', 'KRT15', 'ESR1', 'FASN', 'CDH1', 'IL2RA', 'CYP1A1', 'GATA3', 'MDM2', 'EGFR']
sc.pl.stacked_violin(adata, CD80, groupby = 'cell_states', cmap = 'magma')

### Create a plot for Niche composition

In [None]:
adata

In [None]:
adata.obs['cell_states'].cat.categories

In [None]:
trans_from=[['Epi_ABCC11+', 'Epi_FOXA1+', 'Epi_AGR3+', 'Epi_CENPF+', 'mgEpi_KRT14+', 'Epi_KRT14+'],['EC_CLEC14A+', 'EC_CAVIN2+'],['adipo_FB', 'GJB2+iKC-FB'],['EMT-Epi1_CEACAM6+', 'EMT-Epi2_CEACAM6+', 'EMT-Epi_SERPINA3+', 'EMT-Epi_KRT23+'],['DERL3+B', 'BANK1+B', 'B'],['eff_CD8+T1', 'eff_CD8+T2',],['tcm_CD4+T', 'CD161+FOXP3+T'],['NK/T'],['ADIPOQ+Mast'],['M2MØ', 'MMP12+miMØ'], ['DC1']]
trans_to = ['Epithelial', 'Endothelial', 'Fibroblast', 'EMT-Epi', 'B_cells', 'CD8+T', 'CD4+T', 'NK/T', 'Mast', 'MØ', 'DC']

adata.obs['cell_type'] = [str(i) for i in adata.obs['cell_states']]
for leiden,celltype in zip(trans_from, trans_to):
    for leiden_from in leiden:
        adata.obs['cell_type'][adata.obs['cell_type'] == leiden_from] = celltype

In [None]:
adata.obs['cell_type'] = adata.obs['cell_type'].astype('category')
adata.obs['cell_type'].cat.categories

In [None]:
latent_leiden_0.2latent_cluster_key = '' 
cell_type_key = 'cell_type'

df_counts = (adata.obs.groupby([latent_cluster_key, cell_type_key])
             .size().unstack())
df_counts.plot(kind = "bar", stacked = True, figsize = (10,10), color = color_palette)  # Assign the created color palette
legend = plt.legend(bbox_to_anchor = (1, 1), loc = "upper left", prop ={'size': 10})
legend.set_title("Cell Type Annotations", prop = {'size': 10})
plt.title("Cell Type Composition of Niches")
plt.xlabel("Niche")
plt.ylabel("Cell Type Counts")

In [None]:
df_perc = df_counts.divide(df_counts.sum(axis=1), axis=0) * 100

ax = df_perc.plot(kind = "bar", stacked = True, figsize = (12,7), color = color_palette, width = 0.9)
legend = plt.legend(bbox_to_anchor = (1, 1), loc = "upper left", prop ={'size': 10})
legend.set_title("Cell Types", prop = {'size': 10})
plt.title("Cell Type Composition of Niches (Percentage)")
plt.xlabel("Niche")
plt.ylabel("Cell Type Percentage")

# Remove grid
plt.grid(False)

# Add the text in the center
def label_formatter(x):
    return f'{x:.1f}%' if x > 1.1 else ''

for p in ax.containers:
    ax.bar_label(p, label_type='center', labels=[label_formatter(x) for x in p.datavalues], fontsize=7)

plt.show()

- Show active GPs

In [None]:
adata

In [None]:
gp_names_key = 'nichecompass_active_gp_names'
print(f"Number of total gene programs: {len(adata.uns[gp_names_key])}.")
#print(f"Number of active gene programs: {len(active_gps)}.")

In [None]:
adata.uns[gp_names_key]

In [None]:
adata.uns["nichecompass_differential_gp_test_results"]

In [None]:
# Start by creating a mapping between cells and gene programmes
cell_gp_mapping = []

# For each cell, we will look up its cell type / cluster and gene programme
for i, cell in enumerate(adata.obs.index):
    cell_type = adata.obs.loc[cell, 'cell_type']
    spatial_cluster = adata.obs.loc[cell, 'latent_leiden_0.2']

    if i < len(adata.uns[gp_names_key]):  # Make sure we are not going out of bounds
        gene_programme = adata.uns[gp_names_key][i]
        cell_gp_mapping.append((cell_type, spatial_cluster, gene_programme))

# Convert the mapping to a dataframe
df = pd.DataFrame(cell_gp_mapping, columns=['Cell_Type', 'Spatial_Cluster', 'Gene_Programme'])

# Now, we create a pivot table that counts the number of times each combination appears
df_pivot = pd.pivot_table(df, index=['Cell_Type', 'Spatial_Cluster'], columns='Gene_Programme', aggfunc=len, fill_value=0)

# Next, we transform the pivot table into a format suitable for a Sankey diagram
sources = []
targets = []
counts = []

# Iterate over the MultiIndexed DataFrame
for i in df_pivot.index:
    # Each row represents a unique combination of cell type and spatial cluster
    cell_type, spatial_cluster = i
    
    # Each column in the row represents a unique gene programme
    for gene_programme in df_pivot.columns:
        sources.append(f"{cell_type}_{spatial_cluster}")
        targets.append(gene_programme)
        counts.append(df_pivot.loc[i, gene_programme])

# Now, we can create the Sankey diagram using plotly
fig = go.Figure(data=[go.Sankey(
    node=dict(
        label=list(set(sources+targets)),  # The labels are the unique sources followed by the unique targets
        color='blue'  # Set the color (optional)
    ),
    link=dict(
        source=[sources.index(s) for s in sources],  # The source indices
        target=[len(sources)+targets.index(t) for t in targets],  # The target indices
        value=counts  # The counts
    )
)])

fig.show()

In [None]:

# Set up the matplotlib figure
f, ax = plt.subplots(figsize = (16, 6))  # Change the size as needed

# Draw the heatmap
sns.heatmap(df_pivot, ax = ax)

# Show the plot
plt.show()

### Normalise data and tranform data

In [None]:
adata.layers["counts"] = adata.X.copy()
sc.pp.normalize_total(adata, inplace = True)
sc.pp.log1p(adata)

### Visualise manifold

In [None]:
sc.pp.pca(adata, n_comps = 50, random_state = 1712)
sc.pp.neighbors(adata, n_neighbors = 50, random_state = 1769, method = 'umap')
sc.tl.leiden(adata, resolution = 2, random_state = 1786)
sc.tl.umap(adata, min_dist = 0.3, spread = 2, random_state = 1789)

In [None]:
sc.pl.umap(
    adata,
    color = [
        "total_counts",
        "n_genes_by_counts",
        "leiden",
    ],
    wspace = 0.4,
    frameon = False, 
    size = 1
)

### Characterise clusters

In [None]:
sc.tl.rank_genes_groups(adata, 'leiden', method = 'wilcoxon', n_genes = 100, use_raw = False)
result = adata.uns['rank_genes_groups']
groups = result['names'].dtype.names
markers_cells = pd.DataFrame(
    {group + '_' + key[:1]: result[key][group]
    for group in groups for key in ['names', 'pvals_adj', 'logfoldchanges']})
markers_cells.head(5)

In [None]:
markers_cells.to_csv('BRCA_Xenium_ALL_WilcoxRST_top100.csv', sep = ',', index = False)

In [None]:
adata.obs['leiden'].cat.categories

In [None]:
adata.obs['cell_states'] = adata.obs['leiden'].copy()
adata.obs['cell_states'] = adata.obs['cell_states'].cat.rename_categories(['Epi_ABCC11+', 'EC_CLEC14A+', 'adipo_FB', 'tcm_CD4+T', 'CD161+FOXP3+T', 'eff_CD8+T1', 'ADIPOQ+Mast', 'Epi_FOXA1+', 'GJB2+iKC-FB', 'EMT-Epi1_CEACAM6+', 'DC1', 
                                                                           'M2MØ', 'Epi_AGR3+', 'Epi_CENPF+', 'mgEpi_KRT14+', 'DERL3+B', 'EMT-Epi2_CEACAM6+', 'EMT-Epi_SERPINA3+', 'BANK1+B', 'EMT-Epi_KRT23+', 'MMP12+miMØ',
                                                                           'eff_CD8+T2', 'B', 'Epi_KRT14+', 'NK/T', 'EC_CAVIN2+'])
sc.pl.umap(adata, color = ['cell_states'], size = 0.8, legend_fontsize = 6, legend_loc = 'on data', frameon = False)

### Split objects

In [None]:
adata_1_ann = adata[adata.obs['replicates'].isin(['Rep_1'])]
adata_1_ann

In [None]:
sq.pl.spatial_scatter(
    adata_1_ann,
    library_id = "spatial",
    shape = None,
    color = [
        "cell_states",
    ],
    wspace = 0.4,
    frameon = False,
    legend_fontsize = 6,
    figsize = (7,7)
)

In [None]:
adata_2_ann = adata[adata.obs['replicates'].isin(['Rep_2'])]
adata_2_ann

In [None]:
sq.pl.spatial_scatter(
    adata_2_ann,
    library_id = "spatial",
    shape = None,
    color = [
        "cell_states",
    ],
    wspace = 0.4,
    frameon = False,
    legend_fontsize = 6,
    figsize = (7,7)
)

### Export object

In [None]:
adata

In [None]:
adata_1_ann.write('../../datasets/xenium/Xenium_FFPE_Human_Breast_Cancer_Rep_1_annotated_ctl230619.h5ad')
adata_2_ann.write('../../datasets/xenium/Xenium_FFPE_Human_Breast_Cancer_Rep_2_annotated_ctl230619.h5ad')