#### Notebook for visualisation of gene programmes from `NicheCompass`
- **Developed by:** Anna Maguza
- **Modified by**: Carlos Talavera-López
- **Faculty of Medicine, University of Würzburg**
- **Creation Date:** 5th of July 2024
- **Last modified**: 7th of October 2024

### Load required modules

In [1]:
import ast
import mygene
import numpy as np
import scanpy as sc
import pandas as pd
import plotly.express as px
import plotly.graph_objects as go
from biomart import BiomartServer
from collections import defaultdict

### Set up working environment

In [2]:
sc.settings.verbosity = 3
sc.logging.print_versions()
sc.settings.set_figure_params(dpi = 180, color_map = 'RdPu', dpi_save = 300, vector_friendly = True, format = 'svg')

-----
anndata     0.10.8
scanpy      1.10.2
-----
PIL                 10.3.0
appnope             0.1.4
asttokens           NA
biomart             NA
biothings_client    0.3.1
certifi             2024.08.30
charset_normalizer  3.3.2
colorama            0.4.6
comm                0.2.2
cycler              0.12.1
cython_runtime      NA
dateutil            2.9.0.post0
debugpy             1.8.1
decorator           5.1.1
executing           2.0.1
h5py                3.11.0
idna                3.10
igraph              0.11.6
ipykernel           6.29.5
ipywidgets          8.1.3
jedi                0.19.1
joblib              1.4.2
kiwisolver          1.4.5
legacy_api_wrap     NA
leidenalg           0.10.2
llvmlite            0.42.0
matplotlib          3.8.4
mpl_toolkits        NA
mygene              3.2.2
natsort             8.4.0
numba               0.59.1
numpy               1.26.4
packaging           24.0
pandas              2.2.2
parso               0.8.4
platformdirs        4.2.1
plotly    

In [3]:
fig_dir = '../figures/'

In [4]:
server = BiomartServer("http://www.ensembl.org/biomart")
mart = server.datasets['hsapiens_gene_ensembl']

### Read in `anndata` object

In [5]:
adata = sc.read_h5ad('../data/xenium_human_breast_cancer_analysis.h5ad')
adata

AnnData object with n_obs × n_vars = 282363 × 313
    obs: 'cell_id', 'x_centroid', 'y_centroid', 'transcript_counts', 'control_probe_counts', 'control_codeword_counts', 'total_counts', 'cell_area', 'nucleus_area', 'replicates', 'n_genes_by_counts', 'log1p_n_genes_by_counts', 'log1p_total_counts', 'pct_counts_in_top_10_genes', 'pct_counts_in_top_20_genes', 'pct_counts_in_top_50_genes', 'pct_counts_in_top_150_genes', 'n_counts', 'n_genes', 'leiden', 'cell_states', 'batch', 'CXCL12_ligand_receptor_GP', 'CD8A_ligand_receptor_GP', 'CD80_ligand_receptor_GP', 'CD8B_ligand_receptor_GP', 'Abca1_ligand_receptor_target_gene_GP', 'Ace2_ligand_receptor_target_gene_GP', 'Adam17_ligand_receptor_target_gene_GP', 'Adam2_ligand_receptor_target_gene_GP', 'Adgrb1_ligand_receptor_target_gene_GP', 'Adm2_ligand_receptor_target_gene_GP', 'Ahsg_ligand_receptor_target_gene_GP', 'Ang_ligand_receptor_target_gene_GP', 'Ang2_ligand_receptor_target_gene_GP', 'Ang4_ligand_receptor_target_gene_GP', 'Ang5_ligand_recep

### Extract genes and gene weights from programme of interest

In [6]:
df = adata.uns['nichecompass_gp_summary']
df.head()

Unnamed: 0,gp_name,all_gp_idx,gp_active,active_gp_idx,n_source_genes,n_non_zero_source_genes,n_target_genes,n_non_zero_target_genes,gp_source_genes,gp_target_genes,gp_source_genes_weights,gp_target_genes_weights,gp_source_genes_importances,gp_target_genes_importances
0,CXCL12_ligand_receptor_GP,0,True,0,1,1,1,1,['CXCL12'],['AVPR1A'],[-0.0461],[0.9449],[0.0465],[0.9535]
1,CD8A_ligand_receptor_GP,1,True,1,1,1,1,1,['CD8A'],['PRF1'],[-0.4468],[-0.3082],[0.5917],[0.4083]
2,CD80_ligand_receptor_GP,2,True,2,1,1,1,1,['CD80'],['CD8B'],[-0.2762],[0.8064],[0.2551],[0.7449]
3,CD8B_ligand_receptor_GP,3,True,3,1,1,1,1,['CD8B'],['PRF1'],[0.737],[0.215],[0.7741],[0.2259]
4,Abca1_ligand_receptor_target_gene_GP,4,True,4,0,0,2,1,[],"['CAV1', 'CXCR4']",[],"[0.2084, -0.0]",[],"[1.0, 0.0]"


In [7]:
df_gp37 = df[df['gp_name'] == 'Add-on_37_GP']
gp37_genes = df_gp37['gp_source_genes'].values[0]
gp37_genes

"['KRT16', 'KRT14', 'KRT5', 'KRT6B', 'KRT15', 'C5orf46', 'CLCA2', 'KRT23', 'SERPINA3', 'AVPR1A', 'TACSTD2', 'DSP', 'TAC1', 'PIGR', 'MYLK', 'CEACAM6', 'TCF7', 'OPRPN', 'CEACAM8', 'CLDN4', 'AGR3', 'BASP1', 'KIT', 'JUP', 'NOSTRIN', 'CXCR4', 'SEC11C', 'PTRHD1', 'ERN1', 'ABCC11', 'TUBB2B', 'PDGFRA', 'OXTR', 'IL2RA', 'KRT7', 'RAPGEF3', 'MYH11', 'KLF5', 'S100A14', 'CAV1', 'SH3YL1', 'EGFR', 'C15orf48', 'FOXC2', 'FOXP3', 'C6orf132', 'C2orf42', 'GNLY', 'TPD52', 'PDGFRB', 'SCGB2A1', 'CD14', 'EGFL7', 'USP53', 'LYPD3', 'SVIL', 'ANKRD29', 'GLIPR1', 'CX3CR1', 'LGALSL', 'GJB2', 'RUNX1', 'KRT8', 'PPARG', 'TCEAL7', 'RTKN2', 'CDC42EP1', 'POLR2J3', 'MYO5B', 'KLRB1', 'ITGAX', 'PCLAF', 'HMGA1', 'LAG3', 'ANKRD28', 'CD68', 'KARS', 'MZB1', 'KDR', 'AKR1C1', 'TENT5C', 'AQP1', 'AQP3', 'HOXD8', 'SLC25A37', 'THAP2', 'SMAP2', 'ITM2C', 'ELF5', 'LARS', 'DUSP2', 'FSTL3', 'ESR1', 'EIF4EBP1', 'IL7R', 'GPR183', 'MDM2', 'CRISPLD2', 'CCDC80', 'MKI67', 'CD93', 'MMP1', 'HOXD9', 'DMKN', 'ACTG2', 'NPM3', 'CD4', 'C1QC', 'TFAP2A'

In [8]:
genes_list = ast.literal_eval(gp37_genes)
genes = genes_list[:20]
genes

['KRT16',
 'KRT14',
 'KRT5',
 'KRT6B',
 'KRT15',
 'C5orf46',
 'CLCA2',
 'KRT23',
 'SERPINA3',
 'AVPR1A',
 'TACSTD2',
 'DSP',
 'TAC1',
 'PIGR',
 'MYLK',
 'CEACAM6',
 'TCF7',
 'OPRPN',
 'CEACAM8',
 'CLDN4',
 'AGR3',
 'BASP1',
 'KIT',
 'JUP',
 'NOSTRIN',
 'CXCR4',
 'SEC11C',
 'PTRHD1',
 'ERN1',
 'ABCC11',
 'TUBB2B',
 'PDGFRA',
 'OXTR',
 'IL2RA',
 'KRT7',
 'RAPGEF3',
 'MYH11',
 'KLF5',
 'S100A14',
 'CAV1',
 'SH3YL1',
 'EGFR',
 'C15orf48',
 'FOXC2',
 'FOXP3',
 'C6orf132',
 'C2orf42',
 'GNLY',
 'TPD52',
 'PDGFRB',
 'SCGB2A1',
 'CD14',
 'EGFL7',
 'USP53',
 'LYPD3',
 'SVIL',
 'ANKRD29',
 'GLIPR1',
 'CX3CR1',
 'LGALSL']

In [9]:
gp37_weights = df_gp37['gp_source_genes_weights'].values[0]
weights_list = ast.literal_eval(gp37_weights)
weights = weights_list[:20]
weights

[0.0641,
 0.0633,
 0.0628,
 0.0614,
 0.0422,
 0.0367,
 0.0297,
 0.0292,
 0.0276,
 -0.0234,
 0.0206,
 0.0201,
 -0.0198,
 0.0189,
 0.0183,
 0.0174,
 0.0165,
 0.0154,
 0.0138,
 0.0114]

### Classify genes into families and superfamilies

In [10]:
response = mart.search({
    'filters': {
        'external_gene_name': genes
    },
    'attributes': [
        'external_gene_name',
        'gene_biotype',
        'interpro_description', 
        'go_id',               
        'goslim_goa_description' 
    ]
})

classification = {}
seen_genes = set() 

for line in response.iter_lines():
    line = line.decode('utf-8')
    gene_name, gene_biotype, interpro_description, go_id, go_slim_description = line.split('\t')

    if gene_name in seen_genes: 
        continue

    if not interpro_description:
        interpro_description = 'Unknown Family'
    if not go_slim_description:
        go_slim_description = 'Unknown Pathway'

    if go_slim_description not in classification:
        classification[go_slim_description] = {}

    classification[go_slim_description][interpro_description] = [gene_name]  
    seen_genes.add(gene_name)

gene_weights = dict(zip(genes, weights)) 

data = []
for pathway, families in classification.items():
    for family, genes_list in families.items():
        for gene in genes_list:
            weight = gene_weights.get(gene) 
            data.append([pathway, family, gene, weight])

df = pd.DataFrame(data, columns=['Pathway', 'Family', 'Gene', 'Gene Weight'])

df_filtered = df.dropna(subset=['Gene Weight'])
df_filtered.head()

                                  Pathway  \
0                               organelle   
1                               organelle   
2                               organelle   
3                               organelle   
4                               organelle   
5                               organelle   
6                               organelle   
7                               organelle   
8                               organelle   
9             DNA-templated transcription   
10                              signaling   
11                              signaling   
12                              signaling   
13                              signaling   
14           protein modification process   
15                   extracellular region   
16                   extracellular region   
17                   extracellular region   
18                   extracellular region   
19                   extracellular region   
20                   extracellular region   
21        

### Visualise genes and gene weights as a sunburst plot for easier interpretability

In [15]:
fig = px.sunburst(df_filtered, 
                  path=['Pathway', 'Family', 'Gene'], 
                  color='Gene Weight',
                  color_continuous_scale='RdYlBu_r')

fig.update_layout(coloraxis_colorbar=dict(
    title="Gene Importance Weight",
    tickvals=[min(df_filtered['Gene Weight']), max(df_filtered['Gene Weight'])],
    tickmode='array' 
))

fig.update_layout(
    width=1000,
    height=1000
)

fig.update_traces(
    textinfo="label",
    insidetextfont=dict(size=15)
)

fig.show()