#### Notebook for creation a sunburst plot with genes from de-novo gene program identified with `NicheCompass`
- **Developed by:** Anna Maguza
- **Modified by:** Carlos Talavera-López 
- **Wuerzburg Institute for System Immunology (WüSI) - Faculty of Medicine - University of Würzburg**
- **Creation Date:** 5th of July 2024
- **Last modified:** 240729

### Import required packages

In [2]:
import ast
import numpy as np
import scanpy as sc
import pandas as pd
import plotly.express as px
import plotly.graph_objects as go

### Read in NicheCompass processed anndata object data

In [3]:
adata = sc.read_h5ad('/../NicheCompass_paper/xenium_human_breast_cancer_analysis.h5ad')

- Extract genes and gene weights

In [26]:
df = adata.uns['nichecompass_gp_summary']

In [2]:
fig_dir = '../figures/SunburstGPplot'

In [5]:
df_gp37 = df[df['gp_name'] == 'Add-on_37_GP']
df_gp37.to_csv('../data/Add-on_37_GP.csv')

df_gp86 = df[df['gp_name'] == 'Add-on_86_GP']
df_gp86.to_csv('../data/Add-on_86_GP.csv')

### Add-on program 37

In [26]:
gp37_genes = df_gp37['gp_source_genes'].values[0]

#### Extract genes of interest
- For simplicity and to make the figure easier to read, we will select the top 60 genes

In [27]:
genes_list = ast.literal_eval(gp37_genes)
genes = genes_list[:60]

In [28]:
genes

['KRT16',
 'KRT14',
 'KRT5',
 'KRT6B',
 'KRT15',
 'C5orf46',
 'CLCA2',
 'KRT23',
 'SERPINA3',
 'AVPR1A',
 'TACSTD2',
 'DSP',
 'TAC1',
 'PIGR',
 'MYLK',
 'CEACAM6',
 'TCF7',
 'OPRPN',
 'CEACAM8',
 'CLDN4',
 'AGR3',
 'BASP1',
 'KIT',
 'JUP',
 'NOSTRIN',
 'CXCR4',
 'SEC11C',
 'PTRHD1',
 'ERN1',
 'ABCC11',
 'TUBB2B',
 'PDGFRA',
 'OXTR',
 'IL2RA',
 'KRT7',
 'RAPGEF3',
 'MYH11',
 'KLF5',
 'S100A14',
 'CAV1',
 'SH3YL1',
 'EGFR',
 'C15orf48',
 'FOXC2',
 'FOXP3',
 'C6orf132',
 'C2orf42',
 'GNLY',
 'TPD52',
 'PDGFRB',
 'SCGB2A1',
 'CD14',
 'EGFL7',
 'USP53',
 'LYPD3',
 'SVIL',
 'ANKRD29',
 'GLIPR1',
 'CX3CR1',
 'LGALSL']

#### Extract gene weights 

In [29]:
gp37_weights = df_gp37['gp_source_genes_weights'].values[0]

In [30]:
weights_list = ast.literal_eval(gp37_weights)
weights = weights_list[:60]

In [31]:
weights

[0.0641,
 0.0633,
 0.0628,
 0.0614,
 0.0422,
 0.0367,
 0.0297,
 0.0292,
 0.0276,
 -0.0234,
 0.0206,
 0.0201,
 -0.0198,
 0.0189,
 0.0183,
 0.0174,
 0.0165,
 0.0154,
 0.0138,
 0.0114,
 0.0105,
 -0.0104,
 0.0103,
 0.0102,
 0.0095,
 -0.0087,
 -0.0083,
 -0.0082,
 -0.0081,
 -0.0081,
 0.0078,
 0.0077,
 0.0075,
 -0.0073,
 0.0072,
 -0.0071,
 0.007,
 0.007,
 0.0069,
 0.0067,
 -0.0064,
 0.0062,
 0.0062,
 0.006,
 -0.0056,
 0.0054,
 -0.0054,
 -0.0053,
 0.0052,
 -0.0052,
 0.0051,
 0.0051,
 -0.0051,
 -0.005,
 0.0049,
 0.0048,
 -0.0048,
 0.0046,
 0.0046,
 -0.0046]

### Add gene enrichment information
- This can be done using your favourite Gene Enrichment Analysis tool or Large Language Model. Here we used ToppFun, but ChatGPT concurred with the programmes classification

In [35]:
classification = {
    'Cytoskeletal Proteins': {
        'Keratin Family': ['KRT16', 'KRT14', 'KRT5', 'KRT6B', 'KRT15', 'KRT23', 'KRT7'],
        'Tubulin Family': ['TUBB2B'],
        'Regulation of Actin': ['RAPGEF3'],
        'Myosins': ['MYH11']
    },
    'Cell Adhesion and Junctions': {
        'Desmosomes and Junctions': ['DSP', 'JUP', 'CLDN4', 'CEACAM6', 'CEACAM8'],
        'Tight Junctions and Adhesion': ['TACSTD2', 'AGR3', 'LYPD3']
    },
    'Signaling Receptors': {
        'Growth Factors': ['KIT', 'PDGFRA', 'PDGFRB', 'EGFR'],
        'Other': ['AVPR1A', 'OXTR', 'IL2RA']
    },
    'Gene Regulation': {
        'Forkhead Box Family': ['FOXC2', 'FOXP3'],
        'Transcription Factors': ['TCF7', 'KLF5'],
        'Open Reading Frames': ['C15orf48', 'C6orf132', 'C2orf42', 'C5orf46']
    },
    'Enzymes and Metabolic Proteins': {
        'Kinases and Phosphatases': ['MYLK', 'ERN1'],
        'Transporters': ['ABCC11'],
        'Peptidases and Hydrolases': ['SEC11C', 'USP53', 'PTRHD1', 'SH3YL1'],
        'Other': ['LGALSL']
    },
    'Immune System Genes': {
        'Chemokine Receptors': ['CXCR4', 'CX3CR1'],
        'Cell Surface Markers': ['CD14', 'GNLY', 'PIGR'],
        'Inflammatory Mediators': ['S100A14', 'EGFL7'],
        'Immune Regulation': ['TAC1']
    },
    'Secreted Proteins': {
        'Secreted Proteins': ['SERPINA3', 'SCGB2A1'],
        'ECM Proteins': ['CAV1']
    },
    'Cell Organization': {
        'Transport Proteins': ['NOSTRIN'],
        'Structural Proteins': ['SVIL', 'ANKRD29', 'GLIPR1']
    }
}


In [None]:
data = []
for category, subcategories in classification.items():
    for subcategory, genes_list in subcategories.items():
        for gene in genes_list:
            weight = weights[genes.index(gene)]
            data.append([category, subcategory, gene, weight])

df = pd.DataFrame(data, columns=['Category', 'Subcategory', 'Gene', 'Gene Weight'])

In [36]:
fig = px.sunburst(df, 
                  path=['Category', 'Subcategory', 'Gene'], 
                  color='Gene Weight',
                  color_continuous_scale='RdYlBu_r')

fig.update_layout(coloraxis_colorbar=dict(
    title="Gene Importance Weight",
    tickvals=[min(df['Gene Weight']), max(df['Gene Weight'])],
))

fig.update_layout(
    width=1000,
    height=1000
)

fig.update_traces(
    textinfo="label",
    insidetextfont=dict(size=15)
)

fig.show()

fig.write_image(f'{fig_dir}/RdYlBu_r_add_on_37sunburst_plot.png', scale=3)

+ Add-on program 86

In [37]:
gp86_genes = df_gp86['gp_source_genes'].values[0]

+ extract first 60 genes

In [38]:
genes_list = ast.literal_eval(gp86_genes)
genes = genes_list[:60]

In [39]:
genes

['MLPH',
 'EPCAM',
 'FOXA1',
 'ELF3',
 'KRT8',
 'KRT7',
 'FASN',
 'ABCC11',
 'MYO5B',
 'SERHL2',
 'TACSTD2',
 'LYPD3',
 'SCD',
 'ANKRD30A',
 'S100A14',
 'GATA3',
 'DSP',
 'AR',
 'C6orf132',
 'TFAP2A',
 'CEACAM8',
 'TRAF4',
 'DMKN',
 'CLDN4',
 'KLF5',
 'PCLAF',
 'ERBB2',
 'CENPF',
 'CCND1',
 'PTGDS',
 'SLC5A6',
 'TOP2A',
 'TPD52',
 'FBLN1',
 'SH3YL1',
 'AGR3',
 'CEACAM6',
 'SQLE',
 'CCDC80',
 'CTTN',
 'JUP',
 'USP53',
 'CCDC6',
 'NARS',
 'RHOH',
 'RTKN2',
 'OCIAD2',
 'STC1',
 'TCIM',
 'HOOK2',
 'ZNF562',
 'CD79A',
 'PDGFRA',
 'SMS',
 'DPT',
 'MEDAG',
 'ZEB2',
 'IL7R',
 'ESR1',
 'MS4A1']

+ Extract gene weights 

In [40]:
gp86_weights = df_gp86['gp_source_genes_weights'].values[0]

In [41]:
weights_list = ast.literal_eval(gp86_weights)
weights = weights_list[:60]

In [42]:
weights

[0.0552,
 0.0545,
 0.0535,
 0.053,
 0.0527,
 0.0525,
 0.0524,
 0.0521,
 0.0499,
 0.0494,
 0.0489,
 0.0483,
 0.0472,
 0.0472,
 0.0454,
 0.0453,
 0.0452,
 0.0452,
 0.043,
 0.0429,
 0.0427,
 0.0396,
 0.0383,
 0.0382,
 0.0369,
 0.0368,
 0.0364,
 0.036,
 0.0345,
 -0.034,
 0.0339,
 0.0335,
 0.033,
 -0.0319,
 0.031,
 0.0301,
 0.03,
 0.0299,
 -0.0298,
 0.0296,
 0.0295,
 0.0293,
 0.0288,
 0.0286,
 0.0282,
 0.0276,
 0.0275,
 0.027,
 0.0268,
 0.0268,
 0.0266,
 -0.0266,
 -0.0263,
 0.0262,
 -0.0257,
 -0.0256,
 -0.0255,
 -0.0255,
 0.0253,
 -0.0252]

In [43]:
classification = {
    'Cytoskeletal Proteins': {
        'Keratin Family': ['KRT8', 'KRT7'],
        'Myosins': ['MYO5B'],
        'Actin-binding': ['CTTN']
    },
    'Cell Adhesion and Junctions': {
        'Cell Adhesion Molecules': ['EPCAM', 'TACSTD2', 'LYPD3', 'CEACAM8', 'CLDN4'],
        'Desmosomes and Junctions': ['DSP', 'JUP'],
        'ECM Proteins': ['DPT', 'FBLN1', 'FBLN1', 'DPT', 'MEDAG', 'STC1']
    },
    'Gene Regulation': {
        'Forkhead Box Family': ['FOXA1'],
        'Transcription Factors': ['ELF3', 'GATA3', 'KLF5', 'TFAP2A', 'ZEB2'],
        'Open Reading Frames': ['C6orf132']
    },
    'Metabolism': {
        'Fatty Acid Synthesis': ['FASN', 'SCD'],
        'Transporters': ['ABCC11', 'SLC5A6'],
        'Other Enzymes': ['USP53', 'SH3YL1', 'SMS', 'SQLE', 'NARS', 'SERHL2']
    },
    'Immune System Genes': {
        'Cytokines': ['IL7R'],
        'Cell Surface Markers': ['CD79A', 'RHOH', 'RTKN2'],
        'Inflammatory Mediators': ['S100A14'],
        'Immune Regulation': ['TRAF4']
    },
    'Signaling': {
        'Growth Factors': ['PDGFRA', 'ERBB2'],
        'Hormones': ['ESR1', 'AR'],
        'TNF': ['TRAF4'],
        'JAK-STAT': ['OCIAD2'],
        'Wnt/β-catenin': ['TCIM']
    },
    'Cell Cycle and Proliferation': {
        'Cell Cycle Regulators': ['CCND1', 'CENPF', 'TOP2A'],
        'Proliferation Markers': ['PCLAF', 'TPD52']
    }
}

# Create the DataFrame
data = []
for category, subcategories in classification.items():
    for subcategory, genes_list in subcategories.items():
        for gene in genes_list:
            weight = weights[genes.index(gene)]
            data.append([category, subcategory, gene, weight])

df = pd.DataFrame(data, columns=['Category', 'Subcategory', 'Gene', 'Gene Weight'])

In [44]:
fig = px.sunburst(df, 
                  path=['Category', 'Subcategory', 'Gene'], 
                  color='Gene Weight',
                  color_continuous_scale='RdYlBu_r')

fig.update_layout(coloraxis_colorbar=dict(
    title="Gene Importance Weight",
    tickvals=[min(df['Gene Weight']), max(df['Gene Weight'])],
))

fig.update_layout(
    width=1000,
    height=1000
)


fig.update_traces(
    textinfo="label",
    insidetextfont=dict(size=15)
)

fig.show()

fig.write_image(f'{fig_dir}/RdYlBu_r_add_on_86sunburst_plot.png', scale=3)

In [45]:
import pkg_resources

for dist in pkg_resources.working_set:
    print(f"{dist.project_name}=={dist.version}")

scikit-misc==0.3.1
Babel==2.14.0
Brotli==1.1.0
MarkupSafe==2.1.5
PyQt5==5.15.10
PyQt5-Qt5==5.15.2
PyQt5-sip==12.13.0
PySocks==1.7.1
PyYAML==6.0.1
Send2Trash==1.8.3
anndata==0.10.5.post1
anyio==4.3.0
archspec==0.2.3
argon2-cffi==23.1.0
argon2-cffi-bindings==21.2.0
array-api-compat==1.6
arrow==1.3.0
asttokens==2.4.1
async-lru==2.0.4
attrs==23.2.0
beautifulsoup4==4.12.3
bleach==6.1.0
boltons==24.0.0
cached-property==1.5.2
certifi==2024.6.2
cffi==1.16.0
chardet==5.2.0
charset-normalizer==3.3.2
click==8.1.7
colorama==0.4.6
comm==0.2.2
conda==24.5.0
conda-build==24.5.1
conda-index==0.5.0
conda-libmamba-solver==24.1.0
conda-package-handling==2.3.0
conda-package-streaming==0.10.0
contourpy==1.2.1
cycler==0.12.1
debugpy==1.8.1
decorator==5.1.1
defusedxml==0.7.1
distro==1.9.0
entrypoints==0.4
ete3==3.1.3
exceptiongroup==1.2.0
executing==2.0.1
faiss-gpu==1.7.2
fastjsonschema==2.19.1
filelock==3.15.1
fonttools==4.51.0
fqdn==1.5.1
frozendict==2.4.4
get-annotations==0.1.2
h11==0.14.0
h2==4.1.0
h5py=