In [7]:
#%%appyter init
from appyter import magic
magic.init(lambda _=globals: _())

In [None]:
%%appyter hide_code

{% do SectionField(
    name='primary',
    title='Upload Tumor Expression',
    img='upload.png'
) %}

{% set data_type = TabField(
    name='data_type',
    label='Data Type',
    default='scRNA-seq',
    description='Start with either scRNA-seq or bulk RNA-seq data. If bulk RNA-seq data is selected we will utilize pre-curated reference matrices to identify cell type-specific expression vectors.',
    required=True,
    section='primary',
    choices={
        'scRNA-seq': [
            TabField(
            name='sc_data_type_ctrl',
            label='Data Format (Control Profile)',
            default='Plain Text',
            description='Choose a format for scRNA-seq data',
            required=True,
            choices= {
                'Plain Text': [
                        CustomFileField(
                        name='sc_rna_file_ctrl',
                        label='control scRNA-seq',
                        description='''
    File should be a tsv/csv of the form:

                                        cell 1   cell 2    ...
                    ------------------------------
            Gene/Protein 1    0          2       ...
                    ------------------------------
            Gene/Protein 2    1         1        ...
                    ------------------------------
                        ...                       ... 
                        ''',
                        default='data/GSE171417_rbk_control.tsv.gz',
                        required=False,
                        examples={
                            'GSE171417_rbk_control.tsv.gz': 'https://minio.dev.maayanlab.cloud/sc2targets/GSE171417_rbk_control.tsv.gz',
                        },
                    ),
                    CustomFileField(
                        name='sc_meta_file_ctrl',
                        label='metadata',
                        description='''
                        File should be a tsv/csv of the form:

                            Patient 1 Tumor    Patient 2 Tumor  ...
                        ---------------------------------------
                        Gene/Protein 1    0                   200       ...
                        ---------------------------------------
                        Gene/Protein 2    5                   180       ...
                        ---------------------------------------
                        ...                       ...                    ...        ...
                        ''',
                        default='data/GSE171417_metadata_control.tsv',
                        required=False,
                        examples={
                            'GSE171417_metadata_control.tsv': 'https://minio.dev.maayanlab.cloud/sc2targets/GSE171417_metadata_control.tsv',
                        },
                    ),
                    TextField(
                        name='cell_type_col_ctrl',
                        label='Cell Type Column (Optional)',
                        description='Name of cell type column in uploaded metadata (if not selected, cell type identification will be performed)',
                        default='',
                        hint='cell_type',
                        required=False
                    )
                    ],
                '.mtx': [
                        CustomFileField(
                        name='sc_rna_file',
                        label='scRNA-seq',
                        description='''
    File should be a tsv/csv of the form:

                                        cell 1   cell 2    ...
                    ------------------------------
            Gene/Protein 1    0          2       ...
                    ------------------------------
            Gene/Protein 2    1         1        ...
                    ------------------------------
                        ...                       ... 
                        ''',
                        default='data/sarcoma-v2-proteome-SpectrumMill-ratio-QCfilter-NArm.tsv',
                        required=False,
                        examples={
                            'BR_proteomics_tumor.tsv': 'https://s3.amazonaws.com/multiomics2paper/public/BR_proteomics_tumor.tsv',
                        },
                    ),
                    CustomFileField(
                        name='sc_meta_file_pert',
                        label='metadata',
                        description='''
                        File should be a tsv/csv of the form:

                            Patient 1 Tumor    Patient 2 Tumor  ...
                        ---------------------------------------
                        Gene/Protein 1    0                   200       ...
                        ---------------------------------------
                        Gene/Protein 2    5                   180       ...
                        ---------------------------------------
                        ...                       ...                    ...        ...
                        ''',
                        default='data/sarcoma-v2-proteome-SpectrumMill-ratio-QCfilter-NArm.tsv',
                        required=False,
                        examples={
                            'BR_proteomics_tumor.tsv': 'https://s3.amazonaws.com/multiomics2paper/public/BR_proteomics_tumor.tsv',
                        },
                    )]
            }
        ),
        TabField(
            name='sc_data_type',
            label='Data format',
            default='Plain Text',
            description='Choose a format for scRNA-seq data',
            required=True,
            choices= {
                'Plain Text': [
                        CustomFileField(
                        name='sc_rna_file_pert',
                        label='scRNA-seq',
                        description='''
    File should be a tsv/csv of the form:

                                        cell 1   cell 2    ...
                    ------------------------------
            Gene/Protein 1    0          2       ...
                    ------------------------------
            Gene/Protein 2    1         1        ...
                    ------------------------------
                        ...                       ... 
                        ''',
                        default='data/GSE171417_rbk_experimental.tsv.gz',
                        required=False,
                        examples={
                            'GSE171417_rbk_experimental.tsv.gz': 'https://minio.dev.maayanlab.cloud/sc2targets/GSE171417_rbk_experimental.tsv.gz',
                        },
                    ),
                    CustomFileField(
                        name='sc_meta_file_pert',
                        label='metadata',
                        description='''
                        File should be a tsv/csv of the form:

                            Patient 1 Tumor    Patient 2 Tumor  ...
                        ---------------------------------------
                        Gene/Protein 1    0                   200       ...
                        ---------------------------------------
                        Gene/Protein 2    5                   180       ...
                        ---------------------------------------
                        ...                       ...                    ...        ...
                        ''',
                        default='data/GSE171417_metadata_experimental.tsv',
                        required=False,
                        examples={
                            'GSE171417_metadata_experimental.tsv': 'https://minio.dev.maayanlab.cloud/sc2targets/GSE171417_metadata_experimental.tsv',
                        },
                    ),
                    TextField(
                        name='condition_ctrl',
                        label='Cell Type Column (Optional)',
                        description='Name of the condition',
                        default='',
                        hint='cell_type',
                        required=False
                    ),
                    ],
                '.mtx': [
                        CustomFileField(
                        name='sc_rna_file',
                        label='scRNA-seq',
                        description='''
    File should be a tsv/csv of the form:

                                        cell 1   cell 2    ...
                    ------------------------------
            Gene/Protein 1    0          2       ...
                    ------------------------------
            Gene/Protein 2    1         1        ...
                    ------------------------------
                        ...                       ... 
                        ''',
                        default='data/sarcoma-v2-proteome-SpectrumMill-ratio-QCfilter-NArm.tsv',
                        required=False,
                        examples={
                            'BR_proteomics_tumor.tsv': 'https://s3.amazonaws.com/multiomics2paper/public/BR_proteomics_tumor.tsv',
                        },
                    ),
                    CustomFileField(
                        name='sc_meta_file',
                        label='metadata',
                        description='''
                        File should be a tsv/csv of the form:

                            Patient 1 Tumor    Patient 2 Tumor  ...
                        ---------------------------------------
                        Gene/Protein 1    0                   200       ...
                        ---------------------------------------
                        Gene/Protein 2    5                   180       ...
                        ---------------------------------------
                        ...                       ...                    ...        ...
                        ''',
                        default='data/sarcoma-v2-proteome-SpectrumMill-ratio-QCfilter-NArm.tsv',
                        required=False,
                        examples={
                            'BR_proteomics_tumor.tsv': 'https://s3.amazonaws.com/multiomics2paper/public/BR_proteomics_tumor.tsv',
                        },
                    )]
            }
        )],
        'bulk RNA-seq':[
            CustomFileField(
                name='bulk_expr',
                label='RNA-seq expression',
                description='''
                File should be a tsv/csv of the form:

                    Patient 1 Tumor    Patient 2 Tumor  ...
                ---------------------------------------
                Gene/Protein 1    0                   200       ...
                ---------------------------------------
                Gene/Protein 2    5                   180       ...
                ---------------------------------------
                ...                       ...                    ...        ...
                ''',
                default='data/GSE49155_expression.tsv',
                required=False,
                examples={
                    'GSE49155_expression.tsv': 'https://minio.dev.maayanlab.cloud/sc2targets/GSE49155_expression.tsv',
                },
            ),
            CustomFileField(
                        name='bulk_meta_file',
                        label='metadata',
                        description='''
                        File should be a tsv/csv of the form:

                            Patient 1 Tumor    Patient 2 Tumor  ...
                        ---------------------------------------
                        Gene/Protein 1    0                   200       ...
                        ---------------------------------------
                        Gene/Protein 2    5                   180       ...
                        ---------------------------------------
                        ...                       ...                    ...        ...
                        ''',
                        default='data/GSE49155_metadata.tsv',
                        required=False,
                        examples={
                            'GSE49155_metadata.tsv': 'https://minio.dev.maayanlab.cloud/sc2targets/GSE49155_metadata.tsv',
                        },
                    ),
                    TextField(
                        name='condition_col',
                        label='Condition Column',
                        description='Name of the condition column in uploaded metadata',
                        default='condition',
                        required=True
                    ),
                    TextField(
                        name='ctrl_condition',
                        label='Control Condition Name',
                        description='Name of the control condition in the condition column in uploaded metadata',
                        default='normal',
                        required=True
                    ),
                    ChoiceField(
                        name='reference',
                        label='Single Cell Reference',
                        description='Choose a single cell reference to use for cell type deconvolution',
                        default='Kidney - Tabula Sapiens (Human)',
                        required=True,
                        choices={
                            "Pancreas - Tabula Sapiens (Human)": "tabula_sapiens_pancreas",
                            "Adipose Tissue - Tabula Sapiens (Human)": "tabula_sapiens_adipose",
                            "Adipose Tissue - Tabula Muris (Mouse)": "tabula_muris_adipose",
                            "Liver - Tabula Sapiens (Human)": "tabula_sapiens_liver",
                            "Blood - Tabula Sapiens (Human)": "tabula_sapiens_blood",
                            "Kidney - Tabula Sapiens (Human)": "tabula_sapiens_kidney",
                            "Kidney - Lake et al. 2023 (Human)": "human_kidney_lake_et_al_2023",
                            "Brain (non-myeloid) - Tabula Muris (Mouse)": "tabula_muris_brain-non-myeloid",
                            "Heart - Tabula Sapiens (Human)": "tabula_sapiens_heart",
                            "Heart - Tabula Muris (Mouse)": "tabula_muris_heart",
                            "Muscle - Tabula Sapiens (Human)": "tabula_sapiens_muscle",
                            "Muscle - Tabula Muris (Mouse)": "tabula_muris_muscle",
                            "Skin - Tabula Sapiens (Human)": "tabula_sapiens_skin",
                            "Lung - Tabula Sapiens (Human)": "tabula_sapiens_lung",
                            "Lung - Tabula Muris (Mouse)": "tabula_muris_lung",
                            "Spleen - Tabula Sapiens (Human)": "tabula_sapiens_spleen",
                            "Spleen - Tabula Muris (Mouse)": "tabula_muris_spleen",
                            "Prostate - Tabula Sapiens (Human)": "tabula_sapiens_prostate",
                            "Lung Cancer Atlas (LuCA) - Salcher et al. 2022 (Human)": "salcher-et-al-lung-cancer",
                            }
                    )
         ]
        }
) %}

#"Pancreas - Tritschler et al. (Mouse)": "https://cellxgene.cziscience.com/collections/0a77d4c0-d5d0-40f0-aa1a-5e1429bcbd7e",
#"Adipose Tissue - Emont et al. (Mouse)": "https://cellxgene.cziscience.com/collections/fe0e718d-2ee9-42cc-894b-0b490f437dfd",
#"Liver - MacParland et al. 2018 (Human)": "https://cellxgene.cziscience.com/collections/bd5230f4-cd76-4d35-9ee5-89b3e7475659",
#"Skeletal Muscle - Domínguez Conde et al. 2022 (Human)": "https://cellxgene.cziscience.com/collections/62ef75e4-cbea-454e-a0ce-998ec40223d3",
#"Heart - Litviňuková et al. 2020 (Mouse)": "https://cellxgene.cziscience.com/collections/b52eb423-5d0d-4645-b217-e1c6d38b2e72",
#"Retina - Li et al. 2023 (Human)": "https://cellxgene.cziscience.com/collections/4c6eaf5c-6d57-4c76-b1e9-60df8c655f1e",
#"Retina - Cowan et al. 2020 (Mouse)": "https://cellxgene.cziscience.com/collections/2f4c738f-e2f3-4553-9db2-0582a38ea4dc",
#Breast - Reed et al. 2024 (Human)": "https://cellxgene.cziscience.com/collections/48259aa8-f168-4bf5-b797-af8e88da6637",


{% set n_neighbors = IntField(
    name='n_neighbors',
    label='N neighbors', 
    default=15, 
    min=2,
    max=100,
    step=1,
    description='The size of local neighborhood (in terms of number of neighboring data points) used for manifold approximation.', 
    section='primary'
)
%}

{% set min_dist = FloatField(
    name='min_dist',
    label='Min Distance', 
    default=0.01, 
    min=0.00001,
    max=0.9,
    step=0.00001,
    description='The effective minimum distance between embedded points.', 
    section='primary'
)
%}

{% set resolution = FloatField(
    name='resolution',
    label='Resolution', 
    default=1, 
    min=0.1,
    max=2,
    step=0.1,
    description='A parameter value controlling the coarseness of the clustering. Higher values lead to more clusters.', 
    section='primary'
)
%}

{% set species = ChoiceField(
    name='species',
    label='Species', 
    default='mouse',
    choices=['human', 'mouse'],
    description='Choose a species', 
    required=True,
    section='primary'
)
%}


{% set membrane_screener_list = 'https://appyters.maayanlab.cloud/storage/Tumor_Gene_Target_Screener/surfaceome.csv' %}


In [9]:
from helpers import *
import os
import re
import random
import qnorm
import shutil
from string import ascii_uppercase
from tqdm import tqdm
import numpy as np
import pandas as pd
import scanpy as sc
import decoupler as dc

from matplotlib import cm
from matplotlib.patches import Rectangle
from matplotlib.patches import Patch
import matplotlib.pyplot as plt
import seaborn as sns
from matplotlib.colors import LinearSegmentedColormap
%matplotlib inline

import networkx as nx

from IPython.display import HTML, display, Markdown, FileLink
from matplotlib.gridspec import GridSpec
from matplotlib_venn import venn2
from maayanlab_bioinformatics.normalization import zscore_normalize, log2_normalize
from maayanlab_bioinformatics.dge import ttest_differential_expression
from maayanlab_bioinformatics.harmonization.ncbi_genes import ncbi_genes_lookup
from maayanlab_bioinformatics.api import enrichr_link_from_genes
from statsmodels.stats.multitest import multipletests

import warnings
warnings.filterwarnings("ignore")

sc.settings.set_figure_params(dpi=300, frameon=False)
sc.settings.verbosity = 0

os.makedirs('results', exist_ok=True)
os.makedirs('figures', exist_ok=True)

    
fig_counter = 2
table_counter = 1
letter_counter = 0
discussion_results = {}

import sys
import contextlib
@contextlib.contextmanager
def suppress_output(stdout=True, stderr=True, dest=os.devnull):
    ''' Usage:
    with suppress_output():
        print('hi')
    '''
    dev_null = open(dest, 'a')
    if stdout:
        _stdout = sys.stdout
        sys.stdout = dev_null
    if stderr:
        _stderr = sys.stderr
        sys.stderr = dev_null
    try:
        yield
    finally:
        if stdout:
            sys.stdout = _stdout
        if stderr:
            sys.stderr = _stderr

In [None]:
display(Markdown('## Abstract'))
abstract_text = '''Single cell RNA-seq data enables the profiling of gene expression in individual cells. In disease states, cell type 
proportions and functions are often altered. By utilizing two single cell profiles from a control and experimental condition,
TargetRanger can be used to identify highly expressed cell-surface proteins on diseased cell types that are lowly expressed on the healthy cell type and in healthy cells and tissues.
Additionally, we can use L1000 data to identify compounds most likely to push diseased cell types to a healthy phenotype. SC2Targets also enables this analysis on bulk RNA-seq data, 
identifying cell-type specific expression for each bulk sample using a single cell reference and the the deconvolution algorithm, BayesPrism. Overall, SC2Targets enables the 
identification of cell-type specific targets that and reverser compounds to remove to treat disease. '''

display(Markdown(abstract_text))

In [11]:
#load the buttons

## Methods

In [12]:
## TODO

In [None]:
%%appyter markdown
<img src='https://minio.dev.maayanlab.cloud/sc2targets/scRNAseq2TargetsWorkflow.png' alt="scRNAseq2Targets Workflow Diagram">

In [None]:
display(Markdown("__Fig. 1__ Workflow of the scRNAseq2Targets pipeline. The pipeline starts with two single cell profiles from a control and experimental condition or a bulk RNA-seq dataset with control and perturbation conditions along with a tissue-specific single cell reference. Then it utilizes L1000 data to identify compounds most likely to push diseased cell types to a healthy phenotype as well as identifying cell-type specific membrane targets for targeted cell removal."))

In [None]:
%%appyter code_exec
#load the data
{% if data_type.raw_value == 'scRNA-seq' %}

{% if data_type.value[0].raw_value == 'Plain Text' %}
sc_data_ctrl_file = {{ data_type.value[0].value[0] }}
sc_metadata_ctrl_file = {{ data_type.value[0].value[1] }}
sc_data_ctrl = read_sc_data(sc_data_ctrl_file, sc_metadata_ctrl_file, 'plain')
cell_type_col = {{ data_type.value[1].value[2] }}
{% elif data_type.value[1].raw_value == '.mtx' %}
## TODO IMPLEMENT .MTX FILE LOADING
{% endif %}

{% if data_type.value[1].raw_value == 'Plain Text' %}
sc_data_pert_file = {{ data_type.value[1].value[0] }}
sc_metadata_pert_file = {{ data_type.value[1].value[1] }}
sc_data_pert = read_sc_data(sc_data_pert_file, sc_metadata_pert_file, 'plain')

cell_type_col = {{ data_type.value[1].value[2] }}
{% elif data_type.value[1].raw_value == '.mtx' %}
## TODO IMPLEMENT .MTX FILE LOADING
{% endif %}

{% elif data_type.raw_value == 'bulk RNA-seq' %}

bulk_expr_df = read_bulk_data('{{ data_type.value[0].value }}')
metadata_df = read_bulk_data('{{ data_type.value[1].value }}')
condition_col = '{{ data_type.value[2].value }}'
control_cond = '{{ data_type.value[3].value }}'
sc_ref = '{{ data_type.value[4].value }}'

{% endif %}

In [None]:
%%appyter code_exec
n_neighbors = {{ n_neighbors.raw_value }}
min_dist = {{ min_dist.raw_value }}
resolution = {{ resolution.raw_value }}
species = '{{ species.raw_value }}'

In [None]:
%%appyter code_exec
try:
    lookup = ncbi_genes_lookup(organism='Mammalia/Homo_sapiens' if species == 'human' else 'Mammalia/Mus_musculus')
except:
    import urllib.request, json 
    with urllib.request.urlopen("https://s3.amazonaws.com/multiomics2paper/public/ncbi_genes_disambiguated.json") as url:
        ncbi_genes_disambiguated = json.load(url)
        lookup = ncbi_genes_disambiguated.get
{% if data_type.raw_value == 'bulk RNA-seq' %}
mamamla_info = pd.read_csv('Mammalia/Homo_sapiens.gene_info.tsv', sep='\t', index_col=0)
syns = list(s.split('|') for s in mamamla_info['dbXrefs'])
convert_dict = {}
for syn, symbol in zip(syns, list(mamamla_info['Symbol'])):
    if len(syn) >= 3 and syn[2].startswith('Ensembl:'):
        convert_dict[symbol] = syn[2].split(':')[1]

bulk_expr_df.index = bulk_expr_df.index.map(lambda x: lookup(x) if lookup(x) else x)
{% endif %}

In [18]:
%%appyter markdown
{% if data_type.raw_value == 'bulk RNA-seq' %}
# Visualize Bulk Samples
Utilize UMAP to visualize bulk quantile and log-normalized bulk data expression data colored by experimental condition.
{% endif %}

In [19]:
%%appyter code_exec
{% if data_type.raw_value == 'bulk RNA-seq' %}

#Visualize Bulk data by UMAP, use adata for convenience
adata = sc.AnnData(log2_normalize(qnorm.quantile_normalize(bulk_expr_df)).T.values)
adata.var['gene_names'] = bulk_expr_df.index.values
adata.obs['samples'] = bulk_expr_df.columns.values
adata.obs['condition'] = adata.obs['samples'].map(lambda x: metadata_df.loc[x, condition_col])

sc.pp.pca(adata, n_comps=2)
sc.pp.neighbors(adata, n_neighbors=n_neighbors, n_pcs=2)
sc.tl.umap(adata, min_dist=min_dist)
sc.tl.leiden(adata, resolution=resolution)

sc.pl.umap(adata, color='condition', legend_fontweight='normal', legend_fontsize=12, size=100, save=f'_bulk_data.png')
sc.pl.umap(adata, color='condition', legend_fontweight='normal', legend_fontsize=12, size=100, save=f'_bulk_data.svg', show=False)

display(Markdown(f'__Fig. {fig_counter}__. UMAP visualization of quantile and log-normalized bulk data expression data colored by experimental condition.'))
display(FileLink('figures/umap_bulk_data.png', result_html_prefix='Download PNG: '))
display(FileLink('figures/umap_bulk_data.svg', result_html_prefix='Download SVG: '))
fig_counter += 1
{% endif %}

In [20]:
%%appyter markdown
{% if data_type.raw_value == 'bulk RNA-seq' %}
# Deconvolution with InstaPrism
InstaPrism [TODO: CITATION] is a deconvolution tool based on the BayesPrism algorithm [TODO: CITATION]. It utilizes a single cell RNA-seq cell type reference for deconvolution into cell type fractions for each bulk sample and also produces cell type specific estimated expression for each bulk sample.
{% endif %}

In [21]:
%%appyter code_exec
{% if data_type.raw_value == 'bulk RNA-seq' %}
#Perform deconvolution with InstaPrism
#ref_url: str, bulk_expr: pd.DataFrame, output_dir: str, convert_dict: dict
ref_url = f'https://minio.dev.maayanlab.cloud/sc2targets/references/{sc_ref}.rds'
bulk_expr_norm = log2_normalize(qnorm.quantile_normalize(bulk_expr_df))
with suppress_output():
    estimated_frac_df, cell_type_dfs = deconvolution_insta_prism(ref_url, bulk_expr_norm, 'results', convert_dict)
shutil.make_archive('deconvolution_res', 'zip', 'results')
display(FileLink('deconvolution_res.zip', result_html_prefix='Download deconvolution results: '))
{% endif %}

In [22]:
%%appyter code_exec
{% if data_type.raw_value == 'bulk RNA-seq' %}
# visualize cell type fractions per condition
frac_df_viz = estimated_frac_df.copy()
frac_df_viz['condition'] = frac_df_viz.index.map(lambda x: metadata_df.loc[x, condition_col])
frac_df_viz.reset_index(inplace=True)
frac_df_viz.rename(columns={'index': 'sample'}, inplace=True)
df_melted = frac_df_viz.melt(id_vars=['sample', 'condition'], var_name='cell_type', value_name='fraction')

plt.figure(figsize=(12, 8))
sns.boxplot(data=df_melted, x='condition', y='fraction', hue='cell_type', palette='Set2')
plt.xlabel('Condition')
plt.ylabel('Fraction')
plt.legend(title='Cell Type', bbox_to_anchor=(1.05, 1), loc='upper left')
plt.xticks(rotation=45)
plt.tight_layout()
plt.savefig('figures/cell_type_fractions.svg', dpi=300)
plt.savefig('figures/cell_type_fractions.png', dpi=300)
plt.show()

display(Markdown(f'__Fig. {fig_counter}__. Cell type fractions per condition computed using the selected cell type reference and InstaPrism.'))
display(FileLink('figures/cell_type_fractions.svg', result_html_prefix='Download SVG: '))
display(FileLink('figures/cell_type_fractions.png', result_html_prefix='Download PNG: '))
fig_counter += 1
{% endif %}

In [23]:
%%appyter code_exec
{% if data_type.raw_value == 'bulk RNA-seq' %}

pivot_df = df_melted.groupby(['condition', 'cell_type']).mean('fraction').reset_index().pivot(index='cell_type', columns='condition', values='fraction')

pert_conditions = list(set([c for c in metadata_df[condition_col].values if c != control_cond]))
# Calculate the difference
for pert in pert_conditions:
    if 'change' in pivot_df.columns:
        pivot_df['change'] = pivot_df[pert] - pivot_df[control_cond] + pivot_df['change']
    else:
        pivot_df['change'] = pivot_df[pert] - pivot_df[control_cond]

# Get the absolute change and sort
pivot_df['abs_change'] = pivot_df['change'].abs()
most_changed_df = pivot_df.sort_values(by='abs_change', ascending=False)

# Display the results
top_5 = most_changed_df.index.values[:5]
display(most_changed_df[['change', 'abs_change']])
display(Markdown(f'__Table. {table_counter}__. Cell types ranked by change across control and perturbed conditions.'))
table_counter += 1
{% endif %}

In [24]:
%%appyter code_exec
{% if data_type.raw_value == 'bulk RNA-seq' %}

to_merge = []
for ct in cell_type_dfs:
    ct_name = ct.split('_Z')[0]
    if ct_name not in top_5:
        continue
    ct_df = cell_type_dfs[ct]
    ct_df.index =  ct_df.index.map(lambda x: f"{x}_{ct_name}_{metadata_df.loc[x, condition_col]}" if x in metadata_df.index else x)
    ct_df = ct_df.T 
    ct_df.index = ct_df.index.map(lambda x: lookup(x) if lookup(x) else x)
    to_merge.append(ct_df)
bulk_ct_df = pd.concat(to_merge, axis=1)
bulk_ct_df

adata = sc.AnnData(log2_normalize(qnorm.quantile_normalize(bulk_ct_df)).T.values)
adata.var['gene_names'] = bulk_ct_df.index.values
adata.obs['samples'] = bulk_ct_df.columns.map(lambda x: x.split('_')[0])
adata.obs['cell type'] = bulk_ct_df.columns.map(lambda x: x.split('_')[1])
adata.obs['condition'] = bulk_ct_df.columns.map(lambda x: x.split('_')[2])

sc.pp.pca(adata, n_comps=2)
sc.pp.neighbors(adata, n_neighbors=n_neighbors, n_pcs=2)
sc.tl.umap(adata, min_dist=min_dist)
sc.tl.leiden(adata, resolution=resolution)

sc.pl.umap(adata, color='condition', legend_fontweight='normal', legend_fontsize=12, size=100, save='_condition_cell_type.png')
sc.pl.umap(adata, color='condition', legend_fontweight='normal', legend_fontsize=12, size=100, save='_condition_cell_type.svg', show=False)
display(Markdown(f'__Fig. {fig_counter}__. UMAP visualization of quantile and log-normalized bulk data expression data colored by experimental condition.'))
display(FileLink('figures/umap_condition_cell_type.png', result_html_prefix='Download PNG: '))
display(FileLink('figures/umap_condition_cell_type.svg', result_html_prefix='Download SVG: '))
fig_counter += 1

sc.pl.umap(adata, color='cell type', legend_fontweight='normal', legend_fontsize=12, size=100, save='_cell_type.png')
sc.pl.umap(adata, color='cell type', legend_fontweight='normal', legend_fontsize=12, size=100, save='_cell_type.svg', show=False)
display(Markdown(f'__Fig. {fig_counter}__. UMAP visualization of quantile and log-normalized bulk data expression data colored by cell type.'))
display(FileLink('figures/umap_cell_type.png', result_html_prefix='Download PNG: '))
display(FileLink('figures/umap_cell_type.svg', result_html_prefix='Download SVG: '))
fig_counter += 1

{% endif %}

In [None]:
%%appyter markdown
{% if data_type.raw_value == 'scRNA-seq' %}
# Cell Type Identification
To identify cell types, first cells in each profile are clustered using UMAP [TODO: CITE] and the Leiden algorithm [TODO: CITE]. Canonical cell type makers are sourced from PanglaoDB [TODO: CITE] and are used for enrichment analysis and visualization of inferred cell types using decoupleR [TODO:CITE 36699385].
{% endif %}

In [None]:
%%appyter code_exec
{% if data_type.raw_value == 'scRNA-seq' %}
## Identify cell types if no cell type column is provided
sc_data_ctrl = normalize_sc_data(sc_data_ctrl, n_neighbors, min_dist, resolution)
sc.pl.umap(sc_data_ctrl, color='leiden', title='scRNA-seq control profile UMAP',
           frameon=False, legend_fontweight='normal', legend_fontsize=12, save='_leiden_control.png')
sc.pl.umap(sc_data_ctrl, color='leiden', title='scRNA-seq control profile UMAP',
           frameon=False, legend_fontweight='normal', legend_fontsize=12, save='_leiden_control.svg', show=False)
display(Markdown(f'__Fig. {fig_counter}__. UMAP visualization of scRNA-seq control data colored by leiden cluster.'))
display(FileLink('figures/umap_leiden_control.png', result_html_prefix='Download PNG: '))
display(FileLink('figures/umap_leiden_control.svg', result_html_prefix='Download SVG: '))
fig_counter += 1
{% endif %}

In [None]:
%%appyter code_exec
{% if data_type.raw_value == 'scRNA-seq' %}
## Identify cell types if no cell type column is provided
sc_data_pert = normalize_sc_data(sc_data_pert, n_neighbors, min_dist, resolution)
sc.pl.umap(sc_data_pert, color='leiden', title='scRNA-seq perturbation profile UMAP',
           frameon=False, legend_fontweight='normal', legend_fontsize=12, save='_leiden_pert.png')
sc.pl.umap(sc_data_pert, color='leiden', title='scRNA-seq perturbation profile UMAP',
           frameon=False, legend_fontweight='normal', legend_fontsize=12, save='_leiden_pert.svg', show=False)
display(Markdown(f'__Fig. {fig_counter}__. UMAP visualization of scRNA-seq perturbation data colored by leiden cluster.'))
display(FileLink('figures/umap_leiden_pert.png', result_html_prefix='Download PNG: '))
display(FileLink('figures/umap_leiden_pert.svg', result_html_prefix='Download SVG: '))
fig_counter += 1
{% endif %}

In [None]:
%%appyter code_exec
{% if data_type.raw_value == 'scRNA-seq' %}
markers = pd.read_csv('https://minio.dev.maayanlab.cloud/sc2targets/PanglaoDB_markers_27_Mar_2020.tsv', sep='\t')
# Filter by canonical_marker and human
markers['human'] = markers['species'].str.contains('Hs')
markers['mouse'] = markers['species'].str.contains('Mm')
markers = markers[markers[species] & (markers['canonical marker'] == 1) & (markers[f'sensitivity_{species}'] > 0.5)]
markers = markers[~markers.duplicated(['cell type', 'official gene symbol'])]
markers.rename(columns={'cell type': 'cell_type', 'official gene symbol': 'genesymbol'}, inplace=True)
if species == 'mouse':
    markers['genesymbol'] = markers['genesymbol'].map(lambda x: x.lower().capitalize())
markers


sc_data_pert.var.set_index('gene_names', inplace=True)
sc_data_ctrl.var.set_index('gene_names', inplace=True)

dc.run_ora(
    mat=sc_data_pert,
    net=markers,
    source='cell_type',
    target='genesymbol',
    min_n=3,
    verbose=True,
    use_raw=False
)

dc.run_ora(
    mat=sc_data_ctrl,
    net=markers,
    source='cell_type',
    target='genesymbol',
    min_n=3,
    verbose=True,
    use_raw=False
)
{% endif %}

In [None]:
%%appyter code_exec
{% if data_type.raw_value == 'scRNA-seq' %}
acts_ctrl = dc.get_acts(sc_data_ctrl, obsm_key='ora_estimate')
acts_pert = dc.get_acts(sc_data_pert, obsm_key='ora_estimate')
sc_ctrl_cell_types = dc.rank_sources_groups(acts_ctrl, groupby='leiden', reference='rest', method='t-test_overestim_var')
sc_pert_cell_types = dc.rank_sources_groups(acts_pert, groupby='leiden', reference='rest', method='t-test_overestim_var')
n_ctypes = 3
ctypes_dict_ctrl = sc_ctrl_cell_types.groupby('group').head(n_ctypes).groupby('group')['names'].apply(lambda x: list(x)).to_dict()
ctypes_dict_pert = sc_pert_cell_types.groupby('group').head(n_ctypes).groupby('group')['names'].apply(lambda x: list(x)).to_dict()
{% endif %}

In [None]:
%%appyter code_exec
{% if data_type.raw_value == 'scRNA-seq' %}

sc.pl.matrixplot(acts_ctrl, ctypes_dict_ctrl, 'leiden', dendrogram=True, standard_scale='var',
                 colorbar_title='Z-scaled scores', cmap='RdBu_r', save='cell_types_ctrl.png')
sc.pl.matrixplot(acts_ctrl, ctypes_dict_ctrl, 'leiden', dendrogram=True, standard_scale='var',
                 colorbar_title='Z-scaled scores', cmap='RdBu_r', save='cell_types_ctrl.svg', show=False)
display(Markdown(f'__Fig. {fig_counter}__. Matrix visualization of scRNA-seq leiden cluster cell type enrichments for the control profile.'))
display(FileLink('figures/matrixplot_cell_types_ctrl.png', result_html_prefix='Download PNG: '))
display(FileLink('figures/matrixplot_cell_types_ctrl.svg', result_html_prefix='Download SVG: '))
fig_counter += 1
{% endif %}

In [None]:
%%appyter code_exec
{% if data_type.raw_value == 'scRNA-seq' %}

sc.pl.matrixplot(acts_pert, ctypes_dict_pert, 'leiden', dendrogram=True, standard_scale='var',
                 colorbar_title='Z-scaled scores', cmap='RdBu_r', save='cell_types_pert.png')
sc.pl.matrixplot(acts_pert, ctypes_dict_pert, 'leiden', dendrogram=True, standard_scale='var',
                 colorbar_title='Z-scaled scores', cmap='RdBu_r', save='cell_types_pert.svg', show=False)
display(Markdown(f'__Fig. {fig_counter}__. Matrix visualization of scRNA-seq leiden cluster cell type enrichments for the perturbation profile.'))
display(FileLink('figures/matrixplot_cell_types_pert.png', result_html_prefix='Download PNG: '))
display(FileLink('figures/matrixplot_cell_types_pert.svg', result_html_prefix='Download SVG: '))

{% endif %}

In [None]:
%%appyter code_exec
{% if data_type.raw_value == 'scRNA-seq' %}

annotation_dict_ctrl = sc_ctrl_cell_types.groupby('group').head(1).set_index('group')['names'].to_dict()
annotation_dict_pert = sc_pert_cell_types.groupby('group').head(1).set_index('group')['names'].to_dict()
sc_data_ctrl.obs['cell_type'] = [annotation_dict_ctrl[clust] for clust in sc_data_ctrl.obs['leiden']]
sc_data_pert.obs['cell_type'] = [annotation_dict_pert[clust] for clust in sc_data_pert.obs['leiden']]
overlapping_cell_types = list(set(annotation_dict_ctrl.values()).intersection(set(annotation_dict_pert.values())))
display(Markdown(f"Overlapping cell types for downstream analysis: {', '.join(overlapping_cell_types)}"))
{% endif %}

In [None]:
%%appyter code_exec
{% if data_type.raw_value == 'scRNA-seq' %}
sc.pl.umap(sc_data_ctrl, color='cell_type', save='_cell_types_ctrl.png')
sc.pl.umap(sc_data_ctrl, color='cell_type', save='_cell_types_ctrl.svg', show=False)
display(Markdown(f'__Fig. {fig_counter}__. UMAP visualization of scRNA-seq cell types for the control profile.'))
display(FileLink('figures/umap_cell_types_ctrl.png', result_html_prefix='Download PNG: '))
display(FileLink('figures/umap_cell_types_ctrl.svg', result_html_prefix='Download SVG: '))
fig_counter += 1

{% endif %}

In [None]:
%%appyter code_exec
{% if data_type.raw_value == 'scRNA-seq' %}
sc.pl.umap(sc_data_pert, color='cell_type', save='_cell_types_pert.png')
sc.pl.umap(sc_data_pert, color='cell_type', save='_cell_types_pert.svg', show=False)
display(Markdown(f'__Fig. {fig_counter}__. UMAP visualization of scRNA-seq cell types for the perturbation profile.'))
display(FileLink('figures/umap_cell_types_pert.png', result_html_prefix='Download PNG: '))
display(FileLink('figures/umap_cell_types_pert.svg', result_html_prefix='Download SVG: '))
fig_counter += 1

{% endif %}

In [None]:
%%appyter code_exec
{% if data_type.raw_value == 'scRNA-seq' %}

## get three representative samples per overlapping cell type
data_pert = []
sample_names_pert = []

data_ctrl = []
sample_names_ctrl = []
# get indices of shared genes for each mat

for ct in overlapping_cell_types:
    ## extract counts from samples matching the cell type:
    ct_cells = sc_data_pert.X.transpose()[:, sc_data_pert.obs['cell_type'] == ct]
    n_cells = ct_cells.shape[1]
    for i in range(3):
        rand_idx = np.random.choice(list(range(n_cells)), n_cells // 3, replace=False)
        ct_cells_rand_samp = ct_cells[:, rand_idx].sum(axis=1)
        data_pert.append(ct_cells_rand_samp)
        sample_names_pert.append(f'pert_{ct}_{i + 1}')
    
    ct_cells = sc_data_ctrl.X.transpose()[:, sc_data_ctrl.obs['cell_type'] == ct]
    n_cells = ct_cells.shape[1]
    for i in range(3):
        rand_idx = np.random.choice(list(range(n_cells)), n_cells // 3, replace=False)
        ct_cells_rand_samp = ct_cells[:, rand_idx].sum(axis=1)
        data_ctrl.append(ct_cells_rand_samp)
        sample_names_ctrl.append(f'ctrl_{ct}_{i + 1}')

sim_bulk_pert = pd.DataFrame(data=data_pert, index=sample_names_pert, columns=sc_data_pert.var_names).T
sim_bulk_ctrl = pd.DataFrame(data=data_ctrl, index=sample_names_ctrl, columns=sc_data_ctrl.var_names).T
bulk_ct_df = pd.merge(sim_bulk_pert, sim_bulk_ctrl, left_index=True, right_index=True)
bulk_ct_df.to_csv('results/bulk_ct_df.csv')
display(bulk_ct_df)
display(Markdown(f'__Table. {table_counter}__. Three representative samples per overlapping cell type in the control and perturbation profiles.'))
display(FileLink('results/bulk_ct_df.csv', result_html_prefix='Download CSV: '))
table_counter += 1
{% endif %}

In [36]:
targets = {}
if species == 'mouse':
    lookup = ncbi_genes_lookup(organism='Mammalia/Mus_musculus')
    bgs = {'ARCHS4': 'Mammalia/Mus_musculus/archs4-gene-stats.tsv', 'Tabula Muris': 'Mammalia/Mus_musculus/tabula-muris-gene-stats.tsv' }
else:
    lookup = ncbi_genes_lookup()
    bgs = {'GTEx': 'gtex-gene-stats.tsv', 'ARCHS4': 'archs4-gene-stats.tsv', 'TS': 'ts_10x_cell-ontology-class_donors_tissue-labels_v1.tsv'}

In [None]:
%%appyter markdown
# Target Identification
To identify highly expressed membrane targets on perturbed cell types, we utilize TargetRanger [TODO: CITE], a collection of heathy tissue and cell type atlases. TargetRanger works by finding genes highly expressed in perturbed cell types that are lowly expressed across healthy cell types and tissues. Additionally, we filter targets identified for the control profiles of each cell type to uniquely target diseased cell types.

In [38]:
def find_targets(rna_df, bg, targets):
    df_bg_stats = pd.read_csv(f"https://appyters.maayanlab.cloud/storage/Tumor_Gene_Target_Screener/{bgs[bg]}", sep='\t', index_col=[0,1])
    df_bg_genes = df_bg_stats.unstack().index.map(lambda idx: lookup(idx.partition('.')[0]))
    df_bg_stats = df_bg_stats.unstack().groupby(df_bg_genes, observed=True).median().stack()
    df_bg_expr = df_bg_stats.loc[(slice(None), ['25%', '50%', '75%']), :].unstack()
    common_index = list(set(rna_df.index) & set(df_bg_expr.index))
    expr_df = rna_df.loc[common_index, :]
    index_name = expr_df.index.name
    expr_df.reset_index(inplace=True)
    expr_df.drop_duplicates(subset=index_name, inplace=True)
    expr_df.set_index(index_name, inplace=True, drop=True)
    target_distribution = df_bg_expr.loc[common_index, :].median(axis=1)
    df_expr_norm = qnorm.quantile_normalize(expr_df.loc[common_index, :], target=target_distribution)
    df_bg_expr_norm = qnorm.quantile_normalize(df_bg_expr.loc[common_index, :], target=target_distribution)
    for ct in tqdm(overlapping_cell_types):
        if ct not in targets: targets[ct] = {}
        ct_pert_samples = list(filter(lambda x: ct in x and 'pert' in x, bulk_ct_df.columns))
        ct_ctrl_samples = list(filter(lambda x: ct in x and 'ctrl' in x, bulk_ct_df.columns))
        with suppress_output():
            df_bg_expr_norm.columns = df_bg_expr_norm.columns.to_flat_index().map(lambda s: ', '.join(s))
            dge_pert = ttest_differential_expression(
                df_bg_expr_norm, df_expr_norm[ct_pert_samples],
                log2norm=False,
            )
            dge_ctrl = ttest_differential_expression(
                df_bg_expr_norm, df_expr_norm[ct_ctrl_samples],
                log2norm=False,
            )
            overlapping_pert_norm = set(dge_pert[(dge_pert['AdjPval'] < 0.01) & (dge_pert['Statistic'] > 0)].sort_values('Statistic', ascending=False).index.values).difference(set(dge_ctrl[(dge_ctrl['AdjPval'] < 0.01) & (dge_ctrl['Statistic'] > 0)].sort_values('Statistic', ascending=False).index.values))
            targets[ct][bg] = list(dge_pert[(dge_pert['AdjPval'] < 0.01) & (dge_pert['Statistic'] > 0) & (~dge_pert.index.isin(overlapping_pert_norm))].sort_values('Statistic', ascending=False).index.values)

def find_targets_multiple_pert(rna_df, bg, targets, control_cond, pert_conditions):
    df_bg_stats = pd.read_csv(f"/Users/giacomomarino/GeneRangerFigures/datasets/{bgs[bg]}", sep='\t', index_col=[0,1])
    df_bg_genes = df_bg_stats.unstack().index.map(lambda idx: lookup(idx.partition('.')[0]))
    df_bg_stats = df_bg_stats.unstack().groupby(df_bg_genes, observed=True).median().stack()
    df_bg_expr = df_bg_stats.loc[(slice(None), ['25%', '50%', '75%']), :].unstack()
    common_index = list(set(rna_df.index) & set(df_bg_expr.index))
    expr_df = rna_df.loc[common_index, :]
    index_name = expr_df.index.name
    if index_name == None:
        expr_df.index.name = 'gene_name'
        index_name = 'gene_name'
    expr_df.reset_index(inplace=True)
    expr_df.drop_duplicates(subset=index_name, inplace=True)
    expr_df.set_index(index_name, inplace=True, drop=True)
    target_distribution = df_bg_expr.loc[common_index, :].median(axis=1)
    df_expr_norm = qnorm.quantile_normalize(expr_df.loc[common_index, :], target=target_distribution)
    df_bg_expr_norm = qnorm.quantile_normalize(df_bg_expr.loc[common_index, :], target=target_distribution)
    df_bg_expr_norm.columns = df_bg_expr_norm.columns.to_flat_index().map(lambda s: ', '.join(s))
    for ct in tqdm(top_5):
        ct_ctrl_samples = list(filter(lambda x: ct in x and control_cond in x, bulk_ct_df.columns))
        with suppress_output():
            dge_ctrl = ttest_differential_expression(
                df_bg_expr_norm, df_expr_norm[ct_ctrl_samples],
                log2norm=False,
            )
            for pert in pert_conditions:
                if f"{ct}-{pert}" not in targets: targets[f"{ct}-{pert}"] = {}
                ct_pert_samples = list(filter(lambda x: ct in x and pert in x, bulk_ct_df.columns))
                dge_pert = ttest_differential_expression(
                    df_bg_expr_norm, df_expr_norm[ct_pert_samples],
                    log2norm=False,
                )
                overlapping_pert_norm = set(dge_pert[(dge_pert['AdjPval'] < 0.01) & (dge_pert['Statistic'] > 0)].sort_values('Statistic', ascending=False).index.values).difference(set(dge_ctrl[(dge_ctrl['AdjPval'] < 0.01) & (dge_ctrl['Statistic'] > 0)].sort_values('Statistic', ascending=False).index.values))
                targets[f"{ct}-{pert}"][bg] = list(dge_pert[(dge_pert['AdjPval'] < 0.01) & (dge_pert['Statistic'] > 0) & (~dge_pert.index.isin(overlapping_pert_norm))].sort_values('Statistic', ascending=False).index.values)


In [None]:
%%appyter code_exec
if species == 'human': 
{% if data_type.raw_value == 'scRNA-seq' %}
    print('Finding Targets using ARCHS4')
    find_targets(bulk_ct_df, 'ARCHS4', targets)  
    print('Finding Targets using GTEx')
    find_targets(bulk_ct_df, 'GTEx', targets)
    print('Finding Targets using Tabula Sapiens')
    find_targets(bulk_ct_df, 'TS', targets)
{% elif data_type.raw_value == 'bulk RNA-seq' %}
    print('Finding Targets using ARCHS4')
    find_targets_multiple_pert(bulk_ct_df, 'ARCHS4', targets, control_cond, pert_conditions)
    print('Finding Targets using GTEx')
    find_targets_multiple_pert(bulk_ct_df, 'GTEx', targets, control_cond, pert_conditions)
    print('Finding Targets using Tabula Sapiens')
    find_targets_multiple_pert(bulk_ct_df, 'TS', targets, control_cond, pert_conditions)
{% endif %}
elif species == 'mouse':
{% if data_type.raw_value == 'scRNA-seq' %}
    print('Finding Targets using ARCHS4')
    find_targets(bulk_ct_df, 'ARCHS4', targets)
    print('Finding Targets using Tabula Muris')
    find_targets(bulk_ct_df, 'Tabula Muris', targets)
{% elif data_type.raw_value == 'bulk RNA-seq' %}
    print('Finding Targets using ARCHS4')
    find_targets_multiple_pert(bulk_ct_df, 'ARCHS4', targets, control_cond, pert_conditions)
    print('Finding Targets using Tabula Muris')
    find_targets_multiple_pert(bulk_ct_df, 'Tabula Muris', targets, control_cond, pert_conditions)
{% endif %}

In [None]:
proteins = pd.read_csv('https://appyters.maayanlab.cloud/storage/Tumor_Gene_Target_Screener/surfaceome.csv')
membrane_proteins = list(proteins['genename'].dropna().values)

if species == 'human':
    sc.set_figure_params(scanpy=False)
    YlGnBu = cm.get_cmap('YlGnBu_r', 8)
    cmap = {"None":YlGnBu(0), "ARCHS4": YlGnBu(1), "GTEx":YlGnBu(2), "TS": YlGnBu(4), "ARCHS4-GTEx":YlGnBu(3),  "ARCHS4-TS": YlGnBu(5), "GTEx-TS": YlGnBu(6), "All": YlGnBu(7)}

    top_targets_n = 100
    target_list = []
    for ct in targets:
        for bg in targets[ct]:
            target_list.extend(list(filter(lambda g: g in membrane_proteins, list(targets[ct][bg])[:top_targets_n])))
    if len(set(target_list)) < 100:
        top_targets_n = 500
        for cluster in targets:
            for bg in targets[cluster]:
                target_list.extend(list(filter(lambda g: g in membrane_proteins, list(targets[cluster][bg])[:top_targets_n])))
    
    data1, data2, data3 = [], [], []
    similarity = []
    target_list = list(set(target_list))

    for gene in target_list:
        a = [1 if gene in list(targets[c]['ARCHS4'])[:top_targets_n] else 0 for c in list(targets.keys())]
        g = [2 if gene in list(targets[c]['GTEx'])[:top_targets_n] else 0 for c in list(targets.keys())]
        l = [4 if gene in list(targets[c]['TS'])[:top_targets_n] else 0 for c in list(targets.keys())]
        data1.append(a)
        data2.append(g)
        data3.append(l)
        similarity.append(np.dot(np.dot(np.array(a), np.array(g)), np.array(l)))

    data = np.add(np.add(data1, data2), data3)

    membrane_target_mat = pd.DataFrame(data)
    membrane_target_mat.columns = list(targets.keys())
    membrane_target_mat.index = target_list

    membrane_target_mat['count'] = membrane_target_mat.sum(axis=1)
    membrane_target_mat = membrane_target_mat[membrane_target_mat['count'] >= 7]
    membrane_target_mat = membrane_target_mat.rename_axis('Membrane Target').sort_values(by = ['count', 'Membrane Target'], ascending = [False, True]).drop('count', axis=1)

    h = membrane_target_mat.shape[0]
    
    g = sns.clustermap(membrane_target_mat, figsize=(4,0.3*h+2*(h<15)), cmap=YlGnBu, cbar_pos=None, dendrogram_ratio=0.1-(h<40)*0.01*(h-30), row_cluster=False, xticklabels=True, yticklabels=True)
    g.ax_row_dendrogram.legend(handles=[Rectangle((0, 0), 0, 0, color=val, label=key) for key, val in cmap.items()],
                                    title='Background', loc='upper right')
    plt.grid(False)
    plt.savefig('figures/membrane_targets.svg', dpi=300)
    plt.savefig('figures/membrane_targets.png', dpi=300)
    plt.show()
    display(Markdown(f'__Fig. {fig_counter}__. Membrane Targets for perturbed cell types, filtered by genes upregulated in control cell types compared to healthy background atlases.'))
    display(FileLink('figures/membrane_targets.png', result_html_prefix='Download PNG: '))
    display(FileLink('figures/membrane_targets.svg', result_html_prefix='Download SVG: '))
    fig_counter += 1

    
elif species == 'mouse':
    YlGnBu = cm.get_cmap('YlGnBu_r', 4)
    cmap = {"None": YlGnBu(0), "ARCHS4": YlGnBu(1), "Tabula Muris":YlGnBu(2), "Both": YlGnBu(3)}

    top_targets_n = 100
    target_list = []
    for ct in targets:
        for bg in targets[ct]:
            target_list.extend(list(filter(lambda g: g.upper() in membrane_proteins, list(targets[ct][bg]))))
            #target_list.extend(list(targets[ct][bg])[:20])

    data1, data2 = [], []
    similarity = []
    target_list = list(set(target_list))

    for gene in target_list:
        a = [1 if gene in targets[c]['ARCHS4'] else 0 for c in overlapping_cell_types]
        g = [2 if gene in targets[c]['Tabula Muris'] else 0 for c in overlapping_cell_types]
        data1.append(a)
        data2.append(g)
        similarity.append(np.dot(np.array(a), np.array(g)))

    data = np.add(data1, data2)

    membrane_target_mat = pd.DataFrame(data)
    membrane_target_mat.columns = overlapping_cell_types
    membrane_target_mat.index = target_list
    
    membrane_target_mat['count'] = membrane_target_mat.sum(axis=1)
    membrane_target_mat = membrane_target_mat[membrane_target_mat['count'] >= 3]
    
    membrane_target_mat = membrane_target_mat.rename_axis('Membrane Target').sort_values(by = ['count', 'Membrane Target'], ascending = [False, True]).drop('count', axis=1)

    h = membrane_target_mat.shape[0]
    membrane_target_mat = membrane_target_mat.astype(float)
    
    g = sns.clustermap(membrane_target_mat, figsize=(4,0.3*h+2*(h<15)), cmap=YlGnBu, cbar_pos=None, dendrogram_ratio=0.1-(h<40)*0.01*(h-30), row_cluster=False, xticklabels=True, yticklabels=True)
    g.ax_row_dendrogram.legend(handles=[Rectangle((0, 0), 0, 0, color=val, label=key) for key, val in cmap.items()],
                                    title='Background', loc='upper right')
    plt.grid(False)
    plt.savefig('figures/membrane_targets.svg', dpi=300)
    plt.savefig('figures/membrane_targets.png', dpi=300)
    plt.show()
    display(Markdown(f'__Fig. {fig_counter}__. Membrane Targets for perturbed cell types, filtered by genes upregulated in control cell types compared to healthy background atlases.'))
    display(FileLink('figures/membrane_targets.png', result_html_prefix='Download PNG: '))
    display(FileLink('figures/membrane_targets.svg', result_html_prefix='Download SVG: '))
    fig_counter += 1

In [None]:
%%appyter code_exec
{% if data_type.raw_value == 'scRNA-seq' %}
data = []
for ct in overlapping_cell_types:
    ct_pert_samples = list(filter(lambda x: ct in x and 'pert' in x, bulk_ct_df.columns))
    ct_ctrl_samples = list(filter(lambda x: ct in x and 'ctrl' in x, bulk_ct_df.columns))
    if len(ct_pert_samples) > 0 and len(ct_ctrl_samples) > 0:
        diff_expr_ct = ttest_differential_expression(bulk_ct_df[ct_ctrl_samples], bulk_ct_df[ct_pert_samples], log2norm=False)
        up_genes = list(diff_expr_ct[(diff_expr_ct['Pval'] < 0.01) & (diff_expr_ct['Statistic'] > 0)].sort_values('Statistic', ascending=False).index.values)
        down_genes = list(diff_expr_ct[(diff_expr_ct['Pval'] < 0.01) & (diff_expr_ct['Statistic'] < 0)].sort_values('Statistic', ascending=True).index.values)
        desc, link = get_sigcom_link({'up_entities': up_genes, 'down_entities': down_genes, 'description': ct})
        data.append([ct, link])

compound_df = pd.DataFrame(data, columns=['cell_type','link'])
compound_df.set_index('cell_type', inplace=True)
compound_df.to_csv('results/sigcom_links.csv')
compound_df = compound_df.style.format({'link': lambda url: f'<a href="{url}" rel="noopener noreferrer" target="_blank">SigCom LINCS</a>'})
display(compound_df)
display(Markdown(f'__Table. {table_counter}__. SigCom LINCS analysis links for overlapping cell types in control and perturbation scRNA-seq profiles.'))
display(FileLink('results/sigcom_links.csv', result_html_prefix='Download CSV: '))
table_counter += 1
{% elif data_type.raw_value == 'bulk RNA-seq' %}
data = []
for ct in tqdm(top_5):
    ct_ctrl_samples = list(filter(lambda x: ct in x and control_cond in x, bulk_ct_df.columns))
    for pert in pert_conditions:
        ct_pert_samples = list(filter(lambda x: ct in x and pert in x, bulk_ct_df.columns))
        diff_expr_ct = ttest_differential_expression(bulk_ct_df[ct_ctrl_samples], bulk_ct_df[ct_pert_samples], log2norm=False)
        up_genes = list(diff_expr_ct[(diff_expr_ct['Pval'] < 0.01) & (diff_expr_ct['Statistic'] > 0)].sort_values('Statistic', ascending=False).index.values)
        down_genes = list(diff_expr_ct[(diff_expr_ct['Pval'] < 0.01) & (diff_expr_ct['Statistic'] < 0)].sort_values('Statistic', ascending=True).index.values)
        if len(up_genes) >= 5 and len(down_genes) >= 5:
            desc, link = get_sigcom_link({'up_entities': up_genes, 'down_entities': down_genes, 'description': ct})
            data.append([ct, pert, link])

compound_df = pd.DataFrame(data, columns=['cell_type', 'perturbation', 'link'])
compound_df.set_index('cell_type', inplace=True)
compound_df.to_csv('results/sigcom_links.csv')
compound_df = compound_df.style.format({'link': lambda url: f'<a href="{url}" rel="noopener noreferrer" target="_blank">SigCom LINCS</a>'})
display(compound_df)
display(Markdown(f'__Table. {table_counter}__. SigCom LINCS analysis links for top 5 changed perturbed cell types.'))
display(FileLink('results/sigcom_links.csv', result_html_prefix='Download CSV: '))
table_counter += 1
{% endif %}

# References

In [None]:
%%appyter markdown

TODO: ADD ORDERING/ change based on species and uploaded data type


Butler A, Hoffman P, Smibert P, Papalexi E, Satija R. Integrating single-cell transcriptomic data across different conditions, technologies, and species. Nat Biotechnol. 2018 Jun;36(5):411-420. doi: 10.1038/nbt.4096. 

Hu M, Chikina M. InstaPrism: an R package for fast implementation of BayesPrism. Bioinformatics. 2024 Jul 1;40(7):btae440.

Chu T, Wang Z, Pe'er D, Danko CG. Cell type and gene expression deconvolution with BayesPrism enables Bayesian integrative analysis across bulk and single-cell RNA sequencing in oncology. Nat Cancer. 2022 Apr;3(4):505-517.

Marino GB, Ngai M, Clarke DJB, Fleishman RH, Deng EZ, Xie Z, Ahmed N, Ma'ayan A. GeneRanger and TargetRanger: processed gene and protein expression levels across cells and tissues for target discovery. Nucleic Acids Res. 2023 Jul 5;51(W1):W213-W224.

Evangelista JE, Clarke DJB, Xie Z, Lachmann A, Jeon M, Chen K, Jagodnik KM, Jenkins SL, Kuleshov MV, Wojciechowicz ML, Schürer SC, Medvedovic M, Ma'ayan A. SigCom LINCS: data and metadata search engine for a million gene expression signatures. Nucleic Acids Res. 2022 Jul 5;50(W1):W697-W709.

Lachmann A, Torre D, Keenan AB, Jagodnik KM, Lee HJ, Wang L, Silverstein MC, Ma'ayan A. Massive mining of publicly available RNA-seq data from human and mouse. Nat Commun. 2018 Apr 10;9(1):1366.

GTEx Consortium. The Genotype-Tissue Expression (GTEx) project. Nat Genet. 2013 Jun;45(6):580-5. doi: 10.1038/ng.2653. 

Tabula Sapiens Consortium. The Tabula Sapiens: A multiple-organ, single-cell transcriptomic atlas of humans. Science. 2022 May 13;376(6594):eabl4896. doi: 10.1126/science.abl4896.

Tabula Muris Consortium. Single-cell transcriptomics of 20 mouse organs creates a Tabula Muris. Nature. 2018 Oct;562(7727):367-372. doi: 10.1038/s41586-018-0590-4. 
