In [1]:
#%%appyter init
from appyter import magic
magic.init(lambda _=globals: _())

In [115]:
%%appyter hide_code

{% do SectionField(
    name='primary',
    title='Upload Tumor Expression',
    img='upload.png'
) %}

{% set data_type = TabField(
    name='data_type',
    label='Data Type',
    default='scRNA-seq',
    description='Start with either scRNA-seq or bulk RNA-seq data. If bulk RNA-seq data is selected we will utilize pre-curated reference matrices to identify cell type-specific expression vectors.',
    required=True,
    section='primary',
    choices={
        'scRNA-seq': [
            TabField(
            name='sc_data_type_ctrl',
            label='Data Format (Control Profile)',
            default='Plain Text',
            description='Choose a format for scRNA-seq data',
            required=True,
            choices= {
                'Plain Text': [
                        CustomFileField(
                        name='sc_rna_file_ctrl',
                        label='control scRNA-seq',
                        description='''
    File should be a tsv/csv of the form:

                                        cell 1   cell 2    ...
                    ------------------------------
            Gene/Protein 1    0          2       ...
                    ------------------------------
            Gene/Protein 2    1         1        ...
                    ------------------------------
                        ...                       ... 
                        ''',
                        default='data/GSE171417_rbk_control.tsv.gz',
                        required=False,
                        examples={
                            'GSE171417_rbk_control.tsv.gz': 'https://minio.dev.maayanlab.cloud/sc2targets/GSE171417_rbk_control.tsv.gz',
                        },
                    ),
                    CustomFileField(
                        name='sc_meta_file_ctrl',
                        label='metadata',
                        description='''
                        File should be a tsv/csv of the form:

                            Patient 1 Tumor    Patient 2 Tumor  ...
                        ---------------------------------------
                        Gene/Protein 1    0                   200       ...
                        ---------------------------------------
                        Gene/Protein 2    5                   180       ...
                        ---------------------------------------
                        ...                       ...                    ...        ...
                        ''',
                        default='data/GSE171417_metadata_control.tsv',
                        required=False,
                        examples={
                            'GSE171417_metadata_control.tsv': 'https://minio.dev.maayanlab.cloud/sc2targets/GSE171417_metadata_control.tsv',
                        },
                    ),
                    TextField(
                        name='cell_type_col_ctrl',
                        label='Cell Type Column (Optional)',
                        description='Name of cell type column in uploaded metadata (if not selected, cell type identification will be performed)',
                        default='',
                        hint='cell_type',
                        required=False
                    )
                    ],
                '.mtx': [
                        CustomFileField(
                        name='sc_rna_file',
                        label='scRNA-seq',
                        description='''
    File should be a tsv/csv of the form:

                                        cell 1   cell 2    ...
                    ------------------------------
            Gene/Protein 1    0          2       ...
                    ------------------------------
            Gene/Protein 2    1         1        ...
                    ------------------------------
                        ...                       ... 
                        ''',
                        default='data/sarcoma-v2-proteome-SpectrumMill-ratio-QCfilter-NArm.tsv',
                        required=False,
                        examples={
                            'BR_proteomics_tumor.tsv': 'https://s3.amazonaws.com/multiomics2paper/public/BR_proteomics_tumor.tsv',
                        },
                    ),
                    CustomFileField(
                        name='sc_meta_file_pert',
                        label='metadata',
                        description='''
                        File should be a tsv/csv of the form:

                            Patient 1 Tumor    Patient 2 Tumor  ...
                        ---------------------------------------
                        Gene/Protein 1    0                   200       ...
                        ---------------------------------------
                        Gene/Protein 2    5                   180       ...
                        ---------------------------------------
                        ...                       ...                    ...        ...
                        ''',
                        default='data/sarcoma-v2-proteome-SpectrumMill-ratio-QCfilter-NArm.tsv',
                        required=False,
                        examples={
                            'BR_proteomics_tumor.tsv': 'https://s3.amazonaws.com/multiomics2paper/public/BR_proteomics_tumor.tsv',
                        },
                    )]
            }
        ),
        TabField(
            name='sc_data_type',
            label='Data format',
            default='Plain Text',
            description='Choose a format for scRNA-seq data',
            required=True,
            choices= {
                'Plain Text': [
                        CustomFileField(
                        name='sc_rna_file_pert',
                        label='scRNA-seq',
                        description='''
    File should be a tsv/csv of the form:

                                        cell 1   cell 2    ...
                    ------------------------------
            Gene/Protein 1    0          2       ...
                    ------------------------------
            Gene/Protein 2    1         1        ...
                    ------------------------------
                        ...                       ... 
                        ''',
                        default='data/GSE171417_rbk_experimental.tsv.gz',
                        required=False,
                        examples={
                            'GSE171417_rbk_experimental.tsv.gz': 'https://minio.dev.maayanlab.cloud/sc2targets/GSE171417_rbk_experimental.tsv.gz',
                        },
                    ),
                    CustomFileField(
                        name='sc_meta_file_pert',
                        label='metadata',
                        description='''
                        File should be a tsv/csv of the form:

                            Patient 1 Tumor    Patient 2 Tumor  ...
                        ---------------------------------------
                        Gene/Protein 1    0                   200       ...
                        ---------------------------------------
                        Gene/Protein 2    5                   180       ...
                        ---------------------------------------
                        ...                       ...                    ...        ...
                        ''',
                        default='data/GSE171417_metadata_experimental.tsv',
                        required=False,
                        examples={
                            'GSE171417_metadata_experimental.tsv': 'https://minio.dev.maayanlab.cloud/sc2targets/GSE171417_metadata_experimental.tsv',
                        },
                    ),
                    TextField(
                        name='condition_ctrl',
                        label='Cell Type Column (Optional)',
                        description='Name of the condition',
                        default='',
                        hint='cell_type',
                        required=False
                    ),
                    ],
                '.mtx': [
                        CustomFileField(
                        name='sc_rna_file',
                        label='scRNA-seq',
                        description='''
    File should be a tsv/csv of the form:

                                        cell 1   cell 2    ...
                    ------------------------------
            Gene/Protein 1    0          2       ...
                    ------------------------------
            Gene/Protein 2    1         1        ...
                    ------------------------------
                        ...                       ... 
                        ''',
                        default='data/sarcoma-v2-proteome-SpectrumMill-ratio-QCfilter-NArm.tsv',
                        required=False,
                        examples={
                            'BR_proteomics_tumor.tsv': 'https://s3.amazonaws.com/multiomics2paper/public/BR_proteomics_tumor.tsv',
                        },
                    ),
                    CustomFileField(
                        name='sc_meta_file',
                        label='metadata',
                        description='''
                        File should be a tsv/csv of the form:

                            Patient 1 Tumor    Patient 2 Tumor  ...
                        ---------------------------------------
                        Gene/Protein 1    0                   200       ...
                        ---------------------------------------
                        Gene/Protein 2    5                   180       ...
                        ---------------------------------------
                        ...                       ...                    ...        ...
                        ''',
                        default='data/sarcoma-v2-proteome-SpectrumMill-ratio-QCfilter-NArm.tsv',
                        required=False,
                        examples={
                            'BR_proteomics_tumor.tsv': 'https://s3.amazonaws.com/multiomics2paper/public/BR_proteomics_tumor.tsv',
                        },
                    )]
            }
        )],
        'bulk RNA-seq':[
            CustomFileField(
                name='bulk_expr',
                label='RNA-seq expression',
                description='''
                File should be a tsv/csv of the form:

                    Patient 1 Tumor    Patient 2 Tumor  ...
                ---------------------------------------
                Gene/Protein 1    0                   200       ...
                ---------------------------------------
                Gene/Protein 2    5                   180       ...
                ---------------------------------------
                ...                       ...                    ...        ...
                ''',
                default='data/GSE144441_Expression.tsv',
                required=False,
                examples={
                    'GSE144441_Expression.tsv': 'https://minio.dev.maayanlab.cloud/sc2targets/GSE144441_Expression.tsv',
                },
            ),
            CustomFileField(
                        name='bulk_meta_file',
                        label='metadata',
                        description='''
                        File should be a tsv/csv of the form:

                            Patient 1 Tumor    Patient 2 Tumor  ...
                        ---------------------------------------
                        Gene/Protein 1    0                   200       ...
                        ---------------------------------------
                        Gene/Protein 2    5                   180       ...
                        ---------------------------------------
                        ...                       ...                    ...        ...
                        ''',
                        default='data/GSE144441_Metadata.tsv',
                        required=False,
                        examples={
                            'GSE144441_Metadata.tsv': 'https://minio.dev.maayanlab.cloud/sc2targets/GSE144441_Metadata.tsv',
                        },
                    ),
                    TextField(
                        name='condition_col',
                        label='Condition Column',
                        description='Name of the condition column in uploaded metadata',
                        default='Condition',
                        required=True
                    ),
                    TextField(
                        name='ctrl_condition',
                        label='Control Condition Name',
                        description='Name of the control condition in the condition column in uploaded metadata',
                        default='healthy',
                        required=True
                    ),
                    ChoiceField(
                        name='reference',
                        label='Single Cell Reference',
                        description='Choose a single cell reference to use for cell type deconvolution',
                        choices={
                            "Pancreas - Tabula Sapiens (Human)": "https://cellxgene.cziscience.com/collections/e5f58829-1a66-40b5-a624-9046778e74f5",
                            "Pancreas - Tritschler et al. (Mouse)": "https://cellxgene.cziscience.com/collections/0a77d4c0-d5d0-40f0-aa1a-5e1429bcbd7e",
                            "Adipose Tissue - Tabula Sapiens (Human)": "https://cellxgene.cziscience.com/collections/e5f58829-1a66-40b5-a624-9046778e74f5",
                            "Adipose Tissue - Emont et al. (Mouse)": "https://cellxgene.cziscience.com/collections/fe0e718d-2ee9-42cc-894b-0b490f437dfd",
                            "Adipose Tissue - Tabula Muris (Mouse)": "https://cellxgene.cziscience.com/collections/0b9d8a04-bb9d-44da-aa27-705bb65b54eb",
                            "Liver - MacParland et al. 2018 (Human)": "https://cellxgene.cziscience.com/collections/bd5230f4-cd76-4d35-9ee5-89b3e7475659",
                            "Blood - Tabula Sapiens (Human)": "https://cellxgene.cziscience.com/collections/e5f58829-1a66-40b5-a624-9046778e74f5",
                            "Kidney - Tabula Sapiens(Human)": "https://cellxgene.cziscience.com/collections/e5f58829-1a66-40b5-a624-9046778e74f5",
                            "Brain - Tabula Muris (Mouse)": "https://cellxgene.cziscience.com/collections/0b9d8a04-bb9d-44da-aa27-705bb65b54eb",
                            "Skeletal Muscle - Domínguez Conde et al. 2022 (Human)": "https://cellxgene.cziscience.com/collections/62ef75e4-cbea-454e-a0ce-998ec40223d3",
                            "Heart - Tabula Sapiens (Human)": "https://cellxgene.cziscience.com/collections/e5f58829-1a66-40b5-a624-9046778e74f5",
                            "Heart - Litviňuková et al. 2020 (Mouse)": "https://cellxgene.cziscience.com/collections/b52eb423-5d0d-4645-b217-e1c6d38b2e72",
                            "Heart - Tabula Muris (Mouse)": "https://cellxgene.cziscience.com/collections/0b9d8a04-bb9d-44da-aa27-705bb65b54eb",
                            "Muscle - Tabula Sapiens (Human)": "https://cellxgene.cziscience.com/collections/e5f58829-1a66-40b5-a624-9046778e74f5",
                            "Muscle - Tabula Muris (Mouse)": "https://cellxgene.cziscience.com/collections/0b9d8a04-bb9d-44da-aa27-705bb65b54eb",
                            "Skin - Tabula Sapiens (Human)": "https://cellxgene.cziscience.com/collections/e5f58829-1a66-40b5-a624-9046778e74f5",
                            "Retina - Li et al. 2023 (Human)": "https://cellxgene.cziscience.com/collections/4c6eaf5c-6d57-4c76-b1e9-60df8c655f1e",
                            "Retina - Cowan et al. 2020 (Mouse)": "https://cellxgene.cziscience.com/collections/2f4c738f-e2f3-4553-9db2-0582a38ea4dc",
                            "Breast - Reed et al. 2024 (Human)": "https://cellxgene.cziscience.com/collections/48259aa8-f168-4bf5-b797-af8e88da6637",
                            "Lung - Tabula Sapiens (Human)": "https://cellxgene.cziscience.com/collections/e5f58829-1a66-40b5-a624-9046778e74f5",
                            "Lung - Tabula Muris (Mouse)": "https://cellxgene.cziscience.com/collections/0b9d8a04-bb9d-44da-aa27-705bb65b54eb",
                            "Uterus - Tabula Muris (Mouse)": "https://cellxgene.cziscience.com/collections/0b9d8a04-bb9d-44da-aa27-705bb65b54eb",
                            "Bladder - Tabula Sapiens (Human)": "https://cellxgene.cziscience.com/collections/0b9d8a04-bb9d-44da-aa27-705bb65b54eb",
                            "Bladder - Tabula Muris (Mouse)": "https://cellxgene.cziscience.com/collections/0b9d8a04-bb9d-44da-aa27-705bb65b54eb",
                            "Gingiva - Easter et al. 2024 (Human)": "https://cellxgene.cziscience.com/collections/71f4bccf-53d4-4c12-9e80-e73bfb89e398",
                            "Gingiva - Easter et al. 2024 (Mouse)": "https://cellxgene.cziscience.com/collections/67ba665e-0611-4b53-a522-40c2e0dc6df7",
                            "Spleen - Tabula Muris (Mouse)": "https://cellxgene.cziscience.com/collections/0b9d8a04-bb9d-44da-aa27-705bb65b54eb"
                            }
                    )
         ]
        }
) %}


{% set n_neighbors = IntField(
    name='n_neighbors',
    label='N neighbors', 
    default=15, 
    min=2,
    max=100,
    step=1,
    description='The size of local neighborhood (in terms of number of neighboring data points) used for manifold approximation.', 
    section='primary'
)
%}

{% set min_dist = FloatField(
    name='min_dist',
    label='Min Distance', 
    default=0.01, 
    min=0.00001,
    max=0.9,
    step=0.00001,
    description='The effective minimum distance between embedded points.', 
    section='primary'
)
%}

{% set resolution = FloatField(
    name='resolution',
    label='Resolution', 
    default=1, 
    min=0.1,
    max=2,
    step=0.1,
    description='A parameter value controlling the coarseness of the clustering. Higher values lead to more clusters.', 
    section='primary'
)
%}

{% set species = ChoiceField(
    name='species',
    label='Species', 
    default='mouse',
    choices=['human', 'mouse'],
    description='Choose a species', 
    required=True,
    section='primary'
)
%}


{% set membrane_screener_list = 'https://appyters.maayanlab.cloud/storage/Tumor_Gene_Target_Screener/surfaceome.csv' %}


In [102]:
from helpers import *
import os
import re
import random
import qnorm
from string import ascii_uppercase
from tqdm import tqdm
import numpy as np
import pandas as pd
import scanpy as sc
import decoupler as dc

from matplotlib import cm
from matplotlib.patches import Rectangle
from matplotlib.patches import Patch
import matplotlib.pyplot as plt
import seaborn as sns
from matplotlib.colors import LinearSegmentedColormap
%matplotlib inline

import networkx as nx

from IPython.display import HTML, display, Markdown, FileLink
from matplotlib.gridspec import GridSpec
from matplotlib_venn import venn2
from maayanlab_bioinformatics.normalization import zscore_normalize, log2_normalize
from scipy.stats import zscore
from scipy.stats import ttest_ind, ttest_1samp, ttest_ind_from_stats
from maayanlab_bioinformatics.dge import limma_voom_differential_expression
from maayanlab_bioinformatics.harmonization.ncbi_genes import ncbi_genes_lookup
from maayanlab_bioinformatics.api import enrichr_link_from_genes
from statsmodels.stats.multitest import multipletests

import warnings
warnings.filterwarnings("ignore")

sc.settings.set_figure_params(dpi=300, frameon=False)
sc.set_figure_params(dpi=300)
sc.set_figure_params(figsize=(6, 6))

os.makedirs('results', exist_ok=True)
os.makedirs('figures', exist_ok=True)
try:
    lookup = ncbi_genes_lookup(organism='Mammalia/Homo_sapiens')
except:
    import urllib.request, json 
    with urllib.request.urlopen("https://s3.amazonaws.com/multiomics2paper/public/ncbi_genes_disambiguated.json") as url:
        ncbi_genes_disambiguated = json.load(url)
        lookup = ncbi_genes_disambiguated.get
    
fig_counter = 2
table_counter = 1
letter_counter = 0
discussion_results = {}

import sys
import contextlib
@contextlib.contextmanager
def suppress_output(stdout=True, stderr=True, dest=os.devnull):
    ''' Usage:
    with suppress_output():
        print('hi')
    '''
    dev_null = open(dest, 'a')
    if stdout:
        _stdout = sys.stdout
        sys.stdout = dev_null
    if stderr:
        _stderr = sys.stderr
        sys.stderr = dev_null
    try:
        yield
    finally:
        if stdout:
            sys.stdout = _stdout
        if stderr:
            sys.stderr = _stderr

In [None]:
display(Markdown('## Abstract'))
abstract_text = '''Single cell RNA-seq data enables the profiling of gene expression in individual cells. In disease states, cell type 
proportions and functions are often altered. By utilizing two single cell profiles from a control and experimental condition,
TargetRanger can be used to identify highly expressed cell-surface proteins on diseased cell types that are lowly expressed on the healthy cell type and in healthy cells and tissues.
Additionally, we can use L1000 data to identify compounds most likely to push diseased cell types to a healthy phenotype. SC2Targets also enables this analysis on bulk RNA-seq data, 
identifying cell-type specific expression for each bulk sample using a single cell reference and the the deconvolution algorithm, BayesPrism. Overall, SC2Targets enables the 
identification of cell-type specific targets that and reverser compounds to remove to treat disease. '''

display(Markdown(abstract_text))

In [73]:
#load the buttons

In [99]:
def read_sc_data(sc_data_file: str, sc_metadata_file: str, type: str):
    if type == 'plain':
        if sc_data_file.endswith('.csv') or sc_data_file.endswith('.csz.gz'):
            sc_data = pd.read_csv(sc_data_file, index_col=0, compression='gzip' if sc_data_file.endswith('.gz') else None)
        elif sc_data_file.endswith('.txt') or sc_data_file.endswith('.txt.gz') or sc_data_file.endswith('.tsv') or sc_data_file.endswith('.tsv.gz'):
            sc_data = pd.read_csv(sc_data_file, index_col=0, sep='\t', compression='gzip' if sc_data_file.endswith('.gz') else None)
        else:
            raise ValueError('File type for scRNA-seq control profile not supported (.csv, .tsv, .txt)')
        
        if sc_metadata_file.endswith('.csv') or sc_metadata_file.endswith('.csz.gz'):
            sc_metadata = pd.read_csv(sc_metadata_file, index_col=0, compression='gzip' if sc_metadata_file.endswith('.gz') else None)
        elif sc_data_file.endswith('.txt') or sc_metadata_file.endswith('.txt.gz') or sc_data_file.endswith('.tsv') or sc_data_file.endswith('.tsv.gz'):
            sc_metadata= pd.read_csv(sc_metadata_file, index_col=0, sep='\t', compression='gzip' if sc_metadata_file.endswith('.gz') else None)
        else:
            raise ValueError('File type for scRNA-seq control profile not supported (.csv, .tsv, .txt)')
        
        adata = sc.AnnData(sc_data.T.values, obs=sc_metadata)
        adata.var['gene_names'] = sc_data.index.values
        adata.obs['samples'] = sc_metadata.index.values
        return adata
    
def normalize_sc_data(adata, n_neighbors, min_dist, resolution):
    sc.pp.filter_cells(adata, min_genes=200)
    sc.pp.filter_genes(adata, min_cells=3)

    # Annotate the group of mitochondrial genes as 'mt'
    adata.var['mt'] = adata.var_names.str.startswith('MT-')
    sc.pp.calculate_qc_metrics(adata, qc_vars=['mt'], percent_top=None, log1p=False, inplace=True)

    # Filter cells following standard QC criteria.
    adata = adata[adata.obs.n_genes_by_counts < 2500, :]
    adata = adata[adata.obs.pct_counts_mt < 5, :]

    # Normalize the data
    sc.pp.normalize_total(adata, target_sum=1e4)
    sc.pp.log1p(adata)
    adata.layers['log_norm'] = adata.X.copy()
    sc.pp.highly_variable_genes(adata, min_mean=0.0125, max_mean=3, min_disp=0.5)
    sc.pp.regress_out(adata, ['total_counts', 'pct_counts_mt'])
    sc.pp.scale(adata, max_value=10)
    sc.tl.pca(adata, svd_solver='arpack')
    dc.swap_layer(adata, 'log_norm', X_layer_key=None, inplace=True)
    sc.pp.neighbors(adata, n_neighbors=n_neighbors, n_pcs=40)
    sc.tl.umap(adata, min_dist=min_dist)
    sc.tl.leiden(adata, resolution=resolution)
    return adata

In [None]:
%%appyter code_exec
#load the data
{% if data_type.raw_value == 'scRNA-seq' %}

{% if data_type.value[0].raw_value == 'Plain Text' %}
sc_data_ctrl_file = {{ data_type.value[0].value[0] }}
sc_metadata_ctrl_file = {{ data_type.value[0].value[1] }}
sc_data_ctrl = read_sc_data(sc_data_ctrl_file, sc_metadata_ctrl_file, 'plain')
cell_type_col = {{ data_type.value[1].value[2] }}
{% elif data_type.value[1].raw_value == '.mtx' %}
## TODO IMPLEMENT .MTX FILE LOADING
{% endif %}

{% if data_type.value[1].raw_value == 'Plain Text' %}
sc_data_pert_file = {{ data_type.value[1].value[0] }}
sc_metadata_pert_file = {{ data_type.value[1].value[1] }}
sc_data_pert = read_sc_data(sc_data_pert_file, sc_metadata_pert_file, 'plain')

cell_type_col = {{ data_type.value[1].value[2] }}
{% elif data_type.value[1].raw_value == '.mtx' %}
## TODO IMPLEMENT .MTX FILE LOADING
{% endif %}

{% elif data_type.raw_value == 'bulk RNA-seq' %}
## TODO READ BULK DATA, COMPUTE DECONVOLUTION
{% endif %}

In [None]:
%%appyter code_exec
n_neighbors = {{ n_neighbors.raw_value }}
min_dist = {{ min_dist.raw_value }}
resolution = {{ resolution.raw_value }}
species = '{{ species.raw_value }}'

In [94]:
sc_data_pert: sc.AnnData = sc_data_pert
sc_data_ctrl: sc.AnnData = sc_data_ctrl

In [None]:
%%appyter code_exec
{% if data_type.raw_value == 'scRNA-seq' %}
## Identify cell types if no cell type column is provided
sc_data_ctrl = normalize_sc_data(sc_data_ctrl, n_neighbors, min_dist, resolution)
sc.pl.umap(sc_data_ctrl, color='leiden', title='scRNA-seq control profile UMAP',
           frameon=False, legend_fontweight='normal', legend_fontsize=15)
{% endif %}

In [None]:
%%appyter code_exec
{% if data_type.raw_value == 'scRNA-seq' %}
## Identify cell types if no cell type column is provided
sc_data_pert = normalize_sc_data(sc_data_pert, n_neighbors, min_dist, resolution)
sc.pl.umap(sc_data_ctrl, color='leiden', title='scRNA-seq perturbation profile UMAP',
           frameon=False, legend_fontweight='normal', legend_fontsize=15)
{% endif %}


In [None]:
markers = pd.read_csv('https://minio.dev.maayanlab.cloud/sc2targets/PanglaoDB_markers_27_Mar_2020.tsv', sep='\t')
# Filter by canonical_marker and human
markers['human'] = markers['species'].str.contains('Hs')
markers['mouse'] = markers['species'].str.contains('Mm')
markers = markers[markers[species] & (markers['canonical marker'] == 1) & (markers[f'sensitivity_{species}'] > 0.5)]
markers = markers[~markers.duplicated(['cell type', 'official gene symbol'])]
markers.rename(columns={'cell type': 'cell_type', 'official gene symbol': 'genesymbol'}, inplace=True)
if species == 'mouse':
    markers['genesymbol'] = markers['genesymbol'].map(lambda x: x.lower().capitalize())
markers

#sc_data_pert.var.set_index('gene_names', inplace=True)
sc_data_ctrl.var.set_index('gene_names', inplace=True)

dc.run_ora(
    mat=sc_data_pert,
    net=markers,
    source='cell_type',
    target='genesymbol',
    min_n=3,
    verbose=True,
    use_raw=False
)

dc.run_ora(
    mat=sc_data_ctrl,
    net=markers,
    source='cell_type',
    target='genesymbol',
    min_n=3,
    verbose=True,
    use_raw=False
)

In [None]:
sc_data_ctrl.obsm['ora_estimate']

In [180]:
acts_ctrl = dc.get_acts(sc_data_ctrl, obsm_key='ora_estimate')
acts_pert = dc.get_acts(sc_data_pert, obsm_key='ora_estimate')
sc_ctrl_cell_types = dc.rank_sources_groups(acts_ctrl, groupby='leiden', reference='rest', method='t-test_overestim_var')
sc_pert_cell_types = dc.rank_sources_groups(acts_pert, groupby='leiden', reference='rest', method='t-test_overestim_var')

In [183]:
n_ctypes = 3
ctypes_dict_ctrl = sc_ctrl_cell_types.groupby('group').head(n_ctypes).groupby('group')['names'].apply(lambda x: list(x)).to_dict()
ctypes_dict_pert = sc_pert_cell_types.groupby('group').head(n_ctypes).groupby('group')['names'].apply(lambda x: list(x)).to_dict()

In [None]:
sc.pl.matrixplot(acts_ctrl, ctypes_dict_ctrl, 'leiden', dendrogram=True, standard_scale='var',
                 colorbar_title='Z-scaled scores', cmap='RdBu_r')

In [None]:
sc.pl.matrixplot(acts_pert, ctypes_dict_pert, 'leiden', dendrogram=True, standard_scale='var',
                 colorbar_title='Z-scaled scores', cmap='RdBu_r')

In [186]:
annotation_dict_ctrl = sc_ctrl_cell_types.groupby('group').head(1).set_index('group')['names'].to_dict()
annotation_dict_pert = sc_pert_cell_types.groupby('group').head(1).set_index('group')['names'].to_dict()

In [None]:
sc_data_ctrl.obs['cell_type'] = [annotation_dict_ctrl[clust] for clust in sc_data_ctrl.obs['leiden']]
sc_data_pert.obs['cell_type'] = [annotation_dict_pert[clust] for clust in sc_data_pert.obs['leiden']]

# Visualize
sc.pl.umap(sc_data_ctrl, color='cell_type')
sc.pl.umap(sc_data_pert, color='cell_type')

In [None]:
overlapping_cell_types = list(set(annotation_dict_ctrl.values()).intersection(set(annotation_dict_pert.values())))
overlapping_cell_types

In [219]:
## get three representative samples per overlapping cell type
data_pert = []
sample_names_pert = []

data_ctrl = []
sample_names_ctrl = []
# get indices of shared genes for each mat

for ct in overlapping_cell_types:
    ## extract counts from samples matching the cell type:
    ct_cells = sc_data_pert.X.transpose()[:, sc_data_pert.obs['cell_type'] == ct]
    n_cells = ct_cells.shape[1]
    for i in range(3):
        rand_idx = np.random.choice(list(range(n_cells)), n_cells // 3, replace=False)
        ct_cells_rand_samp = ct_cells[:, rand_idx].sum(axis=1)
        data_pert.append(ct_cells_rand_samp)
        sample_names_pert.append(f'pert_{ct}_{i + 1}')
    
    ct_cells = sc_data_ctrl.X.transpose()[:, sc_data_ctrl.obs['cell_type'] == ct]
    n_cells = ct_cells.shape[1]
    for i in range(3):
        rand_idx = np.random.choice(list(range(n_cells)), n_cells // 3, replace=False)
        ct_cells_rand_samp = ct_cells[:, rand_idx].sum(axis=1)
        data_ctrl.append(ct_cells_rand_samp)
        sample_names_ctrl.append(f'ctrl_{ct}_{i + 1}')

sim_bulk_pert = pd.DataFrame(data=data_pert, index=sample_names_pert, columns=sc_data_pert.var_names).T
sim_bulk_ctrl = pd.DataFrame(data=data_ctrl, index=sample_names_ctrl, columns=sc_data_ctrl.var_names).T
bulk_ct_df = pd.merge(sim_bulk_pert, sim_bulk_ctrl, left_index=True, right_index=True)
bulk_ct_df

In [None]:
%%appyter code_exec
 
targets = {}
if species == 'mouse':
    lookup = ncbi_genes_lookup(organism='Mammalia/Mus_musculus')
    bgs = {'ARCHS4': 'Mammalia/Mus_musculus/archs4-gene-stats.tsv', 'Tabula Muris': 'Mammalia/Mus_musculus/tabula-muris-gene-stats.tsv' }
else:
    lookup = ncbi_genes_lookup()
    bgs = {'GTEx': 'gtex-gene-stats.tsv', 'ARCHS4': 'archs4-gene-stats.tsv', 'TS': 'ts_10x_cell-ontology-class_donors_tissue-labels_v1.tsv'}
def find_targets(rna_df, bg, targets):
    df_bg_stats = pd.read_csv(f"https://appyters.maayanlab.cloud/storage/Tumor_Gene_Target_Screener/{bgs[bg]}", sep='\t', index_col=[0,1])
    df_bg_genes = df_bg_stats.unstack().index.map(lambda idx: lookup(idx.partition('.')[0]))
    df_bg_stats = df_bg_stats.unstack().groupby(df_bg_genes, observed=True).median().stack()
    df_bg_expr = df_bg_stats.loc[(slice(None), ['25%', '50%', '75%']), :].unstack()
    common_index = list(set(rna_df.index) & set(df_bg_expr.index))
    expr_df = rna_df.loc[common_index, :]
    index_name = expr_df.index.name
    expr_df.reset_index(inplace=True)
    expr_df.drop_duplicates(subset=index_name, inplace=True)
    expr_df.set_index(index_name, inplace=True, drop=True)
    target_distribution = df_bg_expr.loc[common_index, :].median(axis=1)
    df_expr_norm = qnorm.quantile_normalize(expr_df.loc[common_index, :], target=target_distribution)
    df_bg_expr_norm = qnorm.quantile_normalize(df_bg_expr.loc[common_index, :], target=target_distribution)
    for ct in tqdm(overlapping_cell_types):
        if ct not in targets: targets[ct] = {}
        ct_pert_samples = list(filter(lambda x: ct in x and 'pert' in x, bulk_ct_df.columns))
        ct_ctrl_samples = list(filter(lambda x: ct in x and 'ctrl' in x, bulk_ct_df.columns))
        with suppress_output():
            df_bg_expr_norm.columns = df_bg_expr_norm.columns.to_flat_index().map(lambda s: ', '.join(s))
            dge_pert = limma_voom_differential_expression(
                df_bg_expr_norm, df_expr_norm[ct_pert_samples],
                voom_design=True,
            )
            dge_ctrl = limma_voom_differential_expression(
                df_bg_expr_norm, df_expr_norm[ct_ctrl_samples],
                voom_design=True,
            )
            targets[ct][bg] = set(dge_pert[(dge_pert['adj.P.Val'] < 0.01) & (dge_pert['t'] > 0)].sort_values('t', ascending=False).index.values).difference(set(dge_ctrl[(dge_ctrl['adj.P.Val'] < 0.01) & (dge_ctrl['t'] > 0)].sort_values('t', ascending=False).index.values))

In [None]:
if species == 'human': 
    print('Finding Targets using ARCHS4')
    find_targets(bulk_ct_df, 'ARCHS4', targets)  
    print('Finding Targets using GTEx')
    find_targets(bulk_ct_df, 'GTEx', targets)
    print('Finding Targets using Tabula Sapiens')
    find_targets(bulk_ct_df, 'TS', targets)
elif species == 'mouse':
    print('Finding Targets using ARCHS4')
    find_targets(bulk_ct_df, 'ARCHS4', targets)
    print('Finding Targets using Tabula Muris')
    find_targets(bulk_ct_df, 'Tabula Muris', targets)

In [None]:
proteins = pd.read_csv('https://appyters.maayanlab.cloud/storage/Tumor_Gene_Target_Screener/surfaceome.csv')
membrane_proteins = list(proteins['genename'].map(lookup).dropna().values)

if species == 'human':
    YlGnBu = cm.get_cmap('YlGnBu_r', 8)
    cmap = {"None":YlGnBu(0), "ARCHS4": YlGnBu(1), "GTEx":YlGnBu(2), "TS": YlGnBu(4), "ARCHS4-GTEx":YlGnBu(3),  "ARCHS4-TS": YlGnBu(5), "GTEx-TS": YlGnBu(6), "All": YlGnBu(7)}

    target_list = []
    for cluster in targets:
        for bg in targets[cluster]:
            targets[cluster][bg] = list(filter(lambda x: not x.startswith('PCDH') ,targets[cluster][bg]))
    for cluster in targets:
        for bg in targets[cluster]:
            target_list.extend(list(filter(lambda g: g in membrane_proteins, targets[cluster][bg][:top_targets_n])))
    if len(set(target_list)) < 100:
        top_targets_n = 500
        for cluster in targets:
            for bg in targets[cluster]:
                target_list.extend(list(filter(lambda g: g in membrane_proteins, targets[cluster][bg][:top_targets_n])))
                target_list.extend(list(targets[cluster][bg][:top_targets_n]))
    
    data1, data2, data3 = [], [], []
    similarity = []
    target_list = list(set(target_list))

    for gene in target_list:
        a = [1 if gene in targets[c]['ARCHS4'][:top_targets_n] else 0 for c in clusters]
        g = [2 if gene in targets[c]['GTEx'][:top_targets_n] else 0 for c in clusters]
        l = [4 if gene in targets[c]['TS'][:top_targets_n] else 0 for c in clusters]
        data1.append(a)
        data2.append(g)
        data3.append(l)
        similarity.append(np.dot(np.dot(np.array(a), np.array(g)), np.array(l)))

    data = np.add(np.add(data1, data2), data3)

    membrane_target_mat = pd.DataFrame(data)
    membrane_target_mat.columns = [f"Cluster {c}" for c in clusters]
    membrane_target_mat.index = target_list

    membrane_target_mat['count'] = membrane_target_mat.sum(axis=1)
    membrane_target_mat = membrane_target_mat[membrane_target_mat['count'] >= 7]
    membrane_target_mat = membrane_target_mat.rename_axis('Membrane Target').sort_values(by = ['count', 'Membrane Target'], ascending = [False, True]).drop('count', axis=1)

    h = membrane_target_mat.shape[0]

    cluster_targets = {}
    for col in membrane_target_mat.columns:
        cluster_targets[col + ' Cell Surface Target'] = ','.join(list(membrane_target_mat[col][membrane_target_mat[col] >= 5].index.values))

    g = sns.clustermap(membrane_target_mat, figsize=(4,0.3*h+2*(h<15)), cmap=YlGnBu, cbar_pos=None, dendrogram_ratio=0.1-(h<40)*0.01*(h-30), row_cluster=False, xticklabels=True, yticklabels=True)
    g.ax_row_dendrogram.legend(handles=[Rectangle((0, 0), 0, 0, color=val, label=key) for key, val in cmap.items()],
                                    title='Background', loc='upper right')

    plt.show()
elif species == 'mouse':

    YlGnBu = cm.get_cmap('YlGnBu_r', 4)
    cmap = {"None": YlGnBu(0), "ARCHS4": YlGnBu(1), "Tabula Muris":YlGnBu(2), "Both": YlGnBu(3)}

    top_targets_n = 100
    target_list = []
    for ct in targets:
        for bg in targets[ct]:
            #target_list.extend(list(filter(lambda g: g.upper() in membrane_proteins, list(targets[ct][bg]))))
            target_list.extend(list(targets[ct][bg])[:20])

    data1, data2 = [], []
    similarity = []
    target_list = list(set(target_list))

    for gene in target_list:
        a = [1 if gene in targets[c]['ARCHS4'] else 0 for c in overlapping_cell_types]
        g = [2 if gene in targets[c]['Tabula Muris'] else 0 for c in overlapping_cell_types]
        data1.append(a)
        data2.append(g)
        similarity.append(np.dot(np.array(a), np.array(g)))

    data = np.add(data1, data2)

    membrane_target_mat = pd.DataFrame(data)
    membrane_target_mat.columns = overlapping_cell_types
    membrane_target_mat.index = target_list
    
    membrane_target_mat['count'] = membrane_target_mat.sum(axis=1)
    membrane_target_mat = membrane_target_mat[membrane_target_mat['count'] >= 3]
    
    membrane_target_mat = membrane_target_mat.rename_axis('Membrane Target').sort_values(by = ['count', 'Membrane Target'], ascending = [False, True]).drop('count', axis=1)

    h = membrane_target_mat.shape[0]
    membrane_target_mat = membrane_target_mat.astype(float)
    
    g = sns.clustermap(membrane_target_mat, figsize=(4,0.3*h+2*(h<15)), cmap=YlGnBu, cbar_pos=None, dendrogram_ratio=0.1-(h<40)*0.01*(h-30), row_cluster=False, col_cluster=False, xticklabels=True, yticklabels=True)
    g.ax_row_dendrogram.legend(handles=[Rectangle((0, 0), 0, 0, color=val, label=key) for key, val in cmap.items()],
                                    title='Background', loc='upper right')
    plt.grid(False)


    plt.show()

