In [None]:
#%%appyter init
from appyter import magic
magic.init(lambda _=globals: _())

In [None]:
%%appyter hide_code

{% do SectionField(
    name='primary',
    title='Tumor Gene Target Screener',
) %}

{% do DescriptionField(
    name='data_file_description',
    text='''
    Files should be a tsv/csv of the form:<br />
    <table class="table">
    <tr>
      <td>&nbsp;</td>
      <th>Replicate 1</th>
      <th>Replicate 2</th>
      <th>...</th>
    </tr>
    <tr>
      <th>Gene|Transcript 1</th>
      <td>0</td>
      <td>200</td>
      <td>...</td>
    </tr>
    <tr>
      <th>Gene|Transcript 2</th>
      <td>5</td>
      <td>180</td>
      <td>...</td>
    </tr>
    <tr>
      <th>...</th>
      <td>...</td>
      <td>...</td>
      <td>...</td>
    </tr>
    </table>''',
    section='primary',
) %}

{% set file = FileField(
    name='tumor_expression',
    label='Tumor RNA-seq expression vectors',
    description='Gene/Transcripts on the rows, replicates on the columns',
    default='GSE49155-lung-squamous-cell-carcinoma.tsv',
    required=True,
    examples={
        'GSE49155-lung-squamous-cell-carcinoma.tsv': 'https://appyters.maayanlab.cloud/storage/Tumor_Gene_Target_Screener/GSE49155-patient.tsv',
    },
    section='primary',
) %}

{% set tumor_transcript_level = BoolField(
    name='tumor_transcript_level',
    label='Whether the tumor RNA-seq expression vectors is at the level of transcripts or genes',
    default=False,
    yes_label='Transcript Level',
    no_label='Gene Level',
    section='primary',
) %}

{% set organism = ChoiceField(
    name='organism',
    label='The organism of the RNA-seq expression data',
    default='Homo sapiens',
    choices={
        'Homo sapiens': '"Mammalia/Homo_sapiens"',
        'Mus musculus': '"Mammalia/Mus_musculus"',
    },
    section='primary',
) %}

{% set background = TabField(
    name='background',
    label='Normal tissue background',
    description='Tumor expression will be contrasted against this background',
    default='Precomputed',
    choices={
        'Precomputed': [
            ChoiceField(
                name='background_dataset',
                label='Normal tissue background',
                description='Choose Gene or Transcript Background variant if your tumor vectors are at the Gene or Transcript level.',
                choices={
                    'GTEx (bulk RNA-seq) - Gene': '"s3://storage/Tumor_Gene_Target_Screener/gtex-gene-stats.tsv", storage_options=dict(client_kwargs=dict(endpoint_url="https://appyters.maayanlab.cloud"), anon=True)',
                    'GTEx (bulk RNA-seq) - Transcript': '"s3://storage/Tumor_Gene_Target_Screener/gtex-transcript-stats.tsv", storage_options=dict(client_kwargs=dict(endpoint_url="https://appyters.maayanlab.cloud"), anon=True)',
                    'ARCHS4 Anatomy (bulk RNA-seq) - Gene': '"s3://storage/Tumor_Gene_Target_Screener/archs4-gene-anatomy-stats.tsv", storage_options=dict(client_kwargs=dict(endpoint_url="https://appyters.maayanlab.cloud"), anon=True)',
                    'ARCHS4 Anatomy (bulk RNA-seq) - Transcript': '"s3://storage/Tumor_Gene_Target_Screener/archs4-transcript-anatomy-stats.tsv", storage_options=dict(client_kwargs=dict(endpoint_url="https://appyters.maayanlab.cloud"), anon=True)',
                    'ARCHS4 Extra (bulk RNA-seq) - Gene': '"s3://storage/Tumor_Gene_Target_Screener/archs4-gene-extra-stats.tsv", storage_options=dict(client_kwargs=dict(endpoint_url="https://appyters.maayanlab.cloud"), anon=True)',
                    'ARCHS4 Extra (bulk RNA-seq) - Transcript': '"s3://storage/Tumor_Gene_Target_Screener/archs4-transcript-extra-stats.tsv", storage_options=dict(client_kwargs=dict(endpoint_url="https://appyters.maayanlab.cloud"), anon=True)',
                    'ARCHS4 New (bulk RNA-seq) - Gene': '"s3://storage/Tumor_Gene_Target_Screener/archs4-gene-stats.tsv", storage_options=dict(client_kwargs=dict(endpoint_url="https://appyters.maayanlab.cloud"), anon=True)',
                    'ARCHS4 New (bulk RNA-seq) - Transcript': '"s3://storage/Tumor_Gene_Target_Screener/archs4-transcript-stats.tsv", storage_options=dict(client_kwargs=dict(endpoint_url="https://appyters.maayanlab.cloud"), anon=True)',
                    'Tabula Sapiens (scRNA-seq) - Gene': '"s3://storage/Tumor_Gene_Target_Screener/ts_10x_cell-ontology-class_donors_tissue-labels_v1.tsv", storage_options=dict(client_kwargs=dict(endpoint_url="https://appyters.maayanlab.cloud"), anon=True)',
                    'Human Cell Atlas (scRNA-seq) - Gene': '"s3://storage/Tumor_Gene_Target_Screener/hca_10x_donors_tissue-labels_v1.tsv", storage_options=dict(client_kwargs=dict(endpoint_url="https://appyters.maayanlab.cloud"), anon=True)',
                    'Tabula Muris (scRNA-seq) - Gene - Mouse': '"s3://storage/Tumor_Gene_Target_Screener/Mammalia/Mus_musculus/tabula-muris-gene-stats.tsv", storage_options=dict(client_kwargs=dict(endpoint_url="https://appyters.maayanlab.cloud"), anon=True)',
                    'ARCHS4 (bulk RNA-seq) - Gene - Mouse': '"s3://storage/Tumor_Gene_Target_Screener/Mammalia/Mus_musculus/archs4-gene-stats.tsv", storage_options=dict(client_kwargs=dict(endpoint_url="https://appyters.maayanlab.cloud"), anon=True)',
                    'ARCHS4 (bulk RNA-seq) - Transcript - Mouse': '"s3://storage/Tumor_Gene_Target_Screener/Mammalia/Mus_musculus/archs4-transcript-stats.tsv", storage_options=dict(client_kwargs=dict(endpoint_url="https://appyters.maayanlab.cloud"), anon=True)',
                },
                default='GTEx (bulk RNA-seq) - Gene',
            ),
        ],
        'Custom': [
            FileField(
                name='background_upload',
                label='Normal tissue background',
                description='Given a matrix (d), genes or transcripts by samples, this matrix can be constructed with `d.T.groupby(sample_tissue_mappings).description().T`',
                examples={
                    'GTEx': 'https://appyters.maayanlab.cloud/storage/Tumor_Gene_Target_Screener/gtex-gene-stats.tsv',
                },
                default=None,
                section='primary',
            ),
            BoolField(
                name='background_transcript_level',
                label='Whether this file is at the level of transcripts or genes',
                default=False,
                yes_label='Transcript Level',
                no_label='Gene Level',
            ),
            BoolField(
                name='single_cell',
                label='Whether this file is scRNA-seq or bulk RNA-seq',
                default=False,
                yes_label='scRNA-seq',
                no_label='Bulk RNA-seq',
            ),
            ChoiceField(
                name='background_organism',
                label='The organism of the background',
                default='Homo sapiens',
                choices={
                  'Homo sapiens': '"Mammalia/Homo_sapiens"',
                  'Mus musculus': '"Mammalia/Mus_musculus"',
                },
            ),
            DescriptionField(
                name='background_file_description',
                text='''
                Files should be a tsv/csv of the form:<br />
                <table class="table">
                <tr>
                  <td>&nbsp;</td>
                  <td>&nbsp;</td>
                  <th>Tissue 1</th>
                  <th>...</th>
                </tr>
                <tr>
                  <th>Gene|Transcript 1</th>
                  <th>25%</th>
                  <td>0</td>
                  <td>...</td>
                </tr>
                <tr>
                  <th>...</th>
                  <th>...</th>
                  <td>...</td>
                  <td>...</td>
                </tr>
                <tr>
                  <th>Gene|Transcript n</th>
                  <th>mean</th>
                  <td>180</td>
                  <td>...</td>
                </tr>
                </table>''',
            ),
        ],
    },
    section='primary',
) %}

{% set background_dataset = background.value[0] %}
{% set background_transcript_level = (
    '- Transcript' in background.value[0].raw_value
) if background.raw_value == 'Precomputed' else (
    background.value[1].raw_value or False
) %}

{% set single_cell = (
    'scRNA-seq' in background.value[0].raw_value
) if background.raw_value == 'Precomputed' else (
    background.value[2].raw_value or False
) %}

{% set background_organism = (
    '"Mammalia/Mus_musculus"' if 'Mouse' in background.value[0].raw_value else '"Mammalia/Homo_sapiens"'
) if background.raw_value == 'Precomputed' else (
    background.value[2].value
) %}

{% set membrane_screener_ = TabField(
    name='membrane_screener',
    label='Prioritize membrane genes',
    description='Use membranome to identify membrane. Only human currently supported.',
    default='Yes',
    section='primary',
    choices={
     'Yes': [
        ChoiceField(
                name='membrane_list',
                label='Membrane gene list',
                description='Choose the source for list of membrane genes.',
                choices={
                    'COMPARTMENTS + Human Protein Atlas': "'https://appyters.maayanlab.cloud/storage/Tumor_Gene_Target_Screener/surfaceome.csv'",
                    'Membranome': "'https://lomize-group-membranome.herokuapp.com/proteins?fileFormat=csv'",
                },
                default='COMPARTMENTS + Human Protein Atlas',
            ),
    ],
    'No': [
    ]
    }
) %}

{% set membrane_screener_list = membrane_screener_.value[0] %}

{% set membrane_screener = background_organism == '"Mammalia/Homo_sapiens"' and membrane_screener_.raw_value == 'Yes' %}

{% set normalize_to_background = BoolField(
    name='normalize_to_background',
    label='Normalize to background distribution',
    default=True,
    section='primary',
) %}

{% set proteomics_vis_ = BoolField(
    name='proteomics_vis',
    label='Show protein expression profiles of gene candidates',
    description='View protein expression levels in normal tissues, from Human Proteome Map and Human Protein Atlas proteomics data. Only human currently supported.',
    default=True,
    section='primary',
) %}
{% set proteomics_vis = background_organism == '"Mammalia/Homo_sapiens"' and proteomics_vis_.raw_value %}

# Overexpressed Candidate Identification

This appyter uses RNA-seq expression data for a tumor and identifies over-expressed proteins verses a baseline dataset of normal tissues such those in GTEx or ARCHS4. It then prioritizes candidates by significance and targetability.

In [None]:
%%appyter code_exec
import os
import qnorm
import numpy as np
import pandas as pd
import warnings
import matplotlib.pyplot as plt
import plotly.express as px
import plotly.graph_objects as go
from IPython.display import HTML, display, Markdown
from matplotlib.gridspec import GridSpec
from matplotlib_venn import venn2
from maayanlab_bioinformatics.normalization import zscore_normalize, log2_normalize
from maayanlab_bioinformatics.dge import limma_voom_differential_expression
from maayanlab_bioinformatics.harmonization.ncbi_genes import ncbi_genes_lookup
{% if organism.raw_value == background_organism.raw_value %}
lookup = background_lookup = ncbi_genes_lookup(organism={{ organism }})
{% else %}
lookup = ncbi_genes_lookup(organism={{ organism }})
background_lookup = ncbi_genes_lookup(organism={{ background_organism }})
{% endif %}

import sys
import contextlib
@contextlib.contextmanager
def suppress_output(stdout=True, stderr=True, dest=os.devnull):
    ''' Usage:
    with suppress_output():
        print('hi')
    '''
    dev_null = open(dest, 'a')
    if stdout:
        _stdout = sys.stdout
        sys.stdout = dev_null
    if stderr:
        _stderr = sys.stderr
        sys.stderr = dev_null
    try:
        yield
    finally:
        if stdout:
            sys.stdout = _stdout
        if stderr:
            sys.stderr = _stderr

## Load Tumor RNA-seq Expression Data

Load RNA-seq expression data for the tumor.

In [None]:
def read_table(filename):
    if filename.endswith('.tsv') or filename.endswith('.tsv.gz'):
        return pd.read_csv(filename, sep='\t', index_col=0)
    elif filename.endswith('.csv') or filename.endswith('.csv.gz'):
        return pd.read_csv(filename, sep=',', index_col=0)
    elif filename.endswith('.gct') or filename.endswith('.gct.gz'):
        return pd.read_csv(filename, sep='\t', index_col=0, skiprows=2)
    else:
        return pd.read_table(filename, sep=None, engine='python', index_col=0)

In [None]:
%%appyter code_eval
df_expr = read_table({{ file }})
{% if tumor_transcript_level.raw_value %}
df_expr_transcripts = df_expr.index.map(lambda idx: idx.partition('.')[0])
df_expr = df_expr.groupby(df_expr_transcripts, observed=True).sum()
{% else %}
df_expr_genes = df_expr.index.astype(str).map(lambda idx: lookup(idx.partition('.')[0]))
df_expr = df_expr.groupby(df_expr_genes, observed=True).median()
{% endif %}
df_expr

## Load Background Dataset

The background dataset contains expression for normal tissues across many healthy individuals.

In [None]:
%%appyter code_eval
df_bg_stats = pd.read_csv({{ background_dataset }}, sep='\t', index_col=[0,1])
{% if background_transcript_level %}
df_bg_transcripts = df_bg_stats.unstack().index.map(lambda idx: idx.partition('.')[0])
df_bg_stats = df_bg_stats.unstack().groupby(df_bg_transcripts, observed=True).sum().stack()
{% else %}
df_bg_genes = df_bg_stats.unstack().index.map(lambda idx: background_lookup(idx.partition('.')[0]))
df_bg_stats = df_bg_stats.unstack().groupby(df_bg_genes, observed=True).median().stack()
{% endif %}
df_bg_expr = df_bg_stats.loc[(slice(None), ['25%', '50%', '75%']), :].unstack()
df_bg_expr

In [None]:
%%appyter markdown
{% if background_transcript_level or tumor_transcript_level.raw_value %}
## Load Transcript <=> Gene Mappings
{% endif %}

In [None]:
%%appyter code_eval
{% if background_transcript_level or tumor_transcript_level.raw_value %}
df_transcript_gene_map = pd.read_csv("s3://storage/Tumor_Gene_Target_Screener/transcript-gene-map.tsv.gz", storage_options=dict(client_kwargs=dict(endpoint_url="https://appyters.maayanlab.cloud"), anon=True), sep='\t', header=0, index_col=0, compression='gzip')
df_transcript_gene_map
{% endif %}

In [None]:
%%appyter markdown
{% if membrane_screener %}
## Load Membrane Proteins for Screening
{% if membrane_screener_list.raw_value == 'Membranome' %}
Membrane proteins are ideal targets, we can get these from [Membranome](https://membranome.org/) among other places.
{% else %}
Membrane proteins are ideal targets. This membrane/surface protein filter was created from the intersection of two data sources:
[COMPARTMENTS](https://compartments.jensenlab.org/About) knowledge predictions for human genes where a filter was applied for "Plasma membrane" and "Cell surface" subcellular localization with a confidence score greater than or equal to 3 and [Human Protein Atlas](https://www.proteinatlas.org/) membrane proteins where a filter was applied for "Evidence at protein level" and removal of genes with "Low tissue specificity."
{% endif %}

{% endif %}

In [None]:
%%appyter code_eval
{% if membrane_screener %}
proteins = pd.read_csv({{ membrane_screener_list }})
{% if membrane_screener_list.raw_value == 'Membranome' %}
proteins = proteins[proteins['species_name_cache'] == 'Homo sapiens']
{% endif %}
membrane_proteins = proteins['genename'].map(lookup).dropna()
membrane_proteins
{% endif %}

In [None]:
%%appyter markdown
{% if background_transcript_level and tumor_transcript_level.raw_value %}
## Background (Transcript) - Tumor (Transcript) Gene Interoperability

Several features of this appyter operate at the gene level, thus while we will find significant transcripts, we
will use the associated gene for the other features.
{% endif %}

In [None]:
%%appyter markdown
{% if background_transcript_level and not tumor_transcript_level.raw_value %}
## Background (Transcript) - Tumor (Gene) Interoperability

As the tumor RNA-seq expression data is at the gene level, we will map the
background to genes for differential expression but later highlight significant transcripts in the background.
{% endif %}

In [None]:
%%appyter code_eval
{% if background_transcript_level and not tumor_transcript_level.raw_value %}
df_bg_expr = df_bg_expr.groupby(df_transcript_gene_map['gene_symbol']).sum().groupby(lookup).median()
df_bg_expr
{% endif %}

In [None]:
%%appyter markdown
{% if not background_transcript_level and tumor_transcript_level.raw_value %}
## Background (Gene) - Tumor (Transcript) Interoperability

Though the tumor RNA-seq expression data is at the transcript level, the background is only at the gene level.
This is not ideal, but we can still highlight individual transcripts in the tumor that are over-expressed
when compared to the average background expression across all transcripts.
{% endif %}

In [None]:
%%appyter code_eval
{% if not background_transcript_level and tumor_transcript_level.raw_value %}
# "melt" from wide format to long format with the columns: (index, type, stat, value)
df_bg_expr_melted = df_bg_expr.melt(ignore_index=False)
# merge with transcript_gene_map will transform the index from gene_symbol to ensembl_transcript_id
#  duplicating entries for each statistic accordingly
df_bg_expr_melted_mapped = pd.merge(
    left=df_bg_expr_melted, left_index=True,
    right=df_transcript_gene_map, right_on='gene_symbol',
)
# 
df_bg_expr = df_bg_expr_melted_mapped.pivot(columns=['variable_0', 'variable_1'], values='value')
df_bg_expr
{% endif %}

In [None]:
%%appyter markdown
{% if organism.raw_value != background_organism %}
## Background ({{ background_organism }}) - Input ({{ organism }}) Interoperability

Given the mismatch between the input and background organisms, we will use homologs to return results
for the same organism used in the input. For this we use MGI Mouse Homology <http://www.informatics.jax.org/homology.shtml>.
{% endif %}

In [None]:
%%appyter code_exec
{% if organism.value == '"Mammalia/Homo_sapiens"' and background_organism == '"Mammalia/Mus_musculus"' %}
from maayanlab_bioinformatics.harmonization.homologs import mouse_expression_to_human
df_bg_expr = mouse_expression_to_human(df_bg_expr)
{% elif organism.value == '"Mammalia/Mus_musculus"' and background_organism == '"Mammalia/Homo_sapiens"' %}
from maayanlab_bioinformatics.harmonization.homologs import human_expression_to_mouse
df_bg_expr = human_expression_to_mouse(df_bg_expr)
{% endif %}

## Distribution matching between tumor sample(s) & the background

We show the median gene expression distribution in the tumor and in the background before and after normalization.

In [None]:
fig, ((ax11, ax12), (ax21, ax22)) = plt.subplots(2, 2)
log2_normalize(df_expr).median(axis=1).hist(bins=100, ax=ax11)
ax11.set_title('Median Expression')
ax11.set_ylabel('Tumor')
log2_normalize(df_expr).median(axis=0).hist(bins=100, ax=ax12)
ax12.set_title('Median Sample Expression')
log2_normalize(df_bg_expr).median(axis=1).hist(bins=100, ax=ax21)
ax21.set_ylabel('Background')
log2_normalize(df_bg_expr).median(axis=0).hist(bins=100, ax=ax22)
ax21.set_xlabel('$log_2(count)$')
ax22.set_xlabel('$log_2(count)$')
plt.tight_layout()
plt.show()

The following venn diagram shows the gene/transcript overlap between the tumor and the background, ideally it should be very high, otherwise additional identifier mapping may be required. If the two are completely disjoint, you've most likely incorrectly labeled your data as gene or transcript data.

In [None]:
common_index = list(set(df_expr.index) & set(df_bg_expr.index))
venn2([set(df_expr.index), set(df_bg_expr.index)],
      ['Tumor  ', '  Background'])

In [None]:
%%appyter markdown
{% if normalize_to_background.raw_value %}
Here we use quantile normalization [1] to align the expression with the background's median distribution. The implications of this has not been fully explored though presumably given that we have no experimental controls to properly normalize, it should be better than nothing.

[1] B.M. Bolstad, R.A Irizarry, M. Åstrand, T.P. Speed, A comparison of normalization methods for high density oligonucleotide array data based on variance and bias, Bioinformatics, Volume 19, Issue 2, 22 January 2003, Pages 185–193, https://doi.org/10.1093/bioinformatics/19.2.185
{% endif %}

In [None]:
%%appyter code_exec
{% if normalize_to_background.raw_value %}
target_distribution = df_bg_expr.loc[common_index, :].median(axis=1)
df_expr_norm = qnorm.quantile_normalize(df_expr.loc[common_index, :], target=target_distribution)
df_bg_expr_norm = qnorm.quantile_normalize(df_bg_expr.loc[common_index, :], target=target_distribution)

fig, ((ax11, ax12), (ax21, ax22)) = plt.subplots(2, 2)
log2_normalize(df_expr_norm).median(axis=1).hist(bins=100, ax=ax11)
ax11.set_title('Median Expression')
ax11.set_ylabel('Tumor')
log2_normalize(df_expr_norm).median(axis=0).hist(bins=100, ax=ax12)
ax12.set_title('Median Sample Expression')
log2_normalize(df_bg_expr_norm).median(axis=1).hist(bins=100, ax=ax21)
ax21.set_ylabel('Background')
log2_normalize(df_bg_expr_norm).median(axis=0).hist(bins=100, ax=ax22)
ax21.set_xlabel('$log_2(count)$')
ax22.set_xlabel('$log_2(count)$')
plt.tight_layout()
plt.show()
{% else %}
print('Warning: Proceeding without normalization')
df_expr_norm = df_expr.loc[common_index, :]
df_bg_expr_norm = df_bg_expr.loc[common_index, :]
{% endif %}

## Perform Differential Expression between Tumor & Background

We use voom-limma to identify significantly differentially expressed genes between the background healthy expression quantiles and the tumor expression.

In [None]:
%%appyter code_eval

with suppress_output():
    df_bg_expr_norm.columns = df_bg_expr_norm.columns.to_flat_index().map(lambda s: ', '.join(s))
    dge = limma_voom_differential_expression(
        df_bg_expr_norm, df_expr_norm,
        voom_design=True,
    )
    {% if tumor_transcript_level.raw_value %}
    dge['ensembl_transcript_id'] = dge.index
    dge['gene_symbol'] = df_transcript_gene_map.loc[dge.index, 'gene_symbol'].apply(lambda g: lookup(g) or g)
    dge['label'] = dge.apply(lambda r: f"{r['ensembl_transcript_id']} - {r['gene_symbol']}", axis=1)
    {% else %}
    dge['gene_symbol'] = dge.index
    dge['label'] = dge.index
    {% endif %}
dge

## Narrow Down Candidate Set

We identify significantly differentially expressed genes with logFC- t-statistic products which significant deviate from the mean, or equivalently, those points which are furthest from the volcano plot origin.

In [None]:
dge['-log(adj.P.Val)'] = -np.log(dge['adj.P.Val'])
prod = (np.abs(dge['t']) * dge['logFC'])
dge['is_deg'] = dge['adj.P.Val'] < 0.05
dge['is_significant'] = prod > prod.mean() + 3 * prod.std()
dge['score'] = dge['is_deg'].astype(int) + dge['is_significant'].astype(int)
#
fig = go.Figure()
fig.add_trace(go.Scattergl(
    mode='markers',
    x=dge.loc[~dge.is_deg, 'logFC'],
    y=dge.loc[~dge.is_deg, '-log(adj.P.Val)'],
    name='Other',
    showlegend=False,
    marker=dict(
        color='black',
    )
))
fig.add_trace(go.Scattergl(
    mode='markers',
    x=dge.loc[dge.is_deg & ~dge.is_significant, 'logFC'],
    y=dge.loc[dge.is_deg & ~dge.is_significant, '-log(adj.P.Val)'],
    text=dge.loc[dge.is_deg & ~dge.is_significant, 'label'],
    name='Differentially Expressed',
    marker=dict(
        color='rgb(255, 221, 221)',
    )
))
fig.add_trace(go.Scattergl(
    mode='markers',
    x=dge.loc[dge.is_significant, 'logFC'],
    y=dge.loc[dge.is_significant, '-log(adj.P.Val)'],
    text=dge.loc[dge.is_significant, 'label'],
    name='Significantly Far from Origin',
    marker=dict(
        color='rgb(239, 85, 59)',
    )
))
fig.update_layout(
    title='Background vs Tumor Differential Expression',
    xaxis_title='Log Fold Change',
    yaxis_title='-Log[Adjusted P-Value]',
    autosize=True,
)
fig.show()

In [None]:
%%appyter code_exec
{% if membrane_screener %}
dge['is_membrane'] = np.in1d(dge['gene_symbol'], membrane_proteins)
dge['score'] = dge['score'] + dge['is_membrane'].astype(int)
#
fig = go.Figure()
fig.add_trace(go.Scattergl(
    mode='markers',
    x=dge.loc[~dge.is_membrane&~dge.is_significant, 'logFC'],
    y=dge.loc[~dge.is_membrane&~dge.is_significant, '-log(adj.P.Val)'],
    name='Other',
    showlegend=False,
    marker=dict(
        color='black',
    )
))
fig.add_trace(go.Scattergl(
    mode='markers',
    x=dge.loc[dge.is_membrane&~dge.is_significant, 'logFC'],
    y=dge.loc[dge.is_membrane&~dge.is_significant, '-log(adj.P.Val)'],
    name='Membrane Protein',
    marker=dict(
        color='grey',
    )
))
fig.add_trace(go.Scattergl(
    mode='markers',
    x=dge.loc[~dge.is_membrane&dge.is_significant, 'logFC'],
    y=dge.loc[~dge.is_membrane&dge.is_significant, '-log(adj.P.Val)'],
    text=dge.loc[~dge.is_membrane&dge.is_significant, 'label'],
    name='Significant',
    marker=dict(
        color='rgb(239, 85, 59)',
    )
))
fig.add_trace(go.Scattergl(
    mode='markers',
    x=dge.loc[dge.is_membrane&dge.is_significant, 'logFC'],
    y=dge.loc[dge.is_membrane&dge.is_significant, '-log(adj.P.Val)'],
    text=dge.loc[dge.is_membrane&dge.is_significant, 'label'],
    name='Significant Membrane Protein',
    marker=dict(
        color='rgb(99, 110, 250)',
    )
))
fig.update_layout(
    title='Background vs Tumor',
    xaxis_title='Log Fold Change',
    yaxis_title='-Log[Adjusted P-Value]',
    autosize=True,
)
fig.show()
{% endif %}

In [None]:
%%appyter code_exec
dge_final = dge[dge.score >= 1].sort_values(['score', '-log(adj.P.Val)'], ascending=False).iloc[:16]
pd.set_option('display.max_colwidth', None)
dge_final['Link'] = dge_final['gene_symbol'].map(lambda g: f"<a href=\"https://cfde-gene-pages.cloud/gene/{g}\">{g}</a>")
{% if membrane_screener %}
display(HTML(dge_final[[
    'AveExpr',
    'logFC',
    'P.Value',
    'adj.P.Val',
    'is_deg',
    'is_significant',
    'is_membrane',
    'score',
    'Link',
]].to_html(notebook=True, escape=False)))
{% else %}
display(HTML(dge_final[[
    'AveExpr',
    'logFC',
    'P.Value',
    'adj.P.Val',
    'is_deg',
    'is_significant',
    'score',
    'Link',
]].to_html(notebook=True, escape=False)))
{% endif %}

## Review Expression Levels of Selected Candidates

In [None]:
%%appyter markdown
{% if 'Tabula Sapiens' in background.value[0].raw_value %}
The background data were obtained from the [Tabula Sapiens](https://tabula-sapiens-portal.ds.czbiohub.org) atlas (The Tabula Sapiens Consortium, Science 376, eabl4896 (2022)).
This dataset contains gene expression levels of 469 cell types from 24 normal tissues across 14 donors.  For a given gene, some cell types have zero expression levels and are listed separately.  You can explore the cell types with non-zero expression levels in the interactive plot, but please note that some cell types were only seen in a single donor and thus do not display statistics.
{% elif 'Human Cell Atlas' in background.value[0].raw_value %}
The background data were obtained from 15 datasets in the [Human Cell Atlas](https://data.humancellatlas.org).
This dataset contains gene expression levels of 27 cell types from 14 normal tissues across a number of donors.  For a given gene, some cell types have zero expression levels and are listed separately.  You can explore the cell types with non-zero expression levels in the interactive plot, but please note that some cell types were only seen in a single donor and thus do not display statistics.
{% endif %}

{% if proteomics_vis %}
Proteomics data were obtained from the [Human Protein Atlas](https://www.proteinatlas.org/about/download) (HPA) with IHC-based expression profiling, the [Human Proteome Map](https://www.humanproteomemap.org/download.php) (HPM) with MS-based expression quantification, and a [GTEx proteome project](https://doi.org/10.1016/j.cell.2020.08.036) using TMT MS. 
These datasets contain protein expression levels detected in normal tissues and cell types. Not all differentially-expressed gene candidates may be present in the data from each project (see table for which proteomics data are present/absent). Plots show expression levels (HPA), average spectral counts (HPM), or a log-transformed relative abundance (GTEx) by tissue/cell-type for each gene candidate (excluding expression levels from the HPA where the [reliability score](https://www.proteinatlas.org/about/assays+annotation) was uncertain). 
{% endif %}

In [None]:
%%appyter code_exec
{% if proteomics_vis %}
hpm = pd.read_csv("s3://storage/Tumor_Gene_Target_Screener/hpm.tsv", storage_options=dict(client_kwargs=dict(endpoint_url="https://appyters.maayanlab.cloud"), anon=True), sep='\t', header=0, index_col=0)
hpa = pd.read_csv("s3://storage/Tumor_Gene_Target_Screener/hpa.tsv", storage_options=dict(client_kwargs=dict(endpoint_url="https://appyters.maayanlab.cloud"), anon=True), sep='\t', header=0, index_col=1)
gtexp = pd.read_csv("s3://storage/Tumor_Gene_Target_Screener/gtex_proteomics.tsv", storage_options=dict(client_kwargs=dict(endpoint_url="https://appyters.maayanlab.cloud"), anon=True), sep='\t', header=0)
gtexp['Name'] = gtexp['gene.id'].map(lambda idx: lookup(idx))

# Show available genes in each dataset
available = pd.DataFrame({'Gene': dge_final['gene_symbol'], 
                          'in HPM': dge_final['gene_symbol'].isin(hpm.index), 
                          'in HPA': dge_final['gene_symbol'].isin(hpa.index),
                          'in GTEx Proteomics': dge_final['gene_symbol'].isin(gtexp.Name)}).drop_duplicates(subset=['Gene'])
display(HTML(available.to_html(notebook=True, escape=False)))
{% endif %}

In [None]:
%%appyter code_exec
# create a dictionary of tissues with their respective cell types
def create_tissue_cell_type_dict(stats):
    final_dict = dict()
    for cell_type in stats.columns:
        tissue_cell_type_list = cell_type.split("-", maxsplit=1)
        final_dict.setdefault(tissue_cell_type_list[0], []).append(tissue_cell_type_list[1])
    return final_dict

# draw a category of boxplots
def display_boxplots(stats, tissue=None, log_x=False):
    transform = lambda x: np.log2(x+1.) if log_x else x
    IQR = stats.loc['75%']-stats.loc['25%']
    fig.add_trace(go.Box(
        lowerfence=transform(np.maximum(
            stats.loc['min'],
            stats.loc['25%'] - (1.5*IQR),
        )),
        q1=transform(stats.loc['25%']),
        median=transform(stats.loc['50%']),
        q3=transform(stats.loc['75%']),
        upperfence=transform(np.minimum(
            stats.loc['max'],
            stats.loc['75%'] + (1.5*IQR),
        )),
        mean=transform(stats.loc['mean']),
        # sd=None if log_x else stats.loc['std'],
        y=stats.columns,
        {% if tumor_transcript_level.raw_value and background_transcript_level %}
        name=f"{% if single_cell %}{tissue} - {% endif %}Background ({'log2 ' if log_x else ''}Transcript Expression)",
        {% else %}
        name=f"{% if single_cell %}{tissue} - {% endif %}Background ({'log2 ' if log_x else ''}Gene Expression)",
        {% endif %}
        orientation='h'
    ))

In [None]:
%%appyter code_exec
{% if proteomics_vis %}
hpa.Tissue = hpa["Tissue"] + ", " + hpa["Cell.type"]
hpa = hpa[hpa['Reliability'] != "Uncertain"] 
gtexp['tissue_specificity'] = gtexp.tissue_specificity.fillna('NA')
{% endif %}

for index, row in dge_final.iterrows():
    gene_symbol = row['gene_symbol']
    label = row['label']
    display(Markdown(f"### {label}"))
    {% if background_transcript_level and not tumor_transcript_level.raw_value %}
    # get stats for all transcripts corresponding to this gene symbol
    stats = df_bg_stats.loc[(df_transcript_gene_map[df_transcript_gene_map['gene_symbol'] == gene_symbol].index, slice(None))].unstack()
    # identify per-tissue mask based top 5 medians, the dropped level here is the stats level since we're going
    #  to apply this mask to all stats based on the median (50%)
    mask = (stats.loc[:, (slice(None), '50%')].droplevel(1, axis=1).rank(ascending=False, method='first') <= 5)
    # apply mask to stats. the resulting mask has at most the top 5 transcripts for each tissue
    #  if this results in a transcript missing from all tissues, we'll drop it
    stats = stats.stack()[mask].unstack(0).dropna(how='all', axis=1)
    # flatten column for next step, creating columns of the form: {tissue} - {transcript_id}
    stats.columns = stats.columns.to_flat_index().map(lambda col: ' - '.join(col))
    {% elif not background_transcript_level and tumor_transcript_level.raw_value %}
    stats = df_bg_stats.loc[(gene_symbol, slice(None))]
    {% else %}
    stats = df_bg_stats.loc[(index, slice(None))]
    {% endif %}
    stats.sort_values('mean', axis=1, inplace=True)
    fig = go.Figure()
    
    {% if single_cell %}
    stats_all = stats
    # store all of the cell types with non-zero expression
    stats = stats_all.loc[:, (stats_all != 0).any(axis=0)]
    # store all of the cell types with zero expression
    stats_zero_expr = stats_all.loc[:, (stats_all == 0).all(axis=0)]
    # display the number of cell types with zero expression
    zero_expr_size = len(stats_all.columns)-len(stats.columns)
    display(Markdown(f"#### {zero_expr_size} cell type{'' if zero_expr_size == 1 else 's'} with zero expression:"))

    # create a dictionary of tissues with their respective cell types (for zero expression cell types)
    stats_zero_tissues_dict = create_tissue_cell_type_dict(stats_zero_expr)

    # display cell types with zero expression
    for tissue in stats_zero_tissues_dict.keys():
        display(Markdown(f"##### **{tissue}:** {'; '.join(stats_zero_tissues_dict[tissue])}"))
    
    # display the number of cell types with non-zero expression
    nonzero_expr_size = len(stats.columns)
    display(Markdown(f"#### {nonzero_expr_size} cell type{'' if nonzero_expr_size == 1 else 's'} with non-zero expression:"))

    # create a dictionary of tissues with their respective cell types (for non-zero expression cell types).  We won't need the values for the keys as implemented, but they're nice to have.
    stats_tissues_dict = create_tissue_cell_type_dict(stats)

    # display boxplots for each tissue
    for tissue in stats_tissues_dict.keys():
        # subset the stats into just cell types that correspond to the given tissue
        display_boxplots(stats[stats.columns[stats.columns.map(lambda x: x.startswith(tissue))]], tissue, log_x=True)

    {% else %}
    display_boxplots(stats, log_x=True)
    {% endif %}

    fig.add_trace(go.Box(
        x=np.log2(df_expr_norm.loc[index]+1.),
        {% if tumor_transcript_level.raw_value %}
        name=f"Tumor (Normalized log2 Transcript Expression)",
        {% else %}
        name=f"Tumor (Normalized log2 Gene Expression)",
        {% endif %}
        orientation='h',
    ))
    
    fig.update_layout(title=label+f" ({% if single_cell %}sc{% else %}Bulk {% endif %}RNA-seq)", height= 300 if len(stats.columns) < 2 else len(stats.columns)*50)
    fig.show()
    
    {% if proteomics_vis %}
    if gene_symbol in gtexp.Name.values:
        d = gtexp[gtexp['Name'] == gene_symbol]
        fig = px.strip(d, y="tissue", x="value",  
                       orientation='h',
                       stripmode="overlay",
                       hover_data=["tissue_specificity"],
                       height=30*d['tissue'].nunique())
        fig.add_trace(go.Box(x=d['value'],
                             y=d['tissue'],
                             orientation='h',
                             marker=dict(color='#636EFA'),
                             name="n > 1"))
        fig.update_layout(title=label+" (GTEx Proteomics)",
                          autosize=True,
                          showlegend=False)
        fig.update_xaxes(title="log2(relative abundance)")
        fig.update_yaxes(title=None)
        fig.show()
       
    if gene_symbol in hpm.index:
        fig = px.scatter(hpm.loc[[gene_symbol]], 
                         y="Tissue", x="value", 
                         height=20*hpm.loc[[gene_symbol]].shape[0])
        fig.update_layout(title=label+" (HPM)", 
                          autosize=True)
        fig.update_xaxes(title="Average Spectral Counts")
        fig.update_yaxes(title=None)
        fig.show()
    
    if gene_symbol in hpa.index:
        fig = px.scatter(hpa.loc[[gene_symbol]], 
                         y="Tissue", x="Level", 
                         category_orders={"Level": ["Not detected", "Low", "Medium", "High"]}, 
                         hover_data=["Reliability"],  
                         hover_name="Tissue",
                         height=20*hpa.loc[[gene_symbol]].shape[0])
        fig.update_layout(title=label+" (HPA)", 
                          showlegend=False, 
                          autosize=True, 
                          xaxis={'tickmode':'array', 
                                 'tickvals':[0, 1, 2, 3], 
                                 'ticktext':["Not detected", "Low", "Medium", "High"]})
        fig.update_xaxes(title="Tissue Expression Level")
        fig.update_yaxes(title=None)
        fig.show()
    {% endif %}