In [None]:
#%%appyter init
from appyter import magic
magic.init(lambda _=globals: _())

In [None]:
%%appyter hide_code

{% do SectionField(
    name='primary',
    title='Tumor Gene Target Screener',
) %}

{% do DescriptionField(
    name='data_file_description',
    text='''
    Files should be a tsv/csv of the form:<br />
    <table class="table">
    <tr>
      <td>&nbsp;</td>
      <th>Replicate 1</th>
      <th>Replicate 2</th>
      <th>...</th>
    </tr>
    <tr>
      <th>Gene|Transcript 1</th>
      <td>0</td>
      <td>200</td>
      <td>...</td>
    </tr>
    <tr>
      <th>Gene|Transcript 2</th>
      <td>5</td>
      <td>180</td>
      <td>...</td>
    </tr>
    <tr>
      <th>...</th>
      <td>...</td>
      <td>...</td>
      <td>...</td>
    </tr>
    </table>''',
    section='primary',
) %}

{% set file = FileField(
    name='patient_expression',
    label='Patient RNA-seq expression vectors',
    description='Gene/Transcripts on the rows, replicates on the columns',
    default='GSE49155-lung-squamous-cell-carcinoma.tsv',
    required=True,
    examples={
        'GSE49155-lung-squamous-cell-carcinoma.tsv': 'https://appyters.maayanlab.cloud/storage/Tumor_Gene_Target_Screener/GSE49155-patient.tsv',
    },
    section='primary',
) %}

{% set tumor_transcript_level = BoolField(
    name='tumor_transcript_level',
    label='Whether the patient RNA-seq expression vectors is at the level of transcripts or genes',
    default=False,
    yes_label='Transcript Level',
    no_label='Gene Level',
) %}

{% set background = TabField(
    name='background',
    label='Normal tissue background',
    description='Patient expression will be contrasted against this background',
    default='Precomputed',
    choices={
        'Precomputed': [
            ChoiceField(
                name='background_dataset',
                label='Normal tissue background',
                description='Choose Gene or Transcript Background variant if your patient vectors are at the Gene or Transcript level.',
                choices={
                    'GTEx - Gene': '"s3://storage/Tumor_Gene_Target_Screener/gtex-gene-stats.tsv", storage_options=dict(client_kwargs=dict(endpoint_url="https://appyters.maayanlab.cloud"), anon=True)',
                    'GTEx - Transcript': '"s3://storage/Tumor_Gene_Target_Screener/gtex-transcript-stats.tsv", storage_options=dict(client_kwargs=dict(endpoint_url="https://appyters.maayanlab.cloud"), anon=True)',
                    'ARCHS4 Anatomy - Gene': '"s3://storage/Tumor_Gene_Target_Screener/archs4-gene-anatomy-stats.tsv", storage_options=dict(client_kwargs=dict(endpoint_url="https://appyters.maayanlab.cloud"), anon=True)',
                    'ARCHS4 Anatomy - Transcript': '"s3://storage/Tumor_Gene_Target_Screener/archs4-transcript-anatomy-stats.tsv", storage_options=dict(client_kwargs=dict(endpoint_url="https://appyters.maayanlab.cloud"), anon=True)',
                    'ARCHS4 Extra - Gene': '"s3://storage/Tumor_Gene_Target_Screener/archs4-gene-extra-stats.tsv", storage_options=dict(client_kwargs=dict(endpoint_url="https://appyters.maayanlab.cloud"), anon=True)',
                    'ARCHS4 Extra - Transcript': '"s3://storage/Tumor_Gene_Target_Screener/archs4-transcript-extra-stats.tsv", storage_options=dict(client_kwargs=dict(endpoint_url="https://appyters.maayanlab.cloud"), anon=True)',
                },
                default='GTEx - Gene',
            ),
        ],
        'Custom': [
            FileField(
                name='background_upload',
                label='Normal tissue background',
                description='Given a matrix (d), genes or transcripts by samples, this matrix can be constructed with `d.T.groupby(sample_tissue_mappings).description().T`',
                examples={
                    'GTEx': 'https://appyters.maayanlab.cloud/storage/Tumor_Gene_Target_Screener/gtex-gene-stats.tsv',
                },
                default=None,
                section='primary',
            ),
            BoolField(
                name='background_transcript_level',
                label='Whether this file is at the level of transcripts or genes',
                default=False,
                yes_label='Transcript Level',
                no_label='Gene Level',
            ),
            DescriptionField(
                name='background_file_description',
                text='''
                Files should be a tsv/csv of the form:<br />
                <table class="table">
                <tr>
                  <td>&nbsp;</td>
                  <td>&nbsp;</td>
                  <th>Tissue 1</th>
                  <th>...</th>
                </tr>
                <tr>
                  <th>Gene|Transcript 1</th>
                  <th>25%</th>
                  <td>0</td>
                  <td>...</td>
                </tr>
                <tr>
                  <th>...</th>
                  <th>...</th>
                  <td>...</td>
                  <td>...</td>
                </tr>
                <tr>
                  <th>Gene|Transcript n</th>
                  <th>mean</th>
                  <td>180</td>
                  <td>...</td>
                </tr>
                </table>''',
            ),
        ],
    },
    section='primary',
) %}

{% set background_dataset = background.value[0] %}
{% set background_transcript_level = (
    background.value[0].raw_value.endswith('- Transcript')
) if background.raw_value == 'Precomputed' else (
    background.value[1].raw_value or False
) %}

{% set membrane_screener = BoolField(
    name='membrane_screener',
    label='Prioritize membrane genes',
    description='Use membranome to identify membrane',
    default=True,
    section='primary',
) %}

{% set normalize_to_background = BoolField(
    name='normalize_to_background',
    label='Normalize to background distribution',
    default=True,
    section='primary',
) %}

{% set proteomics_vis = BoolField(
    name='proteomics_vis',
    label='Show protein expression profiles of gene candidates',
    description='View protein expression levels in normal tissues, from Human Proteome Map and Human Protein Atlas proteomics data',
    default=True,
    section='primary',
) %}

# Overexpressed Candidate Identification

This appyter uses RNA-seq expression data for a patient and identifies over-expressed proteins verses a baseline dataset of normal tissues such those in GTEx or ARCHS4. It then prioritizes candidates by significance and targetability.

In [None]:
%%appyter code_exec
import qnorm
import numpy as np
import pandas as pd
import warnings
import matplotlib.pyplot as plt
import plotly.graph_objects as go
from IPython.display import HTML, display, Markdown
from matplotlib.gridspec import GridSpec
from matplotlib_venn import venn2
from maayanlab_bioinformatics.normalization import zscore_normalize, log2_normalize
from maayanlab_bioinformatics.dge import limma_voom_differential_expression
from maayanlab_bioinformatics.harmonization.ncbi_genes import ncbi_genes_lookup
lookup = ncbi_genes_lookup()

import sys
import contextlib
@contextlib.contextmanager
def suppress_output(stdout=True, stderr=True, dest='/dev/null'):
    ''' Usage:
    with suppress_output():
        print('hi')
    '''
    dev_null = open(dest, 'a')
    if stdout:
        _stdout = sys.stdout
        sys.stdout = dev_null
    if stderr:
        _stderr = sys.stderr
        sys.stderr = dev_null
    try:
        yield
    finally:
        if stdout:
            sys.stdout = _stdout
        if stderr:
            sys.stderr = _stderr

## Load Patient RNA-seq Expression Data

Load RNA-seq expression data for the patient.

In [None]:
def read_table(filename):
    if filename.endswith('.tsv') or filename.endswith('.tsv.gz'):
        return pd.read_csv(filename, sep='\t', index_col=0)
    elif filename.endswith('.csv') or filename.endswith('.csv.gz'):
        return pd.read_csv(filename, sep=',', index_col=0)
    elif filename.endswith('.gct') or filename.endswith('.gct.gz'):
        return pd.read_csv(filename, sep='\t', index_col=0, skiprows=2)
    else:
        return pd.read_table(filename, sep=None, engine='python', index_col=0)

In [None]:
%%appyter code_eval
df_expr = read_table({{ file }})
{% if tumor_transcript_level.raw_value %}
df_expr_transcripts = df_expr.index.map(lambda idx: idx.partition('.')[0])
df_expr = df_expr.groupby(df_expr_transcripts, observed=True).sum()
{% else %}
df_expr_genes = df_expr.index.astype(str).map(lambda idx: lookup(idx.partition('.')[0]))
df_expr = df_expr.groupby(df_expr_genes, observed=True).median()
{% endif %}
df_expr

## Load Background Dataset

The background dataset contains expression for normal tissues in many patients.

In [None]:
%%appyter code_eval
df_bg_stats = pd.read_csv({{ background_dataset }}, sep='\t', index_col=[0,1])
{% if background_transcript_level %}
df_bg_transcripts = df_bg_stats.unstack().index.map(lambda idx: idx.partition('.')[0])
df_bg_stats = df_bg_stats.unstack().groupby(df_bg_transcripts, observed=True).sum().stack()
{% else %}
df_bg_genes = df_bg_stats.unstack().index.map(lambda idx: lookup(idx.partition('.')[0]))
df_bg_stats = df_bg_stats.unstack().groupby(df_bg_genes, observed=True).median().stack()
{% endif %}
df_bg_expr = df_bg_stats.loc[(slice(None), ['25%', '50%', '75%']), :].unstack()
df_bg_expr

In [None]:
%%appyter markdown
{% if background_transcript_level or tumor_transcript_level.raw_value %}
## Load Transcript <=> Gene Mappings

{% endif %}

In [None]:
%%appyter code_exec
{% if background_transcript_level or tumor_transcript_level.raw_value %}
df_transcript_gene_map = pd.read_csv("s3://storage/Tumor_Gene_Target_Screener/transcript-gene-map.tsv.gz", storage_options=dict(client_kwargs=dict(endpoint_url="https://appyters.maayanlab.cloud"), anon=True), sep='\t', header=0, index_col=0, compression='gzip')
df_transcript_gene_map
{% endif %}

In [None]:
%%appyter markdown
{% if membrane_screener.raw_value %}

## Load Membrane Proteins for Screening

Membrane proteins are ideal targets, we can get these from [Membranome](https://membranome.org/) among other places.

{% endif %}

In [None]:
%%appyter code_eval
{% if membrane_screener.raw_value %}
proteins = pd.read_csv('https://lomize-group-membranome.herokuapp.com/proteins?fileFormat=csv')
proteins = proteins[proteins['species_name_cache'] == 'Homo sapiens']
membrane_proteins = proteins['genename'].map(lookup).dropna()
membrane_proteins
{% endif %}

In [None]:
%%appyter markdown
{% if background_transcript_level and tumor_transcript_level.raw_value %}
## Background (Transcript) - Tumor (Transcript) Gene Interoperability

Several features of this appyter operate at the gene level, thus while we will find significant transcripts, we
will use the associated gene for the other features.
{% endif %}

In [None]:
%%appyter markdown
{% if background_transcript_level and not tumor_transcript_level.raw_value %}
## Background (Transcript) - Tumor (Gene) Interoperability

As the tumor RNA-seq expression data is at the gene level, we will map the
background to genes for differential expression but later highlight significant transcripts in the background.
{% endif %}

In [None]:
%%appyter code_eval
{% if background_transcript_level and not tumor_transcript_level.raw_value %}
df_bg_expr = df_bg_expr.groupby(df_transcript_gene_map['gene_symbol']).sum().groupby(lookup).median()
df_bg_expr
{% endif %}

In [None]:
%%appyter markdown
{% if not background_transcript_level and tumor_transcript_level.raw_value %}
## Background (Gene) - Tumor (Transcript) Interoperability

Though the tumor RNA-seq expression data is at the transcript level, the background is only at the gene level.
This is not ideal, but we can still highlight individual transcripts in the tumor that are over-expressed
when compared to the average background expression across all transcripts.
{% endif %}

In [None]:
%%appyter code_eval
{% if not background_transcript_level and tumor_transcript_level.raw_value %}
# "melt" from wide format to long format with the columns: (index, type, stat, value)
df_bg_expr_melted = df_bg_expr.melt(ignore_index=False)
# merge with transcript_gene_map will transform the index from gene_symbol to ensembl_transcript_id
#  duplicating entries for each statistic accordingly
df_bg_expr_melted_mapped = pd.merge(
    left=df_bg_expr_melted, left_index=True,
    right=df_transcript_gene_map, right_on='gene_symbol',
)
# 
df_bg_expr = df_bg_expr_melted_mapped.pivot(columns=['variable_0', 'variable_1'], values='value')
df_bg_expr
{% endif %}

## Distribution matching between patient sample(s) & the background

In [None]:
fig, ((ax11, ax12), (ax21, ax22)) = plt.subplots(2, 2)
log2_normalize(df_expr).median(axis=1).hist(bins=100, ax=ax11)
ax11.set_title('Median Expression')
ax11.set_ylabel('Patient')
df_expr.median(axis=0).hist(bins=100, ax=ax12)
ax12.set_title('Median Sample Expression')
log2_normalize(df_bg_expr).median(axis=1).hist(bins=100, ax=ax21)
ax21.set_ylabel('Background')
df_bg_expr.median(axis=0).hist(bins=100, ax=ax22)

In [None]:
common_index = list(set(df_expr.index) & set(df_bg_expr.index))
venn2([set(df_expr.index), set(df_bg_expr.index)],
      ['Patients  ', '  Background'])

In [None]:
%%appyter code_exec
{% if normalize_to_background.raw_value %}
target_distribution = df_bg_expr.loc[common_index, :].median(axis=1)
df_expr_norm = qnorm.quantile_normalize(df_expr.loc[common_index, :], target=target_distribution)
df_bg_expr_norm = qnorm.quantile_normalize(df_bg_expr.loc[common_index, :], target=target_distribution)

fig, ((ax11, ax12), (ax21, ax22)) = plt.subplots(2, 2)
log2_normalize(df_expr_norm).median(axis=1).hist(bins=100, ax=ax11)
ax11.set_title('Median Expression')
ax11.set_ylabel('Patient')
df_expr_norm.median(axis=0).hist(bins=100, ax=ax12)
ax12.set_title('Median Sample Expression')
log2_normalize(df_bg_expr_norm).median(axis=1).hist(bins=100, ax=ax21)
ax21.set_ylabel('Background')
df_bg_expr_norm.median(axis=0).hist(bins=100, ax=ax22)
{% else %}
print('Warning: Proceeding without normalization')
df_expr_norm = df_expr.loc[common_index, :]
df_bg_expr_norm = df_bg_expr.loc[common_index, :]
{% endif %}

## Perform Differential Expression between Patient & Background

In [None]:
%%appyter code_exec

with suppress_output():
    dge = limma_voom_differential_expression(
        df_bg_expr_norm, df_expr_norm,
        voom_design=True,
    )
    {% if tumor_transcript_level.raw_value %}
    dge['ensembl_transcript_id'] = dge.index
    dge['gene_symbol'] = df_transcript_gene_map.loc[dge.index, 'gene_symbol'].apply(lookup)
    {% else %}
    dge['gene_symbol'] = dge.index
    {% endif %}
dge

## Narrow Down Candidate Set

In [None]:
dge['-log(adj.P.Val)'] = -np.log(dge['adj.P.Val'])
prod = (np.abs(dge['t']) * dge['logFC'])
dge['is_significant'] = prod > prod.mean() + 3 * prod.std()
dge['score'] = dge['is_significant'].astype(int)
#
fig = go.Figure()
fig.add_trace(go.Scattergl(
    mode='markers',
    x=dge.loc[~dge.is_significant, 'logFC'],
    y=dge.loc[~dge.is_significant, '-log(adj.P.Val)'],
    name='Other',
    showlegend=False,
    marker=dict(
        color='black',
    )
))
fig.add_trace(go.Scattergl(
    mode='markers',
    x=dge.loc[dge.is_significant, 'logFC'],
    y=dge.loc[dge.is_significant, '-log(adj.P.Val)'],
    text=dge.loc[dge.is_significant, 'gene_symbol'],
    name='Significant',
    marker=dict(
        color='red',
    )
))
fig.update_layout(
    title='Background vs Patient Differential Expression',
    xaxis_title='Log Fold Change',
    yaxis_title='-Log[Adjusted P-Value]',
    autosize=True,
)
fig.show()

In [None]:
%%appyter code_exec
{% if membrane_screener.raw_value %}
dge['is_membrane'] = np.in1d(dge['gene_symbol'], membrane_proteins)
dge['score'] = dge['score'] + dge['is_membrane'].astype(int)
#
fig = go.Figure()
fig.add_trace(go.Scattergl(
    mode='markers',
    x=dge.loc[~dge.is_membrane&~dge.is_significant, 'logFC'],
    y=dge.loc[~dge.is_membrane&~dge.is_significant, '-log(adj.P.Val)'],
    name='Other',
    showlegend=False,
    marker=dict(
        color='black',
    )
))
fig.add_trace(go.Scattergl(
    mode='markers',
    x=dge.loc[~dge.is_membrane&dge.is_significant, 'logFC'],
    y=dge.loc[~dge.is_membrane&dge.is_significant, '-log(adj.P.Val)'],
    text=dge.loc[~dge.is_membrane&dge.is_significant, 'gene_symbol'],
    name='Significant Other',
    showlegend=False,
    marker=dict(
        color='red',
    )
))
fig.add_trace(go.Scattergl(
    mode='markers',
    x=dge.loc[dge.is_membrane&~dge.is_significant, 'logFC'],
    y=dge.loc[dge.is_membrane&~dge.is_significant, '-log(adj.P.Val)'],
    name='Membrane Protein',
    marker=dict(
        color='grey',
    )
))
fig.add_trace(go.Scattergl(
    mode='markers',
    x=dge.loc[dge.is_membrane&dge.is_significant, 'logFC'],
    y=dge.loc[dge.is_membrane&dge.is_significant, '-log(adj.P.Val)'],
    text=dge.loc[dge.is_membrane&dge.is_significant, 'gene_symbol'],
    name='Significant Membrane Protein',
    marker=dict(
        color='orange',
    )
))
fig.update_layout(
    title='Background vs Tumor',
    xaxis_title='Log Fold Change',
    yaxis_title='-Log[Adjusted P-Value]',
    autosize=True,
)
fig.show()
{% endif %}

In [None]:
%%appyter code_eval
dge_final = dge[dge.score >= 1].sort_values(['score', 't'], ascending=False).iloc[:16]
pd.set_option('display.max_colwidth', None)
dge_final['Link'] = dge_final['gene_symbol'].map(lambda g: f"<a href=\"https://cfde-gene-pages.cloud/gene/{g}\">Link</a>")
HTML(dge_final.to_html(notebook=True, escape=False))

## Review Expression Levels of Selected Candidates

In [None]:
%%appyter markdown
{% if proteomics_vis.raw_value %}
Proteomics data were obtained from the [Human Protein Atlas](https://www.proteinatlas.org/about/download) (HPA) with IHC-based expression profiling, the [Human Proteome Map](https://www.humanproteomemap.org/download.php) (HPM) with MS-based expression quantification, and a [GTEx proteome project](https://doi.org/10.1016/j.cell.2020.08.036) using TMT MS. 
These datasets contain protein expression levels detected in normal tissues and cell types. Not all differentially-expressed gene candidates may be present in the data from each project (see table for which proteomics data are present/absent). Plots show expression levels (HPA), average spectral counts (HPM), or a log-transformed relative abundance (GTEx) by tissue/cell-type for each gene candidate (excluding expression levels from the HPA where the [reliability score](https://www.proteinatlas.org/about/assays+annotation) was uncertain). 
{% endif %}

In [None]:
%%appyter code_exec
{% if proteomics_vis.raw_value %}

hpm = pd.read_csv("s3://storage/Tumor_Gene_Target_Screener/hpm.tsv", storage_options=dict(client_kwargs=dict(endpoint_url="https://appyters.maayanlab.cloud"), anon=True), sep='\t', header=0, index_col=0)
hpa = pd.read_csv("s3://storage/Tumor_Gene_Target_Screener/hpa.tsv", storage_options=dict(client_kwargs=dict(endpoint_url="https://appyters.maayanlab.cloud"), anon=True), sep='\t', header=0, index_col=1)
gtexp = pd.read_csv("s3://storage/Tumor_Gene_Target_Screener/gtex_proteomics.tsv", storage_options=dict(client_kwargs=dict(endpoint_url="https://appyters.maayanlab.cloud"), anon=True), sep='\t', header=0)
gtexp['Name'] = gtexp['gene.id'].map(lambda idx: lookup(idx))

# Show available genes in each dataset
available = pd.DataFrame({'Gene': dge_final['gene_symbol'], 
                          'in HPM': dge_final['gene_symbol'].isin(hpm.index), 
                          'in HPA': dge_final['gene_symbol'].isin(hpa.index),
                          'in GTEx Proteomics': dge_final['gene_symbol'].isin(gtexp.Name)}).drop_duplicates(subset=['Gene'])
display(HTML(available.to_html(notebook=True, escape=False)))
{% endif %}

In [None]:
%%appyter code_eval
{% if proteomics_vis.raw_value %}
import plotly.express as px
hpa.Tissue = hpa["Tissue"] + ", " + hpa["Cell.type"]
hpa = hpa[hpa['Reliability'] != "Uncertain"] 
gtexp['tissue_specificity'] = gtexp.tissue_specificity.fillna('NA')
{% endif %}

for index, row in dge_final.iterrows():
    gene_symbol = row['gene_symbol']
    label = f"{index}" if index == gene_symbol else f"{index} - {gene_symbol}"
    display(Markdown(f"### {label}"))
    {% if background_transcript_level and not tumor_transcript_level.raw_value %}
    stats = df_bg_stats.loc[(df_transcript_gene_map[df_transcript_gene_map['gene_symbol'] == gene_symbol].index, slice(None))].unstack(0)
    stats.columns = stats.columns.to_flat_index().map(lambda col: ' - '.join(col))
    {% else %}
    stats = df_bg_stats.loc[(index, slice(None))]
    {% endif %}
    IQR = stats.loc['75%']-stats.loc['25%']
    fig = go.Figure()
    fig.add_trace(go.Box(
        lowerfence=np.maximum(
            stats.loc['min'],
            stats.loc['25%'] - (1.5*IQR),
        ),
        q1=stats.loc['25%'],
        median=stats.loc['50%'],
        q3=stats.loc['75%'],
        upperfence=np.minimum(
            stats.loc['max'],
            stats.loc['75%'] + (1.5*IQR),
        ),
        mean=stats.loc['mean'],
        sd=stats.loc['std'],
        y=stats.columns,
        name='Background',
        orientation='h'
    ))
    fig.add_trace(go.Box(
        x=df_expr_norm.loc[index],
        name='Patient (Normalized)',
        orientation='h'
    ))
    fig.update_layout(title=label+" (RNA-seq)", height=1200)
    fig.show()
    
    {% if proteomics_vis.raw_value %}
    if gene_symbol in gtexp.Name.values:
        d = gtexp[gtexp['Name'] == gene_symbol]
        fig = px.strip(d, y="tissue", x="value",  
                       orientation='h',
                       stripmode="overlay",
                       hover_data=["tissue_specificity"],
                       height=30*d['tissue'].nunique())
        fig.add_trace(go.Box(x=d['value'],
                             y=d['tissue'],
                             orientation='h',
                             marker=dict(color='#636EFA'),
                             name="n > 1"))
        fig.update_layout(title=label+" (GTEx Proteomics)",
                          autosize=True,
                          showlegend=False)
        fig.update_xaxes(title="log2(relative abundance)")
        fig.update_yaxes(title=None)
        fig.show()
       
    if gene_symbol in hpm.index:
        fig = px.scatter(hpm.loc[[gene_symbol]], 
                         y="Tissue", x="value", 
                         height=20*hpm.loc[[gene_symbol]].shape[0])
        fig.update_layout(title=label+" (HPM)", 
                          autosize=True)
        fig.update_xaxes(title="Average Spectral Counts")
        fig.update_yaxes(title=None)
        fig.show()
    
    if gene_symbol in hpa.index:
        fig = px.scatter(hpa.loc[[gene_symbol]], 
                         y="Tissue", x="Level", 
                         category_orders={"Level": ["Not detected", "Low", "Medium", "High"]}, 
                         hover_data=["Reliability"],  
                         hover_name="Tissue",
                         height=20*hpa.loc[[gene_symbol]].shape[0])
        fig.update_layout(title=label+" (HPA)", 
                          showlegend=False, 
                          autosize=True, 
                          xaxis={'tickmode':'array', 
                                 'tickvals':[0, 1, 2, 3], 
                                 'ticktext':["Not detected", "Low", "Medium", "High"]})
        fig.update_xaxes(title="Tissue Expression Level")
        fig.update_yaxes(title=None)
        fig.show()
    {% endif %}