In [None]:
#%%appyter init
from appyter import magic
magic.init(lambda _=globals: _())

In [None]:
%%appyter hide_code

{% do SectionField(
    name='primary',
    title='Tumor Gene Target Screener',
) %}

{% do DescriptionField(
    name='data_file_description',
    text='''
    Files should be a tsv/csv of the form:<br />
    <table class="table">
    <tr>
      <td>&nbsp;</td>
      <th>Replicate 1</th>
      <th>Replicate 2</th>
      <th>...</th>
    </tr>
    <tr>
      <th>Gene 1</th>
      <td>0</td>
      <td>200</td>
      <td>...</td>
    </tr>
    <tr>
      <th>Gene 2</th>
      <td>5</td>
      <td>180</td>
      <td>...</td>
    </tr>
    <tr>
      <th>...</th>
      <td>...</td>
      <td>...</td>
      <td>...</td>
    </tr>
    </table>''',
    section='primary',
) %}

{% set file = FileField(
    name='patient_expression',
    label='Patient RNA-seq expression vectors',
    description='Gene/Transcripts on the rows, replicates on the columns',
    default='GSE49155-lung-squamous-cell-carcinoma.tsv',
    required=True,
    examples={
        'GSE49155-lung-squamous-cell-carcinoma.tsv': 'https://appyters.maayanlab.cloud/storage/Tumor_Gene_Target_Screener/GSE49155-patient.tsv',
    },
    section='primary',
) %}

{% set background = TabField(
    name='background',
    label='Normal tissue background',
    description='Patient expression will be contrasted against this background',
    default='Precomputed',
    choices={
        'Precomputed': [
            ChoiceField(
                name='background_dataset',
                label='Normal tissue background',
                description='Option 1 uses median vectors for each tissue while option 2 uses them for each tissue subcluster',
                choices={
                    'GTEx': '"https://appyters.maayanlab.cloud/storage/Tumor_Gene_Target_Screener/gtex-tissue-stats.tsv"',
                    'ARCHS4': '"https://appyters.maayanlab.cloud/storage/Tumor_Gene_Target_Screener/archs4-anatomy-stats.tsv"',
                    'ARCHS4 - extra': '"https://appyters.maayanlab.cloud/storage/Tumor_Gene_Target_Screener/archs4-extra-stats.tsv"',
                },
                default='GTEx',
            ),
        ],
        'Custom': [
            FileField(
                name='background_upload',
                label='Normal tissue background',
                description='Given a matrix d genes by samples, this matrix can be constructed with `d.T.groupby(sample_tissue_mappings).description().T`',
                examples={
                    'GTEx': 'https://appyters.maayanlab.cloud/storage/Tumor_Gene_Target_Screener/gtex-tissue-stats.tsv',
                },
                default=None,
                section='primary',
            ),
            DescriptionField(
                name='background_file_description',
                text='''
                Files should be a tsv/csv of the form:<br />
                <table class="table">
                <tr>
                  <td>&nbsp;</td>
                  <td>&nbsp;</td>
                  <th>Tissue 1</th>
                  <th>...</th>
                </tr>
                <tr>
                  <th>Gene|Transcript 1</th>
                  <th>25%</th>
                  <td>0</td>
                  <td>...</td>
                </tr>
                <tr>
                  <th>...</th>
                  <th>...</th>
                  <td>...</td>
                  <td>...</td>
                </tr>
                <tr>
                  <th>Gene|Transcript n</th>
                  <th>mean</th>
                  <td>180</td>
                  <td>...</td>
                </tr>
                </table>''',
            ),
        ],
    },
    section='primary',
) %}

{% set background_dataset = background.value[0] %}

{% set membrane_screener = BoolField(
    name='membrane_screener',
    label='Prioritize membrane genes',
    default=True,
    section='primary',
) %}

{% set transcript_level = BoolField(
    name='transcript_level',
    label='Operate directly on transcripts as opposed to genes',
    default=False,
    yes_label='Transcript Level',
    no_label='Gene Level',
    section='primary',
) %}

{% set normalize_to_background = BoolField(
    name='normalize_to_background',
    label='Normalize to background distribution',
    default=True,
    section='primary',
) %}

# Overexpressed Gene Candidate Identification

This appyter uses RNA-seq expression data for a patient and identifies gene over-expression verses a baseline dataset of normal tissues such those in GTEx or ARCHS4. It then prioritizes gene candidates by significance and targetability.

In [None]:
%%appyter code_exec
import qnorm
import numpy as np
import pandas as pd
import warnings
import matplotlib.pyplot as plt
import plotly.graph_objects as go
import seaborn as sns
from matplotlib.gridspec import GridSpec
from matplotlib_venn import venn2
from maayanlab_bioinformatics.normalization import zscore_normalize, log2_normalize
from maayanlab_bioinformatics.dge import limma_voom_differential_expression
{% if membrane_screener.raw_value or not transcript_level.raw_value %}
from maayanlab_bioinformatics.harmonization.ncbi_genes import ncbi_genes_lookup
lookup = ncbi_genes_lookup()
{% endif %}

import sys
import contextlib
@contextlib.contextmanager
def suppress_output(stdout=True, stderr=True, dest='/dev/null'):
    ''' Usage:
    with suppress_output():
        print('hi')
    '''
    dev_null = open(dest, 'a')
    if stdout:
        _stdout = sys.stdout
        sys.stdout = dev_null
    if stderr:
        _stderr = sys.stderr
        sys.stderr = dev_null
    try:
        yield
    finally:
        if stdout:
            sys.stdout = _stdout
        if stderr:
            sys.stderr = _stderr

## Load Patient RNA-seq Expression Data

Load RNA-seq expression data for the patient.

In [None]:
def read_table(filename):
    if filename.endswith('.tsv') or filename.endswith('.tsv.gz'):
        return pd.read_csv(filename, sep='\t', index_col=0)
    elif filename.endswith('.csv') or filename.endswith('.csv.gz'):
        return pd.read_csv(filename, sep=',', index_col=0)
    elif filename.endswith('.gct') or filename.endswith('.gct.gz'):
        return pd.read_csv(filename, sep='\t', index_col=0, skiprows=2)
    else:
        return pd.read_table(filename, sep=None, engine='python', index_col=0)

In [None]:
%%appyter code_eval
df_expr = read_table({{ file }})
{% if transcript_level.raw_value %}
df_expr_transcripts = df_expr.index.map(lambda idx: idx.partition('.')[0])
df_expr = df_expr.groupby(df_expr_genes, observed=True).sum()
{% else %}
df_expr_genes = df_expr.index.map(lambda idx: lookup(idx.partition('.')[0]))
df_expr = df_expr.groupby(df_expr_genes, observed=True).sum()
{% endif %}
df_expr

## Load Background Dataset

The background dataset contains expression for normal tissues in many patients.

In [None]:
%%appyter code_eval
df_bg_stats = pd.read_csv({{ background_dataset }}, sep='\t', index_col=[0,1])
{% if transcript_level.raw_value %}
df_bg_transcripts = df_bg_stats.unstack().index.map(lambda idx: idx.partition('.')[0])
df_bg_stats = df_bg_stats.unstack().groupby(df_bg_transcripts, observed=True).sum().stack()
{% else %}
df_bg_genes = df_bg_stats.unstack().index.map(lambda idx: lookup(idx.partition('.')[0]))
df_bg_stats = df_bg_stats.unstack().groupby(df_bg_genes, observed=True).sum().stack()
{% endif %}
df_bg_expr = df_bg_stats.loc[(slice(None), ['25%', '50%', '75%']), :].unstack()
df_bg_expr

In [None]:
%%appyter markdown
{% if membrane_screener.raw_value %}

## Load Membrane Proteins for Screening

Membrane proteins are ideal targets, we can get these from [Membranome](https://membranome.org/) among other places.

{% endif %}

In [None]:
%%appyter code_eval
{% if membrane_screener.raw_value %}
proteins = pd.read_csv('https://lomize-group-membranome.herokuapp.com/proteins?fileFormat=csv')
proteins = proteins[proteins['species_name_cache'] == 'Homo sapiens']
membrane_proteins = proteins['genename'].map(lookup).dropna()
membrane_proteins
{% endif %}

## Distribution matching between patient sample(s) & the background

In [None]:
fig, ((ax11, ax12), (ax21, ax22)) = plt.subplots(2, 2)
log2_normalize(df_expr).median(axis=1).hist(bins=100, ax=ax11)
ax11.set_title('Median Expression')
df_expr.median(axis=0).hist(bins=100, ax=ax12)
ax12.set_title('Median Sample Expression')
log2_normalize(df_bg_expr).median(axis=1).hist(bins=100, ax=ax21)
df_bg_expr.median(axis=0).hist(bins=100, ax=ax22)

In [None]:
common_index = list(set(df_expr.index) & set(df_bg_expr.index))
venn2([set(df_expr.index), set(df_bg_expr.index)],
      ['Patients  ', '  Background'])

In [None]:
%%appyter code_exec
{% if normalize_to_background.raw_value %}
target_distribution = df_bg_expr.loc[common_index, :].median(axis=1)
df_expr_norm = qnorm.quantile_normalize(df_expr.loc[common_index, :], target=target_distribution)
df_bg_expr_norm = qnorm.quantile_normalize(df_bg_expr.loc[common_index, :], target=target_distribution)

fig, ((ax11, ax12), (ax21, ax22)) = plt.subplots(2, 2)
log2_normalize(df_expr_norm).median(axis=1).hist(bins=100, ax=ax11)
ax11.set_title('Median Expression')
df_expr_norm.median(axis=0).hist(bins=100, ax=ax12)
ax12.set_title('Median Sample Expression')
log2_normalize(df_bg_expr_norm).median(axis=1).hist(bins=100, ax=ax21)
df_bg_expr_norm.median(axis=0).hist(bins=100, ax=ax22)
{% else %}
print('Warning: Proceeding without normalization')
df_expr_norm = df_expr.loc[common_index, :]
df_bg_expr_norm = df_bg_expr.loc[common_index, :]
{% endif %}

## Perform Differential Expression between Patient & Background

In [None]:
with suppress_output():
    dge = limma_voom_differential_expression(
        df_bg_expr_norm, df_expr_norm,
        voom_design=True,
    )
dge

## Narrow down Geneset

In [None]:
dge['-log(adj.P.Val)'] = -np.log(dge['adj.P.Val'])
prod = (np.abs(dge['t']) * dge['logFC'])
dge['is_significant'] = prod > prod.mean() + 3 * prod.std()
dge['score'] = dge['is_significant'].astype(int)
sns.scatterplot(data=dge, x='logFC', y='-log(adj.P.Val)', hue='is_significant', s=4)

In [None]:
%%appyter code_exec
{% if membrane_screener.raw_value %}
dge['is_membrane'] = np.in1d(dge.index, membrane_proteins)
dge['score'] = dge['score'] + dge['is_membrane'].astype(int)
sns.scatterplot(data=dge, x='logFC', y='-log(adj.P.Val)', hue='is_membrane', s=10)
{% endif %}

In [None]:
dge_final = dge[dge.score > 1].sort_values(['score', 't'], ascending=False).iloc[:16]
dge_final

## Review Selected Genes

In [None]:
for gene in dge_final.index:
    fig = go.Figure()
    fig.add_trace(go.Box(
        lowerfence=df_bg_stats.loc[(gene, 'min')],
        q1=df_bg_stats.loc[(gene, '25%')],
        median=df_bg_stats.loc[(gene, '50%')],
        q3=df_bg_stats.loc[(gene, '75%')],
        upperfence=df_bg_stats.loc[(gene, 'max')],
        mean=df_bg_stats.loc[(gene, 'mean')],
        sd=df_bg_stats.loc[(gene, 'std')],
        x=df_bg_stats.columns,
        name='Background',
    ))
    fig.add_trace(go.Box(
        y=df_expr_norm.loc[gene],
        name='Patient (Normalized)',
    ))
    fig.update_layout(title=gene, autosize=True)
    fig.show()