In [None]:
#%%appyter init
from appyter import magic
magic.init(lambda _=globals: _())

In [None]:
%%appyter markdown

# Gene and Protien Expression across Human Cells and Tissues
The Appyter takes a human gene symbol as the input to produce box plots that display its expression across human cell types and tissue at the mRNA and protein levels. If the gene is not contained in one of the datasets, a plot will not be produced for that resource.

In [None]:
%%appyter hide_code

{% do SectionField(
    name='primary',
    title='Gene and Protein Expression across Cells and Tissues',
    img ='gene-expr.png'
) %}

{% do DescriptionField(
    name='data_file_description',
    text='''
    This appyter takes a gene symbol as the input and displays its expression across cells and tissues 
    utilizing a variety of processed datasets from healthy tissues.
    If the gene is not contained in one of the datasets, a plot will not be produced for that resource.''',
    section='primary',
) %}

%%appyter code_exec
{% set user_input = TabField(
    name = 'species_input',
    label = 'Select species and datasets to display',
    default = 'Human',
    description = 'Select the desired species and the available datasets',
    section = 'primary',
    choices = {
        'Human': [
            AutocompleteField(
                name = 'human_gene',
                label = 'Gene symbol of interest',
                default = 'A2M',
                description = 'Enter the gene symbol of interest (human)',
                file_path = 'https://appyters.maayanlab.cloud/storage/Gene_Expression_by_Tissue/genes.json',
            ),
            BoolField(
                name='gtex_gene',
                label='Include GTEX - gene?',
                default = True
            ),
            BoolField(
                name='archs',
                label='Include ARCHS4',
                default = True
            ),
            BoolField(
                name='ts',
                label='Include Tabula Sapiens?',
                default = True,
            ),
            BoolField(
                name='ccle',
                label='Include CCLE - Transcriptomics?',
                default = True,
            ),
            BoolField(
                name='use_hpm',
                label='Include HPM?',
                default = True,
            ),

            BoolField(
                name='use_hpa',
                label='Include HPA?',
                default = True,
            ),

            BoolField(
                name='gtex_prot',
                label='Include GTEx - Proteomics?',
                default = True,
            ),
            BoolField(
                name='ccle-prot',
                label='Include CCLE - Proteomics?',
                default = True,
            ),
            
        ],
        'Mouse': [
            AutocompleteField(
                name = 'mouse_gene',
                label = 'Gene symbol of interest',
                default = 'Mthfr',
                description = 'Enter the gene symbol of interest (mouse).',
                file_path = 'https://appyters.maayanlab.cloud/storage/Gene_Expression_by_Tissue/mm_genes.json',
            ),
            
            BoolField(
                name='archs_mm',
                label='Include ARCHS4?',
                default = True,
            ),
            
            BoolField(
                name='tm',
                label='Include Tabula Muris?',
                default = True,
            ),

        ]
    }
)%}

In [None]:
import numpy as np
import pandas as pd
import plotly.express as px
import plotly.graph_objects as go
import matplotlib.pyplot as plt
from IPython.display import HTML, display, Markdown
import s3fs
import h5py

s3 = s3fs.S3FileSystem(anon=True, client_kwargs={'endpoint_url': 'https://appyters.maayanlab.cloud/storage'})
def get_gene_stats(gene, f, source):
    if gene not in f[source].keys():
        return pd.DataFrame()
    if source in ['gtex','ts', 'tm', 'archs4']:
        data = f[source][gene]['stats']
        cols = np.array(f[source]['columns']).astype('str')
        rows = np.array(f[source]['rows']).astype('str')
        return pd.DataFrame(data, index=rows, columns=cols)
    if source in ['gtex-prot', 'hpm']:
        data = []
        cols = np.array(f[source]['columns']).astype('str')
        for tissue in f[source][gene].keys():
            vals = pd.DataFrame(data=np.array(f[source][gene][tissue]), columns=cols)
            vals['tissue'] = [tissue] * len(vals)
            data.append(vals)
        return pd.concat(data)
    if source in ['ccle', 'ccle-prot']:
        if source == 'ccle':
            cols = np.array(f[source]['columns']).astype('str')
            vals = np.array(f[source][gene]['stats'])[0]
        else:
            cols = np.array(f[source][gene]['columns']).astype('str')
            vals = np.array(f[source][gene]['stats'])
        return pd.Series(data=vals, index=cols)
    if source == 'hpa':
        cols = np.array(f[source]['columns']).astype('str')
        return pd.DataFrame(data=np.array(f[source][gene]['stats']).astype(str), columns=cols)



In [None]:
%%appyter code_exec
gene = {{ user_input.value[0] }}
{% set species = user_input["args"]["value"] %}
species = {{species|jsonify}}

In [None]:
%%appyter markdown

{% if species == 'Human' %}
## Load in normal gene expression
Utilize processed datasets containing gene expression by cell type and tissue from [GTEx](https://gtexportal.org/home/) [1], 
[ARCHS4](https://maayanlab.cloud/archs4/) [2], and the [Tabula Sapiens](https://tabula-sapiens-portal.ds.czbiohub.org/) [3] and by cell line from [CCLE](https://sites.broadinstitute.org/ccle/) [8].
{% endif %}
{% if species == 'Mouse' %}
Utilize processed datasets containing gene expression by cell type and tissue from 
[ARCHS4](https://maayanlab.cloud/archs4/) [2] and the [Tabula Muris](https://tabula-muris.ds.czbiohub.org/) [7].
{% endif %}

In [None]:
%%appyter code_exec

{% if species == 'Human' %}
s3f = s3.open("Gene_Expression_by_Tissue/RNA-Seq-GTEx-ARCHS4-TS-CCLE.h5")
f = h5py.File(s3f, 'r')
{% endif %}
{% if species == 'Mouse' %}
s3f = s3.open("Gene_Expression_by_Tissue/mm_RNA-Seq-ARCHS4-TM.h5")
f = h5py.File(s3f, 'r')
{% endif %}

In [None]:
%%appyter hide_code

{% if species == 'Human' %}
    {% set gtex_gene = user_input.value[1].value %}
    {% set archs = user_input.value[2].value %}
    {% set ts = user_input.value[3].value %}
    {% set ccle = user_input.value[4].value %}

    {% set use_hpm = user_input.value[5].value %}
    {% set use_hpa = user_input.value[6].value %}
    {% set gtex_prot = user_input.value[7].value %}
    {% set ccle_prot = user_input.value[8].value %}
{% endif %}
{% if species == 'Mouse' %}
    {% set archs_mm = user_input.value[1].value %}
    {% set tm = user_input.value[2].value %}
{% endif %}


In [None]:
%%appyter code_exec

{% if species == 'Human' %}
{% if gtex_gene %}
df_bg_stats_gtex = get_gene_stats(gene, f, 'gtex')
{% endif %}
{% if archs %}
df_bg_stats_archs4 = get_gene_stats(gene, f, 'archs4')
{% endif %}
{% if ts %}
ts_stats = get_gene_stats(gene, f, 'ts')
{% endif %}
{% if ccle %}
ccle_stats = get_gene_stats(gene, f, 'ccle')
{% endif %}
{% endif %}

In [None]:
%%appyter code_exec

{% if species == 'Mouse' %}
{% if archs_mm %}
df_bg_stats_tm = get_gene_stats(gene, f, 'tm')
{% endif %}
{% if tm %}
df_bg_stats_archs4_mm = get_gene_stats(gene, f, 'archs4')
{% endif %}
{% endif %}

In [None]:
f.close()
s3f.close()

In [None]:
%%appyter code_exec
# Show available genes in each dataset


data = {'Selected Gene': [gene]}

{% if species == 'Human' %}
{% if gtex_gene %}
data['in GTEx - Gene'] = [not df_bg_stats_gtex.empty]
{% endif %}
{% if archs %}
data['in ARCHS4'] = [not df_bg_stats_archs4.empty]
{% endif %}
{% if ts %}
data['in TS'] = [not ts_stats.empty]
{% endif %}
{% if ccle %}
data['in CCLE'] = [not ccle_stats.empty]
{% endif %}
{% endif %}

{% if species == 'Mouse' %}
{% if archs_mm %}
data['in ARCHS4'] = [not df_bg_stats_archs4_mm.empty]
{% endif %}
{% if tm %}
data['in Tabula Muris'] = [not df_bg_stats_tm.empty]
{% endif %}
{% endif %}


available = pd.DataFrame(data).T
display(HTML(available.to_html(notebook=True, escape=False)))

In [None]:
%%appyter markdown

## Load plots based on the selected gene

In [None]:
%%appyter code_exec

c = gene
{% if species == 'Human' %}
{% if gtex_gene %}
if not df_bg_stats_gtex.empty:
    df_bg_stats_gtex.sort_values('mean', axis=1, inplace=True)
    display(Markdown("## RNA-Seq Expression Levels"))
    IQR = df_bg_stats_gtex.loc[('75%')]-df_bg_stats_gtex.loc[('25%')]
    fig = go.Figure()
    fig.add_trace(go.Box(
        lowerfence=np.maximum(
            df_bg_stats_gtex.loc[('min')],
            df_bg_stats_gtex.loc[('25%')] - (1.5*IQR),
        ),
        q1=df_bg_stats_gtex.loc[('25%')],
        median=df_bg_stats_gtex.loc[('50%')],
        q3=df_bg_stats_gtex.loc[('75%')],
        upperfence=np.minimum(
            df_bg_stats_gtex.loc[('max')],
            df_bg_stats_gtex.loc[('75%')] + (1.5*IQR),
        ),
        mean=df_bg_stats_gtex.loc[('mean')],
        sd=df_bg_stats_gtex.loc[('std')],
        y=df_bg_stats_gtex.columns,
        name='Background',
        orientation='h'
    ))
    fig.update_layout(title=c+ " (RNA-seq) GTEx", height=1200)
    fig.show()
{% endif %}

{% if archs %}
if not df_bg_stats_archs4.empty:
    df_bg_stats_archs4.sort_values('mean', axis=1, inplace=True)
    IQR = df_bg_stats_archs4.loc[('75%')]-df_bg_stats_archs4.loc[('25%')]
    fig = go.Figure()
    fig.add_trace(go.Box(
        lowerfence=np.maximum(
            df_bg_stats_archs4.loc[('min')],
            df_bg_stats_archs4.loc[('25%')] - (1.5*IQR),
        ),
        q1=df_bg_stats_archs4.loc[('25%')],
        median=df_bg_stats_archs4.loc[('50%')],
        q3=df_bg_stats_archs4.loc[('75%')],
        upperfence=np.minimum(
            df_bg_stats_archs4.loc[('max')],
            df_bg_stats_archs4.loc[('75%')] + (1.5*IQR),
        ),
        mean=df_bg_stats_archs4.loc[('mean')],
        sd=df_bg_stats_archs4.loc[('std')],
        y=df_bg_stats_archs4.columns,
        name='Background',
        orientation='h'
    ))
    fig.update_layout(title=c+ " (RNA-seq) ARCHS4", height=8000)
    fig.show()
{% endif %}

{% if ts %}
if not ts_stats.empty:
    ts_stats.sort_values('mean', axis=1, inplace=True)
    IQR = ts_stats.loc[('75%')]-ts_stats.loc[('25%')]
    fig = go.Figure()
    fig.add_trace(go.Box(
        lowerfence=np.maximum(
            ts_stats.loc[('min')],
            ts_stats.loc[('25%')] - (1.5*IQR),
        ),
        q1=ts_stats.loc[('25%')],
        median=ts_stats.loc[('50%')],
        q3=ts_stats.loc[('75%')],
        upperfence=np.minimum(
            ts_stats.loc[('max')],
            ts_stats.loc[('75%')] + (1.5*IQR),
        ),
        mean=ts_stats.loc[('mean')],
        sd=ts_stats.loc[('std')],
        y=ts_stats.columns,
        name='Background',
        orientation='h'
    ))
    fig.update_layout(title=c+ " (RNA-seq) Tabula Sapiens", height=8000)
    fig.show()
{% endif %}

{% if ccle %}
if not ccle_stats.empty:
    ccle_stats = ccle_stats.sort_values()[-100:]
    fig = px.scatter(y=list(ccle_stats.index.values), x=(ccle_stats.values), height=4000)
    fig.update_layout(title = c+ " CCLE Transcriptomics (Top 100 Cell Lines)", yaxis=dict(dtick = 1, tickmode='linear', tickfont = dict(size = 12)))
    fig.update_yaxes(title=None, range=[-.5,len(ccle_stats)+.5], tickvals=ccle_stats.index)
    fig.update_xaxes(title=None)
    fig.show()
{% endif %}
{% endif %}

{% if species == 'Mouse' %}
{% if archs_mm %}
if not df_bg_stats_archs4_mm.empty:
    df_bg_stats_archs4_mm.sort_values('mean', axis=1, inplace=True)
    IQR = df_bg_stats_archs4_mm.loc[('75%')]-df_bg_stats_archs4_mm.loc[('25%')]
    fig = go.Figure()
    fig.add_trace(go.Box(
        lowerfence=np.maximum(
            df_bg_stats_archs4_mm.loc[('min')],
            df_bg_stats_archs4_mm.loc[('25%')] - (1.5*IQR),
        ),
        q1=df_bg_stats_archs4_mm.loc[('25%')],
        median=df_bg_stats_archs4_mm.loc[('50%')],
        q3=df_bg_stats_archs4_mm.loc[('75%')],
        upperfence=np.minimum(
            df_bg_stats_archs4_mm.loc[('max')],
            df_bg_stats_archs4_mm.loc[('75%')] + (1.5*IQR),
        ),
        mean=df_bg_stats_archs4_mm.loc[('mean')],
        sd=df_bg_stats_archs4_mm.loc[('std')],
        y=df_bg_stats_archs4_mm.columns,
        name='Background',
        orientation='h'
    ))
    fig.update_layout(title=c+ " (RNA-seq) ARCHS4", height=8000)
    fig.show()
{% endif %}

{% if tm %}
if not df_bg_stats_tm.empty:
    df_bg_stats_tm.sort_values('mean', axis=1, inplace=True)
    IQR = df_bg_stats_tm.loc[('75%')]-df_bg_stats_tm.loc[('25%')]
    fig = go.Figure()
    fig.add_trace(go.Box(
        lowerfence=np.maximum(
            df_bg_stats_tm.loc[('min')],
            df_bg_stats_tm.loc[('25%')] - (1.5*IQR),
        ),
        q1=df_bg_stats_tm.loc[('25%')],
        median=df_bg_stats_tm.loc[('50%')],
        q3=df_bg_stats_tm.loc[('75%')],
        upperfence=np.minimum(
            df_bg_stats_tm.loc[('max')],
            df_bg_stats_tm.loc[('75%')] + (1.5*IQR),
        ),
        mean=df_bg_stats_tm.loc[('mean')],
        sd=df_bg_stats_tm.loc[('std')],
        y=df_bg_stats_tm.columns,
        name='Background',
        orientation='h'
    ))
    fig.update_layout(title=c+ " (RNA-seq) Tabula Muris", height=8000)
    fig.show()
{% endif %}
{% endif %}
    

In [None]:
%%appyter markdown

{% if species == 'Human' %}
## Proteomics Expression Levels

Proteomics data were obtained from the [Human Protein Atlas](https://www.proteinatlas.org/about/download) (HPA) [4] 
with IHC-based expression profiling, the [Human Proteome Map](https://www.humanproteomemap.org/download.php) (HPM) [5] 
with MS-based expression quantification, the [GTEx proteome project](https://doi.org/10.1016/j.cell.2020.08.036) [6] using TMT MS, and the [CCLE](https://sites.broadinstitute.org/ccle/). 
These datasets contain protein expression levels detected in normal tissues and cell types. The gene may not be present 
in the data from each project (see table for which proteomics data are present/absent). Plots show expression levels (HPA), 
average spectral counts (HPM), or a log-transformed relative abundance (GTEx) by tissue/cell-type for each gene candidate 
(excluding expression levels from the HPA where the [reliability score](https://www.proteinatlas.org/about/assays+annotation) was uncertain). 
{% endif %}

In [None]:
%%appyter code_exec
{% if species == 'Human' %}
s3f = s3.open("Gene_Expression_by_Tissue/Proteomics-HPM-HPA-GTEx-CCLE.h5")
f = h5py.File(s3f, 'r')

{% if use_hpm %}
hpm = get_gene_stats(gene, f, 'hpm')
{% endif %}
{% if use_hpa %}
hpa = get_gene_stats(gene, f, 'hpa')
{% endif %}
{% if gtex_prot %}
gtexp = get_gene_stats(gene, f, 'gtex-prot')
{% endif %}
{% if ccle_prot %}
cclep = get_gene_stats(gene, f, 'ccle-prot')
{% endif %}

f.close()
s3f.close()

{% endif %}

In [None]:
%%appyter code_exec

{% if "Human" == species %}
# Show available genes in each dataset

data = {'Selected Gene': [gene]}

{% if use_hpm %}
data['in HPM'] = [not hpm.empty]
{% endif %}

{% if use_hpa %}
data['in HPA'] =[not hpa.empty]
{% endif %}

{% if gtex_prot %}
data['in GTEx - Proteomics'] = [not gtexp.empty]
{% endif %}

{% if ccle_prot %}
data['in CCLE - Proteomics'] = [not cclep.empty]
{% endif %}

available = pd.DataFrame(data).T                  
                          
display(HTML(available.to_html(notebook=True, escape=False)))
{% endif %}

In [None]:
%%appyter code_exec

{% if "Human" == species %}
{% if gtex_prot %}
if not gtexp.empty:
    gtexp['tissue_specificity'] = gtexp.tissue_specificity.fillna('NA')
    d = gtexp
    fig = px.strip(d, y="tissue", x="value",  
                   orientation='h',
                   stripmode="overlay",
                   hover_data=["tissue_specificity"],
                   height=30*d['tissue'].nunique())
    fig.add_trace(go.Box(x=d['value'],
                         y=d['tissue'],
                         orientation='h',
                         marker=dict(color='#636EFA'),
                         name="n > 1"))
    fig.update_layout(title=c + " (GTEx Proteomics)",
                      autosize=True,
                      showlegend=False)
    fig.update_xaxes(title="log2(relative abundance)")
    fig.update_yaxes(title=None)
    fig.show()
{% endif %}   

{% if use_hpm %}
if not hpm.empty:
    fig = px.scatter(hpm, 
                     y="tissue", x="value", 
                     height=20*hpm.shape[0])
    fig.update_layout(title=c + " (HPM)", 
                      autosize=True)
    fig.update_xaxes(title="Average Spectral Counts")
    fig.update_yaxes(title=None)
    fig.show()
{% endif %}

{% if use_hpa %}
if not hpa.empty:
    hpa.Tissue = hpa["Tissue"] + ", " + hpa["Cell.type"]
    hpa = hpa[hpa['Reliability'] != "Uncertain"] 
    fig = px.scatter(hpa, 
                     y="Tissue", x="Level", 
                     category_orders={"Level": ["Not detected", "Low", "Medium", "High"]}, 
                     hover_data=["Reliability"],  
                     hover_name="Tissue",
                     height=20*hpa.shape[0])
    fig.update_layout(title=c + " (HPA)", 
                      showlegend=False, 
                      autosize=True, 
                      xaxis={'tickmode':'array', 
                             'tickvals':[0, 1, 2, 3], 
                             'ticktext':["Not detected", "Low", "Medium", "High"]})
    fig.update_xaxes(title="Tissue Expression Level")
    fig.update_yaxes(title=None)
    fig.show()
{% endif %}

{% if ccle_prot %}
if not cclep.empty:
    cclep.sort_values(inplace=True)
    fig = px.scatter(cclep, 
                     y=cclep.index, x=cclep.values, 
                     height=20*cclep.shape[0])
    fig.update_layout(title=c + " (CCLE Proteomics)", 
                      showlegend=False, 
                      autosize=True)
    fig.update_xaxes(title="Cell Line Expression Level")
    fig.update_yaxes(title=None, range=[-.5,len(cclep)+.5], tickvals= cclep.index)
    fig.show()
{% endif %}

{% endif %}

In [None]:
%%appyter markdown

# References

[1] Lonsdale, John, et al. "The genotype-tissue expression (GTEx) project." Nature genetics 45.6 (2013): 580-585. https://doi.org/10.1038/ng.265
        
[2] Lachmann A, Torre D, Keenan AB, Jagodnik KM, Lee HJ, Wang L, Silverstein MC, Ma'ayan A. Massive mining of publicly available RNA-seq data from human and mouse. Nature Communications 9. Article number: 1366 (2018), https://doi.org/10.1038/s41467-018-03751-6

[3] Tabula Sapiens Consortium* et al. “The Tabula Sapiens: A multiple-organ, single-cell transcriptomic atlas of humans.” Science (New York, N.Y.) vol. 376,6594 (2022). https://doi:10.1126/science.abl4896

[4] Uhlén M et al. "Tissue-based map of the human proteome." Science (New York, N.Y.) vol. 347,6220 (2015): 1260419. https://doi.org/10.1126/science.1260419

[5] Kim, Min-Sik et al. “A draft map of the human proteome.” Nature vol. 509,7502 (2014): 575-81. https://doi.org/10.1038/nature13302

[6] Jiang, Lihua et al. “A Quantitative Proteome Map of the Human Body.” Cell vol. 183,1 (2020): 269-283.e19. https://doi.org/10.1016/j.cell.2020.08.036

[7] Tabula Muris Consortium et al. “Single-cell transcriptomics of 20 mouse organs creates a Tabula Muris.” Nature vol. 562,7727 (2018): 367-372. doi:10.1038/s41586-018-0590-4

[8] Ghandi, Mahmoud et al. “Next-generation characterization of the Cancer Cell Line Encyclopedia.” Nature vol. 569,7757 (2019): 503-508. doi:10.1038/s41586-019-1186-3
        
    