In [None]:
#%%appyter init
from appyter import magic
magic.init(lambda _=globals: _())

In [None]:
%%appyter markdown

# Gene and Protien Expression across Human Cells and Tissues
The Appyter takes a human gene symbol as the input to produce box plots that display its expression across human cell types and tissue at the mRNA and protein levels. If the gene is not contained in one of the datasets, a plot will not be produced for that resource.

In [None]:
%%appyter hide_code



{% do SectionField(
    name='primary',
    title='Gene and Protein Expression across Human Cells and Tissues',
    img ='gene-expr.png'
) %}

{% do SectionField(
    name='secondary',
    title='Select Datasets',
    description = 'Choose which datasets should be used'
) %}

{% do DescriptionField(
    name='data_file_description',
    text='''
    This appyter takes the input of a human gene and displays its expression across human cells and tissues 
    utilizing a variety of processed datasets from healthy tissues.
    If the gene is not contained in one of the datasets, a plot will not be produced for that resource.''',
    section='primary',
) %}



{% set gene = AutocompleteField(name = 'gene',
                                label = 'Human gene symbol',
                                default = 'A2M',
                                description = 'Enter the gene of interest',
                                choices = load_static("genes.json"),
                                section = 'primary'
)%}





In [None]:
%%appyter hide_code

{% set gtex_gene = BoolField(name='gtex_gene',
             label='Include GTEX - gene?',
             default = True,
             section ='secondary') 
%}

{% set archs = BoolField(name='archs',
             label='Include ARCHS4',
             default = True,
             section ='secondary') 
%}


{% set ts = BoolField(name='ts',
             label='Include Tabula Sapiens?',
             default = True,
             section ='secondary') 
%}

{% set use_hpm = BoolField(name='use_hpm',
             label='Include HPM?',
             default = True,
             section ='secondary') 
%}

{% set use_hpa = BoolField(name='use_hpa',
             label='Include HPA?',
             default = True,
             section ='secondary') 
%}

{% set gtex_prot = BoolField(name='gtex_prot',
             label='Include GTEx - Proteomics?',
             default = True,
             section ='secondary') 
%}


In [None]:
import numpy as np
import pandas as pd
import plotly.express as px
import plotly.graph_objects as go
import matplotlib.pyplot as plt
from IPython.display import HTML, display, Markdown
import s3fs
import h5py

s3 = s3fs.S3FileSystem(anon=True, client_kwargs={'endpoint_url': 'https://appyters.maayanlab.cloud/storage'})
def get_gene_stats(gene, f, source):
    if gene not in f[source].keys():
        return pd.DataFrame()
    if source == 'gtex' or source == 'ts' or source == 'archs4':
        data = f[source][gene]['stats']
        cols = np.array(f[source]['columns']).astype('str')
        rows = np.array(f[source]['rows']).astype('str')
        return pd.DataFrame(data, index=rows, columns=cols)
    if source == 'gtex-prot' or source == 'hpm':
        data = []
        cols = np.array(f[source]['columns']).astype('str')
        for tissue in f[source][gene].keys():
            vals = pd.DataFrame(data=np.array(f[source][gene][tissue]), columns=cols)
            vals['tissue'] = [tissue] * len(vals)
            data.append(vals)
        return pd.concat(data)
    if source == 'hpa':
        cols = np.array(f[source]['columns']).astype('str')
        return pd.DataFrame(data=np.array(f[source][gene]['stats']).astype(str), columns=cols)



In [None]:
%%appyter code_exec

gene = {{ gene }}


In [None]:
%%appyter markdown

## Load in normal gene expression
Utilize processed datasets containing gene expression by cell type and tissue from [GTEx](https://gtexportal.org/home/) [1], 
[ARCHS4](https://maayanlab.cloud/archs4/) [2], and the [Tabula Sapiens](https://tabula-sapiens-portal.ds.czbiohub.org/) [3].

In [None]:
s3f = s3.open("Gene_Expression_by_Tissue/RNA-Seq-GTEx-ARCHS4-TS.h5")
f = h5py.File(s3f, 'r')

In [None]:
%%appyter code_exec

{% if gtex_gene.value %}
df_bg_stats_gtex = get_gene_stats(gene, f, 'gtex')
{% endif %}

In [None]:
%%appyter code_exec

{% if archs.value %}
df_bg_stats_archs4 = get_gene_stats(gene, f, 'archs4')
{% endif %}

In [None]:
%%appyter code_exec

{% if ts.value%}
ts_stats = get_gene_stats(gene, f, 'ts')
{% endif %}

In [None]:
f.close()
s3f.close()

In [None]:
%%appyter code_exec
# Show available genes in each dataset

data = {'Selected Gene': [gene]}

{% if gtex_gene.value %}
data['in GTEx - Gene'] = [not df_bg_stats_gtex.empty]
{% endif %}
{% if archs.value %}
data['in ARCHS4'] = [not df_bg_stats_archs4.empty]
{% endif %}
{% if ts.value %}
data['in TS'] = [not ts_stats.empty]
{% endif %}

available = pd.DataFrame(data).T
display(HTML(available.to_html(notebook=True, escape=False)))

In [None]:
%%appyter markdown

## Load plots based on the selected gene

In [None]:
%%appyter code_exec

c = gene

{% if gtex_gene.value %}
if not df_bg_stats_gtex.empty:
    display(Markdown("## RNA-Seq Expression Levels"))
    IQR = df_bg_stats_gtex.loc[('75%')]-df_bg_stats_gtex.loc[('25%')]
    fig = go.Figure()
    fig.add_trace(go.Box(
        lowerfence=np.maximum(
            df_bg_stats_gtex.loc[('min')],
            df_bg_stats_gtex.loc[('25%')] - (1.5*IQR),
        ),
        q1=df_bg_stats_gtex.loc[('25%')],
        median=df_bg_stats_gtex.loc[('50%')],
        q3=df_bg_stats_gtex.loc[('75%')],
        upperfence=np.minimum(
            df_bg_stats_gtex.loc[('max')],
            df_bg_stats_gtex.loc[('75%')] + (1.5*IQR),
        ),
        mean=df_bg_stats_gtex.loc[('mean')],
        sd=df_bg_stats_gtex.loc[('std')],
        y=df_bg_stats_gtex.columns,
        name='Background',
        orientation='h'
    ))
    fig.update_layout(title=c+ " (RNA-seq) GTEx", height=1200)
    fig.show()
{% endif %}

{% if archs.value %}
if not df_bg_stats_archs4.empty:
    IQR = df_bg_stats_archs4.loc[('75%')]-df_bg_stats_archs4.loc[('25%')]
    fig = go.Figure()
    fig.add_trace(go.Box(
        lowerfence=np.maximum(
            df_bg_stats_archs4.loc[('min')],
            df_bg_stats_archs4.loc[('25%')] - (1.5*IQR),
        ),
        q1=df_bg_stats_archs4.loc[('25%')],
        median=df_bg_stats_archs4.loc[('50%')],
        q3=df_bg_stats_archs4.loc[('75%')],
        upperfence=np.minimum(
            df_bg_stats_archs4.loc[('max')],
            df_bg_stats_archs4.loc[('75%')] + (1.5*IQR),
        ),
        mean=df_bg_stats_archs4.loc[('mean')],
        sd=df_bg_stats_archs4.loc[('std')],
        y=df_bg_stats_archs4.columns,
        name='Background',
        orientation='h'
    ))
    fig.update_layout(title=c+ " (RNA-seq) ARCHS4", height=1500)
    fig.show()
{% endif %}

    
{% if ts.value %}
if not ts_stats.empty:
    IQR = ts_stats.loc[('75%')]-ts_stats.loc[('25%')]
    fig = go.Figure()
    fig.add_trace(go.Box(
        lowerfence=np.maximum(
            ts_stats.loc[('min')],
            ts_stats.loc[('25%')] - (1.5*IQR),
        ),
        q1=ts_stats.loc[('25%')],
        median=ts_stats.loc[('50%')],
        q3=ts_stats.loc[('75%')],
        upperfence=np.minimum(
            ts_stats.loc[('max')],
            ts_stats.loc[('75%')] + (1.5*IQR),
        ),
        mean=ts_stats.loc[('mean')],
        sd=ts_stats.loc[('std')],
        y=ts_stats.columns,
        name='Background',
        orientation='h'
    ))
    fig.update_layout(title=c+ " (RNA-seq) Tabula Sapiens", height=8000)
    fig.show()
{% endif %}
    

In [None]:
%%appyter markdown

## Proteomics Expression Levels

Proteomics data were obtained from the [Human Protein Atlas](https://www.proteinatlas.org/about/download) (HPA) [4] 
with IHC-based expression profiling, the [Human Proteome Map](https://www.humanproteomemap.org/download.php) (HPM) [5] 
with MS-based expression quantification, and a [GTEx proteome project](https://doi.org/10.1016/j.cell.2020.08.036) [6] using TMT MS. 
These datasets contain protein expression levels detected in normal tissues and cell types. The gene may not be present 
in the data from each project (see table for which proteomics data are present/absent). Plots show expression levels (HPA), 
average spectral counts (HPM), or a log-transformed relative abundance (GTEx) by tissue/cell-type for each gene candidate 
(excluding expression levels from the HPA where the [reliability score](https://www.proteinatlas.org/about/assays+annotation) was uncertain). 

In [None]:
%%appyter code_exec

s3f = s3.open("Gene_Expression_by_Tissue/Proteomics-HPM-HPA-GTExProt.h5")
f = h5py.File(s3f, 'r')

{% if use_hpm.value %}
hpm = get_gene_stats(gene, f, 'hpm')
{% endif %}
{% if use_hpa.value %}
hpa = get_gene_stats(gene, f, 'hpa')
{% endif %}
{% if gtex_prot.value %}
gtexp = get_gene_stats(gene, f, 'gtex-prot')
{% endif %}

f.close()
s3f.close()

In [None]:
%%appyter code_exec
# Show available genes in each dataset
data = {'Selected Gene': [gene]}

{% if use_hpm.value %}
data['in HPM'] = [not hpm.empty]
{% endif %}

{% if use_hpa.value %}
data['in HPA'] =[not hpa.empty]
{% endif %}

{% if gtex_prot.value %}
data['in GTEx - Proteomics'] = [not gtexp.empty]
{% endif %}

available = pd.DataFrame(data).T

    
                          
                          
display(HTML(available.to_html(notebook=True, escape=False)))

In [None]:
%%appyter code_exec

{% if use_hpa.value %}
hpa.Tissue = hpa["Tissue"] + ", " + hpa["Cell.type"]
hpa = hpa[hpa['Reliability'] != "Uncertain"] 
{% endif %}
{% if gtex_prot.value %}
gtexp['tissue_specificity'] = gtexp.tissue_specificity.fillna('NA')
{% endif %}

{% if gtex_prot.value %}
if not gtexp.empty:
    d = gtexp
    fig = px.strip(d, y="tissue", x="value",  
                   orientation='h',
                   stripmode="overlay",
                   hover_data=["tissue_specificity"],
                   height=30*d['tissue'].nunique())
    fig.add_trace(go.Box(x=d['value'],
                         y=d['tissue'],
                         orientation='h',
                         marker=dict(color='#636EFA'),
                         name="n > 1"))
    fig.update_layout(title=c + " (GTEx Proteomics)",
                      autosize=True,
                      showlegend=False)
    fig.update_xaxes(title="log2(relative abundance)")
    fig.update_yaxes(title=None)
    fig.show()
{% endif %}   

{% if use_hpm.value %}
if not hpm.empty:
    fig = px.scatter(hpm, 
                     y="tissue", x="value", 
                     height=20*hpm.shape[0])
    fig.update_layout(title=c + " (HPM)", 
                      autosize=True)
    fig.update_xaxes(title="Average Spectral Counts")
    fig.update_yaxes(title=None)
    fig.show()
{% endif %}

{% if use_hpa.value %}
if not hpa.empty:
    fig = px.scatter(hpa, 
                     y="Tissue", x="Level", 
                     category_orders={"Level": ["Not detected", "Low", "Medium", "High"]}, 
                     hover_data=["Reliability"],  
                     hover_name="Tissue",
                     height=20*hpa.shape[0])
    fig.update_layout(title=c + " (HPA)", 
                      showlegend=False, 
                      autosize=True, 
                      xaxis={'tickmode':'array', 
                             'tickvals':[0, 1, 2, 3], 
                             'ticktext':["Not detected", "Low", "Medium", "High"]})
    fig.update_xaxes(title="Tissue Expression Level")
    fig.update_yaxes(title=None)
    fig.show()
{% endif %}

In [None]:
%%appyter markdown

# References

[1] Lonsdale, John, et al. "The genotype-tissue expression (GTEx) project." Nature genetics 45.6 (2013): 580-585. https://doi.org/10.1038/ng.265
        
[2] Lachmann A, Torre D, Keenan AB, Jagodnik KM, Lee HJ, Wang L, Silverstein MC, Ma'ayan A. Massive mining of publicly available RNA-seq data from human and mouse. Nature Communications 9. Article number: 1366 (2018), https://doi.org/10.1038/s41467-018-03751-6

[3] Tabula Sapiens Consortium* et al. “The Tabula Sapiens: A multiple-organ, single-cell transcriptomic atlas of humans.” Science (New York, N.Y.) vol. 376,6594 (2022). https://doi:10.1126/science.abl4896

[4] Uhlén M et al. "Tissue-based map of the human proteome." Science (New York, N.Y.) vol. 347,6220 (2015): 1260419. https://doi.org/10.1126/science.1260419

[5] Kim, Min-Sik et al. “A draft map of the human proteome.” Nature vol. 509,7502 (2014): 575-81. https://doi.org/10.1038/nature13302

[6] Jiang, Lihua et al. “A Quantitative Proteome Map of the Human Body.” Cell vol. 183,1 (2020): 269-283.e19. https://doi.org/10.1016/j.cell.2020.08.036
        
    