In [None]:
#%%appyter init
from appyter import magic
magic.init(lambda _=globals: _())

In [None]:
# imports
import numpy as np
import pandas as pd
import matplotlib.cm as cm
import matplotlib.colors as colors
from IPython.display import HTML, Markdown
import requests

# bokeh
from bokeh.io import output_notebook
from bokeh.plotting import figure, show
from bokeh.models import HoverTool, ColumnDataSource

# display graphics
output_notebook()

In [None]:
%%appyter hide
{% do SectionField(
    name = 'INPUT',
    title = 'Input Parameters',
    subtitle = 'Fill in the species and gene of interest',
    img = 'mini_plot.png'
)%}

In [None]:
%%appyter code_exec
{% set species_input = TabField(
    name = 'species_input',
    label = 'Species of Interest',
    default = 'Human',
    description = 'Select the species of interest.',
    section = 'INPUT',
    choices = {
        'Human': [
            AutocompleteField(
                name = 'human_gene',
                label = 'Human Gene of Interest',
                default = 'SLC2A2',
                description = 'Enter the gene symbol of interest (human).',
                file_path = 'https://appyters.maayanlab.cloud/storage/Gene_Expression_T2D_Signatures/human_genes.json'
            )
        ],
        'Mouse': [
            AutocompleteField(
                name = 'mouse_gene',
                label = 'Mouse Gene of Interest',
                default = 'Tcf7l2',
                description = 'Enter the gene symbol of interest (mouse).',
                file_path = 'https://appyters.maayanlab.cloud/storage/Gene_Expression_T2D_Signatures/mouse_genes.json'
            )
        ]
    }
)%}

In [None]:
%%appyter code_exec
species = {{ species_input.value[0]["args"]["name"].split("_")[0]|jsonify }}
gene = {{ species_input.value[0] }}
input_gene = {{ species_input.value[0] }}

In [None]:
species = species.lower()

# My Gene's Expression in Type 2 Diabetes Transcriptomics Signatures

In [None]:
display(Markdown(f"## {species.capitalize()} T2D-Related Expression Studies"))

In [None]:
# store if synonym was used
has_syn = False

# import preprocessed signature data
root_path = 'https://s3.appyters.maayanlab.cloud/storage/Gene_Expression_T2D_Signatures/'

# obtain all gene symbols in signature
with requests.get(f"{root_path}{species}_sig_genes.txt") as r:
    sig_genes = r.text.split('\n')

gene_map_ind = []
gene_map_val = []
# find gene synonym, if necessary
if gene not in sig_genes:
    gene_info = {
        'human': 'ftp://ftp.ncbi.nih.gov/gene/DATA/GENE_INFO/Mammalia/Homo_sapiens.gene_info.gz',
        'mouse': 'ftp://ftp.ncbi.nih.gov/gene/DATA/GENE_INFO/Mammalia/Mus_musculus.gene_info.gz'
    }
    gene_df = pd.read_csv(gene_info[species], sep='\t', compression='gzip')[['Symbol', 'Synonyms']]
    for row in gene_df.itertuples():
        if row.Synonyms != '-':
            syns = row.Synonyms.split('|')
            for syn in syns:
                syn = syn.split(':')[1] if syn.find(':') != -1 else syn
                gene_map_ind.append(syn)
                gene_map_val.append(row.Symbol)
    gene_map = pd.DataFrame.from_records(
        zip(gene_map_ind, gene_map_val), 
        columns=['synonym', 'symbol']
    ).set_index('synonym')
    syn_counts = gene_map.index.value_counts()
    gene_map = gene_map.drop(syn_counts[syn_counts > 1].index)
    
    gene = gene_map.loc[input_gene, 'symbol']
    has_syn = True
    display(HTML(f"<div style='font-size:1rem;padding:1rem 0;'>{input_gene} not in signature data, using gene synonym <b>{gene}</b>.</div>"))

In [None]:
# RNA-seq p-values
pval_rna_df = pd.read_feather(f'{root_path}all_{species}_pval.f', columns=['index', gene]).set_index('index')
# RNA-seq fold change
fc_rna_df = pd.read_feather(f'{root_path}all_{species}_fc.f', columns=['index', gene]).set_index('index')

# microarray data
try:
    pval_micro_df = pd.read_feather(f"{root_path}{species}_pruned_affy_pv.f", columns=['index', gene]).set_index('index')
    fc_micro_df = pd.read_feather(f"{root_path}{species}_pruned_affy_fc.f", columns=['index', gene]).set_index('index')
    micro_exists = True
except:
    pval_micro_df = None
    fc_micro_df = None
    micro_exists = False

# Bulk RNA-seq Appyter analysis instances
inst_df_input = pd.read_csv(f"{root_path}{species}_instances.tsv", sep='\t', index_col=0)

In [None]:
# configure color scheme
red_map = cm.get_cmap('Reds_r')
red_norm = colors.Normalize(vmin=-0.25, vmax=1)
blue_map = cm.get_cmap('Blues_r')
blue_norm = colors.Normalize(vmin=-0.25, vmax=1)

def map_color(fc, pv):
    if fc < 0:
        return colors.to_hex(red_map(red_norm(pv)))
    elif fc == 0:
        return '#808080'
    else:
        return colors.to_hex(blue_map(blue_norm(pv)))

In [None]:
def combine_data(pval_df, fc_df, gene, isRNA=False, inst_df=None):
    # extract and combine data for each gene
    comb_df = pd.DataFrame()
    comb_df['sig'] = pval_df.index.tolist()
    comb_df['pval'] = pval_df[gene].tolist()
    comb_df['logpv'] = np.negative(np.log10(comb_df['pval']))
    comb_df['fc'] = fc_df[gene].tolist()
    if isRNA:
        comb_df['inst'] = comb_df['sig'].apply(lambda x: inst_df.loc[x, 'session_id'])
    return comb_df

def make_plot(comb_df, species, gene, micro=False, micro_df=None):
    # create links from Bulk RNA-seq Appyter instance session IDs
    comb_df['inst'] = comb_df['inst'].apply(lambda x: f'https://appyters.maayanlab.cloud/Bulk_RNA_seq/{x}')

    # set color and size for each point on plot
    rna_colors = [map_color(r.fc, r.pval) for r in comb_df.itertuples()]
    rna_sizes = [12 if r.pval < 0.05 else 6 for r in comb_df.itertuples()]

    if micro:
        micro_colors = [map_color(r.fc, r.pval) for r in micro_df.itertuples()]
        micro_sizes = [12 if r.pval < 0.05 else 6 for r in micro_df.itertuples()]

    # generate data source
    data_source = ColumnDataSource(
        data=dict(
            x = comb_df['fc'],
            y = comb_df['logpv'],
            sig = comb_df['sig'],
            pval = comb_df['pval'], 
            fc = comb_df['fc'], 
            colors = rna_colors, 
            sizes = rna_sizes,
            # label = ['RNA-seq']*comb_df.shape[0]
        )
    )

    # generate microarray data source if it exists
    if micro:
        micro_data_source = ColumnDataSource(
            data=dict(
                x = micro_df['fc'],
                y = micro_df['logpv'], 
                sig = micro_df['sig'],
                pval = micro_df['pval'], 
                fc = micro_df['fc'],
                colors = micro_colors,
                sizes = micro_sizes,
                # label = ['Microarray']*micro_df.shape[0]
            )
        )
    # create hover tooltip
    tools = [
        ("Signature", "@sig"),
        ("P-Value", "@pval"),
        ("Fold Change", "@fc")
    ]
    # generate plot and relevant plot labels
    plot = figure(
        plot_width=700,
        plot_height=500,
        tooltips=tools
    )
    plot.circle(
        'x', 'y', 
        size='sizes',
        alpha=0.7, 
        line_alpha=0,
        line_width=0.01, 
        source=data_source,
        fill_color='colors', 
        name=f'{gene}_t2d_expression_volcano_plot',
        # legend_group='label'
    )

    if micro:
        plot.square(
            'x', 'y',
            size='sizes',
            alpha=0.7,
            line_alpha=0,
            line_width=0.01,
            source=micro_data_source,
            fill_color='colors',
            name=f'{gene}_t2d_expression_volcano_plot',
            # legend_group='label'
        )

    plot.xaxis.axis_label = 'Fold Change'
    plot.yaxis.axis_label = '-log10(P-value)'
    plot.title.text = f"Differential Expression of {gene} in {species} Type 2 Diabetes Transcriptomics Signatures"
    plot.title.align = 'center'
    plot.title.text_font_size = '14px'

    show(plot)

The volcano plot below positions each of the Type 2 Diabetes signatures according to the gene-specific fold change (x-position) and -log(p-value) (y-position). Signatures were computed using the R package [limma](https://bioconductor.org/packages/release/bioc/html/limma.html). Fold changes are quantile-normalized log2 fold change values. 

**Red** points indicate signatures where the fold change of the chosen gene was < 0, **blue** points indicate fold change > 0, and **gray** points indicate fold change == 0. Signatures where the specified gene was significantly expressed in either direction are denoted by darker color and larger point size. 

**Circle** points indicate bulk RNA-seq signatures, while **square** points indicate microarray signatures.

Hover over any point to display the corresponding signature name, the p-value, and the fold change. 

Use the toolbar on the right side of the plot to pan, zoom, or save the plot.

In [None]:
comb_df_input = combine_data(pval_rna_df, fc_rna_df, gene, isRNA=True, inst_df=inst_df_input)
if micro_exists:
    micro_df_input = combine_data(pval_micro_df, fc_micro_df, gene)
else:
    micro_df_input = None
make_plot(comb_df_input, species, gene, micro=micro_exists, micro_df=micro_df_input)

In [None]:
# get GEO links 
def geo_link(sig_name, clickable):
    gse_id = sig_name.split('_')[0].replace('* ', '')
    geo_path = 'https://www.ncbi.nlm.nih.gov/geo/query/acc.cgi?acc='
    if clickable:
        return f'<a target="_blank" href="{geo_path}{gse_id}">{gse_id}</a>'
    else:
        return f'{geo_path}{gse_id}'

def appyter_link(sig_name, clickable, inst=''):
    text = f'Analysis of {sig_name}'
    return f'<a target="_blank" href="{inst}">{text}</a>'

# create tables of significant results with links to GEO 
def make_tables(comb_df, is_upreg, isRNA=False):
    dir_df = comb_df[comb_df['fc'] > 0] if is_upreg else comb_df[comb_df['fc'] < 0]
    if dir_df.shape[0] == 0:
        return dir_df
    dir_df = dir_df.drop(columns='logpv').sort_values(by='pval', ascending=True)
    if isRNA:
        dir_df['inst'] = dir_df.apply(lambda row: appyter_link(row.sig, clickable=True, inst=row.inst), axis=1)
    dir_df['sig'] = dir_df.apply(lambda row: f"* {row.sig}" if row.pval < 0.05 else row.sig, axis=1)
    dir_df['pval'] = dir_df['pval'].apply(lambda x: f'{x:.3e}')
    dir_df['fc'] = dir_df['fc'].apply(lambda x: f'{x:.4f}')
    if isRNA:
        dir_df = dir_df.rename(columns={'sig': 'Signature', 'pval': 'P-value', 'fc': 'Log2 Fold Change', 'inst': 'Link to Bulk RNA-seq Analysis'})
    else:
        dir_df = dir_df.rename(columns={'sig': 'Signature', 'pval': 'P-value', 'fc': 'Log2 Fold Change'})
    dir_df['Link to GEO Study'] = dir_df['Signature'].apply(geo_link, clickable=True)
    return dir_df

# create download link for table results
def download_link(df, fname, isRNA=False):
    if isRNA:
        df['Link to Bulk RNA-seq Analysis'] = df['Link to Bulk RNA-seq Analysis'].apply(
            lambda x: x.split('href=')[1].split('>')[0].replace('"', '')
        )
    df['Link to GEO Study'] = df['Link to GEO Study'].apply(
        lambda x: x.split('href=')[1].split('>')[0].replace('"', '')
    )
    df['Signature'] = df['Signature'].apply(lambda x: x.replace('* ', ''))
    csv = df.to_csv(fname, sep='\t', index=False)
    link = f'<div>Download full results: <a href="{fname}" target=_blank>{fname}</a></div>'
    return link

The tables below show the top 10 signatures in which the gene of interest was up-regulated or down-regulated, respectively. Links are included to the original GEO studies, as well as to [Bulk RNA-seq Appyter](https://appyters.maayanlab.cloud/#/Bulk_RNA_seq) instances that display the results of performing RNA-seq analysis on the original expression data used to generate each signature.

In [None]:
up_comb_df_input = make_tables(comb_df_input, is_upreg=True, isRNA=True)
dn_comb_df_input = make_tables(comb_df_input, is_upreg=False, isRNA=True)
if micro_exists:
    up_micro_df_input = make_tables(micro_df_input, is_upreg=True)
    dn_micro_df_input = make_tables(micro_df_input, is_upreg=False)

In [None]:
if has_syn:
    display(HTML(f'<div style="font-size:1rem;padding=1rem;"><b>Top {species} RNA-seq signatures where {gene} (synonym {input_gene}) is up-regulated</b></div>'))
else:
    display(HTML(f'<div style="font-size:1rem;padding=1rem;"><b>Top {species} RNA-seq signatures where {gene} is up-regulated</b></div>'))
display(HTML(f'<div>Asterisk (*) denotes significance (p < 0.05)</div>'))
display(HTML(up_comb_df_input[:10].to_html(escape=False,index=False,col_space=70)))
display(HTML(download_link(up_comb_df_input, f'{gene}_upreg_expression_{species}_T2D_signatures.tsv')))

In [None]:
if has_syn:
    display(HTML(f'<div style="font-size:1rem;padding=1rem;"><b>Top {species} RNA-seq signatures where {gene} (synonym {input_gene}) is down-regulated</b></div>'))
else:
    display(HTML(f'<div style="font-size:1rem;padding=1rem;"><b>Top {species} RNA-seq signatures where {gene} is down-regulated</b></div>'))
display(HTML(f'<div>Asterisk (*) denotes significance (p < 0.05)</div>'))
display(HTML(dn_comb_df_input[:10].to_html(escape=False,index=False,col_space=70)))
display(HTML(download_link(dn_comb_df_input, f'{gene}_downreg_expression_{species}_T2D_signatures.tsv')))

In [None]:
if micro_exists:
    if has_syn:
        display(HTML(f'<div style="font-size:1rem;padding=1rem;"><b>Top {species} microarray signatures where {gene} (synonym {input_gene}) is up-regulated</b></div>'))
    else:
        display(HTML(f'<div style="font-size:1rem;padding=1rem;"><b>Top {species} microarray signatures where {gene} is up-regulated</b></div>'))
    display(HTML(f'<div>Asterisk (*) denotes significance (p < 0.05)</div>'))
    display(HTML(up_micro_df_input[:10].to_html(escape=False,index=False,col_space=70)))
    display(HTML(download_link(up_micro_df_input, f'{gene}_upreg_microarray_{species}_T2D_signatures.tsv')))
else:
    display(HTML(f'<div style="font-size:1rem;padding=1rem;"><b>There are no {species} microarray signatures in which {gene} is up-regulated</b></div>'))

In [None]:
if micro_exists:
    if has_syn:
        display(HTML(f'<div style="font-size:1rem;padding=1rem;"><b>Top {species} microarray signatures where {gene} (synonym {input_gene}) is down-regulated</b></div>'))
    else:
        display(HTML(f'<div style="font-size:1rem;padding=1rem;"><b>Top {species} microarray signatures where {gene} is down-regulated</b></div>'))
    display(HTML(f'<div>Asterisk (*) denotes significance (p < 0.05)</div>'))
    display(HTML(dn_micro_df_input[:10].to_html(escape=False,index=False,col_space=70)))
    display(HTML(download_link(dn_micro_df_input, f'{gene}_downreg_microarray_{species}_T2D_signatures.tsv')))
else:
    display(HTML(f'<div style="font-size:1rem;padding=1rem;"><b>There are no {species} microarray signatures in which {gene} is down-regulated</b></div>'))

In [None]:
alt_species = 'mouse' if species == 'human' else 'human'
# get all gene symbols and find overlapping genes
with requests.get(f"{root_path}{alt_species}_sig_genes.txt") as r:
    alt_sig_genes = r.text.split('\n')
overlapping_genes = list(set([x.upper() for x in sig_genes]).intersection(set([y.upper() for y in alt_sig_genes])))
is_overlap = gene.upper() in overlapping_genes
alt_gene = gene.upper() if species == 'mouse' else gene.lower().capitalize()

In [None]:
if is_overlap:
    display(Markdown(f"## {alt_species.capitalize()} T2D-Related Expression Studies"))
    display(Markdown(f"The volcano plot and results tables below are generated from signatures from the species not chosen in the input form, in the case that the chosen gene overlaps between mouse and human species."))
    display(HTML(f'<div style="font-size:1rem;padding=1rem 0;">Corresponding {alt_species} gene: <b>{alt_gene}</b>.</div>'))

In [None]:
if is_overlap:
    pval_df_alt = pd.read_feather(f"{root_path}all_{alt_species}_pval.f", columns=['index', alt_gene]).set_index('index')
    fc_df_alt = pd.read_feather(f"{root_path}all_{alt_species}_fc.f", columns=['index', alt_gene]).set_index('index')
    inst_df_alt = pd.read_csv(f"{root_path}{alt_species}_instances.tsv", sep='\t', index_col=0)
    
    micro_pval_alt = pd.read_feather(f"{root_path}{alt_species}_pruned_affy_pv.f", columns=['index', alt_gene]).set_index('index')
    micro_fc_alt = pd.read_feather(f"{root_path}{alt_species}_pruned_affy_fc.f", columns=['index', alt_gene]).set_index('index')

    alt_comb_df = combine_data(pval_df_alt, fc_df_alt, alt_gene, isRNA=True, inst_df=inst_df_alt)
    alt_micro_df = combine_data(micro_pval_alt, micro_fc_alt, alt_gene)
    make_plot(alt_comb_df, alt_species, alt_gene, micro=micro_exists, micro_df=alt_micro_df)


In [None]:
if is_overlap:
    up_alt_comb_df = make_tables(alt_comb_df, is_upreg=True, isRNA=True)
    display(HTML(f'<div style="font-size:1rem;padding=1rem 0;"><b>Top {alt_species} RNA-seq signatures where {alt_gene} is up-regulated</b></div>'))
    display(HTML(f'<div>Asterisk (*) denotes significance (p < 0.05)</div>'))
    display(HTML(up_alt_comb_df[:10].to_html(escape=False,index=False,col_space=70)))
    display(HTML(download_link(up_alt_comb_df, f'{alt_gene}_upreg_expression_{alt_species}_T2D_signatures.tsv')))

In [None]:
if is_overlap:
    dn_alt_comb_df = make_tables(alt_comb_df, is_upreg=False, isRNA=True)
    display(HTML(f'<div style="font-size:1rem;padding=1rem 0;"><b>Top {alt_species} RNA-seq signatures where {alt_gene} is down-regulated</b></div>'))
    display(HTML(f'<div>Asterisk (*) denotes significance (p < 0.05)</div>'))
    display(HTML(dn_alt_comb_df[:10].to_html(escape=False,index=False,col_space=70)))
    display(HTML(download_link(dn_alt_comb_df, f'{alt_gene}_downreg_expression_{alt_species}_T2D_signatures.tsv')))

In [None]:
if is_overlap:
    if micro_exists:
        up_alt_micro_df = make_tables(alt_micro_df, is_upreg=True)
        display(HTML(f'<div style="font-size:1rem;padding=1rem 0;"><b>Top {alt_species} microarray signatures where {alt_gene} is up-regulated</b></div>'))
        display(HTML(f'<div>Asterisk (*) denotes significance (p < 0.05)</div>'))
        display(HTML(up_alt_micro_df[:10].to_html(escape=False,index=False,col_space=70)))
        display(HTML(download_link(up_alt_micro_df, f'{alt_gene}_upreg_microarray_{alt_species}_T2D_signatures.tsv')))
    else:
        display(HTML(f'<div style="font-size:1rem;padding=1rem;"><b>There are no {alt_species} microarray signatures in which {alt_gene} is up-regulated</b></div>'))

In [None]:
if is_overlap:
    if micro_exists:
        dn_alt_micro_df = make_tables(alt_micro_df, is_upreg=False)
        display(HTML(f'<div style="font-size:1rem;padding=1rem 0;"><b>Top {alt_species} microarray signatures where {alt_gene} is down-regulated</b></div>'))
        display(HTML(f'<div>Asterisk (*) denotes significance (p < 0.05)</div>'))
        display(HTML(dn_alt_micro_df[:10].to_html(escape=False,index=False,col_space=70)))
        display(HTML(download_link(dn_alt_micro_df, f'{alt_gene}_downreg_microarray_{alt_species}_T2D_signatures.tsv')))
    else:
        display(HTML(f'<div style="font-size:1rem;padding=1rem;"><b>There are no {alt_species} microarray signatures in which {alt_gene} is down-regulated</b></div>'))

In [None]:
if not is_overlap:
    display(HTML(f'<div style="font-size:1rem;padding=1rem 0;">Corresponding gene not found in {alt_species}.</div>'))