In [None]:
#%%appyter init
from appyter import magic
magic.init(lambda _=globals: _())

In [None]:
# imports
import numpy as np
import pandas as pd
import matplotlib.cm as cm
import matplotlib.colors as colors
from IPython.display import HTML
import requests

# bokeh
from bokeh.io import output_notebook
from bokeh.plotting import figure, show
from bokeh.models import HoverTool, ColumnDataSource

# display graphics
output_notebook()

In [None]:
%%appyter hide
{% do SectionField(
    name = 'INPUT',
    title = 'Input Parameters',
    subtitle = 'Fill in the species and gene of interest',
    img = 'volcano_plot_ex.png'
)%}

In [None]:
%%appyter code_exec
{% set species_input = TabField(
    name = 'species_input',
    label = 'Species of Interest',
    default = 'Human',
    description = 'Select the species of interest.',
    section = 'INPUT',
    choices = {
        'Human': [
            AutocompleteField(
                name = 'human_gene',
                label = 'Human Gene of Interest',
                default = 'AKT1',
                description = 'Enter the gene symbol of interest (human).',
                file_path = 'https://appyters.maayanlab.cloud/storage/Gene_Centric_GEO_Reverse_Search/human_genes.json'
            )
        ],
        'Mouse': [
            AutocompleteField(
                name = 'mouse_gene',
                label = 'Mouse Gene of Interest',
                default = 'Mthfr',
                description = 'Enter the gene symbol of interest (mouse).',
                file_path = 'https://appyters.maayanlab.cloud/storage/Gene_Centric_GEO_Reverse_Search/mouse_genes.json'
            )
        ]
    }
)%}

In [None]:
%%appyter code_exec
species = {{ species_input.value[0]["args"]["name"].split("_")[0]|jsonify }}
gene = {{ species_input.value[0] }}
input_gene = {{ species_input.value[0] }}

# Gene Centric GEO Reverse Search

In [None]:
# store if synonym was used
has_syn = False

# obtain all gene symbols
root_path = 'https://appyters.maayanlab.cloud/storage/Gene_Centric_GEO_Reverse_Search/'
gene_list = (requests
             .get(root_path + f"{species}_genes.json")
             .json()[f"{species}_genes"])

# find gene synonym, if necessary
if gene not in set(gene_list):
    gene_info = {
        'Human': 'ftp://ftp.ncbi.nih.gov/gene/DATA/GENE_INFO/Mammalia/Homo_sapiens.gene_info.gz',
        'Mouse': 'ftp://ftp.ncbi.nih.gov/gene/DATA/GENE_INFO/Mammalia/Mus_musculus.gene_info.gz'
    }
    gene_df = pd.read_csv(gene_info[species.capitalize()], sep='\t', compression='gzip')[['Symbol', 'Synonyms']]
    gene_map_ind = []
    gene_map_val = []
    for row in gene_df.itertuples():
        if row.Synonyms != '-':
            syns = row.Synonyms.split('|')
            for syn in syns:
                syn = syn.split(':')[1] if syn.find(':') != -1 else syn
                gene_map_ind.append(syn)
                gene_map_val.append(row.Symbol)
    gene_map = pd.DataFrame.from_records(
        zip(gene_map_ind, gene_map_val), 
        columns=['synonym', 'symbol']
    ).set_index('synonym')
    syn_counts = gene_map.index.value_counts()
    gene_map = gene_map.drop(syn_counts[syn_counts > 1].index)
    try:
        gene = gene_map.loc[input_gene, 'symbol']
        has_syn = True
        display(HTML(f"<div style='font-size:1rem;padding:1rem 0;'>{input_gene} not in signature data, using gene synonym <b>{gene}</b>.</div>"))
    except:
        display(HTML(f"<div style='font-size:1rem;padding:1rem 0;'>{input_gene} not found in signature data and no synonyms found either. Please check your spelling, or try again later with a synonym."))
        has_syn = None

In [None]:
# import preprocessed signature data
if has_syn != None:
    sub_path = f'{species}/{species}_'
    csv_num = requests.get(root_path + f'{species}_lookup.json').json()[gene] #identifies correct csv
    pval_df_input = pd.read_csv(root_path + sub_path + f"pval/{species}_pval_{csv_num}.csv", usecols = [gene]).T
    adjpval_df_input = pd.read_csv(root_path + sub_path + f"adjpval/{species}_adjpval_{csv_num}.csv", usecols = [gene]).T 
    fc_df_input = pd.read_csv(root_path + sub_path + f"fc/{species}_fc_{csv_num}.csv", usecols = [gene]).T
    score_df_input = pd.read_csv(root_path + f"all_{species}_score.csv", index_col=0)

In [None]:
# configure color scheme
red_map = cm.get_cmap('Reds_r')
red_norm = colors.Normalize(vmin=-0.25, vmax=1)
blue_map = cm.get_cmap('Blues_r')
blue_norm = colors.Normalize(vmin=-0.25, vmax=1)

def map_color(fc, pv):
    if fc < 0:
        return colors.to_hex(red_map(red_norm(pv)))
    elif fc == 0:
        return '#808080'
    else:
        return colors.to_hex(blue_map(blue_norm(pv)))

In [None]:
def combine_data(pval_df, adjpval_df, fc_df, score_df, gene):
    # extract and combine data for each gene
    comb_df = pd.DataFrame()
    comb_df['sig'] = score_df.columns.tolist()
    comb_df['score'] = score_df.loc['score'].to_list()
    comb_df['pval'] = pval_df.loc[gene].tolist()
    comb_df['adjpval'] = adjpval_df.loc[gene].tolist()
    comb_df['logpv'] = np.negative(np.log10(comb_df['pval']))
    comb_df['fc'] = fc_df.loc[gene].tolist()
    return comb_df

def make_plot(comb_df, species, gene):
    # set color and size for each point on plot
    plot_colors = [map_color(r.fc, r.pval) for r in comb_df.itertuples()]
    sizes = [12 if r.pval < 0.05 else 6 for r in comb_df.itertuples()]

    # generate data source
    data_source = ColumnDataSource(
        data=dict(
            x = comb_df['fc'],
            y = comb_df['logpv'],
            sig = comb_df['sig'],
            score = comb_df['score'],
            pval = comb_df['pval'],
            adjpval = comb_df['adjpval'],
            fc = comb_df['fc'], 
            colors = plot_colors, 
            sizes = sizes
        )
    )
    # create hover tooltip
    tools = [
        ("Signature", "@sig"),
        ("Score", "@score"),
        ("P-Value", "@pval"),
        ("Adj P-Value", "@adjpval"),
        ("Fold Change", "@fc")
    ]
    # generate plot and relevant plot labels
    plot = figure(
        plot_width=700,
        plot_height=500,
        tooltips=tools
    )
    plot.circle(
        'x', 'y', 
        size='sizes',
        alpha=0.7, 
        line_alpha=0,
        line_width=0.01, 
        source=data_source,
        fill_color='colors', 
        name=f'{gene}_expression_volcano_plot'
    )
    plot.xaxis.axis_label = 'Fold Change'
    plot.yaxis.axis_label = '-log10(P-value)'
    plot.title.text = f"Differential Expression of {gene} in {species.capitalize()} Signatures"
    plot.title.align = 'center'
    plot.title.text_font_size = '14px'
    plot.min_border_top = 75

    show(plot)

The volcano plot below positions signatures according to the gene-specific fold change (x-position) and -log(p-value) (y-position). Signatures were computed using the R package [limma](https://bioconductor.org/packages/release/bioc/html/limma.html). Fold changes are quantile-normalized log2 fold change values. 

**Red** points indicate signatures where the fold change of the chosen gene was < 0, **blue** points indicate fold change > 0, and **gray** points indicate fold change == 0. Signatures where the specified gene was significantly expressed in either direction are denoted by darker color and larger point size. 

Signature names are of the form "{study name} {GSE number}\_{signature number}". 

Score indicates the confidence in the accuracy of the signature, calculated by tallying the number of extrapolations made when calculating said signature (eg labelling a sample as a control or perturbation group, and the direction of expression), with **lower scores indicating higher confidence in quality** (a score of 0 indicates highest confidence, and a score of 3 indicates lowest confidence).

Hover over any point to display the corresponding signature name, score, the p-value, the adjusted p-value, and the fold change.

Use the toolbar on the right side of the plot to pan, zoom, or save the plot.

In [None]:
if has_syn != None:
    comb_df_input = combine_data(pval_df_input, adjpval_df_input, fc_df_input, score_df_input, gene)
    make_plot(comb_df_input, species, gene)

In [None]:
# get GEO links 
def geo_link(sig_name, clickable):
    gse_id = sig_name.split()[-1].split("_")[0]
    geo_path = 'https://www.ncbi.nlm.nih.gov/geo/query/acc.cgi?acc='
    if clickable:
        return f'<a target="_blank" href="{geo_path}{gse_id}">{gse_id}</a>'
    else:
        return f'{geo_path}{gse_id}'

# create tables of significant results with links to GEO 
def make_tables(comb_df, is_upreg):
    dir_df = comb_df[comb_df['fc'] > 0] if is_upreg else comb_df[comb_df['fc'] < 0]
    dir_df = dir_df.drop(columns='logpv').sort_values(by='pval', ascending=True)
    dir_df['sig'] = dir_df.apply(lambda row: f"* {row.sig}" if row.pval < 0.05 else row.sig, axis=1)
    dir_df['pval'] = dir_df['pval'].apply(lambda x: f'{x:.3e}')
    dir_df['adjpval'] = dir_df['adjpval'].apply(lambda x: f'{x:.3e}')
    dir_df['fc'] = dir_df['fc'].apply(lambda x: f'{x:.4f}')
    dir_df = dir_df.rename(columns={'sig': 'Signature',
                                    'score': 'Score',
                                    'pval': 'P-value', 
                                    'adjpval': 'Adj P-value', 
                                    'fc': 'Log2 Fold Change'})
    dir_df['Link to GEO Study'] = dir_df['Signature'].apply(geo_link, clickable=True)
    return dir_df

# create download link for table results
def download_link(df, fname):
    df['Link to GEO Study'] = df['Link to GEO Study'].apply(
        lambda x: x.split('href=')[1].split('>')[0].replace('"', '')
    )
    df['Signature'] = df['Signature'].apply(lambda x: x.replace('* ', ''))
    csv = df.to_csv(fname, sep='\t', index=False)
    link = f'<div>Download full results: <a href="{fname}" target=_blank>{fname}</a></div>'
    return link

The tables below show the top 10 signatures in which the gene of interest was up-regulated or down-regulated, respectively. Links are included to the original GEO studies, and the table is available for download as a tsv file.

In [None]:
if has_syn != None:
    try:
        up_comb_df_input = make_tables(comb_df_input, is_upreg=True)
        
        display(HTML(
            f'<div style="font-size:1rem;padding=1rem;"><b>\
            Top 10 {species.capitalize()} signatures where \
            {gene} {f"(synonym {input_gene}) " if has_syn else ""}is up-regulated\
            </b></div>'
        ))
        display(HTML(f'<div>Asterisk (*) denotes significance (p < 0.05)</div>'))
        display(HTML(up_comb_df_input[:10].to_html(escape=False,index=False,col_space=70)))
        display(HTML(download_link(up_comb_df_input, f'{gene}_upreg_expression_{species}_signatures.tsv')))
    except:
        display(HTML(
            f'<div>\
            No signatures were found where {gene} {f"(synonym {input_gene}) " if has_syn else ""}is up-regulated.\
            </div>'
        ))

In [None]:
if has_syn != None:  
    try:
        dn_comb_df_input = make_tables(comb_df_input, is_upreg=False)
        
        display(HTML(
            f'<div style="font-size:1rem;padding=1rem;"><b>\
            Top 10 {species.capitalize()} signatures where \
            {gene} {f"(synonym {input_gene}) " if has_syn else ""}is down-regulated\
            </b></div>'
        ))
        display(HTML(f'<div>Asterisk (*) denotes significance (p < 0.05)</div>'))
        display(HTML(dn_comb_df_input[:10].to_html(escape=False,index=False,col_space=70)))
        display(HTML(download_link(dn_comb_df_input, f'{gene}_downreg_expression_{species}_signatures.tsv')))
    except:
        display(HTML(
            f'<div>\
            No signatures were found where {gene} {f"(synonym {input_gene}) " if has_syn else ""}is down-regulated.\
            </div>'
        ))

# Alternate Species Search

If the chosen gene overlaps between mouse and human species, the volcano plot and results tables will be generated below for signatures from the species not chosen in the input form. 

In [None]:
# get all gene symbols and find overlapping genes
human_genes = requests.get(root_path + 'human_genes.json').json()['human_genes']
mouse_genes = requests.get(root_path + 'mouse_genes.json').json()['mouse_genes']
# convert mouse_genes to uppercase
mouse_genes_upper = [g.upper() for g in mouse_genes]
overlapping_genes = list(set(human_genes).intersection(set(mouse_genes_upper)))

In [None]:
is_overlap = gene.upper() in overlapping_genes
alt_species = 'mouse' if species == 'human' else 'human'
alt_gene = gene.upper() if species == 'mouse' else gene.lower().capitalize()

if is_overlap:
    display(HTML(f'<div style="font-size:1rem;padding=1rem 0;">\
    Corresponding gene in {alt_species.capitalize()}: <b>{alt_gene}</b>.</div>'))

In [None]:
if is_overlap:
    sub_path = f'{alt_species}/{alt_species}_'
    alt_csv_num = requests.get(root_path + f'{alt_species}_lookup.json').json()[alt_gene] #identifies correct csv
    pval_df_alt = pd.read_csv(root_path + sub_path + f"pval/{alt_species}_pval_{alt_csv_num}.csv", usecols = [alt_gene]).T
    adjpval_df_alt = pd.read_csv(root_path+sub_path+f"adjpval/{alt_species}_adjpval_{alt_csv_num}.csv", usecols = [alt_gene]).T
    fc_df_alt = pd.read_csv(root_path + sub_path + f"fc/{alt_species}_fc_{alt_csv_num}.csv", usecols = [alt_gene]).T
    score_df_alt = pd.read_csv(root_path + f"all_{alt_species}_score.csv", index_col=0)

    comb_df_alt = combine_data(pval_df_alt, adjpval_df_alt, fc_df_alt, score_df_alt, alt_gene)
    make_plot(comb_df_alt, alt_species, alt_gene)

In [None]:
if is_overlap:
    try:
        up_comb_df_alt = make_tables(comb_df_alt, is_upreg=True)
        
        display(HTML(
            f'<div style="font-size:1rem;padding=1rem 0;"><b>\
            Top 10 {alt_species.capitalize()} signatures where {alt_gene} is up-regulated\
            </b></div>'
        ))
        display(HTML(f'<div>Asterisk (*) denotes significance (p < 0.05)</div>'))
        display(HTML(up_comb_df_alt[:10].to_html(escape=False,index=False,col_space=70)))
        display(HTML(download_link(up_comb_df_alt, f'{alt_gene}_upreg_expression_{alt_species}_signatures.tsv')))
    except:
        display(HTML(
            f'<div>\
            No signatures were found where {alt_gene} is up-regulated.\
            </div>'
        ))

In [None]:
if is_overlap:
    try:
        dn_comb_df_alt = make_tables(comb_df_alt, is_upreg=False)
        
        display(HTML(
            f'<div style="font-size:1rem;padding=1rem 0;"><b>\
            Top 10 {alt_species.capitalize()} signatures where {alt_gene} is down-regulated\
            </b></div>'
        ))
        display(HTML(f'<div>Asterisk (*) denotes significance (p < 0.05)</div>'))
        display(HTML(dn_comb_df_alt[:10].to_html(escape=False,index=False,col_space=70)))
        display(HTML(download_link(dn_comb_df_alt, f'{alt_gene}_downreg_expression_{alt_species}_signatures.tsv')))
    except:
        display(HTML(
            f'<div>\
            No signatures were found where {alt_gene} is down-regulated.\
            </div>'
        ))

In [None]:
if not is_overlap:
    display(HTML(f'<div style="font-size:1rem;padding=1rem 0;">Corresponding gene not found in {alt_species}.</div>'))