In [None]:
#%%appyter init
from appyter import magic
magic.init(lambda _=globals: _())

In [None]:
# imports
import numpy as np
import pandas as pd
import matplotlib.cm as cm
import matplotlib.colors as colors
from IPython.display import HTML, Markdown
import warnings
import requests
import time

# bokeh
from bokeh.io import output_notebook
from bokeh.plotting import figure, show
from bokeh.models import ColumnDataSource

# display graphics
output_notebook()

In [None]:
%%appyter hide
{% do SectionField(
    name = 'INPUT',
    title = 'Input Parameters',
    subtitle = 'Fill in the gene of interest',
    img = 'gene.png'
)%}

In [None]:
%%appyter code_exec
{% set gene_input = AutocompleteField(
    name = 'gene',
    label = 'Gene of Interest',
    default = 'A1BG',
    description = 'Enter the gene symbol of interest.',
    file_path = 'https://appyters.maayanlab.cloud/storage/L1000_RNAseq_Gene_Search/allgenes.json',
    section='INPUT'
)%}

In [None]:
%%appyter code_exec
gene = {{ gene_input }}

# RNA-seq-like Gene Centric Signature Reverse Search (RGCSRS)

In [None]:
root_path = 'https://appyters.maayanlab.cloud/storage/L1000_RNAseq_Gene_Search'

In [None]:
# load gene info
gene_info = pd.read_csv(f"{root_path}/L1000_to_RNAseq_gene_list.tsv", sep="\t", index_col=0)
landmark = "not"
inferred_l1000 = "not"
if gene_info.loc[gene, "landmark"] == True:
    landmark = ""
if gene_info.loc[gene, "inferred l1000"] == True:
    inferred_l1000 = ""

In [None]:
display(Markdown(f"**Input gene: {gene}**"))
display(Markdown(f"{gene} is {landmark} an landmark gene."))
display(Markdown(f"{gene} is {inferred_l1000} an originally inferred L1000 gene."))
display(Markdown(f"{gene} is a newly inferred (our model) gene."))
display(HTML(f"""More information about {gene} can be found at the <a href="https://cfde-gene-pages.cloud/gene/{gene}?CF=false&PS=true&Ag=true" target="_blank"> Gene and Drug Landing Page Aggregator</a>"""))

This Appyter provides visualizations of the top 5% of RNA-seq-like signatures induced by CRISPR knockouts and chemical perturbagens. Signatures are computed from transformed data profiles from the [LINCS L1000 data](). The transformation was performed using a two-step model: 
1. A cycleGAN model was used to first predict the RNA-seq expression of the 978 L1000 landmark genes
2. A fully connected neural network was used to extrapolate the predicted RNA-seq expression of the 978 landmark genes to a full set of 23,164 genes

Signatures were computed using the characteristic direction method [(Clark et al., 2014)](https://bmcbioinformatics.biomedcentral.com/articles/10.1186/1471-2105-15-79), as implemented [here](https://github.com/MaayanLab/maayanlab-bioinformatics/blob/master/maayanlab_bioinformatics/dge/characteristic_direction.py). 

The top 5% of signatures were identified based on the mean of the absolute values of the characteristic direction coefficients 

In [None]:
xpr_fc = pd.read_feather(f"{root_path}/xpr_fc.f", columns=['index', gene]) \
    .set_index('index').rename(columns={gene: 'FC'})
xpr_cd = pd.read_feather(f"{root_path}/xpr_cd.f", columns=['index', gene]) \
    .set_index('index').rename(columns={gene: 'CD'})
xpr_ranks = pd.read_feather(f"{root_path}/xpr_ranks.f", columns=['index', gene]) \
    .set_index('index').rename(columns={gene: 'Rank'})

cp_fc = pd.read_feather(f"{root_path}/cp_fc.f", columns=['index', gene]) \
    .set_index('index').rename(columns={gene: 'FC'})
cp_cd = pd.read_feather(f"{root_path}/cp_cd.f", columns=['index', gene]) \
    .set_index('index').rename(columns={gene: 'CD'})
cp_ranks = pd.read_feather(f"{root_path}/cp_ranks.f", columns=['index', gene]) \
    .set_index('index').rename(columns={gene: 'Rank'})

In [None]:
red_map = cm.get_cmap('Reds')
red_norm = colors.Normalize(vmin=-0.005, vmax=0.01)
blue_map = cm.get_cmap('Blues')
blue_norm = colors.Normalize(vmin=-0.005, vmax=0.01)

def map_color(cd, logfc):
    v = cd*logfc
    if v < 0: 
        return '#D3D3D3'
    elif cd < 0:
        return colors.to_hex(red_map(red_norm(cd*logfc)))
    else:
        return colors.to_hex(blue_map(blue_norm(cd*logfc)))

In [None]:
def combine_data(cd_df, fc_df, rank_df):
    # extract and combine data for each gene
    comb_df = pd.concat([cd_df, fc_df, rank_df], axis=1)
    with warnings.catch_warnings(record=True):
        comb_df['logFC'] = np.nan_to_num(np.log2(comb_df['FC']), nan=0.0, posinf=0.0, neginf=0.0)
    return comb_df

def make_plot(comb_df, gene, pert_type):
    # set color and size for each point on plot
    plot_colors = [map_color(row.CD, row.logFC) for row in comb_df.itertuples()]

    # generate data source
    data_source = ColumnDataSource(
        data=dict(
            x = comb_df['logFC'],
            y = comb_df['CD'],
            cd = comb_df['CD'],
            sig = pd.Series(comb_df.index),
            fc = comb_df['FC'], 
            logfc = comb_df['logFC'],
            colors = plot_colors, 
            sizes = [8] * comb_df.shape[0],
        )
    )

    # create hover tooltip
    tools = [
        ("Signature", "@sig"),
        ("CD Coeff", "@cd"),
        ("Fold Change", "@fc"),
        ("Log2 Fold Change", "@logfc")
    ]
    # generate plot and relevant plot labels
    plot = figure(
        plot_width=700,
        plot_height=500,
        tooltips=tools
    )
    
    plot.circle(
        'x', 'y', 
        size='sizes',
        alpha=0.7, 
        line_alpha=1,
        line_width=1, 
        line_color='colors',
        source=data_source,
        fill_color='colors', 
        name=f"{gene}_expression_in_L1000_to_RNAseq_{pert_type.replace(' ','')}_volcano_plot"
    )

    plot.yaxis.axis_label = 'Characteristic Direction Coefficient'
    plot.xaxis.axis_label = 'log2(Fold Change)'
    plot.title.text = f"Differential Expression of {gene} in Transformed {pert_type} L1000 Signatures"
    plot.title.align = 'center'
    plot.title.text_font_size = '14px'
    show(plot)

In [None]:
def make_tables(comb_df, pert, is_upreg):
    # sigranks = pd.read_feather(f"{root_path}{species}_affy_fc_sigrank.f", columns=['index', gene]).set_index('index')
    dir_df = comb_df[comb_df['CD'] > 0] if is_upreg else comb_df[comb_df['CD'] < 0]
    if dir_df.shape[0] == 0:
        return dir_df
    dir_df = dir_df.sort_values(by='CD', ascending=not is_upreg)
    dir_df['FC'] = dir_df['FC'].apply(lambda x: f'{x:.4f}')
    dir_df['CD'] = dir_df['CD'].apply(lambda x: f'{x:.4f}')
    if not is_upreg:
        dir_df['Rank'] = dir_df['Rank'].apply(lambda x: 23615-x)
    if pert == 'xpr':
        dir_df['KO Gene'] = dir_df.index.map(lambda x: x.split('_')[4])
    else:
        dir_df['Perturbagen'] = dir_df.index.map(lambda x: x.split('_')[4])
        dir_df['Dose'] = dir_df.index.map(lambda x: x.split('_')[5] if len(x.split('_')) == 6 else '')
    dir_df['Cell Line'] = dir_df.index.map(lambda x: x.split('_')[1])
    dir_df['Timepoint'] = dir_df.index.map(lambda x: x.split('_')[2].lower())
    dir_df = dir_df.rename(columns={
            'FC': 'Fold Change', 
            'logFC': 'Log2(Fold Change)', 
            'CD': 'CD Coefficient',
            'Rank': 'Rank in Signature'})
    dir_df.index.names = ['Signature']
    return dir_df

# create download link for table results
def download_link(df, fname):
    csv = df.to_csv(fname, sep='\t', index=True)
    link = f'<div>Download full results: <a href="{fname}" target=_blank>{fname}</a></div>'
    return link

In [None]:
def enrichr(pert, top_perts, direction):
    if pert == 'CRISPR':
        desc = f"Top CRISPR targets from RNA-seq-like signatures that {direction}-regulate {gene}"
        list_url = 'https://maayanlab.cloud/Enrichr/addList'
        enrich_url = 'https://maayanlab.cloud/Enrichr/enrich'
    else:
        desc = f"Top compounds from RNA-seq-like signatures that {direction}-regulate {gene}"
        list_url = 'https://maayanlab.cloud/DrugEnrichr/addList'
        enrich_url = 'https://maayanlab.cloud/DrugEnrichr/enrich'
    payload = {
        'list': (None, '\n'.join(top_perts)),
        'description': (None, desc)
    }
    response = requests.post(list_url, files=payload)
    if not response.ok:
        raise Exception('Error analyzing gene list')
    time.sleep(0.5)
    return f"{enrich_url}?dataset={response.json()['shortId']}"

def enrichr_link(pert, df, direction, gene): 
    comb_df = df.copy()
    comb_df['pert'] = comb_df.index.map(lambda x: x.split('_')[4])
    if direction == 'up':
        top_perts = comb_df.sort_values(by='CD', ascending=False) \
            .drop_duplicates(subset=['pert'],keep='first')['pert'][:20]
    else:
        top_perts = comb_df.sort_values(by='CD', ascending=True) \
            .drop_duplicates(subset=['pert'],keep='first')['pert'][:20]
    pert_type = 'CRISPR target genes' if pert == 'CRISPR' else 'chemical compounds'
    results_url = enrichr(pert, top_perts, direction)
    return f'<a href={results_url} target="_blank">Enrichr analysis of top 20 {pert_type} that {direction}-regulate {gene}</a>'


## CRISPR knockout signatures
### Volcano Plots

In the following volcano plots, each point represents a single CRISPR KO signature. The x-position indicates the log2(fold change) of the expression of the chosen gene in the signature, while the y-position indicates the characteristic direction coefficient of the chosen gene. 

Note that the fold change and characteristic direction coefficients of the gene are not necessarily in the same direction for each signature; this is because in cases where a gene is both up- and down-regulated between replicate samples, the characteristic direction method prioritizes the more consistent direction of movement, which may not be consistent with the fold change. To read more about the characteristic direction method, please refer to [Clark et al., 2014](https://bmcbioinformatics.biomedcentral.com/articles/10.1186/1471-2105-15-79).

Points with same-direction fold change and CD coefficient values are highlighted by coloring them blue (up-regulated) or red (down-regulated). Darker colored points indicate genes with a larger CD-coefficient or larger fold change. 

Drag the plot to pan around. Use the toolbar to the right of the plot to zoom, reset the plot view, or download the plot.

In [None]:
# CRISPR
comb_df_xpr = combine_data(xpr_cd, xpr_fc, xpr_ranks)
make_plot(comb_df_xpr, gene, 'CRISPR')

### Tables

The tables below display the characteristic direction (CD) coefficients, fold change values, and log2(fold change) values correponding to the expression of the chosen gene in each CRIPSR KO signature. The rank of the gene in the signature, as determined by its CD coefficient relative to the CD coefficients of the other ~23,613 genes, is also provided. Lower numerical ranks indicate stronger differential expression in the respective direction; a rank of 1 in an up-regulated signature indicates that the gene was the most up-regulated gene in that signature, and a rank of 1 in a down-regulated signature indicates the gene was the most down-regulated gene in that signature. 

While only the top 10 signatures for each direction are displayed, below each table is a link to download the top 100 signatures for each direction. 

A link to the **Enrichr analysis results** of the top 20 unique perturbations from the top signatures that up or down-regulate the input gene can be found below each table as well. 

In [None]:
display(HTML(f'<div style="font-size:1rem;padding=1rem;"><b>Top CRISPR KO signatures where {gene} is up-regulated (based on CD-coefficient)</b></div>'))
display(HTML(make_tables(comb_df_xpr, pert='xpr', is_upreg=True)[:10].to_html(escape=False, col_space=70)))
display(HTML(download_link(make_tables(comb_df_xpr, pert='xpr', is_upreg=True)[:100], f"{gene}_UpReg_L1000_CRISPR_signatures.tsv")))
display(HTML(enrichr_link('CRISPR', comb_df_xpr, 'up', gene)))

In [None]:
display(HTML(f'<div style="font-size:1rem;padding=1rem;"><b>Top CRISPR KO signatures where {gene} is down-regulated (based on CD-coefficient)</b></div>'))
display(HTML(make_tables(comb_df_xpr, pert='xpr', is_upreg=False)[:10].to_html(escape=False, col_space=70)))
display(HTML(download_link(make_tables(comb_df_xpr, pert='xpr', is_upreg=True)[:100], f"{gene}_DnReg_L1000_CRISPR_signatures.tsv")))
display(HTML(enrichr_link('CRISPR', comb_df_xpr, 'down', gene)))

## Chemical perturbation signatures
### Volcano Plots

In the following volcano plot, each point represents a single chemical perturbation signature. The x-position indicates the log2(fold change) of the expression of the chosen gene in the signature, while the y-position indicates the characteristic direction coefficient of the chosen gene. 

Note that the fold change and characteristic direction coefficients of the gene are not necessarily in the same direction for each signature; this is because in cases where a gene is both up- and down-regulated between replicate samples, the characteristic direction method prioritizes the more consistent direction of movement, which may not be consistent with the fold change. To read more about the characteristic direction method, please refer to [Clark et al., 2014](https://bmcbioinformatics.biomedcentral.com/articles/10.1186/1471-2105-15-79).

Points with same-direction fold change and CD coefficient values are highlighted by coloring them blue (up-regulated) or red (down-regulated). Darker colored points indicate genes with a larger CD-coefficient or larger fold change. 

Drag the plot to pan around. Use the toolbar to the right of the plot to zoom, reset the plot view, or download the plot.


In [None]:
# CP
comb_df_cp = combine_data(cp_cd, cp_fc, cp_ranks)
make_plot(comb_df_cp, gene, 'Chemical')

### Tables

The tables below display the characteristic direction (CD) coefficients, fold change values, and log2(fold change) values correponding to the expression of the chosen gene in each chemical perturbation signature. The rank of the gene in the signature, as determined by its CD coefficient relative to the CD coefficients of the other ~23,613 genes, is also provided. Lower numerical ranks indicate stronger differential expression in the respective direction; a rank of 1 in an up-regulated signature indicates that the gene was the most up-regulated gene in that signature, and a rank of 1 in a down-regulated signature indicates the gene was the most down-regulated gene in that signature. 

While only the top 10 signatures for each direction are displayed, below each table is a link to download the top 100 signatures for each direction. 

A link to the **Enrichr analysis results** of the top 20 unique perturbations from the top signatures that up or down-regulate the input gene can be found below each table as well. 

In [None]:
display(HTML(f'<div style="font-size:1rem;padding=1rem;"><b>Top chemical perturbation signatures where {gene} is up-regulated (based on CD-coefficient)</b></div>'))
display(HTML(make_tables(comb_df_cp, pert='cp', is_upreg=True)[:10].to_html(escape=False, col_space=70)))
display(HTML(download_link(make_tables(comb_df_cp, pert='cp', is_upreg=True)[:100], f"{gene}_UpReg_L1000_CP_signatures.tsv")))
display(HTML(enrichr_link('chemical', comb_df_cp, 'up', gene)))

In [None]:
display(HTML(f'<div style="font-size:1rem;padding=1rem;"><b>Top chemical perturbation signatures where {gene} is down-regulated (based on CD-coefficient)</b></div>'))
display(HTML(make_tables(comb_df_cp, pert='cp', is_upreg=False)[:10].to_html(escape=False, col_space=70)))
display(HTML(download_link(make_tables(comb_df_cp, pert='cp', is_upreg=False)[:100], f"{gene}_DnReg_L1000_CP_signatures.tsv")))
display(HTML(enrichr_link('chemical', comb_df_cp, 'down', gene)))