In [None]:
#%%appyter init
from appyter import magic
magic.init(lambda _=globals: _())

In [None]:
%%appyter hide_code
{% do SectionField(
    name='PRIMARY',
    title='KEA3 Consensus Kinases',
    subtitle='This appyter returns consensus kinases using a set of gene sets',
    img='kea3.png'
) %}

In [None]:
%%appyter markdown

{% set title = StringField(
    name='title',
    label='Notebook name',
    default='KEA3 Consensus Kinases',
    section="PRIMARY",
    constraint='[^<>]*'
) %}

# {{ title.raw_value }}

In [None]:
import time
import requests
import pandas as pd
import json
import seaborn as sns
import matplotlib.pyplot as plt
from IPython.display import display, IFrame, Markdown
import math
import scipy.stats as st
import fastcluster

In [None]:
clustergrammer_url = 'https://maayanlab.cloud/clustergrammer/matrix_upload/'
API_URL = 'https://maayanlab.cloud/kea3/api/enrich/'
# libraries = ["ChEA_2016", "GO_Biological_Process_2018" ,"GWAS_Catalog_2019" , "KEGG_2019_Human"]
table = 1
figure = 1

## Get Input

In [None]:
%%appyter code_exec


{% set input_gene_set = FileField(
    name='input_gene_set',
    label='Gene Set',
    default='SARS-CoV-2_differentially_phosphorelated_proteins_down.gmt',
    section="PRIMARY",
    examples={
        'SARS-CoV-2_differentially_phosphorelated_proteins_down.gmt': 'https://appyters.maayanlab.cloud/storage/KEA3Consensus/SARS-CoV-2_differentially_phosphorelated_proteins_down.gmt',
        'SARS-CoV-2_differentially_phosphorelated_proteins_up.gmt': 'https://appyters.maayanlab.cloud/storage/KEA3Consensus/SARS-CoV-2_differentially_phosphorelated_proteins_up.gmt'
    }
) %}

input_gene_set = {{ input_gene_set }}

In [None]:
enrichment = {}
input_sigs = {}
with open(input_gene_set) as o:
    for line in o:
        unpacked = line.strip().split("\t")
        if len(unpacked) == 1:
            raise ValueError("Line '%s' is either empty or not formatted properly. Please consult README for more information"%line)
        sigid = unpacked[0]
        geneset = [i for i in unpacked[1:] if len(i) > 0]
        input_sigs[sigid] = {
            "genes": ", ".join([i.split(",")[0] for i in geneset])
        }
        enrichment[sigid] = {
            "genes": [i.split(",")[0] for i in geneset]
        }

In [None]:
num_sigs = len(enrichment)
input_sigs = pd.DataFrame.from_dict(input_sigs, orient="index")
display(input_sigs.head(10))
display(Markdown("**Table %d** Input Signatures"%(table)), display_id="input_sigs")
table+=1

## User defined parameters

In [None]:
%%appyter code_exec
top_results = {{IntField(name='min_count', label='Top results', description="Number of top results to keep", default=25, section='PRIMARY')}}
width = {{FloatField(name='width', label='image width', default=15, section='PRIMARY')}}
height = {{FloatField(name='height', label='image height', default=15, section='PRIMARY')}}

## Enrichment

In [None]:
# KEA3 Functions
def get_kea3_results(gene_set, query_name):
    payload = {
        'gene_set': gene_set,
        'query_name': query_name
    }
    response = requests.post(API_URL, data=json.dumps(payload))
    if not response.ok:
        raise Exception('Error analyzing gene list')
    time.sleep(1)
    return json.loads(response.text)

In [None]:
failed_list = []
for description, values in enrichment.items():
    print("Querying %s"%(description), end="\r", flush=True)
    genes = values["genes"]
    for tries in range(5):
        try:
            result = get_kea3_results(genes, description)
            enrichment[description] = result
            break
        except Exception as e:
            print(e)
            time.sleep(0.5)
    else:
        failed_list.append(description)
        continue
if len(failed_list):
    print("Failed to add %d list"%len(failed_list))

## Visualization

In [None]:
# methods
def get_dataframe(results, method, table):
    method_renamed = method.replace("Integrated--", "").replace("Rank", " Rank").lower()
    df = pd.DataFrame(index=results.keys())
    for k,v in results.items():
        scores = v[method]
        for s in scores:
            tf = s['TF']
            score = float(s['Score'])
            if tf not in df:
                df[tf] = 0.0
            df.at[k, tf] = score
    df = df.transpose()
    df.to_csv("%s_df.tsv"%method, sep="\t")
    display(df.head(10))
    display(Markdown("**Table %d** The table below shows the %s of kinases for each of the \
        %d input gene sets [Download complete table](%s_df.tsv)"%(table, method_renamed, num_sigs, method))
    )
    table+=1
    return df, table

def get_consensus(df, method, table):
    method_renamed = method.replace("Integrated--", "").replace("Rank", " Rank").lower()
#     top_kinase = df.mean(1).sort_values()[0:top_results].to_frame(name="scores")
#     top_rank = top_results + 10 if top_results + 10 < len(term_df.index) else top_results
    kinases_ranked = (term_df.sum(1)/(term_df>0).sum(1)).sort_values().to_frame(name="scores")
    top_kinase = kinases_ranked.iloc[0:top_results]
    top_kinase.to_csv("%s_top_kinase.tsv"%method)
    display(top_kinase.head(10))
    display(Markdown("**Table %d** Top %d kinases ranked by the mean of %s scores \
        [Download complete list](%s_top_kinase.tsv)"%(table, top_results, method_renamed, method)))
    table +=1
    consensus = df.loc[top_kinase.index]
    consensus.to_csv("%s_consensus_matrix.tsv"%method)
    display(consensus.head(10))
    display(Markdown("**Table %d** Consensus matrix of top %d kinases ranked by the mean of %s scores \
        [Download table](%s_consensus_matrix.tsv)"%(table, top_results, method_renamed, method)))
    table +=1
    
    c = consensus.max(1) + top_kinase.scores
    csub = consensus.rsub(c, axis=0)
    
    minscore = kinases_ranked.scores.min()
    maxscore = kinases_ranked.scores.max()
    diff = maxscore - minscore
    
    if (method == "Integrated--topRank"):
            scaler=1-(top_kinase-minscore)
    else:
        scaler=-(top_kinase-maxscore)/diff

    consensus_scaled = csub.div(csub.sum(1), axis=0).mul(scaler.scores, axis=0)
    consensus_scaled.to_csv("%s_consensus_scaled.tsv"%method)
    
    display(consensus_scaled.head(10))
    display(Markdown("**Table %d** Consensus matrix of top %d kinases ranked by the scaled mean of %s scores \
        [Download table](%s_consensus_scaled.tsv)"%(table, top_results, method_renamed, method)))
    table +=1
    return top_kinase, consensus, consensus_scaled, table

cmap = sns.cubehelix_palette(50, hue=0.05, rot=0, light=1, dark=0)
cmap_rev = sns.cubehelix_palette(50, hue=0.05, rot=0, light=0, dark=1)

def heatmap(df, method, figure, rev=False):
    col = cmap_rev if rev else cmap
    method_renamed = method.replace("Integrated--", "").replace("Rank", " Rank").lower()
    if not rev:
        method_renamed = "scaled %s"%method_renamed
    cg = sns.clustermap(df, cmap=col, figsize=(width, height), cbar_pos=(0.02, 0.65, 0.05, 0.18),)
    cg.ax_row_dendrogram.set_visible(False)
    cg.ax_col_dendrogram.set_visible(False)
    display(cg)
    plt.show()
    cg.savefig("%s_heatmap.png"%method)
    display(Markdown("**Figure %d** Heatmap of top %d kinases ranked by %s"%(figure, top_results, method_renamed)))
    figure += 1
    return figure

def clustergrammer(df, method, figure):
    clustergram_df = df.rename(columns={i:"Gene set: %s"%i for i in df.columns}, index={i:"Kinase: %s"%i for i in df.index})
    name = "%s_clustergrammer.tsv"%method
    clustergram_df.to_csv(name, sep="\t")
    response = ''
    
    method_renamed = method.replace("Integrated--", "").replace("Rank", " Rank").lower()
    method_renamed = "scaled %s"%method_renamed
    
    for i in range(5):
        try:
            res = requests.post(clustergrammer_url, files={'file': open(name, 'rb')})
            if not res.ok:
                response = res.text
                time.sleep(1)
            else:
                url = res.text.replace("http:","https:")   
                break
        except Exception as e:
            response = e
            time.sleep(2)
    else:
        if type(response) == Exception:
            raise response
        else:
            raise Exception(response)
    display(IFrame(url, width="1000", height="1000"))
    display(Markdown("**Figure %d** Clustergrammer of top %d kinases \
                    ranked by %s. [Go to url](%s)"%(figure, top_results, method_renamed, url)))
    figure +=1
    return figure

def stackedBarPlot(df, method, figure, rev=False, width = 15, height = 15):
    method_renamed = method.replace("Integrated--", "").replace("Rank", " Rank").lower()
    filename = "%s_heatmap.svg"%method
    if not rev:
        method_renamed = "scaled %s"%method_renamed
        filename = "scaled_%s"%method
    
    df['sum'] = df.sum(axis=1)
    df = df.sort_values(by = 'sum', ascending=not rev)[0:top_results]\
        .drop(['sum'], axis = 1)
    if df.shape[0]==0:
        return False
    plot = df.plot.barh(stacked = True, figsize = (width,height), fontsize = 20)
    plt.legend(bbox_to_anchor=(1.7, 0), loc='lower right', prop={'size': 16})
    plt.xlabel(method_renamed, labelpad = 20, fontsize = 'xx-large')
    display(plot)
    plt.savefig(filename, format = 'svg', bbox_inches='tight')
    plt.show()
    display(Markdown("**Figure %d** Stacked bar chart of top %d kinases ranked by %s"%(figure, top_results, method_renamed)))
    return figure

## Mean Rank

In [None]:
method = 'Integrated--meanRank'
term_df,table = get_dataframe(enrichment, method, table)

### Top Kinases and Consensus Matrix

In [None]:
top_kinase, consensus_df, consensus_scaled, table = get_consensus(term_df, method, table)

### Heat map
#### Mean Rank

In [None]:
figure = heatmap(consensus_df, method, figure, rev=True)

#### Scaled Mean Rank

In [None]:
figure = heatmap(consensus_scaled, method, figure)

### Clustergrammer

In [None]:
figure = clustergrammer(consensus_scaled, method, figure)

### Stacked bar plot
#### Mean Rank

In [None]:
figure = stackedBarPlot(consensus_df, method, figure, rev=True)

#### Scaled Mean Rank

In [None]:
figure = stackedBarPlot(consensus_scaled, method, figure)

## Top Rank

In [None]:
method = 'Integrated--topRank'
term_df,table = get_dataframe(enrichment, method, table)

### Top Kinases and Consensus Matrix

In [None]:
top_kinase, consensus_df, consensus_scaled, table = get_consensus(term_df, method, table)

### Heat map
#### Top Rank

In [None]:
figure = heatmap(consensus_df, method, figure, rev=True)

#### Scaled Top Rank

In [None]:
figure = heatmap(consensus_scaled, method, figure)

### Clustergrammer

In [None]:
figure = clustergrammer(consensus_scaled, method, figure)

### Stacked bar plot
#### Top Rank

In [None]:
figure = stackedBarPlot(consensus_df, method, figure, rev=True)

#### Scaled Top Rank

In [None]:
figure = stackedBarPlot(consensus_scaled, method, figure)

## References
[1] Lachmann A, Ma’ayan A. KEA: Kinase enrichment analysis. Bioinformatics 25(5) 684-6 (2009) PMID: 19176546

[2] Chen EY, Tan CM, Kou Y, Duan Q, Wang Z, Meirelles GV, Clark NR, Ma'ayan A.
Enrichr: interactive and collaborative HTML5 gene list enrichment analysis tool. BMC Bioinformatics. 2013; 128(14).

[3] Kuleshov MV, Jones MR, Rouillard AD, Fernandez NF, Duan Q, Wang Z, Koplev S, Jenkins SL, Jagodnik KM, Lachmann A, McDermott MG, Monteiro CD, Gundersen GW, Ma'ayan A.
Enrichr: a comprehensive gene set enrichment analysis web server 2016 update. Nucleic Acids Research. 2016; gkw377.

[4] Xie Z, Bailey A, Kuleshov MV, Clarke DJB., Evangelista JE, Jenkins SL, Lachmann A, Wojciechowicz ML, Kropiwnicki E, Jagodnik KM, Jeon M, & Ma’ayan A.
Gene set knowledge discovery with Enrichr. Current Protocols, 1, e90. 2021. doi: 10.1002/cpz1.90 

[5] Fernandez, N. F. et al. Clustergrammer, a web-based heatmap visualization and analysis tool for high-dimensional biological data. Sci. Data 4:170151 doi: 10.1038/sdata.2017.151 (2017).

[6] The COVID-19 Gene and Drug Set Library. Kuleshov et al. Patterns. 2020 Jul 25:100090

[7] Bouhaddou, Mehdi, et al. "The global phosphorylation landscape of SARS-CoV-2 infection." Cell 182.3 (2020): 685-712.