In [None]:
#%%appyter init
from appyter import magic
magic.init(lambda _=globals: _())

# ChEA3 Appyter


The ChEA3 Appyter predicts transcription factors (TFs) associated with user-input sets of genes. Discrete query gene sets are compared to ChEA3 libraries of TF target gene sets assembled from multiple orthogonal 'omics' datasets. The Fisher's Exact Test, with a background size of 20,000, is used to compare the input gene set to the TF target gene sets in order to determine which TFs may be most closely associated with the input gene set. 

In [None]:
# imports
import json
import requests
import numpy as np
from time import sleep
from tabulate import tabulate
from IPython.display import HTML, display, Image, FileLink, Markdown
import plotly.graph_objects as go
import kaleido
import os

In [None]:
%%appyter hide_code 

{% do SectionField(
    name = 'input',
    title = 'ChEA Appyter Gene Set Upload',
    subtitle = 'Upload a gene set you wish to analyze'
)%}

{% set gene_input = TabField(
    name = 'gene_input',
    label = 'Gene Set Input',
    default = 'Paste',
    description = 'Input your gene set',
    choices = {
        'Paste': [
            TextField(
                name = 'paste_gene_input',
                label = 'Input Gene Set',
                default = 'KIAA0907\nKDM5A\nCDC25A\nEGR1\nGADD45B\nRELB\nTERF2IP\nSMNDC1' +
                '\nTICAM1\nNFKB2\nRGS2\nNCOA3\nTEX10\nARID4B\nCHIC2\nFBXO11\nMTF2\nCDK2' +
                '\nDNTTIP2\nGADD45A\nGOLT1B\nPOLR2K\nNFKBIE\nGABPB1\nECD\nPHKG2\nRAD9A\nNET1' +
                '\nKIAA0753\nEZH2\nNRAS\nATP6V0B\nCDK7\nCCNH\nSENP6\nTIPARP\nFOS\nARPP19' +
                '\nTFAP2A\nKDM5B\nNPC1\nTP53BP2\nNUSAP1\nSCCPDH\nKIF20A\nFZD7\nUSP22\nPIP4K2B' +
                '\nCRYZ\nGNB5\nEIF4EBP1\nPHGDH\nRRAGA\nSLC25A46\nRPA1\nHADH\nDAG1\nRPIA\nP4HA2' +
                '\nMACF1\nTMEM97\nMPZL1\nPSMG1\nPLK1\nSLC37A4\nGLRX\nCBR3\nPRSS23\nNUDCD3' +
                '\nCDC20\nKIAA0528\nNIPSNAP1\nTRAM2\nSTUB1\nDERA\nMTHFD2\nBLVRA\nIARS2\nLIPA' +
                '\nPGM1\nCNDP2\nBNIP3\nCTSL1\nCDC25B\nHSPA8\nEPRS\nPAX8\nSACM1L\nHOXA5\nTLE1' +
                '\nPYGL\nTUBB6\nLOXL1',
                description = 'Input your list of genes (one gene per row)',
                section = 'input'
            )
        ],
        'Upload': [
            FileField(
                name = 'upload_gene_input',
                label = 'Upload File',
                default = '',
                description = 'Upload your list of genes as a text file (one gene per row)',
                section = 'input'
            )
        ]
    },
    section = 'input'
)%}


{% set num_tfs = IntField(
    name = 'num_tfs',
    label = 'Top ranked transcription factors to display',
    description = 'Input the amount of top transcription factors you wish to be returned',
    default = 10,
    min = 1,
    max = 100,
    section = 'input'
)%}

{% set threshold = IntField(
    name = 'threshold',
    label = 'Transcription Factor Library Threshold',
    description = 'Only transcription factors with at least this threshold value of ' +
    'contributing libraries will be displayed in the mean rank chart andtable',
    default = 3,
    min = 1,
    max = 6,
    section = 'input'
)%}


In [None]:
# Function to call the ChEA3 API
def get_chea3_results(gene_set, query_name):
    ADDLIST_URL = 'https://maayanlab.cloud/chea3/api/enrich/'
    payload = {
        'gene_set': gene_set,
        'query_name': query_name
    }
    response = requests.post(ADDLIST_URL, data=json.dumps(payload))
    if not response.ok: 
        # r.ok (where r is the object) returns whether the call to the url was successful
        raise Exception('Error analyzing gene list')
    sleep(1)
    return json.loads(response.text) # .text returns the content of response in unicode

# Function for displaying tables 
def display_tables(lib, description):
    for libname in lib:
        display(HTML(f'<h3>{libname}</h3>'))
        
        table = [0] * num_tfs
        tablecounter = 0
        for i in results[libname][0:num_tfs]:
            table[tablecounter] = [i['Rank'],
                                   i['TF'],
                                   f"{i['Intersect']}/{i['Set length']}", 
                                   i['FET p-value'], 
                                   i['FDR'], 
                                   i['Odds Ratio'],
                                   f"{', '.join(i['Overlapping_Genes'].split(',')[0:10])}, ..."]
            tablecounter += 1

        display(HTML(tabulate(table, 
                              ['Rank', 
                               'TF', 
                               'Overlap', 
                               'FET p-value', 
                               'FDR', 
                               'Odds Ratio', 
                               'Overlapping Genes'], 
                              tablefmt='html')))
        
        display(HTML(f'<h5>{description[libname]}</h5>'))
        
        tsv_name = f"{libname.replace(' ', '_')}.tsv"
        with open(tsv_name, 'w') as tsv_file:
            tsv_file.write(tabulate(table, ['Rank', 
                                            'TF',
                                            'Overlap', 
                                            'FET p-value', 
                                            'FDR', 
                                            'Odds Ratio', 
                                            'Overlapping Genes'], 
                                    tablefmt='tsv'))
        display(HTML(f'<a href="{tsv_name}">Download table in .tsv</a>'))
        
        
# Function for displaying the individual library bar charts 
def display_charts(libs, description): 
    for libname in libs:
        
        display(HTML(f'<h3>{libname}</h3>'))
        
        tfs = [i['TF'] for i in results[libname]][0:num_tfs]
        scores = [float(i['FET p-value']) for i in results[libname]][0:num_tfs]
        
        # reverse the order/ranking of the tfs (and their respective scores)
        tfs = tfs[::-1]
        scores = scores[::-1]
        
        # takes the -log of the scores
        scores = -np.log10(scores)
        
        score_range = max(scores) - min(scores)
        x_lowerbound = min(scores) - (score_range * 0.05)
        x_upperbound = max(scores) + (score_range * 0.05)
        
        libfig = go.Figure(data = go.Bar(name = libname, 
                                         x = scores, 
                                         y = tfs, 
                                         marker = go.bar.Marker(color = 'rgb(255,127,80)'), 
                                         orientation = 'h'))
        libfig.update_layout(
            title = {
                'text':'Bar Chart of Scores based on FET p-values',
                'y': 0.87,
                'x': 0.5,
                'xanchor':'center',
                'yanchor':'top'
            },
            xaxis_title = '-log\u2081\u2080(FET p-value)', 
            # \u208 unicode to get the subscript (need a subscript of "10")
            yaxis_title = 'Transcription Factors',
            font = dict(
                size = 16,
                color = 'black'
            )
        )
        
        libfig.update_xaxes(range = [x_lowerbound, x_upperbound])
        
        libfig.show()
        
        display(HTML(f'<h5>{description[libname]}</h5>'))
        
def indexfinder(lib_score_list, value):
    index = 1
    for num in lib_score_list:
        if num == value:
            return index
        elif num != 0:
            index += 1

In [None]:
%%appyter code_exec
# this cell extracts the gene list, whether it was pasted in, or uploaded as a file

{%- if gene_input.raw_value == 'Paste' %} # for if the gene list was pasted in
geneset = {{ gene_input.value[0] }}
{%- else %} # for if the gene list was uploaded as a file (only two options)
geneset_filename = {{ gene_input.value[0] }}
{%- endif %}

# also extracts the integer inputted for number of transcription factors to be returned
num_tfs = {{ num_tfs }}
threshold = {{ threshold }}

In [None]:
%%appyter code_exec 

# This cell parses the gene list input into an organized python list 
# (when we extracted the gene list, it was still in a text file format, not a python list)

{%- if gene_input.raw_value == 'Paste' %} 
genes = geneset.split('\n')
genes = [x.strip() for x in genes]
{%- else %}
open_gene_input = open(geneset_filename, 'r')
lines = open_gene_input.readlines()
genes = [x.strip() for x in lines]
open_gene_input.close()
{%- endif %}

In [None]:
# getting results from the ChEA3 API
results = get_chea3_results(genes, 'query')

# so to clarify, "genes" are the user inputted genes (from the input page)
# "results" is the list of transcription factors that is returned by the ChEA3 API in 
# response to the inputted genes

# Libraries

ChEA3 has six different gene set libraries that are composed from multiple sources. ChEA3 also implements two integration techniques to encompass transcription factor analysis across all six of these libraries: Mean Rank and Top Rank. Mean Rank takes the average of a transcription factors individual library scores and ranks according to this mean score. Top Rank takes the best score among a transcription factors individual library scores and ranks according to this top rank.

# Bar Charts

## Mean Rank Bar Chart

In [None]:
c_lib_palette = {'ARCHS4 Coexpression':'rgb(196, 8, 8)',
                 'ENCODE ChIP-seq':'rgb(244, 109, 67)',
                 'Enrichr Queries':'rgb(242, 172, 68)', 
                 'GTEx Coexpression':'rgb(236, 252, 68)',
                 'Literature ChIP-seq':'rgb(165, 242, 162)',
                 'ReMap ChIP-seq':'rgb(92, 217, 78)'}
# this sets all the color values for all the libraries that will be displayed in the bar chart

# NOTE: removed Integrated mean/topRank since those are compiled from the above 6 libraries 
# afterwards and so none of the TFs will have Integrated mean/topRank as one of their libraries

c_lib_means = {'ARCHS4 Coexpression': [0] * num_tfs, 'ENCODE ChIP-seq': [0] * num_tfs, 
               'Enrichr Queries': [0] * num_tfs, 'GTEx Coexpression': [0] * num_tfs,
               'Literature ChIP-seq': [0] * num_tfs, 'ReMap ChIP-seq': [0] * num_tfs}
# creates a dictionary where each library is a key, and the values are empty lists with as
# many indices/spaces as the user has requested transcription factors (ex: if the user
# requests 15 TFs to be returned, the lists will have 15 spaces)


libs_sorted = ['ARCHS4 Coexpression','ENCODE ChIP-seq','Enrichr Queries',
               'GTEx Coexpression','Literature ChIP-seq','ReMap ChIP-seq']



mr_results = results['Integrated--meanRank']
###### NOTE: for meanRank, the TFs are already ranked by Score ######

for i in range(len(mr_results)):
    for lib in libs_sorted:
        mr_results[i].update({lib:0})
        
for i in range(len(mr_results)):
    thing = mr_results[i]['Library'].split(';')
    for a in range(len(thing)):
        library, value = thing[a].split(',')
        mr_results[i].update({library:int(value)})
    
sortedARCHS4 = sorted(mr_results, key = lambda k: k['ARCHS4 Coexpression'])
sortedGTEx = sorted(mr_results, key = lambda k: k['GTEx Coexpression']) 
sortedEnrichr = sorted(mr_results, key = lambda k: k['Enrichr Queries']) 
sortedENCODE = sorted(mr_results, key = lambda k: k['ENCODE ChIP-seq']) 
sortedReMap = sorted(mr_results, key = lambda k: k['ReMap ChIP-seq']) 
sortedLit = sorted(mr_results, key = lambda k: k['Literature ChIP-seq']) 

rankedARCHS4 = [entry['ARCHS4 Coexpression'] for entry in sortedARCHS4]
rankedENCODE = [entry['ENCODE ChIP-seq'] for entry in sortedENCODE]
rankedEnrichr = [entry['Enrichr Queries'] for entry in sortedEnrichr] 
rankedGTEx = [entry['GTEx Coexpression'] for entry in sortedGTEx]
rankedLit = [entry['Literature ChIP-seq'] for entry in sortedLit]
rankedReMap = [entry['ReMap ChIP-seq'] for entry in sortedReMap] 


ranking_dict = {'ARCHS4 Coexpression':rankedARCHS4,
                'ENCODE ChIP-seq':rankedENCODE,
                'Enrichr Queries':rankedEnrichr,
                'GTEx Coexpression':rankedGTEx,
                'Literature ChIP-seq':rankedLit,
                'ReMap ChIP-seq':rankedReMap}

In [None]:
for tfentry in mr_results:
    tfentry.update( [('SumRank', 0), ('AvgRank', 0) ])
    library_scores = tfentry['Library'].split(';')
    lib_counter = 0
    for a in library_scores:
        l, v = a.split(',')
        v = int(v)
        #scorerank = ranking_dict[l].index(v) + 1
        scorerank = indexfinder(ranking_dict[l], int(v))
        tfentry['SumRank'] += int(scorerank)
        lib_counter += 1
    tfentry['AvgRank'] = (tfentry['SumRank'] / lib_counter)
    
sorted_results = sorted(mr_results, key = lambda k: k['AvgRank'])

In [None]:
sorted_top_results = []
index = 0
while (len(sorted_top_results) < num_tfs):
    if len(sorted_results[index]['Library'].split(';')) >= threshold:
        sorted_top_results.append(sorted_results[index])
    index += 1
    # moves on to the next index
    
sorted_top_results = sorted_top_results[::-1]

# set up a list with all the TFs, sorted by rank (lowest to highest, in line with top_results)
sorted_tfs = []
for i in range(0, len(sorted_top_results)):
    sorted_tfs.append(sorted_top_results[i].get('TF'))
    # this pulls only the TF name from top_results and adds it to sorted_tfs

In [None]:
for i, tfentry in enumerate(sorted_top_results):
    libscores = tfentry['Library'].split(';')
    for a in libscores:
        lib, value = a.split(',')
        rank = indexfinder(ranking_dict[lib], int(value))
        avg = tfentry['AvgRank']
        tot = tfentry['SumRank']
        bar_length = (rank*avg)/tot
        c_lib_means[lib][i] = float(bar_length)

In [None]:
# Plotting the actual bar chart
fig = go.Figure(data = [go.Bar(name = c_lib, 
                               x = c_lib_means[c_lib], 
                               y = sorted_tfs,
                               marker = go.bar.Marker(color = c_lib_palette[c_lib]), 
                               orientation = 'h') 
                        for c_lib in libs_sorted])

fig.update_layout(barmode = 'stack')
fig.update_layout(
    title = {
        'text': 'Stacked Bar Chart of Sum of Ranks in Different Libraries',
        'y': 0.87,
        'x': 0.5,
        'xanchor': 'center',
        'yanchor': 'top',
    },
    xaxis_title = 'Sum of Ranks in Different Libraries',
    yaxis_title = 'Transcription Factors',
    font = dict(
        size = 16,
        color = 'black'
    )
)

fig.show()

# Graph Info/Organization
# the top ranked TF is listed as the first TF (from top to bottom) on the bar chart 

##### Figure 1. Horizontal bar chart, y-axis represents transcription factors. Displays the top ranked transcription factors according to their average integrated scores across all the libraries.

## Top Rank Bar Chart

In [None]:
tr_sortedtopresults = sorted(results['Integrated--topRank'], key = lambda k: k['Score']) 
tr_sortedtopresults = tr_sortedtopresults[0:num_tfs]
tr_sortedtopresults = tr_sortedtopresults[::-1]

topranklibscores = []
for i in range(num_tfs):
    topranklibscores.append(tr_sortedtopresults[i]['Score'])

tr_lib_scores = {'ARCHS4 Coexpression': [0] * num_tfs, 'ENCODE ChIP-seq': [0] * num_tfs, 
                 'Enrichr Queries': [0] * num_tfs, 'GTEx Coexpression': [0] * num_tfs,
                 'Literature ChIP-seq': [0] * num_tfs, 'ReMap ChIP-seq': [0] * num_tfs}

for i, tfentry in enumerate (tr_sortedtopresults):
    libscores = tfentry['Library'].split(';')
    for value in libscores:
        lib, value = value.split(',')
        tr_lib_scores[lib][i] = float(value)

tr_sortedtfs = []
for i in range(0, len(tr_sortedtopresults)):
    tr_sortedtfs.append(tr_sortedtopresults[i].get('TF'))


tr_fig = go.Figure(data = [go.Bar(name = c_lib, 
                                  x = tr_lib_scores[c_lib], 
                                  y = tr_sortedtfs,
                                  marker = go.bar.Marker(color = c_lib_palette[c_lib]), 
                                  orientation = 'h') 
                           for c_lib in libs_sorted])
tr_fig.update_layout(barmode = 'stack')
tr_fig.update_layout(
    title = {
        'text': 'Stacked Bar Chart of Sum of Ranks in Different Libraries',
        'y': 0.87,
        'x': 0.5,
        'xanchor': 'center',
        'yanchor': 'top',
    },
    xaxis_title = 'Sum of Ranks in Different Libraries',
    yaxis_title = 'Transcription Factors',
    font = dict(
        size = 16,
        color = 'black'
    )
)

tr_fig.show()

##### Figure 2. Horizontal bar chart, y-axis represents transcription factors. Displays the top ranked transcription factors according to their top integrated score across all the libraries.

# Individual Library Bar Charts

In [None]:
chart_des = {'GTEx--Coexpression':'Figure 3. Horizontal bar chart, y-axis represents ' +
             'transcription factors. Data mined from the Trascription factors ranked by ' +
             'the GTEx Coexpression gene set library. Data from the TF-target coexpression ' +
             'data in the GTEx dataset.',
             'ReMap--ChIP-seq':'Figure 4. Hortizontal bar chart, y-axis represents ' + 
             'transcription factors. Transcription factors ranked by the ReMap ChIP seq ' +  
             'gene set library. Interaction data mined from the ReMap Project',
             'Enrichr--Queries':'Figure 5. Horizontal bar chart, y-axis represents ' +
             'transcription factors. Transcription factors ranked by the Enrichr Query ' +
             'gene set library. Data from the TF-target co-occurrence data in Enrichr queries.', 
             'ENCODE--ChIP-seq':'Figure 6. Horizontal bar chart, y-axis represents ' +
             'transcription factors. Transcription factors ranked by the ENCODE ChIP seq ' +
             'gene set library. Interaction data mined from the ENCODE project', 
             'ARCHS4--Coexpression':'Figure 7. Horizontal bar chart, y-axis represents ' +
             'transcription factors. Transcription factors evaluated by the ARCHS4 ' +
             'Coexpression gene set library. Data from the TF-target coexpression data ' +
             'in the ARCHS4 dataset.', 
             'Literature--ChIP-seq':'Figure 8. Horizontal bar chart, y-axis represents ' +
             'transcription factors. Transcription factor significance evaluated by the ' +
             'Literature ChIP seq gene set library. Interaction data mined from literature.'}

display_charts(['GTEx--Coexpression',
                'ReMap--ChIP-seq', 
                'Enrichr--Queries',
                'ENCODE--ChIP-seq',
                'ARCHS4--Coexpression',
                'Literature--ChIP-seq'],
              chart_des)

# Tables

## Mean Rank

In [None]:
# first set up an empty list to be filled (eventually we'll be making a table out of this list)
mrtable = [0] * num_tfs

mrcounter = 0
# set up a counter for indexing (to fill mrtable)

tbl_sorted_top_results = sorted_top_results[::-1]
for i in range(len(tbl_sorted_top_results)):
    tbl_sorted_top_results[i]['Rank'] = i + 1
    
for i in tbl_sorted_top_results:
    mrtable[mrcounter] = [i['Rank'], 
                          i['TF'], 
                          i['AvgRank'], 
                          i['Library'].replace(',', ': ').replace(';', ', '),
                          f"{', '.join(i['Overlapping_Genes'].split(',')[0:10])}, ..."]
    # filling mrtable, using replace() to reformat a bit
    # [0:10] for Overlapping_Genes so that only 10 Overlapping Genes are shown (with '...' after)
    mrcounter += 1

# now actually making/printing the table
display(HTML(tabulate(mrtable, 
                      ['Rank', 
                       'Transcription Factor', 
                       'Avg Rank', 
                       'Libraries Ranks',
                       'Overlapping Genes'], 
                      tablefmt='html')))

##### Table 1. Displays the top specified number of transcription factors according to their average integrated rank across all the other libraries. Rows represent transcription factors, columns display data associated with the transcription factors. Avg Rank is the average rank a transcription factor scores, averaged across its ranking among individual libraries.

## Top Rank

In [None]:
trtable = [0] * num_tfs
trcounter = 0
# set up a counter for indexing (to fill mrtable)

for i in results['Integrated--topRank'][0:num_tfs]:
    trtable[trcounter] = [i['Rank'], 
                          i['TF'], 
                          i['Score'], 
                          i['Library'].replace(',', ': ').replace(';', ', '),
                          f"{', '.join(i['Overlapping_Genes'].split(',')[0:10])}, ..."]
    # filling mrtable, using replace() to reformat a bit
    # [0:10] for Overlapping_Genes so that only 10 Overlapping Genes are shown (with '...' after)
    trcounter += 1

    
    
# now actually making/printing the table
display(HTML(tabulate(trtable, 
                      ['Rank', 
                       'Transcription Factor', 
                       'TopRank', 
                       'Libraries Ranks',
                       'Overlapping Genes'], 
                      tablefmt='html')))

##### Table 2. Displays the top specified number of transcription factors according to the top integrated rank across all the libraries. Rows represent transcription factors, columns display data associated with the transcription factors.

In [None]:
display(HTML('<h2>The following tables display the rankings of the top' +
             ' transcription factors from each individual library.</h2>'))

table_des = {'GTEx--Coexpression':'Table 3. Displays the top specified number of ' +
             'transcription factors according to TF-target coexpression data from the ' +
             'GTEx dataset. Rows represent transcription factors, columns display data ' +
             'associated with the transcription factors.',
             'ReMap--ChIP-seq':'Table 4. Displays the top specified number of transcription ' +
             'factors according to interaction data mined from the ReMap project. Rows ' +
             'represent transcription factors, columns display data associated with the ' +
             'transcription factors.',
             'Enrichr--Queries':'Table 5. Displays the top specified number of transcription ' +
             'factors according to TF-target co-occurrence data from Enrichr queries. Rows ' +
             'represent transcription factors, columns display data associated with the ' +
             'trasncription factors.', 
             'ENCODE--ChIP-seq':'Table 6. Displyas the top specified number of transcription ' +
             'factors according to interaction data mined from the ENCODE project. Rows ' +
             'represent the transcription factors, columns display data associated with the ' +
             'transcription factors.', 
             'ARCHS4--Coexpression':'Table 7. Displays the top specified number of ' +
             'transcription factors according to TF-target coexpression data from  the ' +
             'ARCHS4 dataset. Rows represent the transcription factors, columns display data ' +
             'associated with the transcription factors', 
             'Literature--ChIP-seq':'Table 8. Displays the top specified number of ' +
             'transcription factors according to interaction data mined from literature. ' +
             'The rows represent the transcription factors, columns display data associated ' +
             'with the transcription factors.'}

display_tables(['GTEx--Coexpression', 
                'ReMap--ChIP-seq', 
                'Enrichr--Queries',
                'ENCODE--ChIP-seq',
                'ARCHS4--Coexpression',
                'Literature--ChIP-seq'], 
               table_des)