In [442]:
#%%appyter init
from appyter import magic
magic.init(lambda _=globals: _())

# X2K Appyter

The X2K Appyter (Expression2Kinases) predicts upstream regulatory networks associated with user-inputted sets of genes. Discrete query gene sets are compared first to ChEA3 libraries of transcription factor target gene sets assembled from orthogonal 'omics' datasets. Afterwards, ChEA3 results are put through a protein-protein interaction database to determine the transcription factor intermediate protein interactors. Finally, protein interactors are compared to the KEA3 background database—which contains measured and predicted kinase-substrate interactions, kinase-protein interactions, and interactions supported by co-expression and co-occurrence data—to determine which kinases may be most closely associated with the transcription factor intermediate protein interactors. Overall, Expression2Kinases can provide a method of identifying upstream regulators likely responsible for observed patterns in genome-wide gene expression.


In [443]:
# imports
import json
import requests
import numpy as np
import pandas as pd
from time import sleep
from tabulate import tabulate
from observable_jupyter import embed
from IPython.display import HTML, display, Image, FileLink, Markdown
import plotly.graph_objects as go
import kaleido
import os

In [444]:
%%appyter hide_code 

{% do SectionField(
    name = 'input',
    title = 'Gene Set Upload',
    subtitle = 'Upload a gene set you wish to analyze',
    img = 'data_icon.png'
)%}

{% do SectionField(
    name = 'analysis',
    title = 'Analysis Filtering Options',
    subtitle = 'Set thresholds for including top enriched transcription factors and kinases',
    img = 'analysis.png'
)%}

In [445]:
%%appyter hide_code 

{% set gene_input = TabField(
    name = 'gene_input',
    label = 'Gene Set Input',
    default = 'Paste',
    description = 'Input your gene set',
    choices = {
        'Paste': [
            TextField(
                name = 'paste_gene_input',
                label = 'Input Gene Set',
                default = 'KIAA0907\nKDM5A\nCDC25A\nEGR1\nGADD45B\nRELB\nTERF2IP\nSMNDC1' +
                '\nTICAM1\nNFKB2\nRGS2\nNCOA3\nTEX10\nARID4B\nCHIC2\nFBXO11\nMTF2\nCDK2' +
                '\nDNTTIP2\nGADD45A\nGOLT1B\nPOLR2K\nNFKBIE\nGABPB1\nECD\nPHKG2\nRAD9A\nNET1' +
                '\nKIAA0753\nEZH2\nNRAS\nATP6V0B\nCDK7\nCCNH\nSENP6\nTIPARP\nFOS\nARPP19' +
                '\nTFAP2A\nKDM5B\nNPC1\nTP53BP2\nNUSAP1\nSCCPDH\nKIF20A\nFZD7\nUSP22\nPIP4K2B' +
                '\nCRYZ\nGNB5\nEIF4EBP1\nPHGDH\nRRAGA\nSLC25A46\nRPA1\nHADH\nDAG1\nRPIA\nP4HA2' +
                '\nMACF1\nTMEM97\nMPZL1\nPSMG1\nPLK1\nSLC37A4\nGLRX\nCBR3\nPRSS23\nNUDCD3' +
                '\nCDC20\nKIAA0528\nNIPSNAP1\nTRAM2\nSTUB1\nDERA\nMTHFD2\nBLVRA\nIARS2\nLIPA' +
                '\nPGM1\nCNDP2\nBNIP3\nCTSL1\nCDC25B\nHSPA8\nEPRS\nPAX8\nSACM1L\nHOXA5\nTLE1' +
                '\nPYGL\nTUBB6\nLOXL1',
                description = 'Input your list of genes (one gene per row)',
                section = 'input'
            )
        ],
        'Upload': [
            FileField(
                name = 'upload_gene_input',
                label = 'Upload File',
                default = '',
                description = 'Upload your lsit of genes as a text file (one gene per row)',
                section = 'input'
            )
        ]
    },
    section = 'input'
)%}

In [446]:
%%appyter hide_code

{% set tf_filtering = TabField(
    name = 'tf_filtering',
    label = 'Transcription Factoring Filtering Method',
    default = 'Rank Threshold',
    description = 'Choose a method to filter the transcription factors by.',
    choices = {
        'Rank Threshold': [
            IntField(
                name = 'num_tfs',
                label = 'Top ranked transcription factors to display',
                description = 'Input the amount of top transcription factors you wish to be analyzed',
                default = 10,
                min = 1,
                max = 30,
                section = 'input'
            )
        ],
        'Score Threshold': [
            IntField(
                name = 'tf_score_threshold',
                label = 'Transcription Factor Score Threshold',
                description = 'Only transcription factors with a score equal to or below this ' +
                'threshold (lower score is better) will be considered significant and ' +
                'will be included.',
                default = 75,
                min = 1,
                max = 150,
                section = 'input'
            )
        ]
    },
    section = 'analysis'
)%}

{% set kin_filtering = TabField(
    name = 'kin_filtering',
    label = 'Kinase Filtering Method',
    default = 'Rank Threshold',
    description = 'Choose a method to filter the kinases by.',
    choices = {
        'Rank Threshold': [
            IntField(
                name = 'num_kins',
                label = 'Top ranked kinases to display',
                description = 'Input the amount of top kinases you wish to be analyzed',
                default = 10,
                min = 1,
                max = 30,
                section = 'analysis'
            )
        ],
        'Score Threshold': [
            IntField(
                name = 'kin_score_threshold',
                label = 'Kinase Score Threshold',
                description = 'Only kinases with a score equal to or below this ' +
                'threshold (lower score is better) will be considered significant and ' +
                'will be included.',
                default = 75,
                min = 1,
                max = 150,
                section = 'analysis'
            )
        ]
    },
    section = 'analysis'
)%}

In [447]:
### Might want to make this an input field? Where you can choose among a variety of databases?
#{% set ppi_dataset = ChoiceField(
    #name = 'ppi_dataset', 
    #label = 'Protein-Protein Interaction Database', 
    #choices = {'PCA': 'PCA', 'UMAP': 'UMAP', 't-SNE': 't-SNE'},
    #default = 'BioGrid', 
    #description = 'Select a protein-protein interaction database to be used in analysis.', 
    #section = 'input')
#%}

In [448]:
# Function to call the ChEA3 API
def get_chea3_results(gene_set, query_name):
    ADDLIST_URL = 'https://amp.pharm.mssm.edu/chea3/api/enrich/'
    payload = {
        'gene_set': gene_set,
        'query_name': query_name
    }
    response = requests.post(ADDLIST_URL, data=json.dumps(payload))
    if not response.ok: 
        # r.ok (where r is the object) returns whether the call to the url was successful
        raise Exception('Error analyzing gene list')
    sleep(1)
    return json.loads(response.text) # .text returns the content of response in unicode

def get_kea3_results(gene_set, query_name):
    ADDLIST_URL = 'https://maayanlab.cloud/kea3/api/enrich/'
    payload = {
        'gene_set': gene_set,
        'query_name': query_name
    }

    response = requests.post(ADDLIST_URL, data=json.dumps(payload))
    if not response.ok:
        raise Exception('Error analyzing gene list')
    sleep(1)

    return json.loads(response.text)

def indexfinder(lib_score_list, value):
    index = 1
    for num in lib_score_list:
        if num == value:
            return index
        elif num != 0:
            index += 1

In [449]:
%%appyter code_exec
# this cell extracts the gene list, whether it was pasted in, or uploaded as a file

{%- if gene_input.raw_value == 'Paste' %} # for if the gene list was pasted in
geneset = {{ gene_input.value[0] }}
{%- else %} # for if the gene list was uploaded as a file (only two options)
geneset_filename = {{ gene_input.value[0] }}
{%- endif %}

{%- if tf_filtering.raw_value == 'Rank Threshold' %} 
num_tfs = {{ tf_filtering.value[0] }}
{%- else %} # for if the gene list was uploaded as a file (only two options)
tf_score_threshold = {{ tf_filtering.value[0] }}
{%- endif %}

{%- if kin_filtering.raw_value == 'Rank Threshold' %} 
num_kins = {{ kin_filtering.value[0] }}
{%- else %} # for if the gene list was uploaded as a file (only two options)
kin_score_threshold = {{ kin_filtering.value[0] }}
{%- endif %}

# also extracts the integer inputted for number of transcription factors to be returned

threshold = 3

```python
# this cell extracts the gene list, whether it was pasted in, or uploaded as a file # for if the gene list was pasted in
geneset = '''KIAA0907
KDM5A
CDC25A
EGR1
GADD45B
RELB
TERF2IP
SMNDC1
TICAM1
NFKB2
RGS2
NCOA3
TEX10
ARID4B
CHIC2
FBXO11
MTF2
CDK2
DNTTIP2
GADD45A
GOLT1B
POLR2K
NFKBIE
GABPB1
ECD
PHKG2
RAD9A
NET1
KIAA0753
EZH2
NRAS
ATP6V0B
CDK7
CCNH
SENP6
TIPARP
FOS
ARPP19
TFAP2A
KDM5B
NPC1
TP53BP2
NUSAP1
SCCPDH
KIF20A
FZD7
USP22
PIP4K2B
CRYZ
GNB5
EIF4EBP1
PHGDH
RRAGA
SLC25A46
RPA1
HADH
DAG1
RPIA
P4HA2
MACF1
TMEM97
MPZL1
PSMG1
PLK1
SLC37A4
GLRX
CBR3
PRSS23
NUDCD3
CDC20
KIAA0528
NIPSNAP1
TRAM2
STUB1
DERA
MTHFD2
BLVRA
IARS2
LIPA
PGM1
CNDP2
BNIP3
CTSL1
CDC25B
HSPA8
EPRS
PAX8
SACM1L
HOXA5
TLE1
PYGL
TUBB6
LOXL1'''
num_tfs = 10
num_kins = 10
# also extracts the integer inputted for number of transcription factors to be returned
threshold = 3
```

In [450]:
%%appyter code_exec 

# This cell parses the gene list input into an organized python list 
# (when we extracted the gene list, it was still in a text file format, not a python list)

{%- if gene_input.raw_value == 'Paste' %} 
genes = geneset.split('\n')
genes = [x.strip() for x in genes]
{%- else %}
open_gene_input = open(geneset_filename, 'r')
lines = open_gene_input.readlines()
genes = [x.strip() for x in lines]
open_gene_input.close()
{%- endif %}

```python
# This cell parses the gene list input into an organized python list
# (when we extracted the gene list, it was still in a text file format, not a python list)
genes = geneset.split('\n')
genes = [x.strip() for x in genes]
```

In [451]:
###### getting results from the ChEA3 API ######
chea3results = get_chea3_results(genes, 'query')

# so to clarify, "genes" are the user inputted genes (from the input page)
# "results" is the list of transcription factors that is returned by the ChEA3 API in 
# response to the inputted genes

In [452]:
###### Read in the PPI Database being used ######

ppi_url = 'https://appyters.maayanlab.cloud/storage/X2K_Appyter/BioGrid-6-23-21.csv'
ppi = pd.read_csv(ppi_url)
# this will need to be updated in the future so that the appyter works for people who
# don't already have this BioGrid file on their local machine

In [453]:
###### Database Processing ######

# This is only needed if the incoming database has no direction (TF could be in either column
# 1 or column 2, same for their interactors). Normally, I assume column 1 is ALWAYS the TF
# and column 2 is ALWAYS the interactor of the TF, but I'm not sure if I can assume that. 
# The code below sorts a "directionless" dataframe so that TFs are ALWAYs in column 1 and 
# their interactors are always in column 2

#print(ppi.shape)
#ppi_addition = pd.DataFrame({'Gene_A':ppi.Gene_B, 
                             #'Gene_B':ppi.Gene_A})

#updated_ppi = pd.concat([ppi,ppi_addition], ignore_index=True)

#print(updated_ppi.shape)
#print(updated_ppi.drop_duplicates().shape)

#updated_ppi.shape[0] - updated_ppi.drop_duplicates().shape[0]
# shows that there were 3213 unique interactions with "direction"

#updated_ppi['check'] = updated_ppi.apply(lambda row: ''.join(sorted([row['Gene_A'], 
                                                                     #row['Gene_B']])), 
                                 #axis = 1)

#pupdated_ppi = updated_ppi.drop_duplicates('check').loc[:,['Gene_A', 'Gene_B']]

#print(updated_ppi.shape)
#print(pupdated_ppi.shape)

In [454]:
mr_chea3results = chea3results['Integrated--meanRank']

chea3libs_sorted = ['ARCHS4 Coexpression','ENCODE ChIP-seq','Enrichr Queries',
                    'GTEx Coexpression','Literature ChIP-seq','ReMap ChIP-seq']

for i in range(len(mr_chea3results)):
    for lib in chea3libs_sorted:
        mr_chea3results[i].update({lib:0})
        
for i in range(len(mr_chea3results)):
    thing = mr_chea3results[i]['Library'].split(';')
    for a in range(len(thing)):
        library, value = thing[a].split(',')
        mr_chea3results[i].update({library:int(value)})

In [455]:
sortedARCHS4 = sorted(mr_chea3results, key = lambda k: k['ARCHS4 Coexpression'])
sortedGTEx = sorted(mr_chea3results, key = lambda k: k['GTEx Coexpression']) 
sortedEnrichr = sorted(mr_chea3results, key = lambda k: k['Enrichr Queries']) 
sortedENCODE = sorted(mr_chea3results, key = lambda k: k['ENCODE ChIP-seq']) 
sortedReMap = sorted(mr_chea3results, key = lambda k: k['ReMap ChIP-seq']) 
sortedLit = sorted(mr_chea3results, key = lambda k: k['Literature ChIP-seq']) 

rankedARCHS4 = [entry['ARCHS4 Coexpression'] for entry in sortedARCHS4]
rankedENCODE = [entry['ENCODE ChIP-seq'] for entry in sortedENCODE]
rankedEnrichr = [entry['Enrichr Queries'] for entry in sortedEnrichr] 
rankedGTEx = [entry['GTEx Coexpression'] for entry in sortedGTEx]
rankedLit = [entry['Literature ChIP-seq'] for entry in sortedLit]
rankedReMap = [entry['ReMap ChIP-seq'] for entry in sortedReMap] 


chea3ranking_dict = {'ARCHS4 Coexpression':rankedARCHS4,
                     'ENCODE ChIP-seq':rankedENCODE,
                     'Enrichr Queries':rankedEnrichr,
                     'GTEx Coexpression':rankedGTEx,
                     'Literature ChIP-seq':rankedLit,
                     'ReMap ChIP-seq':rankedReMap}

In [456]:
for tfentry in mr_chea3results:
    tfentry.update( [('SumRank', 0), ('AvgRank', 0) ])
    library_scores = tfentry['Library'].split(';')
    lib_counter = 0
    for a in library_scores:
        l, v = a.split(',')
        v = int(v)
        scorerank = indexfinder(chea3ranking_dict[l], int(v))
        tfentry['SumRank'] += int(scorerank)
        lib_counter += 1
    tfentry['AvgRank'] = (tfentry['SumRank'] / lib_counter)
    
sorted_chea3results = sorted(mr_chea3results, key = lambda k: k['AvgRank'])

In [457]:
try:
    tf_score_threshold
except NameError: 
    toptfs = []
    tf_index = 0
    while (len(toptfs) < num_tfs):
        if len(sorted_chea3results[tf_index]['Library'].split(';')) >= threshold:
            toptfs.append(sorted_chea3results[tf_index])
        tf_index += 1
else:
    toptfs = [sorted_chea3results[i] for i in range(len(sorted_chea3results)) \
              if len(sorted_chea3results[i]['Library'].split(';')) >= threshold \
              if float(sorted_chea3results[i]['Score']) <= tf_score_threshold]
#toptfs = [chea3_lib_results[i] for i in range(len(chea3_lib_results)) \
          #if float(chea3_lib_results[i]['Score']) <= score_threshold]
# if a TFs score meets the threshold value, adds this TF (and its data) to the toptfs list

toptfnames = [i['TF'] for i in toptfs]
# makes a list of just the names of the top TFs

In [458]:
###### Setting up the PPI Dictionary ######

# ppi_dict is a dictionary where the key is a tf, and the value is a list of all the possible
# interactors of this tf (based on the ppi database)

ppi_dict = {}

for i in range(ppi.shape[0]):
    ppi_dict.update({ppi.Gene_A[i]:[]})
    
for i in range(ppi.shape[0]):
    ppi_dict[ppi.Gene_A[i]].append(ppi.Gene_B[i])

### Transcription Factors and Their Interactors

In [459]:
###### Makes a dictionary for the top tfs only ######

# ppi_dict is a dictionary of the entire ppi databaes
# top_dict is that same dictionary, but filtered to only include the tfs in toptfnames

top_dict = {}

for name in toptfnames:
    if ppi_dict.get(name):
        top_dict.update({name:ppi_dict.get(name)})
    else:
        top_dict.update({name: 'No known interactors'})

In [460]:
###### Makes a list, top_tf_interactors ######

# tf_interactors = list containing all the interactors per tf in toptfnames
# top_tf_interactors = list containing all the interactors that are shared by at least 2 top tfs

tf_interactors = [j for i in top_dict.values() if i != 'No known interactors' for j in i]

top_tf_interactors = []

for tf in tf_interactors:
    count = 0
    for i in top_dict.items():
        for interactor in i[1]:
            if interactor == tf:
                count += 1
    if count >= 2:
        top_tf_interactors.append(tf)

top_tf_interactors = [i for n, i in enumerate(top_tf_interactors) 
                      if i not in top_tf_interactors[:n]]

In [461]:
###### Now putting the top_tf_interactors through KEA3 #######

kea3results = get_kea3_results(top_tf_interactors, 'query')

In [462]:
###### Getting Ready To Investigate the Top Ranked Kinases ######

mr_kea3results = kea3results['Integrated--meanRank']



kea3libs_sorted = ['BioGRID', 'ChengKSIN', 'ChengPPI', 'HIPPIE', 'mentha', 
                   'MINT', 'PhosDAll', 'prePPI', 'PTMsigDB', 'STRING', 'STRING.bind']

for i in range(len(mr_kea3results)):
    for lib in kea3libs_sorted:
        mr_kea3results[i].update({lib:0})
        
for i in range(len(mr_kea3results)):
    thing = mr_kea3results[i]['Library'].split(';')
    for a in range(len(thing)):
        library, value = thing[a].split(',')
        mr_kea3results[i].update({library:int(value)})

In [463]:
###### Sorting Kinases According to Rank in Each Library #######

sortedPTMsigDB = sorted(mr_kea3results, key = lambda k: k['PTMsigDB'])
sortedprePPI = sorted(mr_kea3results, key = lambda k: k['prePPI']) 
sortedmentha = sorted(mr_kea3results, key = lambda k: k['mentha']) 
sortedMINT = sorted(mr_kea3results, key = lambda k: k['MINT']) 
sortedSTRING = sorted(mr_kea3results, key = lambda k: k['STRING']) 
sortedHIPPIE = sorted(mr_kea3results, key = lambda k: k['HIPPIE']) 
sortedChengKSIN = sorted(mr_kea3results, key = lambda k: k['ChengKSIN']) 
sortedBioGRID = sorted(mr_kea3results, key = lambda k: k['BioGRID']) 
sortedPhosDAll = sorted(mr_kea3results, key = lambda k: k['PhosDAll']) 
sortedSTRINGbind = sorted(mr_kea3results, key = lambda k: k['STRING.bind']) 
sortedChengPPI = sorted(mr_kea3results, key = lambda k: k['ChengPPI']) 

rankedPTMsigDB = [entry['PTMsigDB'] for entry in sortedPTMsigDB]
rankedprePPI = [entry['prePPI'] for entry in sortedprePPI]
rankedmentha = [entry['mentha'] for entry in sortedmentha]
rankedMINT = [entry['MINT'] for entry in sortedMINT]
rankedSTRING = [entry['STRING'] for entry in sortedSTRING]
rankedHIPPIE = [entry['HIPPIE'] for entry in sortedHIPPIE]
rankedChengKSIN = [entry['ChengKSIN'] for entry in sortedChengKSIN]
rankedBioGRID = [entry['BioGRID'] for entry in sortedBioGRID]
rankedPhosDAll = [entry['PhosDAll'] for entry in sortedPhosDAll] 
rankedSTRINGbind = [entry['STRING.bind'] for entry in sortedSTRINGbind]
rankedChengPPI = [entry['ChengPPI'] for entry in sortedChengPPI]

# Composes a dictionary of all the kinases where each entry is the kinases ranked
# according to the respective library (the library name is the key in the dictionar)
kea3ranking_dict = {'PTMsigDB':rankedPTMsigDB,
                    'prePPI':rankedprePPI,
                    'mentha':rankedmentha,
                    'MINT':rankedMINT,
                    'STRING':rankedSTRING,
                    'HIPPIE':rankedHIPPIE,
                    'ChengKSIN':rankedChengKSIN, 
                    'BioGRID':rankedBioGRID, 
                    'PhosDAll':rankedPhosDAll, 
                    'STRING.bind':rankedSTRINGbind, 
                    'ChengPPI':rankedChengPPI}

In [464]:
###### Sorting the kinases by AvgRank #######

for tfentry in mr_kea3results:
    tfentry.update( [('SumRank', 0), ('AvgRank', 0) ])
    library_scores = tfentry['Library'].split(';')
    lib_counter = 0
    for a in library_scores:
        l, v = a.split(',')
        v = int(v)
        scorerank = indexfinder(kea3ranking_dict[l], int(v))
        tfentry['SumRank'] += int(scorerank)
        lib_counter += 1
    tfentry['AvgRank'] = (tfentry['SumRank'] / lib_counter)
    
sorted_kea3results = sorted(mr_kea3results, key = lambda k: k['AvgRank'])

In [465]:
###### Getting the List of Sorted Kinases ######

try:
    kin_score_threshold
except NameError:
    sorted_topkinresults = []
    kin_index = 0
    while (len(sorted_topkinresults) < num_kins):
        if len(sorted_kea3results[kin_index]['Library'].split(';')) >= threshold:
            sorted_topkinresults.append(sorted_kea3results[kin_index])
        kin_index += 1
else:
    sorted_topkinresults = [mr_kea3results[i] for i in range(len(sorted_kea3results)) \
                            if len(sorted_kea3results[i]['Library'].split(';')) >= threshold \
                            if float(mr_kea3results[i]['Score']) <= kin_score_threshold]    

In [466]:
topkinnames = [i['TF'] for i in sorted_topkinresults]

# Network Summary

## Transcription Factors and Their Interactors

### Highest Ranked Transcription Factors

In [467]:
display(HTML(', '.join(toptfnames)))

In [468]:
c_lib_palette = {'ARCHS4 Coexpression':'rgb(196, 8, 8)',
                 'ENCODE ChIP-seq':'rgb(244, 109, 67)',
                 'Enrichr Queries':'rgb(242, 172, 68)', 
                 'GTEx Coexpression':'rgb(236, 252, 68)',
                 'Literature ChIP-seq':'rgb(165, 242, 162)',
                 'ReMap ChIP-seq':'rgb(92, 217, 78)'}

tf_amt = len(toptfs)

c_lib_means = {'ARCHS4 Coexpression': [0] * tf_amt, 'ENCODE ChIP-seq': [0] * tf_amt, 
               'Enrichr Queries': [0] * tf_amt, 'GTEx Coexpression': [0] * tf_amt,
               'Literature ChIP-seq': [0] * tf_amt, 'ReMap ChIP-seq': [0] * tf_amt}

toptfs = toptfs[::-1]

# set up a list with all the TFs, sorted by rank (lowest to highest, in line with top_results)
sorted_tfs = []
for i in range(0, len(toptfs)):
    sorted_tfs.append(toptfs[i].get('TF'))    
    
for i, tfentry in enumerate(toptfs):
    libscores = tfentry['Library'].split(';')
    for a in libscores:
        lib, value = a.split(',')
        rank = indexfinder(chea3ranking_dict[lib], int(value))
        avg = tfentry['AvgRank']
        tot = tfentry['SumRank']
        bar_length = (rank*avg)/tot
        c_lib_means[lib][i] = float(bar_length)

In [469]:
# Plotting the actual bar chart
chea3_fig = go.Figure(data = [go.Bar(name = c_lib, 
                                     x = c_lib_means[c_lib], 
                                     y = sorted_tfs,
                                     marker = go.bar.Marker(color = c_lib_palette[c_lib]), 
                                     orientation = 'h') 
                              for c_lib in chea3libs_sorted])

chea3_fig.update_layout(barmode = 'stack')
chea3_fig.update_layout(
    title = {
        'text': 'Stacked Bar Chart of Average Ranks in Different Libraries',
        'y': 0.87,
        'x': 0.5,
        'xanchor': 'center',
        'yanchor': 'top',
    },
    xaxis_title = 'Average of Ranks Across All Libraries',
    yaxis_title = 'Transcription Factors',
    font = dict(
        size = 16,
        color = 'black'
    )
)

chea3_fig.show()

In [470]:
chea3_mr_table = [0] * len(toptfs)

chea3_mr_counter = 0
# set up a counter for indexing (to fill mrtable)

tbl_toptfs = toptfs[::-1]

for i in range(len(tbl_toptfs)):
    tbl_toptfs[i]['Rank'] = i + 1
    
for i in tbl_toptfs:
    chea3_mr_table[chea3_mr_counter] = [i['Rank'], 
                                        i['TF'], 
                                        i['AvgRank'], 
                                        i['Library'].replace(',', ': ').replace(';', ', '),
                                        f"{', '.join(i['Overlapping_Genes'].split(',')[0:10])}, ..."]
    # filling mrtable, using replace() to reformat a bit
    # [0:10] for Overlapping_Genes so that only 10 Overlapping Genes are shown (with '...' after)
    chea3_mr_counter += 1

# now actually making/printing the table
display(HTML(tabulate(chea3_mr_table, ['Rank', 
                                       'Transcription Factor', 
                                       'Avg Rank', 
                                       'Library Ranks',
                                       'Overlapping Genes'], 
                      tablefmt='html')))


chea3_tsv_name = 'transcriptionfactors.tsv'
with open(chea3_tsv_name, 'w') as tsv_file:
    tsv_file.write(tabulate(chea3_mr_table, ['Rank', 
                                             'Transcription Factor',
                                             'Avg Rank', 
                                             'Library Ranks', 
                                             'Overlapping Genes'], 
                            tablefmt='tsv'))
    display(HTML(f'<a href="{chea3_tsv_name}">Download table in .tsv</a>'))

Rank,Transcription Factor,Avg Rank,Library Ranks,Overlapping Genes
1,RLF,22.3333,"ARCHS4 Coexpression: 15, Enrichr Queries: 22, GTEx Coexpression: 30","KDM5A, KDM5B, MACF1, EGR1, CHIC2, GADD45B, GADD45A, TIPARP, NCOA3, BNIP3, ..."
2,FOXM1,29.5,"Literature ChIP-seq: 6, ARCHS4 Coexpression: 63, ENCODE ChIP-seq: 51, Enrichr Queries: 33, ReMap ChIP-seq: 7, GTEx Coexpression: 17","KDM5A, ARID4B, TMEM97, CDC20, TUBB6, NRAS, NUSAP1, DAG1, PHGDH, PIP4K2B, ..."
3,E2F7,29.8,"Literature ChIP-seq: 53, ARCHS4 Coexpression: 18, Enrichr Queries: 55, ReMap ChIP-seq: 11, GTEx Coexpression: 12","EGR1, HSPA8, GOLT1B, GADD45A, USP22, NCOA3, PLK1, FOS, TICAM1, TRAM2, ..."
4,E2F1,39.8333,"Literature ChIP-seq: 7, ARCHS4 Coexpression: 80, ENCODE ChIP-seq: 42, Enrichr Queries: 51, ReMap ChIP-seq: 46, GTEx Coexpression: 13","KDM5B, CCNH, TMEM97, PRSS23, CDC20, SENP6, TUBB6, NUDCD3, NUSAP1, PHGDH, ..."
5,ZNF367,61.6667,"ARCHS4 Coexpression: 99, Enrichr Queries: 72, GTEx Coexpression: 14","EGR1, NCOA3, PLK1, RPA1, TMEM97, FOS, CDC25A, CDC20, NRAS, MTHFD2, ..."
6,BATF3,63.6667,"ARCHS4 Coexpression: 26, Enrichr Queries: 141, GTEx Coexpression: 24","EGR1, GADD45B, GADD45A, NCOA3, FOS, GABPB1, TICAM1, RELB, NFKB2, RGS2, ..."
7,MYBL2,67.0,"Literature ChIP-seq: 15, ARCHS4 Coexpression: 162, ENCODE ChIP-seq: 5, Enrichr Queries: 32, ReMap ChIP-seq: 165, GTEx Coexpression: 23","KDM5A, KDM5B, GOLT1B, GLRX, TMEM97, EPRS, TRAM2, ARPP19, CDC20, NUDCD3, ..."
8,GTF2B,70.0,"ARCHS4 Coexpression: 112, ENCODE ChIP-seq: 44, ReMap ChIP-seq: 19, GTEx Coexpression: 105","KDM5B, CHIC2, PYGL, TMEM97, GABPB1, ARPP19, CDC20, SENP6, TUBB6, NUSAP1, ..."
9,ATF3,72.5,"Literature ChIP-seq: 112, ARCHS4 Coexpression: 1, ENCODE ChIP-seq: 27, Enrichr Queries: 87, ReMap ChIP-seq: 122, GTEx Coexpression: 86","CCNH, GLRX, EPRS, LIPA, RELB, RGS2, TUBB6, NUDCD3, NRAS, PHKG2, ..."
10,HMGA2,74.6667,"ARCHS4 Coexpression: 115, Enrichr Queries: 29, GTEx Coexpression: 80","EGR1, GADD45B, GADD45A, NCOA3, FZD7, PLK1, FOS, EPRS, TRAM2, PRSS23, ..."


### Transcription Factor Interactors

"Transcription factor interactors" are any molecules that interact with the given transcription factor according to the protein-protein interaction database.

In [471]:
for name in toptfnames:
    if ppi_dict.get(name):
        display(HTML(f"{name} interacts with {len(ppi_dict.get(name))} gene(s)/protein(s):"))
        display(HTML(', '.join(ppi_dict.get(name))))
    else:
        display(HTML(f"{name} has no known interactors."))
    display(HTML(f"<hr>"))
    print('')































## Relevant Transcription Factor Interactors

"Relevant transcription factor interactors" are transcription factor interactors that are shared among at least 2 of the top trasncription factors themselves.

In [472]:
display(HTML(f"There are {len(top_tf_interactors)} relevant transcription factor interactors:"))
display(HTML(', '.join(top_tf_interactors)))

In [473]:
rtf_fortable = []
    
for i in range(len(top_tf_interactors)):
    rtf_fortable.append([i+1, top_tf_interactors[i]])
    
display(HTML(tabulate(rtf_fortable, ['','TF Interactors'], tablefmt='html')))

rtf_tsv_name = 'tf_interactors.tsv'
with open(rtf_tsv_name, 'w') as tsv_file:
    tsv_file.write(tabulate(rtf_fortable, ['','TF Interactors'], tablefmt='tsv'))
    display(HTML(f'<a href="{rtf_tsv_name}">Download table in .tsv</a>'))

Unnamed: 0,TF Interactors
1,SP1
2,MYBL2
3,SMARCA5
4,GSK3B
5,EZH2
6,CCNF
7,CREBBP
8,HDAC1
9,RBBP4
10,FHL2


## Kinases

In [474]:
display(HTML(', '.join(topkinnames)))

In [475]:
###### Preparing to Plot the Kinases Bar Chart ######

k_lib_palette = {'BioGRID': 'rgb(196, 8, 8)', 'ChengKSIN': 'rgb(244, 109, 67)', 
                 'ChengPPI': 'rgb(242, 172, 68)','HIPPIE': 'rgb(236, 252, 68)', 
                 'mentha': 'rgb(165, 242, 162)', 'MINT': 'rgb(92, 217, 78)',
                 'PhosDAll': 'rgb(0, 138, 64)', 'prePPI': 'rgb(96, 191, 235)', 
                 'PTMsigDB': 'rgb(14, 130, 201)', 'STRING': 'rgb(58, 50, 168)', 
                 'STRING.bind': 'rgb(158, 50, 168)'}
# this sets all the color values for all the libraries that will be displayed in the bar chart

# NOTE: removed Integrated mean/topRank since those are compiled from the above 6 libraries 
# afterwards and so none of the TFs will have Integrated mean/topRank as one of their libraries

kin_amt = len(sorted_topkinresults)

k_lib_means = {'STRING.bind': [0] * kin_amt, 'ChengPPI': [0] * kin_amt, 
               'PhosDAll': [0] * kin_amt, 'BioGRID': [0] * kin_amt, 
               'HIPPIE': [0] * kin_amt, 'ChengKSIN': [0] * kin_amt,
               'STRING': [0] * kin_amt, 'MINT': [0] * kin_amt, 
               'mentha': [0] * kin_amt, 'prePPI': [0] * kin_amt,
               'PTMsigDB': [0] * kin_amt}
# creates a dictionary where each library is a key, and the values are empty lists with as
# many indices/spaces as the user has requested transcription factors (ex: if the user
# requests 15 TFs to be returned, the lists will have 15 spaces)

# need to reverse the order for plotting (go.Figure plots "backwards" sort of)    
sorted_topkinresults = sorted_topkinresults[::-1]

sorted_kins = []
for i in range(0, len(sorted_topkinresults)):
    sorted_kins.append(sorted_topkinresults[i].get('TF'))
    # this pulls only the TF name from top_results and adds it to sorted_tfs

for i, tfentry in enumerate(sorted_topkinresults):
    libscores = tfentry['Library'].split(';')
    for a in libscores:
        lib, value = a.split(',')
        rank = indexfinder(kea3ranking_dict[lib], int(value))
        avg = tfentry['AvgRank']
        tot = tfentry['SumRank']
        bar_length = (rank*avg)/tot
        k_lib_means[lib][i] = float(bar_length)

In [476]:
###### Plotting the mean rank data ######

kea3_fig = go.Figure(data = [go.Bar(name = k_lib, 
                                    x = k_lib_means[k_lib], 
                                    y = sorted_kins,
                                    marker = go.bar.Marker(color = k_lib_palette[k_lib]), 
                                    orientation = 'h') 
                             for k_lib in kea3libs_sorted])

kea3_fig.update_layout(barmode = 'stack')
kea3_fig.update_layout(
    title = {
        'text': 'Stacked Bar Chart of Average Ranks in Different Libraries',
        'y': 0.87,
        'x': 0.5,
        'xanchor': 'center',
        'yanchor': 'top',
    },
    xaxis_title = 'Average of Ranks Across All Libraries',
    yaxis_title = 'Kinases',
    font = dict(
        size = 16,
        color = 'black'
    )
)

kea3_fig.show()

In [477]:
kea3_mr_table = [0] * len(sorted_topkinresults)

kea3_mr_counter = 0
# set up a counter for indexing (to fill mrtable)

# need to "re-reverse" the ordering for the table (so now we're back to normal ordering)
tbl_sorted_topkinresults = sorted_topkinresults[::-1]
for i in range(0, len(tbl_sorted_topkinresults)):
    tbl_sorted_topkinresults[i]['Rank'] = i + 1

for i in tbl_sorted_topkinresults:
    kea3_mr_table[kea3_mr_counter] = [i['Rank'], 
                                      i['TF'], 
                                      i['AvgRank'], 
                                      i['Library'].replace(',', ': ').replace(';', ', '),
                                      f"{', '.join(i['Overlapping_Genes'].split(',')[0:10])}, ..."]
    # filling mr_table, using replace() to reformat a bit
    # [0:10] for Overlapping_Genes so that only 10 Overlapping Genes are shown (with '...' after)
    kea3_mr_counter += 1
                        
display(HTML(tabulate(kea3_mr_table, ['Rank', 
                                      'Kinase', 
                                      'Avg Rank', 
                                      'Library Ranks',
                                      'Overlapping Genes'], 
                      tablefmt='html')))


kea3_tsv_name = 'kinases.tsv'
with open(kea3_tsv_name, 'w') as tsv_file:
    tsv_file.write(tabulate(kea3_mr_table, ['Rank', 
                                            'Kinase',
                                            'Avg Rank', 
                                            'Library Ranks', 
                                            'Overlapping Genes'], 
                            tablefmt='tsv'))
    display(HTML(f'<a href="{kea3_tsv_name}">Download table in .tsv</a>'))

Rank,Kinase,Avg Rank,Library Ranks,Overlapping Genes
1,MAPK9,4.2,"ChengPPI: 1, STRING.bind: 2, PhosDAll: 4, BioGRID: 2, ChengKSIN: 6, HIPPIE: 3, STRING: 4, mentha: 6, prePPI: 3, PTMsigDB: 11","ATF2, GSK3B, CEBPA, HDAC1, CEBPE, CCNF, CEBPG, FHL2, RELA, MTA1, ..."
2,MAPK8,11.6364,"ChengPPI: 2, STRING.bind: 1, PhosDAll: 36, BioGRID: 1, ChengKSIN: 1, HIPPIE: 2, STRING: 45, MINT: 28, mentha: 1, prePPI: 9, PTMsigDB: 2","ATF2, GSK3B, CEBPA, HDAC1, CEBPE, CEBPG, FHL2, RELA, MTA1, RBBP4, ..."
3,PRKDC,21.9091,"ChengPPI: 3, STRING.bind: 14, PhosDAll: 32, BioGRID: 60, ChengKSIN: 22, HIPPIE: 10, STRING: 52, MINT: 5, mentha: 5, prePPI: 16, PTMsigDB: 22","GSK3B, CEBPA, ATF2, HDAC1, CEBPE, CEBPG, CCNF, SRA1, RELA, RBBP4, ..."
4,MAPK1,22.0,"ChengPPI: 19, STRING.bind: 18, PhosDAll: 31, BioGRID: 3, ChengKSIN: 2, HIPPIE: 7, STRING: 84, MINT: 34, mentha: 3, prePPI: 35, PTMsigDB: 6","GSK3B, ATF2, CEBPA, HDAC1, CEBPE, CCNF, CEBPG, FHL2, RELA, MTA1, ..."
5,CHUK,24.4545,"ChengPPI: 8, STRING.bind: 11, PhosDAll: 41, BioGRID: 13, ChengKSIN: 7, HIPPIE: 24, STRING: 40, MINT: 12, mentha: 18, prePPI: 86, PTMsigDB: 9","GSK3B, CEBPA, ATF2, CREBBP, JUN, TBP, HDAC1, CEBPG, FHL2, SMARCA5, ..."
6,MAPK10,25.0,"ChengPPI: 5, STRING.bind: 4, PhosDAll: 15, BioGRID: 5, ChengKSIN: 19, HIPPIE: 1, STRING: 61, mentha: 2, prePPI: 10, PTMsigDB: 128","GSK3B, ATF2, CEBPA, CREBBP, JUN, JUND, TBP, HDAC1, CEBPG, FHL2, ..."
7,MAPK14,25.2727,"ChengPPI: 4, STRING.bind: 61, PhosDAll: 28, BioGRID: 25, ChengKSIN: 10, HIPPIE: 4, STRING: 81, MINT: 26, mentha: 9, prePPI: 25, PTMsigDB: 5","GSK3B, ATF2, CEBPA, HDAC1, CEBPE, CCNF, CEBPG, FHL2, RELA, MTA1, ..."
8,VRK1,28.6,"ChengPPI: 13, STRING.bind: 12, PhosDAll: 9, BioGRID: 10, ChengKSIN: 5, HIPPIE: 25, STRING: 136, mentha: 16, prePPI: 57, PTMsigDB: 3","ATF2, GSK3B, CEBPA, JUN, TBP, HDAC1, CEBPE, CCNF, CEBPG, SMARCA5, ..."
9,CDK1,29.0,"ChengPPI: 51, STRING.bind: 47, PhosDAll: 37, BioGRID: 16, HIPPIE: 11, STRING: 66, MINT: 3, mentha: 7, prePPI: 32, PTMsigDB: 20","GSK3B, CEBPA, ATF2, HDAC1, CEBPE, CCNF, CEBPG, FHL2, RELA, MTA1, ..."
10,ATR,30.9,"ChengPPI: 18, STRING.bind: 72, PhosDAll: 7, BioGRID: 20, ChengKSIN: 32, HIPPIE: 39, STRING: 46, mentha: 27, prePPI: 22, PTMsigDB: 26","GSK3B, ATF2, CEBPA, HDAC1, CEBPE, CEBPG, CCNF, RELA, MTA1, RBBP4, ..."


In [478]:
x2k_visualizer_input = {'nodes':[], 'interactions':[]}
for tfentry in toptfs:
    x2k_visualizer_input['nodes'].append({'name': tfentry['TF'],
                                          'type': 'tf',
                                          'pvalue': -1})
    
for kinentry in sorted_topkinresults:
    x2k_visualizer_input['nodes'].append({'name': kinentry['TF'],
                                          'type': 'kinase',
                                          'pvalue': -1})
for tfientry in top_tf_interactors:
    x2k_visualizer_input['nodes'].append({'name': tfientry,
                                          'type': 'other',
                                          'pvalue': -1})

In [479]:
###### Creating 3 Dictionaries For Indexing, One for Each Protein Category ######

index_counter = 0

x2k_visualizer_tfindices = {}
for i in range(0, len(toptfnames)):
    x2k_visualizer_tfindices.update({x2k_visualizer_input['nodes'][i].get('name'):i})
    index_counter = i + 1

x2k_visualizer_kinindices = {}
for i in range(index_counter, index_counter + len(sorted_topkinresults)):
    x2k_visualizer_kinindices.update({x2k_visualizer_input['nodes'][i].get('name'):i})
    index_counter = i + 1
    
x2k_visualizer_intindices = {}
for i in range(index_counter, index_counter + len(top_tf_interactors)):
    x2k_visualizer_intindices.update({x2k_visualizer_input['nodes'][i].get('name'):i})

In [480]:
###### Connecting TFs and Interactors ######

for tf in toptfnames:
    for i in top_dict.get(tf):
        if top_dict.get(tf) != 'No known interactors':
            tfint_interaction = {'source': x2k_visualizer_tfindices.get(tf),
                                 'target': x2k_visualizer_intindices.get(i)}
        if i in top_tf_interactors and tfint_interaction not in x2k_visualizer_input['interactions']:
            x2k_visualizer_input['interactions'].append(tfint_interaction)

In [481]:
overlapping_genelist1 = [i['Overlapping_Genes'] for i in sorted_topkinresults]

overlapping_genelist2 = []
for genelist in overlapping_genelist1:
    gene = genelist.split(',')
    overlapping_genelist2.append(gene)
    
# overlapping_genelist2 is a list of lists, where the list indices correspond to 
# a single kinases index in sorted_topkinresults (index 3 of sorted_topkinresults will
# have the overlapping genes in index 3 of overlapping_genelist2)

int_kin_interactions = {}
for kin_genelist in overlapping_genelist2:
    for gene in kin_genelist:
        int_kin_interactions.update({gene:[]})

new_sorted_topkinresults = [i.copy() for i in sorted_topkinresults]
for i in range(len(new_sorted_topkinresults)):
    new_sorted_topkinresults[i].update({'Overlapping_Genes':overlapping_genelist2[i]})

for i in int_kin_interactions.keys():
    for j in new_sorted_topkinresults:
        if i in j['Overlapping_Genes']:
            int_kin_interactions[i].append(j['TF'])

In [482]:
for a in int_kin_interactions.keys():
    for b in int_kin_interactions.get(a):
        intkin_interaction = {'source': x2k_visualizer_intindices.get(a),
                              'target': x2k_visualizer_kinindices.get(b)}
        if intkin_interaction not in x2k_visualizer_input['interactions']:
            x2k_visualizer_input['interactions'].append(intkin_interaction)

In [483]:
full_x2k_visualizer_input = {}
full_x2k_visualizer_input['X2K'] = x2k_visualizer_input

full_x2k_visualizer_input.update([ ('ChEA', []), ('KEA', []), ('G2N', {}), ('input', []) ])

In [484]:
display(HTML("<style>.observable-link ~ iframe { height: 1500px !important; }</style>"))
embed('@maxim-k/x2k-network', cells=['x2k'], inputs={'json': full_x2k_visualizer_input})