In [None]:
#%%appyter init
from appyter import magic
magic.init(lambda _=globals: _())

# Enkefalos

This appyter is for correlating your omics results to neural electrophysiological and morphological measures. The appyter will take in your genes of interest (GOI), as well as a FDR threshold for analyses. It will then conduct a series of analyses and display significant correlatory results.

Other parameters (such as species identifier) are set to default values in the cells below. You can download the notebook, change these parameters, and rerun it if you wish. Transcriptomic correlatory data was derived by the Allen Brain Institute, and all significance values for each correlation were tabulated and put into data files that Enkefalos uses. This study can be found [here.](https://journals.plos.org/ploscompbiol/article?id=10.1371/journal.pcbi.1007113)

In [None]:
# Imports
import pandas as pd 
import numpy as np
import json
import requests
import matplotlib.pyplot as plt
import matplotlib.colors as colors
import seaborn as sns
import time
from matplotlib.ticker import MaxNLocator
from IPython.display import display, FileLink, HTML, Markdown
import base64
from tkinter import *
from tkinter import scrolledtext
import matplotlib.pyplot as plt
import imageio as iio
import matplotlib
import math
import statsmodels.formula.api as smf
from statsmodels.stats.anova import anova_lm
import requests
from time import sleep
import webbrowser
import networkx as nx
from matplotlib.pyplot import figure
import itertools
import uuid
import urllib
import re

In [None]:
%%appyter hide_code

{% do SectionField(
    name='section1', 
    title = '1. Submit Your Gene List', 
    img = "608c5f65f4c35b0027d51be5.png",
    subtitle = 'Upload a text file containing your gene list or copy and paste your gene list into the text box below (One gene per row). There are some genes already in the input as an example, though this can be changed. Please do fill in this field, as if you do not you will get an ERROR.', 
) %}
{% do SectionField(
    name='section2', 
    title = '2. Choose FDR', 
    img = "608c5f65f4c35b0027d51be5.png",
    subtitle = 'Select one FDR for analyses. We recommend choosing a FDR value <= 10% (our increments only allow you to choose values from 1%-10%). A default value has been provided, though it can be changed.', 
) %}

{% do SectionField(
    name='section3', 
    title = '3. Select method for modeling relationships with respect to cell class', 
    img = "608c5f65f4c35b0027d51be5.png",
    subtitle = 'Select either class-driven model, non-class driven model, or interaction model for analysis. A default selection of non-class driven has been provided, though it can be changed.', 
) %}

In [None]:
%%appyter hide_code

{% set gene_list_kind = TabField(
    name='gene_list_kind',
    label='Gene List',
    default='Paste',
    description='Paste or upload your gene list',
    required=True,
    choices={
        'Paste': [
            TextListField(
                name='gene_list_input',
                label='Gene List',
                default= ['EGFR', 'HDAC1', 'STAT3', 'MYC', 'JUN'],
                description='Paste your gene list (One gene per row).',
                required = True,
                section='section1'
            ),
        ],
        'Upload': [
            FileField(
                name='gene_list_filename',
                label='Gene List File',
                description='Upload your gene list as a text file (One gene per row).',
                required = True,
                section='section1'
            ),
        ],
    },
    section = 'section1',
) %}

{% set FDR = IntField(
    name='FDR',
    description='Select one FDR for analyses.', 
    min = 1,
    max= 10,
    step = 1,
    default = 5, 
    required = True,
    label='FDR', 
    section='section2',
) %}

{% set relationship_type = MultiChoiceField(
    name='relationship_type', 
    description='Select a gene-property relationship type for which to do you analysis.', 
    label='Relationship Type', 
    default=['Non-class Driven'], 
    section = 'section3',
    choices=[
    'Class Driven', 'Non-class Driven', 'Interaction',
    ]
) %}

In [None]:
%%appyter code_exec

{%- if gene_list_kind.raw_value == 'Paste' %}
gene_list_input = {{ gene_list_kind.value[0] }}
{%- else %}
gene_list_filename = {{ gene_list_kind.value[0] }}
{%- endif %}
FDR = {{ FDR }}
relationship_type = {{relationship_type}}

In [None]:
%%appyter code_exec

{%- if gene_list_kind.raw_value == 'Paste' %}
genes_of_interest = [x.strip() for x in gene_list_input]
{%- else %}
open_gene_list_file = open(gene_list_filename,'r')
lines = open_gene_list_file.readlines()
genes_of_interest = [x.strip() for x in lines]
open_gene_list_file.close()
{%- endif %}

In [None]:
# Error handling
class NoResults(Exception):
    pass 
class APIFailure(Exception):
    pass

In [None]:
matplotlib.style.use('default')
color_exc = '#006DDB'
color_inh = '#920000'

#File Paths for Bomkamp Data
path_1 = "data/online_table1.csv"
path_2 = "data/online_table2.csv"
path_3 = "data/online_table3.csv"
path_4 = "data/online_table4.csv"

scores = pd.read_csv(path_1, index_col = 0)
scores_all = pd.read_csv(path_2, index_col = 0)
ephys = pd.read_csv(path_3, index_col = 0, low_memory = False)
morph = pd.read_csv(path_4, index_col = 0, low_memory = False)

# Significant Correlations

This is the first analysis. A table of genes from your list of genes that had significant electrophysiological/morphological correlations will be displayed with their respective correlations and FDR values. The FDR value is an adjusted p-value calculated using the Benjamini-Hochberg method for correction for multiple hypotheses testing. The FDR will be determined by the relationship type you selected before running Enkefalos. More detail about what the different relationship types are can be found in our user guide and in the Pavlidis Lab study. Only the top 10 significant results are displayed in this notebook, but the full table containing all significant results can be downloaded using the Download CSV file of the full table of significant correlations link.

In [None]:
FDR = float(FDR)/100
relationship_type = str(relationship_type)
relationship_type = re.sub(r"['\[{}\]']", "", relationship_type)
genes_of_interest = [gene.upper() for gene in genes_of_interest]
#Takes genes from Bomkamp data and stores it in a list
genes_in_data = scores_all['gene_symbol'].values.tolist()
#Empty list which will take in genes from user's data that are also included in the Bomkamp data
genes_of_interest_in_data = []
#Method to see whether each gene in user's data is included in Bomkamp data or not

def contains_gene (x):
    if x in genes_in_data:
        genes_of_interest_in_data.append(x)     
for gene in genes_of_interest:
    contains_gene(gene)

def create_download_link(df, title = "Download CSV file of the full table of significant results", filename = "Significant_Correlations.csv"):  
    csv = df.to_csv(filename, index = False)
    html = f'<a href="{filename}" target=_blank>{title}</a>'
    return HTML(html)

if relationship_type == "Class Driven":
    FDR_type = "FDR_gene"
if relationship_type == "Non-class Driven":
    FDR_type = "FDR_gene|class_anova"
if relationship_type == "Interaction":
    FDR_type = "FDR_int_anova"


if len(genes_of_interest_in_data) == 0:
    print("Sorry, either the genes you are interested in are not included in our data, or do not have any significant relations under the FDR threshold you specified. Perhaps try again with another list!")
else:
    #This is the table with all significant correlations from user's genes under their preffered FDR threshold. Sorted from lowest to highest FDR.    
    filtered_data = scores_all[(scores_all['gene_symbol'].isin(genes_of_interest_in_data)) & (scores_all[FDR_type] < FDR)]
    filtered_data = filtered_data.sort_values(by=[FDR_type])
    display(HTML(filtered_data[['gene_symbol', 'property', FDR_type]][:10].to_html(index = False)))
    display(create_download_link(filtered_data[['gene_symbol', 'property', FDR_type]]))
##pd.set_option('display.max_rows', None)
##print (filtered_data[['gene_symbol', 'property', 'FDR_gene|class_anova']])

# Genes with Significant Interactions

A table of genes from your list of genes that had significant electrophysiological/morphological correlations will be displayed. Only the top 10 results are displayed in this notebook, but the full table containing the entire gene/protein list can be downloaded using the Download CSV file of the full table of significant interactions link.

In [None]:
#If user would like to see only which genes were enriched and nothing else, they can use this option. Designed so genes print out in the order
#in which the user pasted their gens. If they pasted them from greatest to least expression value, the enriched genes with the highest expression
#will be at the top and vice versa
goi_id = []
for gene in genes_of_interest_in_data:
    if gene in filtered_data['gene_symbol'].values:
        goi_id.append(gene)
goi_id = pd.DataFrame(goi_id, columns = ['Genes'])

def create_download_link2(df, title = "Download CSV file of the full table of genes with significant interactions.", filename = "Significant_Interactions.csv"):  
    csv = df.to_csv(filename, index = False)
    html = f'<a href="{filename}" target=_blank>{title}</a>'
    return HTML(html)
    
display(HTML(goi_id[:10].to_html(index = False)))
display(create_download_link2(goi_id))


# STRING Call and Network Analysis

A network diagram of all your enriched genes will be created using the STRING database. Nodes of the interactome will be tabulated for the number of known interactions each gene has with the other genes in the network. A table with the number of interactions each gene in the network has (from greatest to least) will be displayed. Only the top 10 results are displayed in this notebook, but the full table containing the entire gene/protein and number of interactions list can be downloaded using the Download CSV file link. Moreover, a link to the StringDB analysis will be printed which you can copy and use for your own purposes.

From these results, if you would like to conduct further analyses, please refer to the appyter catolog and find the ENKEFALOS_2 appyter. This second appyter will allow you obtain a correlation plot for a gene and electrophysiological/morphological measure of your interest, as well as a data table with the points plotted, and a subset network with your specified gene as the central node and other proteins from your enriched list connected to it if they are known to have significant interactions. For the purpose of the subset network, this second appyter will take your table of enriched genes as an input, so be sure to save this result from this first analysis.

In [None]:
enriched_genes = [*set(filtered_data['gene_symbol'].values.tolist())]
#String API to call for website and file with correlations
string_api_url = "https://version-11-5.string-db.org/api"
output_format = "tsv-no-header"
method_1 = "network"
method_2 = "get_link"
request_url_1 = "/".join([string_api_url, output_format, method_1])
request_url_2 = "/".join([string_api_url, output_format, method_2])

#Parameters for String API
params = {
    "identifiers" : "%0d".join(enriched_genes), # your proteins
    "species" : 9606, # species NCBI identifier 
    "network_flavor": "confidence", # show confidence links
    "network_type": "functional"
    }

response_1 = requests.post(request_url_1, data=params)
response_2 = requests.post(request_url_2, data=params)
#Opens STRING diagram on web
webbrowser.open(response_2.text)


In [None]:
#Creates a small data frame for the known/predicted interactions between each node
all_interactions = pd.DataFrame(columns = ['Gene1', 'Gene2'])
for line in response_1.text.strip().split("\n"):
    l = line.strip().split("\t")
    p1, p2 = l[2], l[3]
    all_interactions.loc[len(all_interactions.index)] = [p1, p2]
    ## filter the interaction according to experimental score
    #experimental_score = float(l[10])
    #if experimental_score > 0.4: 
    #print("\t".join([p1, p2, "experimentally confirmed (prob. %.3f)" % experimental_score]))

#Drops duplicate interactions so we can just determine singular relations between each node
interactions = all_interactions.drop_duplicates(ignore_index = True)

#Gathers all the gene names from the interactions dataframe into one list, from which we can count the occurence of
#each gene to get the number of interactions each has
genes_in_interactions = []
genes_in_interactions.extend(interactions['Gene1'].tolist())
genes_in_interactions.extend(interactions['Gene2'].tolist())

#Another list that removes the duplicates from genes_in_interactions for which we can say the number
#of interactions each of the genes in this list has
single_genes_in_interactions = [*set(genes_in_interactions)]

#Prints node correlations for each gene
n=0
interactions_per_gene = pd.DataFrame(columns = ['Gene', '# of Interactions'])
for gene in single_genes_in_interactions:
    interactions_per_gene.loc[n] = [gene] + [int(genes_in_interactions.count(gene))]
    #print(gene + " has " + str(genes_in_interactions.count(gene)) + " node interaction(s).")
    n = n+1
interactions_per_gene = interactions_per_gene.sort_values(by=['# of Interactions'], ascending = False)

def create_download_link3(df, title = "Download CSV file of the full table of the number of interactions per gene, and open link to the StringDB analysis", filename = "Number_of_Interactions.csv"):  
    csv = df.to_csv(filename, index = False)
    html = f'<a href="{filename}" target=_blank>{title}</a>'
    return HTML(html)
    
display(HTML(interactions_per_gene[:10].to_html(index = False)))
display(create_download_link3(interactions_per_gene))
html_string = f'<a href={response_2.text} target=_blank>{"STRING Network"}</a>'
display(HTML(html_string))

