In [None]:
##%%appyter init
from appyter import *
magic.init(lambda _=globals: _())

# Enkefalos (Part 2)

This appyter is a continued tool from the first Enkefalos appyter for more downstream analysis. By inputting a gene of interest and the enriched genes from a prior Enkefalos analysis you may have conducted, as well as a property of your choice, this appyter will provide you a correlation plot with both excitatory and inhibitory neural cell types, plot data from the correlation plot, and a subset StringDB network with your gene of interest as the central node.

Transcriptomic correlatory data was derived by the Allen Brain Institute, and all significance values for each correlation was tabulated and put into data files by a study done by Bomkamp et al. that Enkefalos uses. This study can be found [here.](https://journals.plos.org/ploscompbiol/article?id=10.1371/journal.pcbi.1007113)

In [None]:
# Imports
import re
import pandas as pd 
import numpy as np
import json
import requests
import matplotlib.pyplot as plt
import matplotlib.colors as colors
import seaborn as sns
import time
from matplotlib.ticker import MaxNLocator
from IPython.display import display, FileLink, HTML, Markdown
import base64
from tkinter import *
from tkinter import scrolledtext
import matplotlib.pyplot as plt
import imageio as iio
import matplotlib
import math
import statsmodels.formula.api as smf
from statsmodels.stats.anova import anova_lm
import requests
from time import sleep
import webbrowser
import networkx as nx
from matplotlib.pyplot import figure
import itertools
import uuid
import urllib

In [None]:
%%appyter hide_code

{% do SectionField(
    name='section1', 
    title = '1. Submit a gene for further downstream analysis.', 
    img = "608c5f65f4c35b0027d51be5.png",
    subtitle = 'Input a gene name from your prior analysis. There is an example gene already in the input, though this can be changed.', 
) %}
{% do SectionField(
    name='section2', 
    title = '2. Choose an electrophysiological/morphological property.', 
    img = "608c5f65f4c35b0027d51be5.png",
    subtitle = 'Select a property of interest for which you would like to see the correlatory plot.', 
) %}
{% do SectionField(
    name='section3', 
    title = '3. Input the enriched genes from your prior Enkefalos analysis.', 
    img = "608c5f65f4c35b0027d51be5.png",
    subtitle = 'Upload a text file containing the genes that had significant correlations from your prior Enkefalos analysis or copy and paste that gene list into the text box below (One gene per row).', 
) %}

In [None]:
%%appyter hide_code

{% set gene = StringField(
    name='gene',
    label = 'Gene',
    description= 'Input a gene from your prior analysis to obtain a correlation plot for.', 
    default = 'EGFR', 
    required = True,
    section='section1',
) %}

{% set property = MultiChoiceField(
    name='properties', 
    description='Select an electrophysiological/morphological property for which you would like to obtain your graph.', 
    label='Property', 
    default=['AP Amplitude (mV)'], 
    section = 'section2',
    choices=[
    'AP Amplitude (mV)', 'AHP Amplitude (mV)', 'AP Half-width (ms)',
    'Max Firing Frequency (Hz)', 'Time Constant Tau (ms)', 'Resting Membrane Potential (mV)',
    'Average Interspike Interval (ms)', 'Input-Output Curve Slope (Hz/pA)', 'Capacitance (pF)', 
    'Sag', 'Input Resistance (MΩ)', 'Rheobase (pA)', 'Adaptation Ratio', 'AP Threshold (mV)', 
    'Latency (s)', 'ISI CoV', 'Bifurcation Angle (degrees)', 'Max Branch Order', 'Branchiness (branches/μm)',
    'Total Length (μm)', 'Total Volume (μm^3)', 'Soma Surface (μm^2)', 'Electrophysiology PC1', 
    'Electrophysiology PC2', 'Electrophysiology PC3', 'Morphology PC1', 'Morphology PC2', 'Morphology PC3' ]
) %}

{% set gene_list_kind = TabField(
    name='gene_list_kind',
    label='Gene List',
    default='Paste',
    description='Paste or upload your list of genes that had significant correlations from your prior analysis.',
    required=True,
    choices={
        'Paste': [
            TextListField(
                name='gene_list_input',
                label='Gene List',
                description='Paste your gene list (One gene per row).',
                default = ['EGFR', 'MYC', 'HDAC1'],
                required = True,
                section='section3'
            ),
        ],
        'Upload': [
            FileField(
                name='gene_list_filename',
                label='Gene List File',
                description='Upload your gene list as a text file (One gene per row).',
                required = True,
                section='section3'
            ),
        ],
    },
    section = 'section3',
) %}

In [None]:
%%appyter code_exec

{%- if gene_list_kind.raw_value == 'Paste' %}
gene_list_input = {{ gene_list_kind.value[0] }}
{%- else %}
gene_list_filename = {{ gene_list_kind.value[0] }}
{%- endif %}
gene = {{gene}}
property = {{property}}

In [None]:
%%appyter code_exec

{%- if gene_list_kind.raw_value == 'Paste' %}
enriched_genes = [x.strip() for x in gene_list_input]
{%- else %}
open_gene_list_file = open(gene_list_filename,'r')
lines = open_gene_list_file.readlines()
enriched_genes = [x.strip() for x in lines]
open_gene_list_file.close()
{%- endif %}

In [None]:
# Error handling
class NoResults(Exception):
    pass 
class APIFailure(Exception):
    pass

In [None]:
matplotlib.style.use('default')
color_exc = '#006DDB'
color_inh = '#920000'

#File Paths for Bomkamp Data
path_1 = "data/online_table1.csv"
path_2 = "data/online_table2.csv"
path_3 = "data/online_table3.csv"
path_4 = "data/online_table4.csv"

enriched_genes = [gene.upper() for gene in enriched_genes]
scores = pd.read_csv(path_1, index_col = 0)
scores_all = pd.read_csv(path_2, index_col = 0)
ephys = pd.read_csv(path_3, index_col = 0, low_memory = False)
morph = pd.read_csv(path_4, index_col = 0, low_memory = False)

ephys.index = [str(n) for n in ephys.index]
morph.index = [str(n) for n in morph.index]

tmp = scores_all.drop_duplicates('gene_entrez_id')
tmp.index = tmp.gene_entrez_id.astype(str)
id_to_symbol = tmp.gene_symbol.to_dict()
symbol_to_id = {a:b for b,a in id_to_symbol.items()}

property_labels = {'apamp': 'AP Amplitude', 'ahpamp': 'AHP Amplitude', 'aphw': 'AP Half-width', 
                   'maxfreq': 'Max Firing Frequency', 'tau' : 'Time Constant Tau', 'rmp': 'Resting Membrane Potential',
                   'avg_isi': 'Average Interspike Interval', 'f_i_curve_slope': 'Input-Output Curve Slope',
                   'cap' : 'Capacitance', 'sag': 'Sag', 'ri': 'Input Resistance', 'rheo': 'Rheobase',
                   'adratio': 'Adaptation Ratio', 'apthr': 'AP Threshold', 
                   'latency' : 'Latency', 'isi_cv' : 'ISI CoV', 
                   'average_bifurcation_angle_local' : 'Bifurcation Angle', 'max_branch_order' : 'Max Branch Order', 
                   'branchiness' : 'Branchiness', 'total_length' : 'Total Length',
                   'total_volume' : 'Total Volume', 'soma_surface' : 'Soma Surface', 
                   'E_PC1': 'Electrophysiology PC1','E_PC2': 'Electrophysiology PC2', 
                   'E_PC3': 'Electrophysiology PC3', 
                   'M_PC1': 'Morphology PC1', 'M_PC2': 'Morphology PC2', 
                   'M_PC3': 'Morphology PC3'}

property_labels_units = {'apamp': 'AP Amplitude (mV)', 'ahpamp': 'AHP Amplitude (mV)', 'aphw': 'AP Half-width (ms)', 
                         'maxfreq': 'Max Firing Frequency (Hz)',
                         'tau' : 'Time Constant Tau (ms)', 'rmp': 'Resting Membrane Potential (mV)',
                         'avg_isi': 'Average Interspike Interval (ms)', 'f_i_curve_slope': 'Input-Output Curve Slope (Hz/pA)',
                         'cap' : 'Capacitance (pF)', 'sag': 'Sag', 'ri': u'Input Resistance (MΩ)', 'rheo': 'Rheobase (pA)',
                         'adratio': 'Adaptation Ratio', 'apthr': 'AP Threshold (mV)', 
                         'latency' : 'Latency (s)', 'isi_cv' : 'ISI CoV', 
                         'average_bifurcation_angle_local' : 'Bifurcation Angle (degrees)', 'max_branch_order' : 'Max Branch Order', 
                         'branchiness' : u'Branchiness (branches/μm)', 'total_length' : u'Total Length (μm)',
                         'total_volume' : u'Total Volume (μm^3)', 'soma_surface' : u'Soma Surface (μm^2)', 
                         'E_PC1': 'Electrophysiology PC1','E_PC2': 'Electrophysiology PC2', 
                         'E_PC3': 'Electrophysiology PC3', 
                         'M_PC1': 'Morphology PC1', 'M_PC2': 'Morphology PC2', 
                         'M_PC3': 'Morphology PC3'}

props_morph = ['average_bifurcation_angle_local', 'branchiness', 'max_branch_order', 'total_length',
               'total_volume', 'soma_surface', 'M_PC1', 'M_PC2', 'M_PC3']
props_all = property_labels.keys()
props_ephys = sorted(list(set(props_all).difference(set(props_morph))))
property_table = pd.DataFrame(list(property_labels.keys()), list(property_labels.values()), columns = ['Abbreviation'])


def make_scatter(gene, prop, ax = None, alpha = 0.6, s = 20, log = False, annotate_qvals = True):

    # Set up axes
    if ax == None:
        fig, ax = plt.subplots()    
    if log:
        ax.set_yscale('log')

    # Select data
    if type(gene) == str:
        gene = str(symbol_to_id[gene])
    elif type(gene) == int:
        gene = str(gene)
    df = morph if prop in props_morph else ephys
    line_labels = [{'exc': 1, 'inh': 0}[line.split('__')[-1]] for line in df.columns]
    x = df.loc[gene]        
    y = df.loc[prop]
    line_labels_gene = line_labels
    exc = [n > 0.5 for n in line_labels]
    inh = [n <= 0.5 for n in line_labels]

    # Remove zero values if there are too many
    not_zero = [val > 0 for val in x]
    x = x[not_zero]
    y = y[not_zero]
    exc = [a and b for a, b in zip(exc, not_zero)]
    inh = [a and b for a, b in zip(inh, not_zero)]
    line_labels_gene = [i for (i, v) in zip(line_labels_gene, not_zero) if v]

    # Plot
    x_exc = df.loc[:, exc].loc[gene]
    y_exc = df.loc[:, exc].loc[prop]    
    x_inh = df.loc[:, inh].loc[gene]
    y_inh = df.loc[:, inh].loc[prop]    
    
    fit_all = np.polyfit(x, np.log10(y) if log else y, deg = 1)
    fit_exc = np.polyfit(x_exc, np.log10(y_exc) if log else y_exc, deg = 1)
    fit_inh = np.polyfit(x_inh, np.log10(y_inh) if log else y_inh, deg = 1)
    

    # Draw lines of best fit
    lim_all = pd.Series([x.min(), x.max()])
    lim_exc = pd.Series([x_exc.min(), x_exc.max()])
    lim_inh = pd.Series([x_inh.min(), x_inh.max()])

    for lim, fit, linestyle in zip([lim_all, lim_exc, lim_inh], [fit_all, fit_exc, fit_inh], ['k--', 'k-', 'k-']):
        if log:
            fit_line = 10**(lim * fit[0] + fit[1])
        else:
            fit_line = lim * fit[0] + fit[1]
        ax.plot(lim, fit_line, linestyle, linewidth = 1)
        
    for x_subset, y_subset, color, marker in zip([x_inh, x_exc], [y_inh, y_exc], [color_inh, color_exc], ['o', 'D']):
        ax.scatter(x_subset, y_subset, color = color, s = s, marker = marker, alpha = alpha)
                
    ax.set_xlabel(id_to_symbol[gene] + ' ($log_2$ CPM+1)')
    ax.set_ylabel(property_labels_units[prop])
    ax.spines['right'].set_visible(False)
    ax.spines['top'].set_visible(False)

    # Annotate p-values for CI, CC, and Int models
    if annotate_qvals:
        if (scores_all[(scores_all.property == prop) & (scores_all.gene_entrez_id == int(gene))]).shape[0] > 0:
            pvals = [np.round(scores_all[(scores_all.property == prop) & (scores_all.gene_entrez_id == int(gene))][pval].values[0], 3) for pval in ['FDR_gene', 'FDR_gene|class_anova', 'FDR_int_anova']]
            ax.annotate('\n'.join([label + ': q=' + str(pval) for label, pval in zip(['CI', 'CC', 'Int'], pvals)]), (1, 1), xycoords = 'axes fraction', ha = 'right', va = 'top')
    
def make_legend(ax, alpha = 0.6, s = 20):
    i = ax.scatter([], [], color = color_inh, marker = 'o', s = s, alpha = alpha)
    e = ax.scatter([], [], color = color_exc, marker = 'D', s = s, alpha = alpha)
    ax.legend([i, e], ['Inhibitory', 'Excitatory'], bbox_to_anchor=(1, 1), loc= 'upper left', frameon = True)


# Correlation Graph (Single Relationship)

This is the first analysis. It will take your gene of interest and property of interest and derive a correlation plot based on data Enkefalos runs on. Correlatory data will be plotted by neural cell type (inhibitory or excitatory), with a linear regression.

In [None]:
# Scatter plot for a single gene and property
def print_plot(x, y):
    gene = x
    measure = y
    make_scatter(gene, measure)
    make_legend(plt.gca())
    plt.show()

gene = gene.upper()
property = str(property)
property = re.sub(r"['\[{}\]']", "", property)
property_abb = list(property_labels_units.keys())[list(property_labels_units.values()).index(property)]
print_plot(gene, property_abb)

# Plot Data

Tables of the excitatory and inhibitory plot data on the correlation plot can be downloaded using the two Download CSV file links below.

In [None]:
#Plot Data for a gene of interest and its significant correlations
def create_download_link1(df, title = "Download CSV file of the excitatory data plotted on the correlation plots.", filename = "Excitatory_data.csv"):  
    csv = df.to_csv(filename)
    html = f'<a href="{filename}" target=_blank>{title}</a>'
    return HTML(html)

def create_download_link2(df, title = "Download CSV file of the inhibitory data plotted on the correlation plots.", filename = "Inhibitory_data.csv"):  
    csv = df.to_csv(filename)
    html = f'<a href="{filename}" target=_blank>{title}</a>'
    return HTML(html)

gene_plot_data_id = str(symbol_to_id[gene])
df = ephys
line_labels = [{'exc': 1, 'inh': 0}[line.split('__')[-1]] for line in df.columns]
exc = [n > 0.5 for n in line_labels]
inh = [n <= 0.5 for n in line_labels]

exc_data_prop = pd.DataFrame(df.loc[:, exc].loc[property_abb])
exc_data_gene = pd.DataFrame(df.loc[:, exc].loc[gene_plot_data_id])
exc_data_gene = exc_data_gene.rename(columns = {gene_plot_data_id : gene})
exc_data_prop = exc_data_prop.rename(columns = {property_abb : property})
exc_data_prop[gene] = exc_data_gene[gene].values
display(create_download_link1(exc_data_prop))

inh_data_gene = pd.DataFrame(df.loc[:, inh].loc[gene_plot_data_id])
inh_data_prop = pd.DataFrame(df.loc[:, inh].loc[property_abb])
inh_data_gene = inh_data_gene.rename(columns = {gene_plot_data_id : gene})
inh_data_prop = inh_data_prop.rename(columns = {property_abb : property})
inh_data_prop[gene] = inh_data_gene[gene].values
display(create_download_link2(inh_data_prop))


# Subset Network

This analysis will provide you a subset StringDB network from the original StringDB network provided to you in the first Enkefalos analysis, with this network's central node being your gene of interest (i.e. all nodes are connected to this gene in some way). A link to this subset network will also be printed for your convenience.

In [None]:
#String API to call for website and file with correlations
string_api_url = "https://version-11-5.string-db.org/api"
output_format = "tsv-no-header"
method_1 = "network"
method_2 = "get_link"
request_url_1 = "/".join([string_api_url, output_format, method_1])
request_url_2 = "/".join([string_api_url, output_format, method_2])

params = {
    "identifiers" : "%0d".join(enriched_genes), # your proteins
    "species" : 9606, # species NCBI identifier 
    "network_flavor": "confidence", # show confidence links
    "network_type": "functional"
    }

response_1 = requests.post(request_url_1, data=params)

all_interactions = pd.DataFrame(columns = ['Gene1', 'Gene2'])
for line in response_1.text.strip().split("\n"):
    l = line.strip().split("\t")
    p1, p2 = l[2], l[3]
    all_interactions.loc[len(all_interactions.index)] = [p1, p2]

G = nx.Graph()
G = nx.from_pandas_edgelist(all_interactions, 'Gene1', 'Gene2')

if gene not in G.nodes:
    print(f"The gene of interest '{gene}' has no interactions with the other genes in the list of enriched genes.")
else:
    connected_nodes = nx.node_connected_component(G, gene)
    params_1 = {
        "identifiers": "%0d".join(connected_nodes),  # your proteins
        "species": 9606,  # species NCBI identifier
        "network_flavor": "confidence",  # show confidence links
        "network_type": "functional"
    }
    response_2 = requests.post(request_url_2, data=params_1)
    webbrowser.open(response_2.text)
    html_subset = f'<a href={response_2.text} target=_blank>{"Subset STRING Network"}</a>'
    display(HTML(html_subset))