In [None]:
#%%appyter init
from appyter import magic
magic.init(lambda _=globals: _())

In [None]:
%%appyter markdown

<center> <h1> Gene Network Analysis</h1>
<h3>An appyter for the visualization and analysis of gene networks and sub-clusters within gene sets.</h3></center>

In [None]:
import csv
import plotly.offline as py
import plotly.graph_objects as go
import networkx as nx
import matplotlib.pyplot as plt
import pandas as pd
from networkx.algorithms import community
from collections import OrderedDict
import time
import requests
import seaborn as sns
import math
import numpy as np
import os
from pyvis import network as net
import random
import unidecode
import json

In [None]:
%%appyter hide

{% do SectionField(
    name="GENES",
    title="Submit a gene list",
) %}


{% set user_gene_lists = TextField(
    name="user_gene_lists",
    label="Gene list(s)",
    description="Paste with a single gene on each line. Separate gene lists with two line breaks.",
    section="GENES",
    default="",
) %}

In [None]:
%%appyter code_exec

n_genes = {{ IntField(
    name="n_genes",
    label="Minimum number of genes per cluster",
    section="GENES",
    default=20,
    minimum=2,
    maximum=1000000
)}}

gene_list_selection = '''{{ ChoiceField(
    name = "gene_list_selection",
    label="Sample gene lists to load",
    description="If not using your own gene lists, you can load examples.",
    section="GENES",
    default="SARS-CoV-2_down",
    choices=["SARS-CoV-2_down"]
)}}'''

edge_types = {{ MultiCheckboxField(
    name = "edge_types",
    label="Types of edges to use to construct the network",
    section="GENES",
    default=["Gene-gene co-expression","Protein-protein interactions"],
    choices=["Gene-gene co-expression","Protein-protein interactions"]
)}}

user_gene_lists = {{user_gene_lists}}

# Load data
---

In [None]:
# Load data
cloud_url = 'https://appyters.maayanlab.cloud/storage/Gene_Network_Analysis/'

In [None]:
print(edge_types)

In [None]:
%%appyter code_exec

# Load gene lists
{% if not user_gene_lists.value == "" %}
gene_lists = {}

for i, genes in enumerate(user_gene_lists.split("\n\n")):
    gene_lists[i] = genes.split("\n")
    
{% else %}
if gene_list_selection == "ULK4_293_coIP_hits":
    with open("ULK4_293_coIP_hits.txt","r") as f_in:
        writer = csv.reader(f_in, lineterminator='\n')
        sample_genes = [item for sublist in writer for item in sublist if len(sublist) > 0]
        gene_lists = [[ x.upper() for x in sample_genes ]]
elif gene_list_selection == "SARS-CoV-2_down":
    df_genes = pd.read_csv(cloud_url + "gene_lists/SARS-CoV-2_down.csv",header=None).drop(columns=[0,1])
    df_genes = df_genes.transpose()
    gene_lists = df_genes.to_dict("list")
    for k,v in gene_lists.items():
        gene_lists[k] = [unidecode.unidecode(x) for x in v if isinstance(x, str)] # filter out NaNs  

{% endif %}

In [None]:
print(f"Loaded {len(gene_lists)} gene lists")

In [None]:
df_ppi_edges = pd.read_csv(cloud_url + 'ppi_edges_list.csv',header=None)
df_gene_edges = pd.read_csv(cloud_url + 'top_500_correlation.csv')

display(df_ppi_edges.head())
print("PPI dataframe shape:", df_ppi_edges.shape)
display(df_gene_edges.head())
print("Gene-gene coexpression dataframe shape:",df_gene_edges.shape)

In [None]:
ppi_edges_dict = {}

for index, row in df_ppi_edges.iterrows():
    if row[0] in ppi_edges_dict:
        ppi_edges_dict[row[0]].append(row[1])
    else:
        ppi_edges_dict[row[0]] = [row[1]]
        
gene_edges_dict = df_gene_edges.to_dict('list')

# Create the networks

In [None]:

'''
def get_relevant_ppi_edges(gene_list):
    edges = []
    for gene in gene_list:
        if gene in ppi_edges_dict:  
            edges = [*edges,  *[(gene, x) for x in ppi_edges_dict[gene]]]
    return edges

def get_relevant_gene_edges(gene_list):
    edges = []
    for gene in gene_list:
        if gene in gene_edges_dict:  
            edges = [*edges,  *[(gene, x) for x in gene_edges_dict[gene]]]
    return edges
'''

def get_relevant_ppi_edges(gene_list):
    edges = []
    missing = []
    for gene_a in gene_list:
        if gene_a in ppi_edges_dict:
            for gene_b in ppi_edges_dict[gene_a]:
                if gene_b == gene_a: continue
                if gene_b in gene_list: edges.append((gene_a, gene_b))      
        else: missing.append(gene_a)
    return edges,missing

def get_relevant_gene_edges(gene_list):
    # use at most the top 3 edges for each gene.
    edges = []
    missing = []
    for gene_a in gene_list:
        gene_count = 0
        if gene_a in gene_edges_dict:
            for gene_b in gene_edges_dict[gene_a]:
                if gene_count >= 3: break 
                if gene_b == gene_a: continue
                if gene_b in gene_list: 
                    gene_count += 1
                    edges.append((gene_a, gene_b))
        else: missing.append(gene_a)
    return edges,missing

def pretty_heading(content):
    num_dashes = len(content) + 2
    num_dashes = max(30,num_dashes)
    print("-"*num_dashes)
    print(content)
    print("-"*num_dashes)    

In [None]:
networks = {}
clustering_coeffs = {}

nums_missing_nodes = []

for list_num, gene_list in gene_lists.items():

    # create the Network object
    pretty_heading(f"Constructing network for gene list {list_num}")
 
    ppi_edges,ppi_missing = get_relevant_ppi_edges(gene_list)
    gene_edges,gene_missing = get_relevant_gene_edges(gene_list)
    
    print("Missing PPI nodes:\n\n", ppi_missing, "\n")
    print("Missing gene-gene co-expression nodes:\n\n", gene_missing, "\n")
    
    both_missing = set(ppi_missing).intersection(set(gene_missing))
    print("Missing nodes for both types of edges:\n\n", both_missing, "\n")
        
    G = nx.Graph(name=list_num)
    G.add_nodes_from(gene_list)
    
    if "Protein-protein interactions" in edge_types:
        G.add_edges_from(ppi_edges,edge_type="PPI")

    if "Gene-gene co-expression" in edge_types:
        G.add_edges_from(gene_edges,edge_type="Coexpression")
        
    hits = []
    
    for edge in G.edges:
        hits.append(edge[0])
        hits.append(edge[1])
        
    num_missing = len(gene_list) - len(list(set(hits)))
    print(num_missing, " disconnected or missing nodes\n")
    nums_missing_nodes.append(num_missing)
  
    print(nx.info(G), "\n")
    
    networks[list_num] = G
    clustering_coeffs[list_num] = nx.average_clustering(G)
    

# Compute clusters
---

In [None]:
# Clustering
all_clusters = {}
num_clusters_by_method = {}

### k-clique communities

In [None]:
all_clusters["k_clique_communities"] = {}

for num, G in networks.items():

    pretty_heading(f"Computing k_clique_communities for gene list {num}")
    
    c = list(community.k_clique_communities(G, 3)) 
    clusters = [ list(x) for x in c if len(x) > n_genes]
    print(f"Computed {len(clusters)} cluster(s)\n")
    
    print("Cluster sizes:", [len(x) for x in clusters],"\n")

    all_clusters["k_clique_communities"][num] = clusters

### Girvan-Newman communities

In [None]:
all_clusters["girvan_newman"] = {}

for num, G in networks.items():
    pretty_heading(f"Computing girvan_newman communities for gene list {num}")

    communities_generator = community.girvan_newman(G)
    top_level_communities = next(communities_generator)
    next_level_communities = next(communities_generator)
    clusters = [ list(x) for x in next_level_communities if len(x) > n_genes ]
    clusters = [ list(x) for x in clusters if len(x) > n_genes]
    print(f"Computed {len(clusters)} cluster(s)\n") 
    print("Cluster sizes:", [len(x) for x in clusters],"\n")
    
    all_clusters["girvan_newman"][num] = clusters

### Greedy modularity communities

In [None]:
all_clusters["greedy_modularity_communities"] = {}

for num, G in networks.items():
    pretty_heading(f"Computing greedy_modularity_communities for gene list {num}")

    clusters = list(community.greedy_modularity_communities(G))
    clusters = [ list(x) for x in clusters if len(x) > n_genes]
    
    print(f"Computed {len(clusters)} cluster(s)\n") 
    print("Cluster sizes:", [len(x) for x in clusters],"\n")
    
    all_clusters["greedy_modularity_communities"][num] = clusters



In [None]:
all_clusters["connected_components"] = {}
# connected_components

for num, G in networks.items():
    pretty_heading(f"Computing connected_components for gene list {num}")
    
    components = sorted(nx.connected_components(G), key = len, reverse=True)
    clusters = [ list(x) for x in components if len(x) > n_genes ]

    print(f"Computed {len(clusters)} cluster(s)\n") 
    print("Cluster sizes:", [len(x) for x in clusters],"\n")
    
    all_clusters["connected_components"][num] = clusters



### Compile all clustering results

In [None]:
all_cluster_dfs = {}

clustering_results = []

for clustering_method, gene_lists in all_clusters.items():
    for num, clusters in gene_lists.items():
        for index,cluster in enumerate(clusters):
            for gene in cluster:
                data = {"gene": gene, "gene_list": num, "clustering_method": clustering_method, "cluster": index}     
                clustering_results.append(data)
df_clusters = pd.DataFrame(clustering_results)

display(df_clusters)  

# Network visualizations
---

Nodes are color-coded by their `greedy_modularity_communities` cluster membership.

In [None]:
# generate colors to color code clusters
def random_color():
    hex_number = "#000000"
    while hex_number == "#000000":
        hex_number = "#"+''.join([random.choice('0123456789ABCDEF') for j in range(6)])
    return hex_number

cluster_color_dict = {}
edge_color_dict = {"PPI": "#bd34eb", "Coexpression": "#2dc2b0"}

def color_by_cluster(cluster):
    if not cluster in cluster_colors:
        cluster_color_dict[cluster] = random_color()
    return cluster_color_dict[cluster]

In [None]:
# networkx and plotly code
def network_graph(num, G, df):
 
    pos = nx.spring_layout(G,k=1,iterations=800)
    for n, p in pos.items():
        G.nodes[n]['pos'] = p

    edge_x = {"PPI": [], "Coexpression": []}
    edge_y = {"PPI": [], "Coexpression": []}

    for edge in G.edges.data():
        x0, y0 = G.nodes[edge[0]]['pos']
        x1, y1 = G.nodes[edge[1]]['pos']

        edge_type = edge[2]['edge_type']
        edge_x[edge_type].append(x0)
        edge_x[edge_type].append(x1)
        edge_x[edge_type].append(None)
        edge_y[edge_type].append(y0)
        edge_y[edge_type].append(y1)
        edge_y[edge_type].append(None)

    ppi_edge_trace = go.Scatter(
        x=edge_x["PPI"], y=edge_y["PPI"],
        line=dict(width=1, color=edge_color_dict["PPI"]),
        hoverinfo='none',
        mode='lines',
        name="Protein-protein interaction")

    coexp_edge_trace = go.Scatter(
        x=edge_x["Coexpression"], y=edge_y["Coexpression"],
        line=dict(width=1, color=edge_color_dict["Coexpression"]),
        hoverinfo='none',
        mode='lines',
        name="Gene-gene coexpression")

    # dicts mapping clusters to relevant node info
    node_x = {}
    node_y = {}
    gene_names = {}
    
    cluster_node_data = {}
    cluster_node_data["not assigned"] = {"x": [], "y": [], 'color': "#c5d0d1", 'genes': []}

    for node in G.nodes():
        cluster = df[df["gene"] == node]["cluster"].values
        if len(cluster) > 0:
            cluster = cluster[0]
            if cluster not in cluster_node_data:
                cluster_node_data[cluster] = {
                    'x': [], 
                    'y': [], 
                    'color': "", #color_by_cluster(cluster), 
                    'genes': []
                }
        else:
            cluster = "not assigned"

        x, y = G.nodes[node]['pos']
        cluster_node_data[cluster]['x'].append(x)
        cluster_node_data[cluster]['y'].append(y)
        cluster_node_data[cluster]['genes'].append(node)
        
    
    cluster_node_data_sorted = {k: v for k, v in sorted(cluster_node_data.items(),
                                                       key= lambda x: str(x)) if k != 'not assigned'}
    cluster_node_data_sorted['not assigned'] = cluster_node_data['not assigned']
    
    node_traces = []
    for cluster, data in cluster_node_data_sorted.items():
        trace = go.Scatter(
                    x=data['x'], 
                    y=data['y'],
                    mode='markers',
                    hoverinfo='text',
                    text=data['genes'],
                    name=str(cluster),
                    textposition='middle center',
                    marker=dict(
                        showscale=False,
                        reversescale=True,
                        size=10,
                        line_width=2))
        if cluster == 'not assigned':
            trace.marker.color = data['color']
        
        node_traces.append(trace)

    fig = go.Figure(
        data=[ppi_edge_trace, coexp_edge_trace, *node_traces],
        layout=go.Layout(
            title=f"Gene list {num + 1}<br>",
            titlefont_size=16,
            showlegend=True,
            hovermode='closest',
            margin=dict(b=10,l=5,r=5,t=40),
            xaxis=dict(showgrid=False, zeroline=False, showticklabels=False),
            yaxis=dict(showgrid=False, zeroline=False, showticklabels=False)))
    fig.show()


In [None]:
for num, G in networks.items():
    clustering_coeff = nx.average_clustering(G)
    print("Average clustering: ",clustering_coeff)
    for clustering_method in df_clusters['clustering_method'].unique():
        if not clustering_method == 'greedy_modularity_communities': continue
        df_method = df_clusters[df_clusters["clustering_method"] == clustering_method]
        df_method = df_method[df_method["gene_list"] == num] 
        #display(df_method)

        network_graph(num,G,df_method)

# Comparison of clustering coefficients to numbers of clusters computed
---

In [None]:
# Compule the numbers of clusters computed by each method for each gene list

cluster_counts = {}

for method, network_clusters in all_clusters.items():
    cluster_counts[method] = []
    for i, clusters in network_clusters.items():
        cluster_counts[method].append(len(clusters))

print(cluster_counts)


In [None]:
x = [(gene_list + 1, method, count) for method, counts in cluster_counts.items() for gene_list, count in enumerate(counts) ]

df_cluster_counts = pd.DataFrame(x, columns = ["Gene list","Method", "Number of clusters"])
display(df_cluster_counts)
plt.figure(figsize=(15,8))
sns.barplot(x="Gene list", hue="Method", y="Number of clusters", data=df_cluster_counts)
plt.show()

# Validation of cluster quality with Enrichr scores
---

In [None]:
# Validation with Enrichr
enrichr_libraries = OrderedDict([
    ('Ontologies', ['GO_Biological_Process_2018']),
])

# Util functions
def enrichr_link_from_genes(genes, description='', enrichr_link='https://amp.pharm.mssm.edu/Enrichr'):
    ''' Functional access to Enrichr API
    '''
    time.sleep(1)
    resp = requests.post(enrichr_link + '/addList', files={
    'list': (None, '\n'.join(genes)),
    'description': (None, description),
    })
    if resp.status_code != 200:
        raise Exception('Enrichr failed with status {}: {}'.format(
          resp.status_code,
          resp.text,
        ))
    # wait a tinybit before returning link (backoff)
    time.sleep(1)
    result = resp.json()
    return dict(result, link=enrichr_link + '/enrich?dataset=' + resp.json()['shortId'])

def enrichr_get_top_results(userListId, bg, enrichr_link='https://amp.pharm.mssm.edu/Enrichr'):
    time.sleep(1)
    resp = requests.get(enrichr_link + '/enrich?userListId={}&backgroundType={}'.format(userListId, bg))
    if resp.status_code != 200:
        raise Exception('Enrichr failed with status {}: {}'.format(
          resp.status_code,
          resp.text,
        ))
    time.sleep(1)
    return pd.DataFrame(resp.json()[bg], columns=['rank', 'term', 'pvalue', 'zscore', 'combinedscore', 'overlapping_genes', 'adjusted_pvalue', '', ''])


In [None]:
# Get Enrichr links for each method, for each gene list, for each cluster
enrichr_links = {}

for clustering_method, cluster_groups in all_clusters.items():
    enrichr_links[clustering_method] = {}
    for num, clusters in cluster_groups.items():
        enrichr_links[clustering_method][num]  = {}
        for index, genes in enumerate(clusters):
            try:
                link = enrichr_link_from_genes(genes, f'gene list {num}, {clustering_method} cluster {index}')
            except:
                link = None
                print(f'Enrichr failed for {clustering_method}, cluster {index} genes')

            enrichr_links[clustering_method][num][index] = link

In [None]:
# analysis parameters
top_n_results = 5
num_overall_results = 100
sort_by = 'combinedscore'

# Grab top results for each cluster
all_enrichr_results = []

for clustering_method, cluster_groups in enrichr_links.items():
    if clustering_method == 'overall': continue
    for num, links in cluster_groups.items():
        num_clusters = len(all_clusters[clustering_method][num])
        if num_clusters == 0: continue
        top_n_results = int(num_overall_results / num_clusters)
        for cluster, link in links.items():
            if link is None:
                continue
            for category, libraries in enrichr_libraries.items():
                for library in libraries:
                    try:
                        results = enrichr_get_top_results(link['userListId'], library).sort_values(sort_by).iloc[:top_n_results]
                        results['clustering_method'] = clustering_method
                        results['gene_list'] = num
                        results['link'] = link['link']
                        results['library'] = library
                        results['category'] = category
                        results['cluster'] = cluster
                        all_enrichr_results.append(results)
                    except:
                        print('{}: {} {} {} gene list {} cluster {} failed, continuing'.format(link, library, category, clustering_method, num, cluster))

df_clustering_enrichr = pd.concat(all_enrichr_results).reset_index()

In [None]:
display(df_clustering_enrichr)

In [None]:
unique_terms = {}
num_unique_terms = []

num_unique_hits = []

for clustering_method in [ x for x in df_clustering_enrichr["clustering_method"].unique() if x != "overall"]:
    df_method = df_clustering_enrichr.loc[df_clustering_enrichr["clustering_method"] == clustering_method]
    
    terms = df_method["term"].values    
    unique = list(set(terms))
    num_unique_terms.append(len(unique))
    unique_terms[clustering_method] = unique
    
    hits = df_method.shape[0]
    num_unique_hits.append(hits)



# avg number of unique terms per method
avg_unique_terms = int(np.mean(np.array(num_unique_terms)))
print("Average unique terms per clustering method: ", avg_unique_terms)

# avg number of enrichr entries per method
avg_hits = int(np.mean(np.array(num_unique_hits)))
print("Average hits per clustering method: ", avg_hits)

In [None]:
# get data on the overall gene sets

overall_results = []
num_results = int(avg_hits / len(gene_lists))

for num, genes in gene_lists.items():
    print(genes)
    for category, libraries in enrichr_libraries.items():
        for library in libraries:
             try:
                link = enrichr_link_from_genes(genes, 'overall')
                results = enrichr_get_top_results(link['userListId'], library).sort_values(sort_by).iloc[:top_n_results]
                results['gene_list'] = num
                results['clustering_method'] = 'overall'
                results['link'] = link['link']
                results['library'] = library
                results['category'] = category
                results['cluster'] = ""
                overall_results.append(results)
            except:
                print('Failed to get Enrichr results for overall gene set')

In [None]:
df_clustering_enrichr = pd.concat(all_enrichr_results).reset_index()

In [None]:
df_overall_results = pd.concat(overall_results)

df_overall_results = df_overall_results.loc[:,[ x for x in df_overall_results.columns if x != ""]]

df_clustering_enrichr = df_clustering_enrichr.loc[:,[ x for x in df_clustering_enrichr.columns if x != ""]]

df_enrichr_results = pd.concat([df_overall_results,df_clustering_enrichr]).reset_index()

In [None]:
# accumulate results for each method
neg_log_pvalues = {}
combined_scores = {}

for clustering_method in df_enrichr_results["clustering_method"].unique():
    df_method = df_enrichr_results.loc[df_enrichr_results["clustering_method"] == clustering_method]
    vals = df_method["pvalue"].values
    neg_log_pvalues[clustering_method] = [ -math.log(p) for p in vals]
    combined_scores[clustering_method] = df_method["combinedscore"].values

In [None]:
# accumulate results for each cluster
neg_log_pvalues_cluster = {}
combined_scores_cluster = {}

for gene_list in df_enrichr_results["gene_list"].unique():
    neg_log_pvalues_cluster[gene_list] = {}
    combined_scores_cluster[gene_list] = {}
                    
    df_gene_list = df_enrichr_results.loc[df_enrichr_results["gene_list"] == gene_list]
    
    for clustering_method in df_gene_list["clustering_method"].unique():
        
        df_method = df_gene_list.loc[df_gene_list["clustering_method"] == clustering_method]        
        if clustering_method == "overall":
            title = "Overall"
            
            vals = df_method["pvalue"].values

            neg_log_pvalues_cluster[gene_list][title] = [ -math.log(p) for p in vals]
            combined_scores_cluster[gene_list][title] = df_cluster["combinedscore"].values
            
        else:
            
            for cluster in df_method["cluster"].unique():
                
                title = f'{clustering_method}, cluster {cluster}'
                
                df_cluster = df_method[df_method["cluster"] == cluster]
                vals = df_cluster["pvalue"].values

                neg_log_pvalues_cluster[gene_list][title] = [ -math.log(p) for p in vals]
                combined_scores_cluster[gene_list][title] = df_cluster["combinedscore"].values

In [None]:
# plot one distribution per method and cluster

for gene_list, methods in neg_log_pvalues_cluster.items():
    print(f"Gene list {gene_list}")
    num_plots = len(methods) 
    num_rows = int(math.ceil(num_plots / 2))
    count = 1
    fig = plt.figure(figsize=(15,5*num_rows))    
    for method, data in methods.items():
        
    
        ax = fig.add_subplot(num_rows,2,count)
        sns.distplot(data, ax=ax)
        ax.set_title(method)
        ax.set_xlabel("-log (p-value)")
        ax.set_ylabel("Frequency")
        plt.xlim([-1,10]) 
        
        count += 1
    fig.tight_layout(pad=3, w_pad=2, h_pad=6)
    plt.show()

In [None]:
# Overlay the kernel density estimates of combined scores on a single plot
sns.set(color_codes=True)

for clustering_method, y in combined_scores.items():
    fig, ax = plt.subplots(figsize=(7,5))
    sns.kdeplot(y, neg_log_pvalues[clustering_method], label=clustering_method,shade=True)
    ax.set_title(clustering_method)
    ax.set_xlabel("Combined score")
    ax.set_ylabel("-log(p-value)")
    ax.legend()


In [None]:
# Overlay the kernel density estimates of combined scores on a single plot
# for each cluster
for gene_list, methods in combined_scores_cluster.items():
    print(f"Gene list {gene_list}")
    num_plots = len(methods) 
    num_rows = int(math.ceil(num_plots / 2))
    count = 1
    fig = plt.figure(figsize=(15,5*num_rows))    
    for method, data in methods.items():
        ax = fig.add_subplot(num_rows,2,count)
        sns.kdeplot(data, neg_log_pvalues_cluster[gene_list][method], label=method,shade=True)
        ax.set_title(method)
        ax.set_xlabel("Combined score")
        ax.set_ylabel("-log(p-value)")
        ax.legend()
        count += 1
    fig.tight_layout(pad=3, w_pad=2, h_pad=6)
    plt.show()
'''

for cluster, y in combined_scores_cluster.items():
    print(len(y))
    fig, ax = plt.subplots(figsize=(7,5))
    sns.kdeplot(y, neg_log_pvalues_cluster[cluster], label=cluster,shade=True)
    ax.set_title(cluster)
    ax.set_xlabel("Combined score")
    ax.set_ylabel("-log(p-value)")
    ax.legend()

'''

In [None]:
%%appyter hide_code

'''
cluster_colors = {}
edge_colors = {"PPI": "#bd34eb", "Coexpression": "#2dc2b0"}

os.makedirs("network_visualizations/PPI_and_coexpression_graphs",exist_ok=True)
os.makedirs("network_visualizations/PPI",exist_ok=True)
os.makedirs("network_visualizations/coexpression",exist_ok=True)

folder = "./network_visualizations/"

if len(edge_types) == 2:
    folder += "PPI_and_coexpression_graphs"
elif "Protein-protein interactions" in edge_types:
    folder += "PPI"
elif "Gene-gene co-expression" in edge_types:
    folder += "coexpression"

# make the clustering method graphs
for num, G in networks.items():
    for clustering_method in df_clusters.clustering_method.unique():
        
        df_method = df_clusters[df_clusters["clustering_method"] == clustering_method]
        df_method = df_method[df_method["gene_list"] == num]
    
        nt = net.Network(width="100%", height = 800, notebook=True)
        nt.from_nx(G)
        # nt.show_buttons()
        for node in nt.nodes:
            cluster = df_method[df_method["gene"] == node["id"]]
            if cluster.shape[0] == 0: node["color"] = "#000"
            else: 
                #print(cluster)
                cluster_num = cluster["cluster"].values[0]
                if not cluster_num in cluster_colors:
                    cluster_colors[cluster_num] = random_color()
                
                node["color"] = cluster_colors[cluster_num]
                
        for edge in nt.edges:
            edge["color"] = edge_colors[edge["edge_type"]]
            edge["title"] = edge["edge_type"]
        nt.prep_notebook()
        display(f"Gene list {num}, {clustering_method}")
        display(nt.show(f"{folder}/graph_{num}_{clustering_method}.html"))
'''

# Comparison to [STRING](https://string-db.org) results

In [None]:
# comparison to STRING results

print("Calculating number of missed genes in each STRING network:\n\n")
string_api_url = "https://string-db.org/api"
output_format = "json"
method = "interaction_partners"

request_url = "https://string-db.org/api/json/network"
nums_misses_string = []

for num, genes in gene_lists.items():
    print("List ", num + 1)
    params = {
        "identifiers" : "%0d".join(genes), # your protein
        "species": 9606
    }
    
    hits = set()
    
    response = requests.post(request_url, data=params)
    data = response.json()
    for interaction in data:
        hits.add(interaction["preferredName_A"])
        hits.add(interaction["preferredName_B"])
    
    misses = set()
    
    for gene in genes:
        if gene not in hits:
            misses.add(gene)
    
    print(f"{len(genes) - len(hits)} gene misses out of {len(genes)}\n")
    nums_misses_string.append(len(genes) - len(hits))      

In [None]:
# Compare missing/disconnected nodes in our network to those in STRING

%matplotlib inline
gene_list_sizes = [len(v) for k,v in gene_lists.items()]

fig, ax = plt.subplots(figsize=(15,7))

ind = np.arange(1,len(gene_lists)+1) 
width = 0.35   

bar1 = plt.bar(ind,nums_misses_string,width, label="STRING misses",color="#bd34eb")
bar2 = plt.bar(ind + width,nums_missing_nodes,width, label="Our misses",color="#2dc2b0")
plt.xticks(ind + width / 2, ind)
plt.legend(loc='best')
ax.set_ylabel("Number of disconnected nodes")
ax.set_xlabel("Gene list")
plt.show()