In the previous tutorial, we have shown how to create a custom notebook, explore, and filter the data using tables generated by BGCFlow.

In this notebook, we will do an integrated analysis utilizing networks of knowledge bases from different tools: `BiG-SCAPE`, `antiSMASH KnownClusterBlast`, `BiG-FAM database query`, and `ARTS2`. We will generate a `graphml` file with annotation that can be loaded to Cytoscape and also also attempt to visualize the network in this notebook.

[Download Notebook](https://github.com/NBChub/bgcflow/blob/main/.examples/notebooks/integrated_network.ipynb){:target="_blank" .md-button}

In [None]:
import pandas as pd
import networkx as nx
from pathlib import Path
import json
import numpy as np
import plotly.graph_objects as go
import seaborn as sns
from IPython.display import display, Markdown, HTML

from itables import to_html_datatable as DT
import itables.options as opt

opt.css = """
.itables table td { font-style: italic; font-size: .8em;}
.itables table th { font-style: oblique; font-size: .8em; }
"""

opt.classes = ["display", "compact"]
opt.lengthMenu = [5, 10, 20, 50, 100, 200, 500]

def create_node_trace(G, node_trace_category, color, showtextlabel=False, nodesize=10, nodeopacity=0.8, 
                      nodesymbol="circle", linewidth=1, linecolor="black", textposition="top center", showlegend=False):
    """
    Create a node trace for a given graph.

    Parameters:
    G (networkx.Graph): The graph to create the node trace for.
    node_trace_category (str): The category of the node trace.
    color (str): The color of the nodes.
    showtextlabel (bool): Whether to show text labels for the nodes.
    nodesize (int): The size of the nodes.
    nodeopacity (float): The opacity of the nodes.
    nodesymbol (str): The symbol used for the nodes.
    linewidth (int): The width of the lines.
    linecolor (str): The color of the lines.
    textposition (str): The position of the text labels.
    showlegend (bool): Whether to show the legend.

    Returns:
    go.Scatter: The node trace.
    """
    if showtextlabel:
        markermode = "markers+text"
    else:
        markermode = "markers"
    nodes = np.array([node for node in G.nodes() if G.nodes[node]["node_trace"] == node_trace_category])
    pos = np.array([G.nodes[node]['pos'] for node in nodes.flatten()]).reshape(-1, 2)
    xs, ys = pos[:, 0], pos[:, 1]
    texts = np.array([G.nodes[node]['text'] for node in nodes if "text" in G.nodes[node].keys()])
    node_trace = go.Scatter(
        x=xs.tolist(),
        y=ys.tolist(),
        text=texts.tolist(),
        textposition=textposition,
        mode=markermode,
        hoverinfo='text',
        name=node_trace_category,
        showlegend=showlegend,
        marker=dict(
            symbol=nodesymbol,
            opacity=nodeopacity,
            showscale=False,
            color=color,
            size=nodesize,
            line=dict(width=linewidth, color=linecolor)))
    return node_trace

def create_edge_trace(Graph, name, showlegend=False, color='#888', width=0.5, opacity=0.8, dash="solid"):
    """
    Create an edge trace for a given graph.

    Parameters:
    Graph (networkx.Graph): The graph to create the edge trace for.
    name (str): The name of the edge trace.
    showlegend (bool): Whether to show the legend.
    color (str): The color of the edges.
    width (float): The width of the edges.
    opacity (float): The opacity of the edges.
    dash (str): The style of the edges.

    Returns:
    go.Scatter: The edge trace.
    """
    edge_trace = go.Scatter(
        x=[],
        y=[],
        name=name,
        opacity=opacity,
        line=dict(width=width,color=color, dash=dash),
        hoverinfo='none',
        mode='lines',
        showlegend=showlegend)

    edges = np.array([edge for edge in Graph.edges() if G.edges[edge]["relation_type"] == name])
    pos = np.array([Graph.nodes[e]['pos'] for e in edges.flatten()]).reshape(-1, 2)
    xs = np.insert(pos[:, 0], np.arange(2, len(pos[:, 0]), 2), None)
    ys = np.insert(pos[:, 1], np.arange(2, len(pos[:, 1]), 2), None)
    edge_trace['x'] = xs
    edge_trace['y'] = ys

    return edge_trace

def get_graph_stats(g, graph_name):
    """
    Get statistics for a given graph.

    Parameters:
    g (networkx.Graph): The graph to get statistics for.
    graph_name (str): The name of the graph.

    Returns:
    str: A string containing the statistics for the graph.
    """
    num_nodes_g = g.number_of_nodes()
    num_edges_g = g.number_of_edges()
    avg_degree_g = sum(dict(g.degree()).values()) / num_nodes_g
    num_bgc_g = sum(1 for _, data in g.nodes(data=True) if data.get('node_trace') == 'BGC')
    return f" - {graph_name} : **{num_nodes_g}** total nodes (**{num_bgc_g}** BGC nodes), **{num_edges_g}** edges, {avg_degree_g:.2f} average degree"

## Handling Graphs: Reading, Filtering, and Merging Networks

We will start by defining some parameters for filtering:

In [None]:
bigscape_cutoff = "0.30"
bigfam_rank_filter = 1 # only select first hits of bigfam models
knownclusterblast_similarity_cutoff = 0.8 # select antismash knownclusterblast hits above 0.7

And then we will first list all the files that we need:

In [None]:
report_dir = Path("../")
dependency_version = report_dir / "metadata/dependency_versions.json"
with open(dependency_version, "r") as file:
    dependency_version = json.load(file)

antismash_version = dependency_version["antismash"]

assets_dir = report_dir / "docs/assets"
integrated_bigscape_network = assets_dir / f"data/bigscape_{bigscape_cutoff}_as{antismash_version}.graphml"
bigfam_network = assets_dir / f"data/query_bigfam_as{antismash_version}_network.json"
arts_hits = assets_dir / f"tables/arts_hits_as{antismash_version}.csv"

We will start by reading the input files into memory:

In [None]:
# read ARTS table
df_arts_hits = pd.read_csv(arts_hits)

integrated_bigscape_network_graph = nx.read_graphml(integrated_bigscape_network)

# Iterate over the nodes of bigscape_network_graph_knownclusterblast
for n, data in integrated_bigscape_network_graph.nodes(data=True):
    # If node_trace is not in the node's attributes, add it with a default value
    if 'node_trace' not in data:
        if n.startswith('BGC'):
            integrated_bigscape_network_graph.nodes[n]['node_trace'] = 'MIBIG_BiG-SCAPE'
        else:
            integrated_bigscape_network_graph.nodes[n]['node_trace'] = 'BGC'

with open(bigfam_network, "r") as f:
    graph_data = json.load(f)
    bigfam_network_graph = nx.readwrite.json_graph.node_link_graph(graph_data)

for n, data in bigfam_network_graph.nodes(data=True):
    bigfam_network_graph.nodes[n]['node_trace'] = bigfam_network_graph.nodes[n]['node_type']

bigfam_network_graph = nx.relabel_nodes(bigfam_network_graph, lambda x: str(x))

And then continue by filtering the networks based on our cutoff definition.

In this first part, we will split again the BiG-SCAPE - antiSMASH KnownClusterBlast network into its parts:

In [None]:
# Filter edges with relation_type = 'bigscape_similarity'
edges_with_bigscape_similarity = [(u, v) for u, v, d in integrated_bigscape_network_graph.edges(data=True) if d['relation_type'] == 'bigscape_similarity']

# Filter edges with relation_type = 'knownclusterblast' and similarity above the threshold
edges_with_knownclusterblast = [(u, v) for u, v, d in integrated_bigscape_network_graph.edges(data=True) if d['relation_type'] == 'knownclusterblast' and d['similarity'] > knownclusterblast_similarity_cutoff]

# Create subgraph
bigscape_network_graph_knownclusterblast = integrated_bigscape_network_graph.edge_subgraph(edges_with_knownclusterblast)
bigscape_network_graph_bigscape_similarity = integrated_bigscape_network_graph.edge_subgraph(edges_with_bigscape_similarity)

# Calculate stats for bigscape_network_graph
display(Markdown(get_graph_stats(integrated_bigscape_network_graph, "integrated_bigscape_network_graph")))

# Calculate stats for bigscape_network_graph_knownclusterblast
display(Markdown(get_graph_stats(bigscape_network_graph_knownclusterblast, f"bigscape_network_graph_knownclusterblast with similarity > {knownclusterblast_similarity_cutoff}")))

# Calculate stats for bigscape_network_graph_bigscape_similarity
display(Markdown(get_graph_stats(bigscape_network_graph_bigscape_similarity, "bigscape_network_graph_bigscape_similarity")))

Next, we will filter the BiG-FAM hits based on the top taxa distribution in the BiG-FAM database:

In [None]:
bigfam_models_stats = pd.DataFrame.from_dict({n:data for n, data in bigfam_network_graph.nodes(data=True) if data["node_trace"] != "BGC"}).T

deleted_bigfam = []
bigfam_taxa_cutoff = 0.3
bigfam_filter = bigfam_models_stats[bigfam_models_stats.top_taxa_proportion <= bigfam_taxa_cutoff]
for n in bigfam_filter.index:
    try:
        bigfam_network_graph.remove_node(n)
        deleted_bigfam.append(n)  
    except nx.NetworkXError as e:
        print(e)

deleted_bigfam = ', '.join([str(i) for i in deleted_bigfam])
display(Markdown(f"Deleted BigFam models: {deleted_bigfam}"))

In [None]:
# Filter edges with rank = 0
edges_with_rank_filtered = [(u, v) for u, v, d in bigfam_network_graph.edges(data=True) if d['rank'] < bigfam_rank_filter]

# Create subgraph
bigfam_network_graph_filtered = bigfam_network_graph.edge_subgraph(edges_with_rank_filtered)
# Iterate over the edges of bigfam_network_graph_filtered
for u, v, data in bigfam_network_graph_filtered.edges(data=True):
    # Get the relation_type attribute
    if "relation_type" not in data.keys():
        data["relation_type"] = "bigfam_similarity"

# Calculate stats for bigfam_network_graph
display(Markdown(get_graph_stats(bigfam_network_graph, "bigfam_network_graph")))

# Calculate stats for bigfam_network_graph_filtered
display(Markdown(get_graph_stats(bigfam_network_graph_filtered, "bigfam_network_graph_filtered")))

Once we cleaned up the subgraphs, we then recombine them all again:

In [None]:
# Combine Graphs
G = nx.compose(bigscape_network_graph_bigscape_similarity, bigscape_network_graph_knownclusterblast)
G = nx.compose(G, bigfam_network_graph_filtered)
display(Markdown(get_graph_stats(G, "integrated_graph")))

Finally, we will add the metadata of the ARTS2 hits for prioritization:

In [None]:
def custom_agg(x):
    if x.dtypes == object:
        return ', '.join(x)
    else:
        return x.sum()

df_arts_hits_filtered = df_arts_hits[(~df_arts_hits.bgc_id.isna()) & (df_arts_hits.hits > 1)].loc[:, ["bgc_id", "profile", "hits", "Dup", "HGT", "Res", "BGC"]]
df_arts_hits_filtered = df_arts_hits_filtered.apply(lambda x: x.map(lambda y: y.replace('✔', '1').replace('✖', '0') if isinstance(y, str) else y))
df_arts_hits_filtered = df_arts_hits_filtered.apply(lambda x: x.map(lambda y: int(y) if isinstance(y, str) and y.isdigit() else y))
df_arts_hits_filtered = df_arts_hits_filtered.groupby("bgc_id").agg(custom_agg)
df_arts_hits_filtered = df_arts_hits_filtered.rename(columns={c:f"ARTS_{c}" for c in df_arts_hits_filtered.columns})
display(HTML(DT(df_arts_hits_filtered.reset_index(), scrollX=True)))

This table are then loaded as the node information for the graph:

In [None]:
for n in df_arts_hits_filtered.index:
    for c in df_arts_hits_filtered.columns:
        G.nodes[n][c] = df_arts_hits_filtered.loc[n, c]

Followed by some sanitazion:

In [None]:
# Get self-loops
self_loops = nx.selfloop_edges(G)

# Check if there are any self-loops
if self_loops is not None:
    # Remove self-loops
    G.remove_edges_from(self_loops)

# Iterate over the nodes of the graph, getting the node and its attributes
for n, data in G.nodes(data=True):
    # Create a list of keys to remove after iterating over the dictionary
    keys_to_remove = []
    # Iterate over the items in the attributes dictionary
    for k, v in data.items():
        # Check if the value is not of a type compatible with GraphML
        if isinstance(v, list):
            data[k] = ", ".join([str(i) for i in v])
        elif v is None:
            # Add the key to the list of keys to remove
            keys_to_remove.append(k)
        elif not isinstance(v, (int, float, str, bool, np.int64)):
            print(f"Node {n} has attribute {k} of incompatible type {type(v)}")
    # Remove the keys with None values
    for key in keys_to_remove:
        del G.nodes[n][key]

And now, the graph is ready and can be downloaded for display in network visualization tools:

In [None]:
outfile = Path(f"assets/integrated_network__bigscape_{bigscape_cutoff}__bigfam_{bigfam_rank_filter}__knownclusterblast_{knownclusterblast_similarity_cutoff}.graphml")
outfile.parent.mkdir(parents=True, exist_ok=True)
nx.write_graphml(G, outfile)
display(Markdown(f"[Download Graph]({str(outfile)})"+'{:target="_blank" .md-button}'))

## Network Visualization

While we recommend users to use Network Visualization tools such as Cytoscape or Gephi, we can also attempt to visualize the network in Python

In [None]:
# define layout options
options = {
    'prog': 'twopi',
    #'args': ' '.join(['-Gstart=10', '-Goverlap_scaling=-100'])
}

# position nodes
pos = nx.nx_agraph.pygraphviz_layout(G, **options)#, args='-Goverlap=false -Elen=weight')
for n, p in pos.items():
    G.nodes[n]['pos'] = p
    # set up text display
    node_trace = G.nodes[n]["node_trace"]
    if "text" not in G.nodes[n].keys():
        text = [n]
        for k, v in G.nodes[n].items():
            if node_trace == "BGC":
                if k in ["genome_id", "product", "bigscape_class", "Organism"]:
                    text.append(f"{k} : {v}")
        text = "<br>".join(text)
        G.nodes[n]['text'] = text

bigscape_class_labels = set([data['bigscape_class'] for n, data in G.nodes(data=True) if data["node_trace"] == "BGC"])
bigscape_class_colors = sns.color_palette("colorblind", len(bigscape_class_labels)).as_hex()

# define visualization
edge_annotation_map = {'bigscape_similarity' : {'color':'black',
                                                'width':10
                                               },
                       'knownclusterblast' : {'color':'grey',
                                                'width':0.1
                                               },
                       'bigfam_similarity' : {'color':'grey',
                                                'width':0.1
                                               },
                      }

node_annotation_map = {'MIBIG_BiG-SCAPE' : {'color':'green',
                                  'node_symbol' : 'star'},
                       'MIBIG_knownclusterblast': {'color':'blue',
                                  'node_symbol' : 'star'}, 
                       "BGC" : {'color':'grey',
                                'node_symbol' : 'circle'},
                       "BiG-FAM GCFs" : {'color':'yellow',
                                'node_symbol' : 'triangle-up'},
                      }

In [None]:
traces = []
node_trace = []
edge_trace = []

for e in edge_annotation_map.keys():
    dash = "solid"
    if 'knownclusterblast' in e:
        dash = "dot"
    edge_trace = create_edge_trace(G, e, color=edge_annotation_map[e]['color'], dash=dash, showlegend=True)
    traces.append(edge_trace)

for trace in node_annotation_map.keys():
    nodeopacity = 0.5
    showtextlabel = False
    linecolor = "black"
    linewidth = 0.5
    textposition="top left"
    node_size = 8
    if trace in bigscape_class_labels:
        nodeopacity = 0.8
    node_trace = create_node_trace(G, trace, node_annotation_map[trace]['color'], showtextlabel=showtextlabel, 
                                   nodesymbol=node_annotation_map[trace]['node_symbol'], nodeopacity=nodeopacity, 
                                   showlegend=True, linecolor=linecolor, linewidth=linewidth, nodesize=node_size,
                                   textposition=textposition)
    traces.append(node_trace)

G_arts = G.copy()
for n in G_arts.nodes:
    if n in df_arts_hits_filtered.index:
        G_arts.nodes[n]["node_trace"] = "ARTS_hits"

nodeopacity = 0.5
showtextlabel = False
node_symbol = "circle"
node_color = "grey"
linecolor = "red"
linewidth = 1
textposition="top left"
node_size = 8
node_trace = create_node_trace(G_arts, "ARTS_hits", node_color, showtextlabel=showtextlabel, 
                                   nodesymbol=node_symbol, nodeopacity=nodeopacity, 
                                   showlegend=True, linecolor=linecolor, linewidth=linewidth, nodesize=node_size,
                                   textposition=textposition)

traces.append(node_trace)

fig = go.Figure(data=traces,
                layout=go.Layout(
                    paper_bgcolor='rgba(0,0,0,0)',
                    plot_bgcolor='white',
                    showlegend=True,
                    hovermode='closest',
                    margin=dict(b=20,l=5,r=5,t=40),
                    xaxis=dict(showgrid=False, zeroline=False, showticklabels=False, linecolor='black', mirror=True, linewidth=1),
                    yaxis=dict(showgrid=False, zeroline=False, showticklabels=False, linecolor='black', mirror=True, linewidth=1),
                    width=800, height=600)
                )

outfile = Path("assets/figures/integrated_network.html")
outfile.parent.mkdir(parents=True, exist_ok=True)
fig.write_html(outfile)

display(HTML(filename=str(outfile)))

## How to add this notebook to the report

Convert this file:

```bash
# cd to the docs folder containing this notebook
jupyter nbconvert --to markdown \
    --execute "integrated_network.ipynb" \
    --output "integrated_network.md" \
    --template "admonition" \
    --TemplateExporter.extra_template_basedirs="../../../../workflow/notebook/nb_convert"
```

Edit the mkdocs.yaml
```yaml
markdown_extensions:
  - attr_list
  - admonition
  - pymdownx.details
  - pymdownx.superfences
theme:
  icon:
    admonition:
      note: octicons/tag-16
      code: material/code-tags

```