# Exploratory Data analysis

This notebook is to:

## Load data

In [20]:
import os
import pandas as pd
import matplotlib.pyplot as plt

datadir = "../data"

node_df = pd.read_csv(f"{datadir}/hetionet-v1.0-nodes.tsv", sep="\t")
edge_type_df = pd.read_csv(f"{datadir}/metaedges.tsv", sep="\t")
edge_df = pd.read_csv(f"{datadir}/hetionet-v1.0-edges.sif", sep="\t")

## Descriptive statistics

### Nodes

In [19]:
print(f"Number of nodes: {len(node_df)}")
print("Number of node types: {}".format(len(node_df["kind"].unique())))
print("-"*40)
for node_type in node_df["kind"].unique():
    print(f"\t{node_type}")
print("-"*40)

Number of nodes: 47031
Number of node types: 11
----------------------------------------
	Anatomy
	Biological Process
	Cellular Component
	Compound
	Disease
	Gene
	Molecular Function
	Pathway
	Pharmacologic Class
	Side Effect
	Symptom
----------------------------------------


In [70]:
# TODO: plot a bargraph of node type distribution here

### Edges

In [31]:
edge_type_df["edge_type"] = edge_type_df["metaedge"].str.split(" - ", expand=True)[1].fillna("regulates")
print("Number of edges: {}".format(len(edge_df)))
print("Number of edge types: {}".format(len(edge_type_df)))
print("Number of grouped edge types: {}".format(len(edge_type_df.edge_type.unique())))
print("-"*40)
for et in edge_type_df.edge_type.unique():
    print(f"\t{et}")

Number of edges: 2250197
Number of edge types: 24
Number of grouped edge types: 16
----------------------------------------
	downregulates
	expresses
	upregulates
	binds
	causes
	palliates
	resembles
	treats
	associates
	localizes
	presents
	covaries
	interacts
	participates
	regulates
	includes


In [91]:
def tsv2networkx(data, node_df, edge_type_df):

    import networkx as nx

    g_nx = nx.Graph()

    # nodes (add node ids)
    node_name_dict = node_df.set_index("id").to_dict()["name"]
    node_type_dict = node_df.set_index("id").to_dict()["kind"]
    for node in node_df["id"].tolist():
        node_type = node_type_dict[node]
        g_nx.add_node(node,
                      label=node_name_dict[node],
                      entity=node_type,
                      color=node_colors[node_type])

    # edges
    edge_type_df["edge_type"] = edge_type_df["metaedge"].str.split(
        " - ", expand=True)[1].fillna("regulates")

    link_dict = edge_type_df.set_index("abbreviation").to_dict()["edge_type"]
    for abrv in edge_type_df["abbreviation"].unique():
        links = data[data["metaedge"] == abrv][[
            "source", "target"]].itertuples(index=False, name=None)
        link_type = link_dict[abrv]
        g_nx.add_edges_from(links,
                            label=link_type,
                            color=edge_colors[link_type],
                            dashes=False)

    return g_nx

def networkx2pyvis(G, **options):
    """From networkx to pyvis graph

    Args:
        G (nx graph): networkx graph
        node_df (pd.DataFrame): df load from this file hetionet-v1.0-nodes.tsv

    Returns:
        pyvis graph: corresponding pyvis graph
    """
  
    from pyvis.network import Network
    
    H = Network(directed=True, **options)

    # Add nodes
    for node, data in G.nodes(data=True):
        H.add_node(node,
                   font={"color": data['color']},
                   **data)

    # Add edges
    for node1, node2, data in G.edges(data=True):
        H.add_edge(node1, node2, **data)

    return H

In [80]:
import networkx as nx

G_test = nx.Graph()
G_test.add_nodes_from([1,2],
                      label=["a", "b"],
                      color="#DFFF00")

In [88]:
G = tsv2networkx(edge_df, node_df, edge_type_df)

In [89]:
node_name = "Compound::DB01231"
sel_df = edge_df[(edge_df["source"]==node_name) |
                      (edge_df["target"]==node_name)]
selected_nodes = list(set(sel_df["source"].tolist() + sel_df["target"].tolist()))
len(selected_nodes)

40

In [92]:
pyvis_opts = """
var options = {
  "nodes": {
    "font": {
      "face": "verdana"
    },
    "size": 15
  },
  "edges": {
    "color": {
      "inherit": true
    },
    "font": {
      "size": 10,
      "face": "verdana"
    },
    "smooth": false,
    "width": 2
  },
  "physics": {
    "barnesHut": {
      "gravitationalConstant": -2000,
      "centralGravity": 0,
      "springLength": 200
    },
    "minVelocity": 0.25
  }
}
"""

subgraph = G.subgraph(selected_nodes)
options = {
                "notebook": True,
                "height": "750px",
                "width": "100%",
                "cdn_resources": "in_line"
            }
H = networkx2pyvis(subgraph, **options)
H.set_options(pyvis_opts)
H.show("test.html")

test.html


In [8]:
edge_df.head()

Unnamed: 0,source,metaedge,target
0,Gene::9021,GpBP,Biological Process::GO:0071357
1,Gene::51676,GpBP,Biological Process::GO:0098780
2,Gene::19,GpBP,Biological Process::GO:0055088
3,Gene::3176,GpBP,Biological Process::GO:0010243
4,Gene::3039,GpBP,Biological Process::GO:0006898
