# Exploratory Data analysis

## Load data

In [2]:
import os
import pandas as pd
import matplotlib.pyplot as plt

datadir = "../data"

node_df = pd.read_csv(f"{datadir}/hetionet-v1.0-nodes.tsv", sep="\t")
edge_type_df = pd.read_csv(f"{datadir}/metaedges.tsv", sep="\t")
edge_df = pd.read_csv(f"{datadir}/hetionet-v1.0-edges.sif", sep="\t")

## Descriptive statistics

### Nodes

In [3]:
print(f"Number of nodes: {len(node_df)}")
print("Number of node types: {}".format(len(node_df["kind"].unique())))
print("-"*40)
for node_type in node_df["kind"].unique():
    print(f"\t{node_type}")
print("-"*40)

Number of nodes: 47031
Number of node types: 11
----------------------------------------
	Anatomy
	Biological Process
	Cellular Component
	Compound
	Disease
	Gene
	Molecular Function
	Pathway
	Pharmacologic Class
	Side Effect
	Symptom
----------------------------------------


In [15]:
%matplotlib inline
df = edge_df.copy()
df["source_kind"] = df["source"].str.split("::", expand=True)[0]
df["target_kind"] = df["target"].str.split("::", expand=True)[0]
source_types = df["source_kind"]
target_types = df["target_kind"]

### Edges

In [3]:
edge_type_df["edge_type"] = edge_type_df["metaedge"].str.split(" - ", expand=True)[1].fillna("regulates")
print("Number of edges: {}".format(len(edge_df)))
print("Number of edge types: {}".format(len(edge_type_df)))
print("Number of grouped edge types: {}".format(len(edge_type_df.edge_type.unique())))
print("-"*40)
for et in edge_type_df.edge_type.unique():
    print(f"\t{et}")

Number of edges: 2250197
Number of edge types: 24
Number of grouped edge types: 16
----------------------------------------
	downregulates
	expresses
	upregulates
	binds
	causes
	palliates
	resembles
	treats
	associates
	localizes
	presents
	covaries
	interacts
	participates
	regulates
	includes


## Connected components in Hetionet

In [10]:
from grapharm._utils import tsv2networkx
from grapharm.viz import pyvis_opts, networkx2pyvis
import networkx as nx

# Build graph
print("Build graph")
G = tsv2networkx(edge_df, node_df, edge_type_df)
print("Done")
print("="*20)

print("Find connected component:")
print("  Warning: this step may take around 30s")
connected_components = list((G.subgraph(c).copy()
                             for c in nx.connected_components(G)))
print("* Number of subgraphs: {}".format(len(connected_components)))

subgraph_num_nodes = {}
for i, subgraph in enumerate(connected_components):
    subgraph_num_nodes[i] = subgraph.number_of_nodes()

id = max(subgraph_num_nodes, key=subgraph_num_nodes.get)
subgraph = connected_components[id]
nodes = list(subgraph.nodes)
entity_types = set([n.split("::")[0] for n in nodes])
print("* The largest subgraph has {} nodes ({} types) and {} edges.".format(
    subgraph.number_of_nodes(),
    len(entity_types),
    subgraph.number_of_edges()
))
id = sorted(((v, k) for k, v in subgraph_num_nodes.items()))[-2][1]
subgraph = connected_components[id]
print("* The second largest subgraph has {} nodes and {} edges.".format(
    subgraph.number_of_nodes(),
    subgraph.number_of_edges()
))
print("Number of subgraphs with less than 10 nodes: {}".format(
    sum(value < 10 for value in subgraph_num_nodes.values())
))

Build graph
Done
Find connected component:
* Number of subgraphs: 1874
* The largest subgraph has 45158 nodes (11 types) and 2107709 edges.
* The second largest subgraph has 1 nodes and 0 edges.
Number of subgraphs with less than 10 nodes: 1873
