In [None]:
import matplotlib.pyplot as plt
import networkx as nx
import pandas as pd

In [None]:
# Create a Pandas DataFrame. This is what it looks like.
df = pd.read_csv("./data/2018-05-02-human-ppi-weighted-cap0_75.txt", sep="\t")
df.head(n=2)

In [None]:
def graph_stats(graph, extras=False):
    print("Number of vertices: %d" % graph.number_of_nodes())
    print("Number of edges:    %d" % graph.number_of_edges())
    if extras:
        print("ACC:                %f" % nx.average_clustering(graph, weight="edge_weight"))
        print("ASPL:               %f" % nx.average_shortest_path_length(graph, weight="edge_weight"))

In [None]:
# Create a directed, weighted graph directly from the table. 
# Compare stats to what Jeffrey Law sent us: 16613 vertices and 562267 edges.
dG = nx.from_pandas_edgelist(df, "tail", "head", "edge_weight", nx.DiGraph)
graph_stats(dG)

In [None]:
# Remove edges that are unidirectional and make an undirected graph.
# Jeffrey said he had 277246 edges here. Just rolling with the punches.
uG = dG.to_undirected(reciprocal=True)  # "reciprocal" ignores edges not going both ways
graph_stats(uG)

In [None]:
# This is the degree distribution among vertices (generally).
# This says that basically all 16000 vertices have deg < 500.
# Maybe worth thresholding > 250 (approximately)?
df.groupby("tail").apply(len).hist(bins=20, log=True)
plt.show()

In [None]:
# Count tail nodes, sort by highest degree, and threshold them arbitrarily.
# This list of high degree nodes will be used to filter out dense data in the plot.
source_counts = df.groupby("tail").apply(len).sort_values(ascending=False)
source_counts = source_counts[(source_counts <= 200)]
drop_nodes = list(source_counts.index)
uG.remove_nodes_from(drop_nodes)
print("Just dropped %d nodes from the graph!" % len(drop_nodes))

In [None]:
# Nothing is working...
nx.draw_spring(uG)