In [1]:
import pandas as pd

In [55]:
import numpy as np

In [16]:
import networkx as nx

In [44]:
from scipy import sparse

# Pandas

In [2]:
!head datasets/en-wikipedia.humans.ungraph.names.txt

# Node names
# NodeId NodeName
0	Alexander Seton (d. 1332)
31	Ularbek Baitailaq
108	Albert P. Forsythe
159	Luca Segantini
171	Jacques Moeschal (architect)
172	Jacques Moeschal (footballer)
174	Fred Armstrong (mayor)
185	James Williamson (priest)


In [3]:
df_nodes = pd.read_csv("datasets/en-wikipedia.humans.ungraph.names.txt", comment="#", delimiter="\t", names=["id", "name"])

In [4]:
df_nodes.head()

Unnamed: 0,id,name
0,0,Alexander Seton (d. 1332)
1,31,Ularbek Baitailaq
2,108,Albert P. Forsythe
3,159,Luca Segantini
4,171,Jacques Moeschal (architect)


In [5]:
!head datasets/en-wikipedia.humans.ungraph.txt

# Directed graph: ../datasets/en-wikipedia.humans.ungraph.txt 
# Nodes: 1014428 Edges: 3923957
# FromNodeId	ToNodeId
185	186
185	191
186	185
186	187
187	186
187	188
188	187


In [6]:
df_edges = pd.read_csv("datasets/en-wikipedia.humans.ungraph.txt", comment="#", delimiter="\t", names=["id", "src", "dst"])

In [7]:
df_edges.head()

Unnamed: 0,id,src,dst
0,185,186,
1,185,191,
2,186,185,
3,186,187,
4,187,186,


In [8]:
!head datasets/en-wikipedia.humans.ungraph.features.txt

# Node features
# NodeId FeatureList
0	758752
31	758758 758772
108	50 91 138 479 554 861 1668 3405 6387 11242 12041 19378 20462 39172 55878 66377 143837 187927 203505 367505
159	283447
171	67
172	1 67 1418 2456 9781 19864 229996
174	28508
185	35535


In [9]:
df_features = pd.read_csv("datasets/en-wikipedia.humans.ungraph.features.txt", comment="#", delimiter="\t", names=["id", "features"])

In [10]:
df_features.head()

Unnamed: 0,id,features
0,0,758752
1,31,758758 758772
2,108,50 91 138 479 554 861 1668 3405 6387 11242 120...
3,159,283447
4,171,67


In [11]:
df_features["features"] = df_features.features.str.split(" ")

In [12]:
df_features.head()

Unnamed: 0,id,features
0,0,[758752]
1,31,"[758758, 758772]"
2,108,"[50, 91, 138, 479, 554, 861, 1668, 3405, 6387,..."
3,159,[283447]
4,171,[67]


In [13]:
!head datasets/en-wikipedia.humans.ungraph.features.names.txt

# Node feature names
# FeatureId FeatureName
0	United States
1	Association football
2	England
3	Germany
4	World War II
5	London
6	France
7	United Kingdom


In [14]:
df_features_names = pd.read_csv("datasets/en-wikipedia.humans.ungraph.features.names.txt", comment="#", delimiter="\t", names=["id", "name"])

In [15]:
df_features_names.head()

Unnamed: 0,id,name
0,0,United States
1,1,Association football
2,2,England
3,3,Germany
4,4,World War II


# NetworkX

In [18]:
graph = nx.read_edgelist("datasets/en-wikipedia.humans.ungraph.txt", comments="#", delimiter="\t")

In [19]:
print(nx.info(graph))

Name: 
Type: Graph
Number of nodes: 731293
Number of edges: 3266258
Average degree:   8.9328


In [25]:
to_select = list()
for node in graph:
    if graph.degree(node) > 200:
        to_select.append(node)

In [26]:
len(to_select)

1472

In [27]:
subgraph = nx.subgraph(graph, to_select)

In [28]:
print(nx.info(subgraph))

Name: 
Type: SubGraph
Number of nodes: 1472
Number of edges: 20255
Average degree:  27.5204


In [41]:
for node in subgraph:
    name = df_nodes[df_nodes["id"]==int(node)].name.values[0]
    subgraph.node[node]["name"] = name

In [43]:
nx.write_graphml(subgraph, "test.graphml")

# CSR Sparse Matrix

In [51]:
nodes = list(graph.nodes())

In [52]:
csr_graph = nx.adj_matrix(graph, nodelist=nodes)

In [47]:
csr_graph.shape

(731293, 731293)

In [48]:
csr_graph[0]

<1x731293 sparse matrix of type '<type 'numpy.int64'>'
	with 2 stored elements in Compressed Sparse Row format>

In [49]:
csr_graph[0].indices

array([4, 5], dtype=int32)

In [50]:
csr_graph[0].data

array([1, 1])

In [53]:
node_set = {0, 1, 2, 3}

In [57]:
vec = np.zeros(len(nodes))

In [58]:
vec[0] = 1.
vec[1] = 1.
vec[2] = 1.

In [60]:
np.where(csr_graph * vec)

(array([     4,      5, 112125, 379398, 494600, 549793]),)

In [63]:
list(graph.neighbors(nodes[0]))

[u'287142', u'287143']