In [1]:
%reload_ext autoreload
%autoreload 2

In [2]:
import os,sys
sys.path.insert(1, os.path.join(sys.path[0], '..', 'module'))

In [3]:
import wiki

path_base = '/Users/harangju/Developer/data/wiki/dumps/'
name_xml = 'enwiki-20190801-pages-articles-multistream.xml.bz2'
name_index = 'enwiki-20190801-pages-articles-multistream-index.txt.bz2'
path_xml = path_base + name_xml
path_index = path_base + name_index
dump = wiki.Dump(path_xml, path_index)

In [4]:
topic = 'evolutionary biology'
%time dump.load_page('Index of {} articles'.format(topic))
links = dump.links
dump.links[:5]

Dump: Loading index...
Dump: Loaded.
CPU times: user 1min 19s, sys: 2.17 s, total: 1min 21s
Wall time: 1min 21s


['File:charles Darwin by Julia Margaret Cameron 2.jpg',
 'Charles Darwin',
 'Evolutionary biology',
 'The Origin of Species',
 'Abiogenesis']

In [5]:
net = wiki.Net()
net.build_graph(name=topic, dump=dump, nodes=[str(l) for l in links],
                compute_core_periphery=False, compute_communities=False)

wiki.Net: traversing Wikipedia...
wiki.Net: depth = 0
wiki.Net: len(queue) = 160
wiki.Net: depth = 1
wiki.Net: removing isolates...
wiki.Net: adding years...
wiki.Net: filling empty years...


In [6]:
net.graph.nodes['Darwinism']

{'year': 1800}

In [7]:
[len(links), len(net.graph.nodes)]
# [291, 265] with redirects
# [291, 234] without

[291, 265]

In [8]:
[len(links), len(net.graph.nodes)]

[291, 265]

#### Test network generation

In [9]:
import pickle
import gensim.utils as gu

path_save = '/Users/harangju/Developer/data/wiki/models/'
tfidf = gu.SaveLoad.load(path_save + 'tfidf.model')
dct = pickle.load(open(path_save + 'dict.model','rb'))

In [10]:
net = wiki.Net()
net.build_graph(name=topic, dump=dump, nodes=[str(l) for l in links],
                model=tfidf, dct=dct)

wiki.Net: traversing Wikipedia...
wiki.Net: depth = 0
wiki.Net: len(queue) = 160
wiki.Net: depth = 1
wiki.Net: removing isolates...
wiki.Net: adding years...
wiki.Net: filling empty years...
wiki.Net: calculating weights...
wiki.Net: computing core-periphery...
wiki.Net: computing communities...


In [11]:
[len(links), len(net.graph.nodes)]
# [291, 265] with redirects
# [291, 234] without

[291, 265]

In [12]:
net.save_graph(topic)

In [13]:
new_net = wiki.Net()
new_net.load_graph(topic)
len(new_net.graph.nodes)

265

In [14]:
new_net.graph.graph

{'name': 'evolutionary biology',
 'tfidf': <2080851x265 sparse matrix of type '<class 'numpy.float64'>'
 	with 253947 stored elements in Compressed Sparse Column format>,
 'coreness': 0.7845415242880858,
 'modularity': 0.26006018498783096}

In [15]:
[n for n in net.graph.nodes][:5]

['The Voyage of the Beagle',
 'Charles Darwin',
 'Charles Lyell',
 'Thomas Henry Huxley',
 'Evolution']

In [16]:
[y for y in net.years][:5]

[-9000, -3500, -1900, -600, -500]

In [17]:
net._numbered = None
[n for n in net.numbered.nodes][:5]

[0, 1, 36, 23, 85]

#### Test weighting with tf-idf model

In [18]:
net.graph['Evolution']

AtlasView({'Charles Darwin': {'weight': 0.4313447568679566}, 'Evolutionary biology': {'weight': 0.4400523916182571}, 'The Origin of Species': {'weight': 0.5765181173462256}, 'Adaptation': {'weight': 0.6818558586652332}, 'Anti-predator adaptation': {'weight': 0.3528309736849022}, 'Archaeopteryx': {'weight': 0.27985701037935673}, 'Atavism': {'weight': 0.38007029772108963}, 'Gene-centered view of evolution': {'weight': 0.564823956705098}, 'Cephalization': {'weight': 0.29803593027631003}, 'Chronospecies': {'weight': 0.3264603555788205}, 'Co-evolution': {'weight': 0.435903518311745}, 'Co-operation (evolution)': {'weight': 0.5113070391815111}, 'Common descent': {'weight': 0.5480997893873277}, 'Convergent evolution': {'weight': 0.49094971306719426}, 'Creation-evolution controversy': {'weight': 0.5432976990825654}, 'Darwin (unit)': {'weight': 0.4049276501924059}, 'Darwinism': {'weight': 0.4040596533555473}, 'Directed mutagenesis': {'weight': 0.31889300576344215}, 'Directed evolution': {'weight

In [19]:
net.graph['Transposon']

AtlasView({'List of biology topics': {'weight': 0.1966936076996275}})

#### Test coreness

In [20]:
net.graph.graph['coreness']

0.7845415242880858

In [21]:
import networkx as nx
import matplotlib.pyplot as plt

matrix = nx.convert_matrix.to_numpy_array(net.graph,
                                          nodelist=[n for n in net.graph.nodes
                                                    if net.graph.nodes[n]['core']] +
                                                   [n for n in net.graph.nodes
                                                    if not net.graph.nodes[n]['core']])
plt.imshow(matrix)

<matplotlib.image.AxesImage at 0x7fdd69238d90>

In [22]:
import networkx as nx
pos = nx.drawing.nx_agraph.graphviz_layout(net.graph, prog='neato')
nx.drawing.nx_pylab.draw_networkx(net.graph,
                                  pos,
                                  font_size=0,
                                  alpha=0.9,
                                  node_size=5,
                                  node_color=[net.graph.nodes[n]['core']
                                              for n in net.graph.nodes])

ImportError: ('requires pygraphviz ', 'http://pygraphviz.github.io/')

#### Test community structure

In [None]:
net.graph.graph['modularity']

In [None]:
import matplotlib.pyplot as plt

num_communities = max([net.graph.nodes[n]['community'] for n in net.graph.nodes])
nodelist = [[n for n in net.graph.nodes if net.graph.nodes[n]['community']==i]
            for i in range(num_communities)]
nodelist = [n for ns in nodelist for n in ns]
matrix = nx.convert_matrix.to_numpy_array(net.graph,
                                          nodelist=nodelist)
plt.imshow(matrix)

#### Test clique to barcodes

In [None]:
net._cliques = None
%time net.cliques[:4]
%time net.cliques[:4]

In [None]:
net.nodes_for_year[-4000]

In [None]:
%time net.filtration

In [None]:
%time net.persistence

In [None]:
import pandas as pd
pd.options.display.max_rows = 12

In [None]:
net._barcodes = None
%time net.barcodes
%time net.barcodes

In [None]:
net.barcodes[net.barcodes.lifetime!=0]