### WikiEngine

In [1]:
# from wiki.dump import WikiDump
import wiki

path_base = '/Users/harangju/Developer/data/wiki/'
name_xml = 'enwiki-20190801-pages-articles-multistream.xml.bz2'
name_index = 'enwiki-20190801-pages-articles-multistream-index.txt.bz2'
path_xml = path_base + name_xml
path_index = path_base + name_index
dump = wiki.Dump(path_xml, path_index)

### Exploring the wiki dump

In [2]:
%time dump.load_page('Portal:Physics/Topics')
dump.links[:5]

Dump: Loading index...
Dump: Loaded.
CPU times: user 1min 16s, sys: 2.56 s, total: 1min 19s
Wall time: 1min 19s


['Classical physics', 'Mechanics', 'Optics', 'Electricity', 'Magnetism']

In [3]:
dump.load_page('Danielle Bassett')
dump.links[:3]

['University of pennsylvania',
 'Pennsylvania state university',
 'University of cambridge']

In [4]:
dump.load_page('Matter', filter_top=True).strip_code()[:200]

'In classical physics and general chemistry, matter is any substance that has mass and takes up space by having volume. All everyday objects that can be touched are ultimately composed of atoms, which '

### Get index of physics articles

* [all indices on Wikipedia](https://en.wikipedia.org/wiki/Portal:Contents/Indices)
* topics not searched
* international trade ("topics"), theory of constraints (small)
* too big: mathematics, neuroscience

In [16]:
import string

links = {}
# natural & physical sciences
topics = ['anatomy', 'biochemistry', 'cognitive science', 'evolutionary biology',
          'genetics', 'immunology', 'molecular biology']
topics += ['chemistry', 'biophysics', 'energy', 'optics', 
           'earth science', 'geology', 'meteorology']
# philosophy
# topics += []
topics += ['philosophy of language', 'philosophy of law', 
           'philosophy of mind', 'philosophy of science']
# social sciences
topics += ['economics', 'accounting', 'education', 'linguistics', 'law', 'psychology', 'sociology']
# technology & applied sciences
topics += ['electronics', 'software engineering', 'robotics']

In [17]:
for topic in topics:
    dump.load_page('Index of %s articles' % topic)
    links[topic] = [str(l) for l in dump.article_links]
    print('Topic "' + topic + '" has ' + str(len(links[topic])) + ' articles.')

Topic "anatomy" has 2331 articles.
Topic "biochemistry" has 1216 articles.
Topic "cognitive science" has 127 articles.
Topic "evolutionary biology" has 287 articles.
Topic "genetics" has 1441 articles.
Topic "immunology" has 572 articles.
Topic "molecular biology" has 507 articles.
Topic "chemistry" has 1088 articles.
Topic "biophysics" has 773 articles.
Topic "energy" has 158 articles.
Topic "optics" has 386 articles.
Topic "earth science" has 135 articles.
Topic "geology" has 116 articles.
Topic "meteorology" has 761 articles.
Topic "philosophy of language" has 275 articles.
Topic "philosophy of law" has 208 articles.
Topic "philosophy of mind" has 109 articles.
Topic "philosophy of science" has 448 articles.
Topic "economics" has 562 articles.
Topic "accounting" has 154 articles.
Topic "education" has 872 articles.
Topic "linguistics" has 420 articles.
Topic "law" has 3657 articles.
Topic "psychology" has 1801 articles.
Topic "sociology" has 772 articles.
Topic "electronics" has 127

In [18]:
links['physics'] = []
for letter in ['!$@', '0–9'] + list(string.ascii_uppercase):
    dump.load_page('Index of physics articles (%s)' % letter)
    links['physics'].extend([str(l) for l in dump.article_links])
print('Topic "' + 'physics' + '" has ' + str(len(links['physics'])) + ' articles.')

Topic "physics" has 15215 articles.


### Build graphs of topics

In [19]:
%load_ext autoreload
%autoreload 2

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


In [20]:
import networkx as nx

graphs = {}
page_noload = {}
depth = 2
for topic in links.keys():
    print('Graph topic: ' + topic)
    graphs[topic] = nx.DiGraph()
    page_noload[topic] = wiki.Crawler.bfs(graphs[topic], dump, links[topic],
                                          depth_goal = depth, nodes = links[topic])
    path_save = path_base + 'graphs/' + topic + '_d' + str(depth) + '.gexf'
    nx.write_gexf(graphs[topic], path_save)

Graph topic: anatomy
Depth: 0
Crawler: len(queue) = 7760
Depth: 1
Crawler: len(queue) = 788
Depth: 2
Graph topic: biochemistry
Depth: 0
Crawler: len(queue) = 4390
Depth: 1
Crawler: len(queue) = 437
Depth: 2
Graph topic: cognitive science
Depth: 0
Crawler: len(queue) = 480
Depth: 1
Crawler: len(queue) = 49
Depth: 2
Graph topic: evolutionary biology
Depth: 0
Crawler: len(queue) = 116
Depth: 1
Crawler: len(queue) = 960
Depth: 2
Graph topic: genetics
Depth: 0
Crawler: len(queue) = 4260
Depth: 1
Crawler: len(queue) = 368
Depth: 2
Graph topic: immunology
Depth: 0
Crawler: len(queue) = 142
Depth: 1
Crawler: len(queue) = 990
Depth: 2
Graph topic: molecular biology
Depth: 0
Crawler: len(queue) = 160
Depth: 1
Crawler: len(queue) = 135
Depth: 2
Graph topic: chemistry
Depth: 0
Crawler: len(queue) = 4120
Depth: 1
Crawler: len(queue) = 435
Depth: 2
Graph topic: biophysics
Depth: 0
Crawler: len(queue) = 119
Depth: 1
Crawler: len(queue) = 610
Depth: 2
Graph topic: energy
Depth: 0
Crawler: len(queue) =