# Wiki knwoledge graph

In [1]:
import json
from joblib import Parallel, delayed
from tqdm import tqdm
from urllib import request, parse
from itertools import chain
import networkx as nx

In [2]:
def link_to_title(link):
    return link["title"]

def clean_if_key(page, key):
    if key in page.keys():
        return map(link_to_title, page[key])
    else: return []
    
def get_wiki_links(pageTitle):
    safe_title = parse.quote(pageTitle)
    url = "https://en.wikipedia.org/w/api.php?action=query&"\
          "prop=links|linkshere&pllimit=500&lhlimit=500&titles={}&"\
          "format=json&formatversion=2".format(safe_title)
    page = request.urlopen(url).read()
    j = json.loads(page)
    jpage = j["query"]["pages"][0]
    inbound = clean_if_key(jpage, "links")
    outbound = clean_if_key(jpage, "linkshere")
    pagelinks = {"title": pageTitle,
                "in-links": list(inbound),
                "out-links": list(outbound)}
    return pagelinks

def flatten_network(page):
    return page["in-links"] + page["out-links"]

def page_to_edges(page):
    a = [(page["title"], p) for p in page["out-links"]]
    b = [(p, page["title"]) for p in page["in-links"]]
    return a + b

In [3]:
%%time
root = get_wiki_links("Parallel_computing")
initial_network = flatten_network(root)

Wall time: 1.59 s


In [4]:
init_iterable = tqdm(initial_network)
wiki_links_worker = delayed(get_wiki_links)
all_pages = (Parallel(n_jobs=-2)(wiki_links_worker(l) for l in init_iterable))

100%|████████████████████████████████████████████████| 895/895 [01:06<00:00, 13.50it/s]


In [6]:
pages_iterable = tqdm(all_pages)
edges = (Parallel(n_jobs=-2)(delayed(page_to_edges)(p) for p in pages_iterable))

100%|███████████████████████████████████████████████| 895/895 [00:06<00:00, 132.74it/s]


In [7]:
%%time
edges = chain.from_iterable(edges)
G = nx.DiGraph()
for e in edges:
    G.add_edge(*e)

Wall time: 2.33 s


In [8]:
nx.readwrite.gexf.write_gexf(G,"/output/02_wiki.gexf")