# Wikipedia Network
Scrape all wikipedia links on a seed Wikipedia page to build a directed network where each node is a wiki page and each edge is a link

In [35]:
# original implementation using list and pop(0) 7m 43s for d=2
import requests
from bs4 import BeautifulSoup
import networkx as nx

url_cache = {str: {str}}

def fetch_page_wiki_links(url: str) -> set[str]:
    if url in url_cache:
        return url_cache[url]
    try:
        response = requests.get(url, timeout=10)
    except:
        return set()

    if response.ok:
        links = set()
        soup = BeautifulSoup(response.content, 'html.parser')
        a_tags = soup.find_all('a', href=True)
        for tag in a_tags:
            href = tag['href']
            if href.startswith('/wiki/') and ':' not in href:
                links.add(f"https://en.wikipedia.org{href}")
        # cache set of links
        url_cache[url] = links
        return links
        
def build_wikipedia_network(seed_url, max_depth=1):
    G = nx.DiGraph()
    queue = [(seed_url, 0)]

    while queue:
        current_url, depth = queue.pop(0)

        if depth > max_depth:
            break
        else:
            links = fetch_page_wiki_links(current_url)
            for link in links:
                G.add_edge(current_url, link)
                queue.append((link, depth + 1))

    return G

In [36]:
# recursive implementation 6m 9s for d=2
import requests
from bs4 import BeautifulSoup
import networkx as nx
from collections import deque 

url_cache = {str : {str}}

def fetch_page_wiki_links(url: str) -> set[str]:
    if url in url_cache:
        return url_cache[url]
    try:
        response = requests.get(url, timeout=10)
    except:
        return set()

    if response.ok:
        links = set()
        soup = BeautifulSoup(response.content, 'html.parser')
        a_tags = soup.find_all('a', href=True)
        for tag in a_tags:
            href = tag['href']
            if href.startswith('/wiki/') and ':' not in href:
                links.add(f"https://en.wikipedia.org{href}")
        # cache set of links
        url_cache[url] = links
        return links
        
def build_cache(seed_url, remaining_depth):
  links = fetch_page_wiki_links(seed_url)
  if remaining_depth <= 0:
    result = []
    for l in links:
      result.append((seed_url, l))
    return result
  
  result = []
  for l in links:
    result.extend(build_cache(l, remaining_depth-1))

  return result

def build_wikipedia_network(seed_url, max_depth=1):
  G = nx.DiGraph()
  cache = build_cache(seed_url, max_depth)
  edge_list = [(u,v) for u, v in cache]
  G.add_edges_from(edge_list)
  return G

In [2]:
# original implementation using deque 5m 40s for d = 2
import requests
from bs4 import BeautifulSoup
import networkx as nx
from collections import deque
url_cache = {str: {str}}

def fetch_page_wiki_links(url: str) -> set[str]:
    if url in url_cache:
        return url_cache[url]
    try:
        response = requests.get(url, timeout=10)
    except:
        return set()

    if response.ok:
        links = set()
        soup = BeautifulSoup(response.content, 'html.parser')
        a_tags = soup.find_all('a', href=True)
        for tag in a_tags:
            href = tag['href']
            if href.startswith('/wiki/') and ':' not in href:
                links.add(f"https://en.wikipedia.org{href}")
        # cache set of links
        url_cache[url] = links
        return links
        
def build_wikipedia_network(seed_url, max_depth=1):
    G = nx.DiGraph()
    queue = deque()
    queue.append((seed_url, 0))

    while queue:
        current_url, depth = queue.popleft()

        if depth > max_depth:
            break
        else:
            links = fetch_page_wiki_links(current_url)
            for link in links:
                G.add_edge(current_url, link)
                queue.append((link, depth + 1))

    return G

seed_url = 'https://en.wikipedia.org/wiki/Example'
wiki_network = build_wikipedia_network(seed_url, max_depth=1)

# Print the number of nodes and edges
print("Number of nodes:", wiki_network.number_of_nodes())
print("Number of directed edges:", wiki_network.number_of_edges())

Number of nodes: 1166
Number of directed edges: 1217


In [2]:
# async implementation using deque
from bs4 import BeautifulSoup
import networkx as nx
from collections import deque
import asyncio
import aiohttp

url_cache: dict[str: set[str]] = {}

async def afetch_page_wiki_links(url: str) -> tuple[str, set[str]]:
    if url in url_cache:
        return (url, url_cache[url])
    links = set()
    sem = asyncio.Semaphore(10)
    async with sem:
        try:
            async with aiohttp.ClientSession() as session:
                async with session.get(url, timeout=10) as response:
                    if response.ok:
                        html = await response.text()
                        soup = BeautifulSoup(html, 'html.parser')
                        a_tags = soup.find_all('a', href=True)
                        for tag in a_tags:
                            href = tag['href']
                            if href.startswith('/wiki/') and ':' not in href:
                                links.add(f"https://en.wikipedia.org{href}")
        except Exception as e:
            print(f'Failed to fetch {url}: {e}')
            return (url, set())
    # cache set of links
    url_cache[url] = links
    return (url, links)
        
async def abuild_wikipedia_network(seed_url: str, max_depth: int = 1):
    G = nx.DiGraph()
    queue = deque([(seed_url, 0)])

    while queue:
        curr_depth = queue[0][1]
        if curr_depth > max_depth:
            print("Quitting: ", queue[0])
            return G
        
        tasks = {url : afetch_page_wiki_links(url) for url, depth in queue}
        queue.clear()
        
        
        link_map = dict(await asyncio.gather(*tasks.values())) # returns list containing set of links on page
        # print(link_map)
        
        for root_link, link_set in link_map.items():
            print(f"{curr_depth} Key: {root_link}, Value: {link_set}")
            for link in link_set:
                G.add_edge(root_link, link)
                queue.append((link, curr_depth + 1))

seed_url = 'https://en.wikipedia.org/wiki/Example'
wiki_network = await abuild_wikipedia_network(seed_url, 2)
# Print the number of nodes and edges
print("Number of nodes:", wiki_network.number_of_nodes())
print("Number of directed edges:", wiki_network.number_of_edges())

0 Key: https://en.wikipedia.org/wiki/Example, Value: {'https://en.wikipedia.org/wiki/The_Example_(comics)', 'https://en.wikipedia.org/wiki/Example_(album)', 'https://en.wikipedia.org/wiki/The_Example', 'https://en.wikipedia.org/wiki/.example', 'https://en.wikipedia.org/wiki/Main_Page', 'https://en.wikipedia.org/wiki/Exemplar_(disambiguation)', 'https://en.wikipedia.org/wiki/Example.com', 'https://en.wikipedia.org/wiki/Exemplum', 'https://en.wikipedia.org/wiki/HMS_Example_(P165)', 'https://en.wikipedia.org/wiki/Eixample', 'https://en.wikipedia.org/wiki/Exempli_gratia', 'https://en.wikipedia.org/wiki/Example', 'https://en.wikipedia.org/wiki/Example_(musician)'}
1 Key: https://en.wikipedia.org/wiki/The_Example_(comics), Value: {'https://en.wikipedia.org/wiki/The_Example_(comics)', 'https://en.wikipedia.org/wiki/Colin_Wilson_(comics)', 'https://en.wikipedia.org/wiki/Main_Page', 'https://en.wikipedia.org/wiki/The_Australian', 'https://en.wikipedia.org/wiki/Edinburgh_Fringe_Festival', 'https

  tasks = {url : afetch_page_wiki_links(url) for url, depth in queue}


ValueError: too many file descriptors in select()

: 

In [None]:
seed_url = 'https://en.wikipedia.org/wiki/Example'
wiki_network = await abuild_wikipedia_network(seed_url, 1)

# Print the number of nodes and edges
print("Number of nodes:", wiki_network.number_of_nodes())
print("Number of directed edges:", wiki_network.number_of_edges())

In [None]:
seed_url = 'https://en.wikipedia.org/wiki/Example'
wiki_network = build_wikipedia_network(seed_url, max_depth=1)

# Print the number of nodes and edges
print("Number of nodes:", wiki_network.number_of_nodes())
print("Number of directed edges:", wiki_network.number_of_edges())

In [None]:
start = 'https://en.wikipedia.org/wiki/Example'
target = 'https://en.wikipedia.org/wiki/David_Sherry_(philosopher)'
print(nx.shortest_path(wiki_network, start, target))
list(wiki_network.nodes)[-5:]

In [None]:
import matplotlib.pyplot as plt
start = 'https://en.wikipedia.org/wiki/Example'
target = 'https://en.wikipedia.org/wiki/Melbourne'
nx.shortest_path(wiki_network, start, target)
pos = nx.planar_layout(wiki_network)
nx.draw_networkx_nodes(wiki_network, pos)
nx.draw_networkx_edges(wiki_network, pos)
labels = {node: str(node).split("/")[-1] for node in wiki_network.nodes()}
#nx.draw_networkx_labels(wiki_network, pos, labels, font_size=6)
plt.axis("off")
plt.show()

In [8]:
nx.set_node_attributes(wiki_network, {l:l.split("/")[-1] for l in wiki_network}, 'label')