# Wikipedia Network
Scrape all wikipedia links on a seed Wikipedia page to build a directed network where each node is a wiki page and each edge is a link

In [2]:
import requests
from bs4 import BeautifulSoup
import networkx as nx

url_cache = {str: {str}}

def fetch_page_wiki_links(url: str) -> set[str]:
    if url in url_cache:
        return url_cache[url]
    
    response = requests.get(url)

    if response.ok:
        links = set()
        soup = BeautifulSoup(response.content, 'html.parser')
        a_tags = soup.find_all('a', href=True)
        for tag in a_tags:
            href = tag['href']
            if href.startswith('/wiki/') and ':' not in href:
                links.add(f"https://en.wikipedia.org{href}")
        # cache set of links
        url_cache[url] = links
        return links
        
def build_wikipedia_network(seed_url, max_depth=1):
    G = nx.DiGraph()
    queue = [(seed_url, 0)]

    while queue:
        current_url, depth = queue.pop(0)

        if depth > max_depth:
            break
        else:
            links = fetch_page_wiki_links(current_url)
            for link in links:
                G.add_edge(current_url, link)
                queue.append((link, depth + 1))

    return G

In [4]:
seed_url = 'https://en.wikipedia.org/wiki/Example'
wiki_network = build_wikipedia_network(seed_url, max_depth=1)

# Print the number of nodes and edges
print("Number of nodes:", wiki_network.number_of_nodes())
print("Number of directed edges:", wiki_network.number_of_edges())

Number of nodes: 1154
Number of directed edges: 1204


In [5]:
%%timeit
requests.get(seed_url)

107 ms ± 2.05 ms per loop (mean ± std. dev. of 7 runs, 10 loops each)
