In [53]:
import requests
import bs4
import re
from time import sleep
import random
import networkx as nx
from urllib.parse import unquote, urlparse
import csv

In [54]:
def getText(url):
    headers = {
        'User-Agent': 'IR Project 1 - Web Crawler -'
        }
    
    output = ""
    response = requests.get(url, headers=headers)
    parsed = bs4.BeautifulSoup(response.text)
    for p in parsed.select('p'):
        output += p.getText()
        

    return output

In [55]:

with open("output.csv", 'w', newline='', encoding='utf-8') as csvfile:
    fieldnames = ['url', 'name', 'text']
    writer = csv.DictWriter(csvfile, fieldnames=fieldnames)
    writer.writeheader()
   

In [56]:
def append_to_csv(parsed, url):
    with open("output.csv", 'a', newline='', encoding='utf-8') as csvfile:
        fieldnames = ['url', 'name', 'text']
        writer = csv.DictWriter(csvfile, fieldnames=fieldnames)
        name = parsed.select_one('h1#firstHeading').getText()
        text = ""
        for p in parsed.select('p'):
            text += p.getText()
        writer.writerow({'url': url, 'name': name, 'text': text})

In [57]:
def bfs_graph(start_link, max_nodes=3000, per_page=3):
    """
    BFS crawling from start_link, build and plot a NetworkX graph.
    - max_nodes: limit total unique pages (nodes)
    - per_page: max outgoing links to follow per page
    Returns the built graph.
    """
    G = nx.DiGraph()
    queue = [start_link]
    visited = set()

    def short_name(url):
        path = urlparse(url).path
        if path.startswith('/wiki/'):
            name = unquote(path.split('/wiki/')[1])
            return name.replace('_', ' ')
        return url

    while queue and len(visited) < max_nodes:
        current = queue.pop(0)
        if current in visited:
            continue
        try:
            response = requests.get(current, headers=headers, timeout=10)
        except Exception:
            continue
        if response.status_code != 200:
            continue

        parsed = bs4.BeautifulSoup(response.text, 'html.parser')
        visited.add(current)
        print(f"[{len(visited)}/{max_nodes}] {current}")
        append_to_csv(parsed, current)
        # avoid non-article wiki namespaces
        links = parsed.find_all('a', attrs={'href': re.compile(r'^/wiki/(?!Help:|File:|Special:|Talk:|Category:|Portal:|Template:)')})
        def is_good_href(h):
            title = unquote(h.split('/wiki/')[1]) if '/wiki/' in h else h
            bad_keywords = ('upload', 'wizard', 'file', 'help', 'special', 'talk', 'category', 'portal', 'template')
            return not any(k.lower() in title.lower() for k in bad_keywords)
        links = [a for a in links if a.get('href') and is_good_href(a.get('href'))]
        random.shuffle(links)

        for next_link in links[:per_page]:
            href = next_link.get('href')
            if not href:
                continue
            full_link = "https://en.wikipedia.org" + href
            G.add_edge(short_name(current), short_name(full_link))

            if full_link not in visited and full_link not in queue and len(visited) + len(queue) < max_nodes:
                queue.append(full_link)

        sleep(random.random() * 1.5)  

    pos = nx.spring_layout(G, k=0.6, seed=42)
    nx.draw(G, pos, with_labels=True, node_size=300, font_size=8, arrows=True)

    return G


In [58]:
bfs_graph('https://en.wikipedia.org/wiki/Baldur%27s_Gate_3')

[1/3000] https://en.wikipedia.org/wiki/Baldur%27s_Gate_3
[2/3000] https://en.wikipedia.org/wiki/Idle_animation
[3/3000] https://en.wikipedia.org/wiki/Larian_Studios
[4/3000] https://en.wikipedia.org/wiki/Baldur%27s_Gate:_Descent_into_Avernus
[5/3000] https://en.wikipedia.org/wiki/Main_Page
[6/3000] https://en.wikipedia.org/wiki/ISBN_(identifier)
[7/3000] https://en.wikipedia.org/wiki/Divinity_(series)
[8/3000] https://en.wikipedia.org/wiki/Kickstarter
[9/3000] https://en.wikipedia.org/wiki/Forgotten_Realms
[10/3000] https://en.wikipedia.org/wiki/Dungeon_Master%27s_Guide
[11/3000] https://en.wikipedia.org/wiki/Giant_(Dungeons_%26_Dragons)
[12/3000] https://en.wikipedia.org/wiki/Christ_Child
[13/3000] https://en.wikipedia.org/wiki/Wikipedia:About
[14/3000] https://en.wikipedia.org/wiki/English_language
[15/3000] https://en.wikipedia.org/wiki/ISO_2709
[16/3000] https://en.wikipedia.org/wiki/Graph_Query_Language
[17/3000] https://en.wikipedia.org/wiki/Divinity_II
[18/3000] https://en.wikip

KeyboardInterrupt: 