In [31]:
import requests
import bs4
import re
from time import sleep
import random
import networkx as nx
from urllib.parse import unquote, urlparse
import csv

In [21]:
headers = {
    'User-Agent': 'IR_lab2_project0',
}


response = requests.get('https://en.wikipedia.org/wiki/Baldur%27s_Gate_3', headers=headers)
response.status_code 

200

In [22]:
parsed = bs4.BeautifulSoup(response.text)
parsed

<!DOCTYPE html>
<html class="client-nojs vector-feature-language-in-header-enabled vector-feature-language-in-main-page-header-disabled vector-feature-page-tools-pinned-disabled vector-feature-toc-pinned-clientpref-1 vector-feature-main-menu-pinned-disabled vector-feature-limited-width-clientpref-1 vector-feature-limited-width-content-enabled vector-feature-custom-font-size-clientpref-1 vector-feature-appearance-pinned-clientpref-1 vector-feature-night-mode-enabled skin-theme-clientpref-day vector-sticky-header-enabled vector-toc-available" dir="ltr" lang="en">
<head>
<meta charset="utf-8"/>
<title>Baldur's Gate 3 - Wikipedia</title>
<script>(function(){var className="client-js vector-feature-language-in-header-enabled vector-feature-language-in-main-page-header-disabled vector-feature-page-tools-pinned-disabled vector-feature-toc-pinned-clientpref-1 vector-feature-main-menu-pinned-disabled vector-feature-limited-width-clientpref-1 vector-feature-limited-width-content-enabled vector-fe

In [23]:
def getText(url):
    output = ""
    response = requests.get(url, headers=headers)
    parsed = bs4.BeautifulSoup(response.text)
    for p in parsed.select('p'):
        output += p.getText()
        

    return output

In [24]:
getText("https://en.wikipedia.org/wiki/Baldur%27s_Gate_3")

'\n\nBaldur\'s Gate 3 (also known as BG3 and Baldur\'s Gate III) is a 2023 role-playing video game by Larian Studios. It is the third installment in the Baldur\'s Gate series. The game\'s full release for Windows happened in August, with PlayStation 5, macOS, and Xbox Series X/S later in the same year. In the game\'s narrative, the party seeks to cure themselves of a parasitic tadpole infecting their brain. It can be played alone or in a group.    \nAdapted from the fifth edition of tabletop role-playing game Dungeons & Dragons, Baldur\'s Gate 3 takes its mechanics and setting, the Forgotten Realms, from the tabletop game. Players create a highly customisable character and embark on quests with a party of voiced companions. Alternatively, they can play as a companion instead. The gameplay comprises real-time exploration of large areas, turn-based combat, and narrative choices which impact the party and the wider world. Outcomes for combat, dialogue and world interaction are generally d

In [37]:

with open("output.csv", 'w', newline='', encoding='utf-8') as csvfile:
    fieldnames = ['url', 'name', 'text']
    writer = csv.DictWriter(csvfile, fieldnames=fieldnames)
    writer.writeheader()
   

In [33]:
def append_to_csv(parsed, url):
    with open("output.csv", 'a', newline='', encoding='utf-8') as csvfile:
        fieldnames = ['url', 'name', 'text']
        writer = csv.DictWriter(csvfile, fieldnames=fieldnames)
        name = parsed.select_one('h1#firstHeading').getText()
        text = ""
        for p in parsed.select('p'):
            text += p.getText()
        writer.writerow({'url': url, 'name': name, 'text': text})

In [38]:
def bfs_graph(start_link, max_nodes=3000, per_page=3):
    """
    BFS crawling from start_link, build and plot a NetworkX graph.
    - max_nodes: limit total unique pages (nodes)
    - per_page: max outgoing links to follow per page
    Returns the built graph.
    """
    G = nx.DiGraph()
    queue = [start_link]
    visited = set()

    def short_name(url):
        path = urlparse(url).path
        if path.startswith('/wiki/'):
            name = unquote(path.split('/wiki/')[1])
            return name.replace('_', ' ')
        return url

    while queue and len(visited) < max_nodes:
        current = queue.pop(0)
        if current in visited:
            continue
        try:
            response = requests.get(current, headers=headers, timeout=10)
        except Exception:
            continue
        if response.status_code != 200:
            continue

        parsed = bs4.BeautifulSoup(response.text, 'html.parser')
        visited.add(current)
        print(f"[{len(visited)}/{max_nodes}] {current}")
        append_to_csv(parsed, current)
        # avoid non-article wiki namespaces
        links = parsed.find_all('a', attrs={'href': re.compile(r'^/wiki/(?!Help:|File:|Special:|Talk:|Category:|Portal:|Template:)')})
        def is_good_href(h):
            title = unquote(h.split('/wiki/')[1]) if '/wiki/' in h else h
            bad_keywords = ('upload', 'wizard', 'file', 'help', 'special', 'talk', 'category', 'portal', 'template')
            return not any(k.lower() in title.lower() for k in bad_keywords)
        links = [a for a in links if a.get('href') and is_good_href(a.get('href'))]
        random.shuffle(links)

        for next_link in links[:per_page]:
            href = next_link.get('href')
            if not href:
                continue
            full_link = "https://en.wikipedia.org" + href
            G.add_edge(short_name(current), short_name(full_link))

            if full_link not in visited and full_link not in queue and len(visited) + len(queue) < max_nodes:
                queue.append(full_link)

        sleep(random.random() * 1.5)  

    pos = nx.spring_layout(G, k=0.6, seed=42)
    nx.draw(G, pos, with_labels=True, node_size=300, font_size=8, arrows=True)

    return G


In [None]:
bfs_graph('https://en.wikipedia.org/wiki/Baldur%27s_Gate_3')

[1/3000] https://en.wikipedia.org/wiki/Baldur%27s_Gate_3
[2/3000] https://en.wikipedia.org/wiki/Baldur%27s_Gate_II:_Shadows_of_Amn
[3/3000] https://en.wikipedia.org/wiki/Borislav_Slavov
[4/3000] https://en.wikipedia.org/wiki/Tracy_Wiles
[5/3000] https://en.wikipedia.org/wiki/Next_Generation_(magazine)
[6/3000] https://en.wikipedia.org/wiki/Al-Qadim:_The_Genie%27s_Curse
[7/3000] https://en.wikipedia.org/wiki/Empire_(film_magazine)
[8/3000] https://en.wikipedia.org/wiki/Main_Page
[9/3000] https://en.wikipedia.org/wiki/The_Game_Awards
[10/3000] https://en.wikipedia.org/wiki/McCallum_(TV_series)
[11/3000] https://en.wikipedia.org/wiki/Manhunt_(2019_TV_series)
[12/3000] https://en.wikipedia.org/wiki/Law_%26_Order:_UK
[13/3000] https://en.wikipedia.org/wiki/Nintendo
[14/3000] https://en.wikipedia.org/wiki/Future_plc
[15/3000] https://en.wikipedia.org/wiki/Virtua_Fighter_(video_game)
[16/3000] https://en.wikipedia.org/wiki/Eye_of_the_Beholder_(video_game)
[17/3000] https://en.wikipedia.org/wi