In [1]:
import requests
import re
import urllib.request
from bs4 import BeautifulSoup
from collections import deque
from html.parser import HTMLParser
from urllib.parse import urlparse
import os

# Regex pattern to match a URL
HTTP_URL_PATTERN = r'^http[s]*://.+'

domain = "en.wikipedia.org/wiki/Ferrari" # <- put your domain to be crawled
full_url = "https://en.wikipedia.org/wiki/Ferrari" # <- put your domain to be crawled with https or http

# Create a class to parse the HTML and get the hyperlinks
class HyperlinkParser(HTMLParser):
    def __init__(self):
        super().__init__()
        # Create a list to store the hyperlinks
        self.hyperlinks = []

    # Override the HTMLParser's handle_starttag method to get the hyperlinks
    def handle_starttag(self, tag, attrs):
        attrs = dict(attrs)

        # If the tag is an anchor tag and it has an href attribute, add the href attribute to the list of hyperlinks
        if tag == "a" and "href" in attrs:
            self.hyperlinks.append(attrs["href"])

In [2]:
# Function to get the hyperlinks from a URL
def get_hyperlinks(url):

    # Try to open the URL and read the HTML
    try:
        # Open the URL and read the HTML
        with urllib.request.urlopen(url) as response:

            # If the response is not HTML, return an empty list
            if not response.info().get('Content-Type').startswith("text/html"):
                return []

            # Decode the HTML
            html = response.read().decode('utf-8')
    except Exception as e:
        print(e)
        return []

    # Create the HTML Parser and then Parse the HTML to get hyperlinks
    parser = HyperlinkParser()
    parser.feed(html)

    return parser.hyperlinks

In [3]:
# Function to get the hyperlinks from a URL that are within the same domain
def get_domain_hyperlinks(local_domain, url):
    clean_links = []
    for link in set(get_hyperlinks(url)):
        clean_link = None

        # If the link is a URL, check if it is within the same domain
        if re.search(HTTP_URL_PATTERN, link):
            # Parse the URL and check if the domain is the same
            url_obj = urlparse(link)
            if url_obj.netloc == local_domain:
                clean_link = link

        # If the link is not a URL, check if it is a relative link
        else:
            if link.startswith("/"):
                link = link[1:]
            elif link.startswith("#") or link.startswith("mailto:"):
                continue
            clean_link = "https://" + local_domain + "/" + link

        if clean_link is not None:
            if clean_link.endswith("/"):
                clean_link = clean_link[:-1]
            clean_links.append(clean_link)

    # Return the list of hyperlinks that are within the same domain
    return list(set(clean_links))

In [None]:
def crawl(url):

    #Max file length
    Max_Length = 225
    
    # Parse the URL and get the domain
    local_domain = urlparse(url).netloc

    # Create a queue to store the URLs to crawl
    queue = deque([url])

    # Create a set to store the URLs that have already been seen (no duplicates)
    seen = set([url])

    # Create a directory to store the text files
    if not os.path.exists("text/"):
            os.mkdir("text/")

    if not os.path.exists("text/"+local_domain+"/"):
            os.mkdir("text/" + local_domain + "/")

    # Create a directory to store the csv files
    if not os.path.exists("processed"):
            os.mkdir("processed")

    # While the queue is not empty, continue crawling
    while queue:

        # Get the next URL from the queue
        url = queue.pop()
        print(url) # for debugging and to see the progress


        if len(url) > Max_Length:
            print(f"Skipping {url} due to long filename")
            continue

        # Save text from the url to a <url>.txt file
        with open('text/'+local_domain+'/'+url[8:].replace("/", "_") + ".txt", "w", encoding="UTF-8") as f:

            # Get the text from the URL using BeautifulSoup
            soup = BeautifulSoup(requests.get(url).text, "html.parser")

            # Get the text but remove the tags
            text = soup.get_text()

            # If the crawler gets to a page that requires JavaScript, it will stop the crawl
            if ("You need to enable JavaScript to run this app." in text):
                print("Unable to parse page " + url + " due to JavaScript being required")

            # Otherwise, write the text to the file in the text directory
            f.write(text)

        # Get the hyperlinks from the URL and add them to the queue
        for link in get_domain_hyperlinks(local_domain, url):
            if link not in seen:
                queue.append(link)
                seen.add(link)

crawl(full_url)

https://en.wikipedia.org/wiki/Ferrari
https://en.wikipedia.org/wiki/LaFerrari#LaFerrari_Aperta_(2016–2018)
https://en.wikipedia.org/wiki/Category:Use_British_English_from_April_2019
https://en.wikipedia.org/wiki/Category:Use_British_English_from_November_2023
https://en.wikipedia.org/wiki/Jonny_Buckland
https://en.wikipedia.org/w/index.php?title=Jonny_Buckland&action=info
https://en.wikipedia.org/wiki/Module:Category_handler/blacklist
https://en.wikipedia.org/wiki/Special:PageHistory/Module:Category_handler/blacklist/doc
https://en.wikipedia.org/wiki/Special:Contributions/66.87.95.197
https://en.wikipedia.org/w/index.php?title=Special:Log/block&page=User%3A66.87.95.197
https://en.wikipedia.org/wiki/Special:Log/gblblock
https://en.wikipedia.org/w/index.php?title=User_talk:185.193.240.192&action=edit&redlink=1
https://en.wikipedia.org//en.m.wikipedia.org/w/index.php?title=User_talk:185.193.240.192&action=edit&redlink=1&mobileaction=toggle_view_mobile
HTTP Error 404: Not Found
https://en.



https://en.wikipedia.org/w/index.php?title=Special:UrlShortener&url=https%3A%2F%2Fen.wikipedia.org%2Fw%2Findex.php%3Ftitle%3DSpecial%3ARecentChangesLinked%26days%3D3%26hideWikibase%3D0%26hidecategorization%3D0%26hideminor%3D1%26hidemyself%3D1%26hidenondamaging%3D1%26limit%3D100%26target%3DTemplate%253ASic%26userExpLevel%3Dregistered
Skipping https://en.wikipedia.org/w/index.php?title=Special:UrlShortener&url=https%3A%2F%2Fen.wikipedia.org%2Fw%2Findex.php%3Ftitle%3DSpecial%3ARecentChangesLinked%26days%3D3%26hideWikibase%3D0%26hidecategorization%3D0%26hideminor%3D1%26hidemyself%3D1%26hidenondamaging%3D1%26limit%3D100%26target%3DTemplate%253ASic%26userExpLevel%3Dregistered due to long filename
https://en.wikipedia.org/w/index.php?title=Special:RecentChangesLinked&days=14&from=&userExpLevel=registered&hidemyself=1&hideminor=1&hidecategorization=0&hideWikibase=0&hidenondamaging=1&limit=100&target=Template%3ASic
https://en.wikipedia.org/w/index.php?title=Special:RecentChangesLinked&limit=250