In [None]:
import requests
import time
from urllib.parse import urlparse, urljoin
from bs4 import BeautifulSoup
import re


In [None]:
visited_urls = []
to_visit_urls = ["http://www.mit.edu"]
num_pages_visited = 0

In [None]:
# Function to check if URL is allowed by robots.txt
def is_allowed_by_robots(url):
    parsed_url = urlparse(url)
    robots_url = parsed_url.scheme + "://" + parsed_url.netloc + "/robots.txt"
    try:
        robots = requests.get(robots_url)
        robots_text = robots.text
    except:
        # Assume URL is allowed if there is an error
        return True
    for line in robots_text.split("\n"):
        if line.startswith("Disallow:"):
            disallowed_path = line.split(":")[1].strip()
            if disallowed_path == "/" or parsed_url.path.startswith(disallowed_path):
                return False
    return True

In [None]:
# Loop until we've visited 100 pages or there are no more pages to visit
while num_pages_visited < 100 and len(to_visit_urls) > 0:
    # Get next URL to visit
    next_url = to_visit_urls.pop(0)

    # Check if URL has already been visited
    if next_url in visited_urls:
        continue

    # Check if URL is allowed by robots.txt
    if not is_allowed_by_robots(next_url):
        continue

    # Send HTTP request and get response
    try:
        response = requests.get(next_url)
    except:
        # Skip this URL if there is an error
        continue

    # Check if content type is HTML
    content_type = response.headers.get("Content-Type", "")
    if "text/html" not in content_type:
        # Skip this URL if content type is not HTML
        continue

    # Get canonical URL
    canonical_url = response.url

    # Get list of outgoing links
    outgoing_links = []

    # Parse HTML content and extract links
    soup = BeautifulSoup(response.content, "html.parser")
    for link in soup.find_all("a"):
        href = link.get("href", "")
        # Check if href is a valid URL
        if not href.startswith(("http://", "https://")):
            continue
        # Check if href is within the mit.edu domain
        if not urlparse(href).netloc.endswith("mit.edu"):
            continue
        # Convert href to canonical form
        href = urljoin(canonical_url, href)
        href = re.sub(r"#.*", "", href)
        if href not in outgoing_links and href != canonical_url:
            outgoing_links.append(href)

    # Add outgoing links to list of URLs to visit
    to_visit_urls.extend(outgoing_links)

    # Mark this URL as visited
    visited_urls.append(next_url)

    # Write URLs to output file
    with open("output.txt", "a") as f:
        f.write(canonical_url + " " + " ".join(outgoing_links) + "\n")

    # Increment number of pages visited
    num_pages_visited += 1

    # Wait five seconds before visiting next page
    time.sleep(5)