In [1]:
import threading
import queue
import time
import os
from urllib.parse import urljoin, urlparse
from selenium import webdriver
from selenium.webdriver.chrome.options import Options
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from selenium.common.exceptions import TimeoutException, StaleElementReferenceException
from dotenv import load_dotenv
import openai

# Load environment variables from .env file
load_dotenv()
openai.api_key = os.getenv("OPENAI_API_KEY")

# Base settings
BASE_URL = "https://www.mit.edu"
DOMAIN = "mit.edu"
MAX_DEPTH = 7       # Maximum crawl depth (adjust as needed)
NUM_WORKERS = 4     # Number of parallel worker threads

In [2]:
# Shared data structures and locks
url_queue = queue.Queue()
visited = set()
visited_lock = threading.Lock()

application_links = {}  # Mapping: {normalized_link: page_url}
application_links_lock = threading.Lock()

forms_found = []        # List of dicts: {"form_url": ..., "page_url": ..., "form_html": ...}
forms_found_lock = threading.Lock()

def normalize_url(url):
    """Normalize a URL by removing its fragment (the part after #)."""
    parsed = urlparse(url)
    normalized = parsed._replace(fragment="").geturl()
    return normalized

def get_driver():
    options = Options()
    options.add_argument("--headless")
    options.add_argument("--disable-gpu")
    options.add_argument("--no-sandbox")
    return webdriver.Chrome(options=options)


In [3]:
def crawl_url(driver, url, depth):
    if depth <= 0:
        return
    normalized_url = normalize_url(url)
    # Check if we've already visited this URL (ignoring fragments)
    with visited_lock:
        if normalized_url in visited:
            return
        visited.add(normalized_url)

    try:
        driver.set_page_load_timeout(10)
        driver.get(url)
        WebDriverWait(driver, 5).until(
            EC.presence_of_all_elements_located((By.TAG_NAME, "a"))
        )
    except Exception as e:
        print(f"Error loading {url}: {e}")
        return

    current_url = driver.current_url
    normalized_current_url = normalize_url(current_url)

    # Process forms on the page
    try:
        forms = driver.find_elements(By.TAG_NAME, "form")
    except Exception as e:
        print(f"Error finding forms on {current_url}: {e}")
        forms = []
    for form in forms:
        try:
            form_action = form.get_attribute("action")
            if not form_action:
                continue
            full_form_url = form_action if form_action.startswith("http") else urljoin(current_url, form_action)
            form_html = form.get_attribute("outerHTML")
            with forms_found_lock:
                forms_found.append({
                    "form_url": full_form_url,
                    "page_url": normalized_current_url,
                    "form_html": form_html
                })
            print(f"Found form: {full_form_url} on page: {normalized_current_url}")
        except Exception as e:
            print(f"Error processing form on {normalized_current_url}: {e}")
            continue

    # Extract and process anchor tags
    try:
        anchors = driver.find_elements(By.TAG_NAME, "a")
    except Exception as e:
        print(f"Error finding anchors on {current_url}: {e}")
        anchors = []

    # Extract href and text from anchors (to avoid stale element issues)
    anchor_data = []
    for a in anchors:
        try:
            href = a.get_attribute("href")
            text = a.text
            if href:
                anchor_data.append((href, text))
        except StaleElementReferenceException:
            continue

    for href, text in anchor_data:
        try:
            full_url = href if href.startswith("http") else urljoin(current_url, href)
            normalized_full_url = normalize_url(full_url)
            parsed_url = urlparse(normalized_full_url)
            if DOMAIN not in parsed_url.netloc:
                continue

            # Check for keywords to identify potential application links
            keywords = ["bachelor", "undergraduate", "apply", "application", "admission", "major"]
            if any(keyword in text.lower() or keyword in normalized_full_url.lower() for keyword in keywords):
                with application_links_lock:
                    if normalized_full_url not in application_links:
                        application_links[normalized_full_url] = normalized_current_url
                        print(f"Found potential application link: {normalized_full_url} on page: {normalized_current_url}")

            # Add new URL to the queue if it hasn't been visited
            with visited_lock:
                if normalized_full_url not in visited:
                    url_queue.put((normalized_full_url, depth - 1))
        except Exception as e:
            print(f"Error processing link on {normalized_current_url}: {e}")
            continue


In [4]:
def worker():
    """Worker function to process URLs from the queue."""
    driver = get_driver()
    while True:
        try:
            url, depth = url_queue.get(timeout=10)
        except queue.Empty:
            break
        crawl_url(driver, url, depth)
        url_queue.task_done()
    driver.quit()


In [5]:
# Seed the queue with the base URL
url_queue.put((BASE_URL, MAX_DEPTH))

threads = []
for _ in range(NUM_WORKERS):
    t = threading.Thread(target=worker)
    t.start()
    threads.append(t)

# Wait for all URLs to be processed
url_queue.join()
for t in threads:
    t.join()

print("\nCandidate Application Links:")
for link, page in application_links.items():
    print(f"Link: {link} found on {page}")

print("\nForms Found:")
for form in forms_found:
    print(f"Form URL: {form['form_url']}, Found on: {form['page_url']}")


Found form: https://www.mit.edu/search on page: https://www.mit.edu/
Found potential application link: https://www.mit.edu/admissions-aid on page: https://www.mit.edu/
Found form: https://www.mit.edu/search on page: https://www.mit.edu/education/
Found form: https://www.mit.edu/search on page: https://www.mit.edu/
Found potential application link: https://advising.mit.edu/ on page: https://www.mit.edu/education/
Found form: https://www.mit.edu/search on page: https://www.mit.edu/innovation/
Found form: https://www.mit.edu/search on page: https://www.mit.edu/research/
Found form: https://www.mit.edu/search on page: https://www.mit.edu/admissions-aid/
Found potential application link: https://www.mit.edu/admissions-aid/ on page: https://www.mit.edu/admissions-aid/
Found potential application link: https://sfs.mit.edu/undergraduate-financial-aid on page: https://www.mit.edu/admissions-aid/
Found potential application link: http://gradadmissions.mit.edu/programs on page: https://www.mit.ed

Exception in thread Thread-6 (worker):
Traceback (most recent call last):
  File "/home/mike/miniconda3/envs/tooling/lib/python3.11/site-packages/urllib3/connectionpool.py", line 787, in urlopen
    response = self._make_request(
               ^^^^^^^^^^^^^^^^^^^
  File "/home/mike/miniconda3/envs/tooling/lib/python3.11/site-packages/urllib3/connectionpool.py", line 534, in _make_request
    response = conn.getresponse()
               ^^^^^^^^^^^^^^^^^^
  File "/home/mike/miniconda3/envs/tooling/lib/python3.11/site-packages/urllib3/connection.py", line 516, in getresponse
    httplib_response = super().getresponse()
                       ^^^^^^^^^^^^^^^^^^^^^
  File "/home/mike/miniconda3/envs/tooling/lib/python3.11/http/client.py", line 1395, in getresponse
    response.begin()
  File "/home/mike/miniconda3/envs/tooling/lib/python3.11/http/client.py", line 325, in begin
    version, status, reason = self._read_status()
                              ^^^^^^^^^^^^^^^^^^^
  File "/home

Error loading https://catalog.mit.edu/mit/: ('Connection aborted.', RemoteDisconnected('Remote end closed connection without response'))Error loading https://catalog.mit.edu/mit/overview/: ('Connection aborted.', RemoteDisconnected('Remote end closed connection without response'))
Error loading https://catalog.mit.edu/mit/overview/campus/: HTTPConnectionPool(host='localhost', port=60919): Max retries exceeded with url: /session/84e474c8a642d366d2a8f335d563edc8/timeouts (Caused by NewConnectionError('<urllib3.connection.HTTPConnection object at 0x7bb94a150bd0>: Failed to establish a new connection: [Errno 111] Connection refused'))

Error loading https://catalog.mit.edu/mit/overview/administration/: HTTPConnectionPool(host='localhost', port=41237): Max retries exceeded with url: /session/3303f665823db6127179086b4c7bc6cf/timeouts (Caused by NewConnectionError('<urllib3.connection.HTTPConnection object at 0x7bb94a150810>: Failed to establish a new connection: [Errno 111] Connection refuse

KeyboardInterrupt: 

Error loading https://engineering.mit.edu/engage/engineering-in-action/pioneering-the-future-of-materials-extraction/: HTTPConnectionPool(host='localhost', port=60919): Max retries exceeded with url: /session/84e474c8a642d366d2a8f335d563edc8/timeouts (Caused by NewConnectionError('<urllib3.connection.HTTPConnection object at 0x7bb9540887d0>: Failed to establish a new connection: [Errno 111] Connection refused'))


Error loading https://engineering.mit.edu/connect/the-infinite/newsletter-signup/: HTTPConnectionPool(host='localhost', port=41237): Max retries exceeded with url: /session/3303f665823db6127179086b4c7bc6cf/timeouts (Caused by NewConnectionError('<urllib3.connection.HTTPConnection object at 0x7bb948892790>: Failed to establish a new connection: [Errno 111] Connection refused'))
Error loading https://web.mit.edu/people.html: HTTPConnectionPool(host='localhost', port=60919): Max retries exceeded with url: /session/84e474c8a642d366d2a8f335d563edc8/timeouts (Caused by NewConnectionError('<urllib3.connection.HTTPConnection object at 0x7bb948893790>: Failed to establish a new connection: [Errno 111] Connection refused'))
Error loading https://science.mit.edu/research-and-academics/: HTTPConnectionPool(host='localhost', port=60919): Max retries exceeded with url: /session/84e474c8a642d366d2a8f335d563edc8/timeouts (Caused by NewConnectionError('<urllib3.connection.HTTPConnection object at 0x7bb

In [None]:
def classify_form(form_html, page_url):
    """
    Uses the OpenAI API to determine if a given HTML form (with its page context)
    appears to be the application form for MIT's bachelor's majors.
    """
    prompt = (
        f"Given the following HTML form and the page URL where it was found, "
        f"determine if this form appears to be the application form for MIT's bachelor's majors. "
        f"Answer with a clear 'Yes' or 'No' and provide a brief explanation.\n\n"
        f"Page URL: {page_url}\n\nForm HTML:\n{form_html}"
    )
    
    try:
        response = openai.chat.completions.create(
            model="gpt-4o-mini",
            messages=[
                {
                    "role": "system",
                    "content": "You are an assistant that analyzes HTML forms to determine if they are application forms for MIT bachelor's majors.",
                },
                {"role": "user", "content": prompt},
            ],
            temperature=0,
        )
        classification = response.choices[0].message.content
        return classification
    except Exception as e:
        print(f"Error during classification for form on {page_url}: {e}")
        return "Classification error."

In [None]:
print("\nForm Classification Results:")
for idx, form in enumerate(forms_found, start=1):
    print(f"\nForm {idx}:")
    print(f"Form URL: {form['form_url']}")
    print(f"Found on Page: {form['page_url']}")
    classification = classify_form(form["form_html"], form["page_url"])
    print(f"Classification: {classification}")