# Site Crawler
A Python script to analyze a website's links and sitemap, now with link status checking.
- To run this script, you may need to install the required libraries.
    - If there is packages needed just uncomment the ones you need and run with the script build

The below notebook was built with Python 3.12 as a base runtime.


In [1]:
# Uncomment the following command(s) to install any required libraries:
%pip install --upgrade pip
%pip install requests
%pip install beautifulsoup4
%pip install lxml

Defaulting to user installation because normal site-packages is not writeable
Note: you may need to restart the kernel to use updated packages.
Defaulting to user installation because normal site-packages is not writeable
Note: you may need to restart the kernel to use updated packages.
Defaulting to user installation because normal site-packages is not writeable
Note: you may need to restart the kernel to use updated packages.
Defaulting to user installation because normal site-packages is not writeable
Note: you may need to restart the kernel to use updated packages.


In [2]:
import requests
import sys
from bs4 import BeautifulSoup, XMLParsedAsHTMLWarning
from urllib.parse import urljoin, urlparse
from typing import Set, List, Dict

import warnings
warnings.filterwarnings("ignore", category=XMLParsedAsHTMLWarning)

In [3]:
# Define a set to store visited URLs to prevent infinite loops on circular links
visited_urls: Set[str] = set()

def is_valid_url(url: str) -> bool:
    """
    Checks if a URL is valid by parsing it and ensuring it has a scheme and netloc.
    """
    try:
        result = urlparse(url)
        return all([result.scheme, result.netloc])
    except ValueError:
        return False

## Link Finder
This will scan the base domain and check for the level of internal links within the site.
It runs a recursive check on the webpage as well as its subpages (children) to provide clear detail to the domain structure

In [4]:
def find_all_links(url: str, base_domain: str) -> Set[str]:
    """
    Recursively finds all internal links on a given webpage and its subpages.
    
    Args:
        url (str): The URL of the page to crawl.
        base_domain (str): The domain of the website to stay within.
    
    Returns:
        Set[str]: A set of unique internal URLs found.
    """
    if url in visited_urls:
        return set()
    
    print(f"Crawling: {url}")
    # Add a limit to the number of URLs to crawl to prevent the script from running forever
    # adjust the limit for the number of URLs to check as needed
    if len(visited_urls) > 10000000:
        print("Crawl limit reached (1000 URLs). Stopping further crawling.")
        return set()
    
    visited_urls.add(url)
    
    internal_links: Set[str] = set()
    try:
        response = requests.get(url, timeout=500)
        response.raise_for_status() # Raise an exception for bad status codes
        soup = BeautifulSoup(response.text, 'lxml')
        
        for link in soup.find_all('a', href=True):
            href = link.get('href')
            full_url = urljoin(url, href)
            
            # Normalize URL to remove fragments and query parameters
            normalized_url = urlparse(full_url)._replace(fragment='', query='').geturl()
            
            if is_valid_url(normalized_url) and urlparse(normalized_url).netloc == base_domain:
                internal_links.add(normalized_url)
                
    except requests.RequestException as e:
        print(f"Error crawling {url}: {e}")
        
    return internal_links

## Sitemap check
The below section will scan the domain for a sitemap file that can be used as a starting point to what is contained within the site.
It will produce the number of what is contained in the sitemap and provide it to the subsequent steps.

In [5]:
def check_sitemap(url: str) -> List[str]:
    """
    Attempts to find and parse a sitemap.xml file for a given URL.
    
    Args:
        url (str): The base URL of the website.
        
    Returns:
        List[str]: A list of URLs found within the sitemap, or an empty list if not found.
    """
    parsed_url = urlparse(url)
    sitemap_url = f"{parsed_url.scheme}://{parsed_url.netloc}/sitemap.xml"
    
    print(f"\nAttempting to find sitemap at: {sitemap_url}")
    try:
        response = requests.get(sitemap_url, timeout=500)
        response.raise_for_status()
        
        soup = BeautifulSoup(response.text, 'lxml')
        urls_from_sitemap = [loc.text for loc in soup.find_all('loc')]
        
        if urls_from_sitemap:
            print(f"Found {len(urls_from_sitemap)} URLs in the sitemap.")
            return urls_from_sitemap
        else:
            print("Sitemap found but no URLs were listed.")
            return []
            
    except requests.RequestException as e:
        print(f"Sitemap not found or an error occurred: {e}")
        return []

## Progress Bar
Just a simple progress bar that will show how many pages scanned and how much to go.

In [6]:
def check_link_status(urls: Set[str]) -> Dict[str, str]:
    """
    Checks the HTTP status for a set of URLs.
    
    Args:
        urls (Set[str]): A set of URLs to check.
        
    Returns:
        Dict[str, str]: A dictionary of URLs and their status codes or error messages.
    """
    print("\n" + "=" * 30)
    print("CHECKING LINK STATUSES...")
    print("=" * 30)
    
    status_results = {}
    total_links = len(urls)
    
    for i, url in enumerate(urls, 1):
        # Progress indicator
        sys.stdout.write(f"\rChecking link {i}/{total_links}: {url[:60000]}...")
        sys.stdout.flush()
        
        try:
            response = requests.head(url, timeout=500, allow_redirects=True)
            status_results[url] = str(response.status_code)
        except requests.RequestException as e:
            status_results[url] = f"Error: {e}"
            
    sys.stdout.write("\n") # Newline after progress bar
    return status_results


## URL Request and Sitemap

In [7]:
# Requests the user to input a that is wanted to be checked
if __name__ == "__main__":
    # This will ask the user for the URL to analyse
    start_url = input("Please enter the website URL to check (e.g., https://example.com): ")
    if not start_url.startswith('http'):
        start_url = 'https://' + start_url
    
    base_domain = urlparse(start_url).netloc
    if not base_domain:
        print("Invalid URL provided. Please include a domain.")
        sys.exit()

    print(f"Starting analysis for website: {start_url}")
    print("-" * 30)
    
    # Find all links on the initial page
    found_links = find_all_links(start_url, base_domain)
    
    # Check for a sitemap
    sitemap_urls = check_sitemap(start_url)
    
    print("\n" + "=" * 30)
    print("ANALYSIS RESULTS")
    print("=" * 30)
    
    # Print the links found by crawling
    print(f"\nLinks found by crawling {start_url} and its internal pages:")
    if found_links:
        for link in sorted(list(found_links)):
            print(f"- {link}")
    else:
        print("No internal links were found.")
        
    print("-" * 30)
    
    # Print the links found in the sitemap
    print("\nURLs found in the sitemap:")
    if sitemap_urls:
        for url in sitemap_urls:
            print(f"- {url}")
    else:
        print("No sitemap was found or processed.")
        
    print("-" * 30)
    
    # Compare the two sets of links within the sitemap
    combined_links = found_links.union(sitemap_urls)
    
    if combined_links:
        print("\nChecking the status of all unique URLs found...")
        link_statuses = check_link_status(combined_links)
        
        # Report any bad links (4xx or 5xx) that were found during the crawl or in the sitemap
        bad_links = {url: status for url, status in link_statuses.items() if not status.startswith('2') and not status.startswith('3')}
        
        if bad_links:
            print("\n" + "=" * 30)
            print("BROKEN LINK REPORT")
            print("=" * 30)
            for url, status in sorted(bad_links.items()):
                print(f"[{status}] {url}")
        else:
            print("\n" + "=" * 30)
            print("BROKEN LINK REPORT")
            print("=" * 30)
            print("No broken links (4xx or 5xx) found!")
        
    # Final comparison between crawled and sitemap links
    def _parent_url(link: str) -> str:
        parsed = urlparse(link)
        base = f"{parsed.scheme}://{parsed.netloc}"
        path = parsed.path.rstrip('/')
        if not path:
            return f"{base}/"
        segments = [segment for segment in path.split('/') if segment]
        if len(segments) <= 1:
            return f"{base}/"
        parent_path = '/' + '/'.join(segments[:-1])
        return f"{base}{parent_path}"

    def _summarize(source_name: str, urls) -> None:
        url_set = set(urls or [])
        if not url_set:
            print(f"\nNo URLs available for {source_name}.")
            return

        parent_map: Dict[str, Set[str]] = {}
        for link in url_set:
            parent = _parent_url(link)
            parent_map.setdefault(parent, set()).add(link)

        print("\n" + "=" * 30)
        print(f"{source_name.upper()} PARENT/SUBLINK COUNTS")
        print("=" * 30)
        print(f"Total parents: {len(parent_map)}")
        print(f"Total URLs: {len(url_set)}")
        for parent, children in sorted(parent_map.items()):
            child_list = sorted(children)
            print(f"- {parent}: {len(child_list)} sublinks")
            for child in child_list:
                print(f"    - {child}")

    _summarize("Crawled URLs", found_links)
    _summarize("Sitemap URLs", sitemap_urls)
    _summarize("All URLs", combined_links)

Starting analysis for website: https://www.wgtn.ac.nz
------------------------------
Crawling: https://www.wgtn.ac.nz

Attempting to find sitemap at: https://www.wgtn.ac.nz/sitemap.xml
Found 3 URLs in the sitemap.

ANALYSIS RESULTS

Links found by crawling https://www.wgtn.ac.nz and its internal pages:
- https://www.wgtn.ac.nz
- https://www.wgtn.ac.nz/
- https://www.wgtn.ac.nz/LinkedIn
- https://www.wgtn.ac.nz/YouTube
- https://www.wgtn.ac.nz/about
- https://www.wgtn.ac.nz/about/campuses-facilities/campuses
- https://www.wgtn.ac.nz/about/campuses-facilities/campuses/auckland
- https://www.wgtn.ac.nz/about/campuses-facilities/campuses/kelburn
- https://www.wgtn.ac.nz/about/campuses-facilities/campuses/pipitea
- https://www.wgtn.ac.nz/about/campuses-facilities/campuses/te-aro
- https://www.wgtn.ac.nz/about/campuses-facilities/faculties-schools
- https://www.wgtn.ac.nz/about/contacts
- https://www.wgtn.ac.nz/about/contacts/media-enquiries
- https://www.wgtn.ac.nz/about/governance/universi

## JavaScript Components

In [8]:
from collections import deque
from typing import Any, Dict, List, Set
import hashlib
import requests
from urllib.parse import urljoin, urlparse
from bs4 import BeautifulSoup

# Helper Functions (Required for crawling/parsing)
def is_valid_url(url: str) -> bool:
    """Checks if a URL has a valid scheme (http or https)."""
    try:
        result = urlparse(url)
        return all([result.scheme, result.netloc]) and result.scheme in ('http', 'https')
    except ValueError:
        return False

# Main Component Functions
# NOTE: Adjust the max_pages parameter as needed to control depth of the crawl and how much information is requested
def crawl_javascript_components(start_url: str, base_domain: str, max_pages: int = 500000) -> Dict[str, List[Dict[str, Any]]]:
    """
    Crawls a website to identify and collect metadata about JavaScript components
    (external scripts and inline blocks).
    """
    visited: Set[str] = set()
    queue = deque([start_url])
    js_components: Dict[str, List[Dict[str, Any]]] = {}

    while queue and len(visited) < max_pages:
        current_url = queue.popleft()
        if current_url in visited:
            continue
        visited.add(current_url)

        try:
            # Send a GET request to the current URL
            response = requests.get(current_url, timeout=500)
            response.raise_for_status() # Raises an exception for bad status codes (4xx or 5xx)
        except requests.RequestException as exc:
            print(f"Error loading {current_url} for JavaScript scan: {exc}")
            continue

        # Parse the HTML content
        # Ensure you have 'lxml' installed: pip install lxml
        soup = BeautifulSoup(response.text, 'lxml')

        # 1. New links for crawling
        # Extract all links (<a> tags)
        for anchor in soup.find_all('a', href=True):
            href = anchor.get('href')
            full_url = urljoin(current_url, href)
            normalized_url = urlparse(full_url)._replace(fragment='', query='').geturl()
    
        # Check if link is valid, on the base domain, and hasn't been visited/queued
        if is_valid_url(normalized_url) and urlparse(normalized_url).netloc == base_domain:
            if normalized_url not in visited and normalized_url not in queue:
                # Add the child page to the queue for future processing
                queue.append(normalized_url)

        # 2. Extract JavaScript components from the current page
        page_scripts: List[Dict[str, Any]] = []
        for script in soup.find_all('script'):
            raw_src = script.get('src')
            normalized_src = ''

            # Handle external scripts (with a 'src' attribute)
            if raw_src:
                full_src = urljoin(current_url, raw_src.strip())
                # Normalize the external script URL
                normalized_src = urlparse(full_src)._replace(fragment='').geturl()

            # Extract custom data-* attributes
            data_attrs: Dict[str, str] = {}
            for key, value in script.attrs.items():
                if key.startswith('data-'):
                    data_attrs[key] = ' '.join(value) if isinstance(value, list) else str(value)

            # Handle inline scripts (no 'src' attribute)
            inline_hash = ''
            inline_length = 0
            if not raw_src:
                script_content = script.string or script.get_text()
                if script_content:
                    normalized_text = script_content.strip()
                    inline_length = len(normalized_text)
                    if normalized_text:
                        # Hash the content for identification
                        inline_hash = hashlib.sha256(normalized_text.encode('utf-8')).hexdigest()[:12]

            # Compile script information dictionary
            script_info: Dict[str, Any] = {
                "src": normalized_src,
                "is_external": bool(raw_src),
                "type": script.get('type') or 'text/javascript',
                "async": script.has_attr('async'),
                "defer": script.has_attr('defer'),
                "module": script.get('type') == 'module',
                "crossorigin": script.get('crossorigin') or '',
                "data_attributes": data_attrs
            }
            if inline_hash:
                script_info["inline_hash"] = inline_hash
                script_info["inline_length"] = inline_length

            page_scripts.append(script_info)

        if page_scripts:
            js_components[current_url] = page_scripts

    return js_components

# Provides a formatted output of the JavaScript components within the site
def print_javascript_components(component_map: Dict[str, List[Dict[str, Any]]]) -> None:
    """
    Prints a formatted summary of the collected JavaScript components.
    """
    if not component_map:
        print("No JavaScript components found.")
        return

    print("\n" + "=" * 30)
    print("JAVASCRIPT COMPONENTS REPORT")
    print("=" * 30)

    # 1. Detailed breakdown per page
    for page_url, scripts in sorted(component_map.items()):
        print(f"\nPage: {page_url}")
        for script in scripts:
            if script["is_external"]:
                print(f"  - external src: {script['src']}")
            else:
                print("  - inline script")
            
            # Print common attributes
            print(f"    type: {script['type']}")
            if script["async"]:
                print("    async: True")
            if script["defer"]:
                print("    defer: True")
            if script["module"]:
                print("    module: True")
            
            # Print conditional attributes
            if script["crossorigin"]:
                print(f"    crossorigin: {script['crossorigin']}")
            if script["data_attributes"]:
                print(f"    data-* attrs: {script['data_attributes']}")
            if "inline_hash" in script:
                print(f"    inline hash: {script['inline_hash']} (length={script['inline_length']})")

    # 2. Summary statistics
    unique_external_set: Set[str] = set()
    for scripts in component_map.values():
        for script in scripts:
            if script['is_external']:
                src = script.get('src')
                if isinstance(src, str):
                    unique_external_set.add(src)

    unique_external = sorted(unique_external_set)
    inline_total = sum(1 for scripts in component_map.values() for script in scripts if not script['is_external'])
    
    print("\n" + "-" * 30)
    print("SUMMARY")
    print("-" * 30)
    
    print("\nUnique external script sources:")
    if unique_external:
        for src in unique_external:
            print(f"  - {src}")
    else:
        print("  None detected.")

    print(f"\nTotal inline scripts detected: {inline_total}")

if __name__ == '__main__':
    # This will take the URL that was provided earlier and use it in this step
    TARGET_URL = start_url
    TARGET_DOMAIN = start_url # e.g., 'www.google.com'
    MAX_PAGES_TO_CRAWL = 2000000 # Set a reasonable limit to prevent excessively long runs

    # Note: To run this against a real site, you must have 'requests', 'beautifulsoup4', and 'lxml' installed:
    # pip install requests beautifulsoup4 lxml
    print("=" * 40)
    print(f"Starting JavaScript Component Crawl on: {TARGET_URL}")
    print(f"Restricted to base domain: {TARGET_DOMAIN}")
    print(f"Max pages to visit: {MAX_PAGES_TO_CRAWL}")
    print("=" * 40)
    
    # Run the crawler
    components = crawl_javascript_components(
        start_url=TARGET_URL,
        base_domain=TARGET_DOMAIN,
        max_pages=MAX_PAGES_TO_CRAWL 
    )
    
    # Print the report
    print_javascript_components(components)

Starting JavaScript Component Crawl on: https://www.wgtn.ac.nz
Restricted to base domain: https://www.wgtn.ac.nz
Max pages to visit: 2000000

JAVASCRIPT COMPONENTS REPORT

Page: https://www.wgtn.ac.nz
  - inline script
    type: text/javascript
    inline hash: d6d13a21ec41 (length=227)
  - inline script
    type: text/javascript
    inline hash: 3dceac6b400c (length=433)
  - inline script
    type: application/ld+json
    inline hash: 39346a0b334c (length=913)
  - external src: https://www.wgtn.ac.nz/__data/assets/git_bridge/0005/2009624/dist/assets/index.js
    type: module
    defer: True
    module: True
  - inline script
    type: text/javascript
    inline hash: d6d13a21ec41 (length=227)
  - inline script
    type: text/javascript
    inline hash: dd8f10c560da (length=814)
  - inline script
    type: text/javascript
    inline hash: e7f391bc0e11 (length=192)
  - external src: https://www.wgtn.ac.nz/__data/assets/git_bridge/0007/2075596/dist/assets/index.js?h=0089d46
    type: mod

In [9]:
import csv
import os

# Define output directory
output_dir = "outputs"
if not os.path.exists(output_dir):
    os.makedirs(output_dir)
    print(f"Created directory: {output_dir}")

# 1. Export Crawled Links
if 'found_links' in locals() and found_links:
    crawled_file = os.path.join(output_dir, "crawled_links.csv")
    with open(crawled_file, mode='w', newline='', encoding='utf-8') as f:
        writer = csv.writer(f)
        writer.writerow(["URL"])
        for link in sorted(found_links):
            writer.writerow([link])
    print(f"Exported {len(found_links)} crawled links to {crawled_file}")
else:
    print("No crawled links data found to export.")

# 2. Export Sitemap Links
if 'sitemap_urls' in locals() and sitemap_urls:
    sitemap_file = os.path.join(output_dir, "sitemap_links.csv")
    with open(sitemap_file, mode='w', newline='', encoding='utf-8') as f:
        writer = csv.writer(f)
        writer.writerow(["URL"])
        for link in sitemap_urls:
            writer.writerow([link])
    print(f"Exported {len(sitemap_urls)} sitemap links to {sitemap_file}")
else:
    print("No sitemap data found to export.")

# 3. Export Broken Links
if 'bad_links' in locals() and bad_links:
    broken_file = os.path.join(output_dir, "broken_links.csv")
    with open(broken_file, mode='w', newline='', encoding='utf-8') as f:
        writer = csv.writer(f)
        writer.writerow(["URL", "Status Code/Error"])
        for url, status in sorted(bad_links.items()):
            writer.writerow([url, status])
    print(f"Exported {len(bad_links)} broken links to {broken_file}")
else:
    print("No broken links data found to export.")

# 4. Export JavaScript Components
if 'components' in locals() and components:
    js_file = os.path.join(output_dir, "javascript_components.csv")
    with open(js_file, mode='w', newline='', encoding='utf-8') as f:
        writer = csv.writer(f)
        # Define headers
        headers = [
            "Page URL", "Script Type", "Source/Hash", "Type Attribute", 
            "Async", "Defer", "Module", "Crossorigin", "Data Attributes"
        ]
        writer.writerow(headers)
        
        count = 0
        for page_url, scripts in sorted(components.items()):
            for script in scripts:
                # Determine script type and identifier
                s_type = "External" if script["is_external"] else "Inline"
                identifier = script["src"] if script["is_external"] else f"Hash: {script.get('inline_hash', 'N/A')}"
                
                # Format data attributes as a string
                data_attrs_str = "; ".join([f"{k}={v}" for k, v in script["data_attributes"].items()])
                
                row = [
                    page_url,
                    s_type,
                    identifier,
                    script["type"],
                    script["async"],
                    script["defer"],
                    script["module"],
                    script["crossorigin"],
                    data_attrs_str
                ]
                writer.writerow(row)
                count += 1
                
    print(f"Exported {count} JavaScript component records to {js_file}")
else:
    print("No JavaScript component data found to export.")

Created directory: outputs
Exported 102 crawled links to outputs\crawled_links.csv
Exported 3 sitemap links to outputs\sitemap_links.csv
Exported 6 broken links to outputs\broken_links.csv
Exported 17 JavaScript component records to outputs\javascript_components.csv
