Given a domain, i.e. cs.utexas.edu , scrape and generate a site.json
containing all text content of all pages available through 
domain/sitemap.xml.

1. Get list of sites (and metadata) from sitemap.
2. Interate through sites, scrape text content.
3. Store text content, scrape_timestamp 
together with site metadata to output list of site objects.

In [None]:
# Env imports.
from os import getenv
from dotenv import load_dotenv

# Retrieve the API keys from environment variables
load_dotenv()

API_KEY: str = getenv('SCRAPERAPI_API_KEY')
NUM_RETRIES: int = int(getenv('SCRAPERAPI_NUM_RETRIES'))
MAX_WORKERS: int = int(getenv('SCRAPERAPI_MAX_WORKERS'))

In [None]:
from pprint import pprint
from scraperapi_sdk import ScraperAPIClient
from scraperapi_sdk.exceptions import ScraperAPIException

client: ScraperAPIClient = ScraperAPIClient(API_KEY)

In [None]:
from datetime import datetime
from lxml import etree
from typing import List, Dict, Optional

SITEMAP_NAMESPACE = {'s': 'http://www.sitemaps.org/schemas/sitemap/0.9'}

class EmptySitemapDataException(Exception):
    '''No valid urls found in the requested domain's sitemap.xml.'''
    pass

def get_subfield(url_tag: etree._Element, field_name: str) -> Optional[str]:
    field = url_tag.find(f's:{field_name}', SITEMAP_NAMESPACE)
    return field.text if field is not None else None

def get_sitemap_data(client: ScraperAPIClient, sitemap_url: str) -> List[Dict[str, str]]:
    # Use 'make_request' instead of 'get' for byte str compatibility.
    # i.e. w/ encoding declaration <?xml version="1.0" encoding="UTF-8"?> 
    response = client.make_request(url=sitemap_url, params={'retry': NUM_RETRIES})
    root = etree.fromstring(response.content)
    sitemap_data = []

    for url in root.findall('.//s:url', SITEMAP_NAMESPACE):
        loc = url.find('s:loc', SITEMAP_NAMESPACE)
        if loc is not None:
            url_data = {
                'loc': loc.text,
                'lastmod': get_subfield(url_tag=url, field_name='lastmod'),
                'priority': get_subfield(url_tag=url, field_name='priority'),
                'changefreq': get_subfield(url_tag=url, field_name='changefreq')
            }
            sitemap_data.append(url_data)
            # pprint(f'Success: Added URL data to sitemap dataset: {url_data}.')
        else:
            pprint(f'Warning: No <loc> tag found in URL data: {url}. Skipping...')
    
    if not sitemap_data:
        raise EmptySitemapDataException

    return sitemap_data

In [None]:
pprint(get_sitemap_data(client=client, sitemap_url='https://www.cs.utexas.edu/sitemap.xml'))

In [None]:
pprint(get_sitemap_data(client=client, sitemap_url='https://oia.osu.edu/sitemap.xml'))

In [None]:
from bs4 import BeautifulSoup

class EmptyTextContentException(Exception):
    '''No text content found at the requested URL.'''
    pass

def get_text_content(client: ScraperAPIClient, url: str) -> str:
    raw_html = client.get(url, params={'retry': NUM_RETRIES})
    soup = BeautifulSoup(raw_html, 'html.parser')

    # Parse the response with BeautifulSoup and extract all text content,
    # excluding <header> and <footer> tags if present.
    soup.header and soup.header.decompose()
    soup.footer and soup.footer.decompose()
    text_content = soup.get_text(separator=' ', strip=True)

    if not text_content:
        raise EmptyTextContentException
    
    return text_content

In [None]:
pprint(get_text_content(client=client, url='https://www.cs.utexas.edu/about'))

In [None]:
from urllib.parse import urlparse, ParseResult

def needs_sanitation(loc: str, domain: str) -> bool:
    '''Verifies loc URL uses https and matches the domain.'''
    parsed_loc: ParseResult = urlparse(loc)
    return parsed_loc.scheme != 'https' or parsed_loc.hostname != f'www.{domain}'

def sanitize_loc(loc: str, domain: str) -> str:
    # Init with secure https domain base.
    sanitized_loc: str = f'https://www.{domain}'

    # Parse (instead of regexp) the loc URL to extract path.
    parsed_loc: ParseResult = urlparse(loc)

    # Append the path, query, and fragment (if present) to the sanitized URL.
    sanitized_loc += parsed_loc.path
    if parsed_loc.query:
        sanitized_loc += f'?{parsed_loc.query}'
    if parsed_loc.fragment:
        sanitized_loc += f'#{parsed_loc.fragment}'
        
    return sanitized_loc
    

def sanitize_sitemap_data(sitemap_data: List[Dict[str, str]], domain: str) -> List[Dict[str, str]]:
    processed_sitemap_data = []
    for data in sitemap_data:
        if data.get('loc'):
            if needs_sanitation(loc=data['loc'], domain=domain):
                sanitized_data = { **data, 'loc': sanitize_loc(loc = data['loc'], domain=domain) }
                processed_sitemap_data.append(sanitized_data)
            else:
                processed_sitemap_data.append(data)

    return processed_sitemap_data

In [None]:
domain = 'oia.osu.edu'
sitemap_url = f'https://www.{domain}/sitemap.xml'
sitemap_data = get_sitemap_data(client, sitemap_url)
sanitized_sitemap_data = sanitize_sitemap_data(sitemap_data, domain)

pprint(sanitized_sitemap_data)

In [None]:
from datetime import datetime
from typing import List, Dict, Optional
from concurrent.futures import ThreadPoolExecutor
from threading import Lock

def get_lastmod_timestamp(data: Dict[str, str]) -> Optional[str]:
    lastmod: str = data.get('lastmod')
    # Validate lastmod is a non-empty field.
    if lastmod:
        try:
            # Validate lastmod is a valid iso time str.
            datetime.fromisoformat(lastmod)
            return lastmod
        except ValueError:
            pass
    # Explicitly return None if any validation fails.
    return None

def scrape_domain_from_sitemap_data(
    client: ScraperAPIClient,
    sitemap_url: str,
    sitemap_data: List[Dict[str, str]], 
) -> (List[Dict[str, str]], List[Dict[str, str]]):

    lock = Lock()
    domain_data: List[Dict[str, str]] = []
    failure_sites: List[Dict[str, str]] = []
    
    def process_url(data: Dict[str, str]):
        try:
            if 'loc' not in data:
                raise KeyError('The key 'loc' is missing in sitemap data item.')

            url = data['loc']
            text_content: str = get_text_content(client, url)
            scrape_timestamp: str = datetime.now().isoformat()
            lastmod_timestamp: str = get_lastmod_timestamp(data) or scrape_timestamp
            
            site_data = {
                'url': url,
                'src': sitemap_url,
                'text_content': text_content,
                'lastmod_timestamp': lastmod_timestamp,
                'scrape_timestamp': scrape_timestamp,
            }
            
            with lock:
                domain_data.append(site_data)
                
        except (
            KeyError,
            ScraperAPIException,
            EmptySitemapDataException,
            EmptyTextContentException
        ) as e:
            with lock:
                failure_sites.append(data)
            print(e)
    
    with ThreadPoolExecutor(max_workers=MAX_WORKERS) as executor:
        executor.map(process_url, sitemap_data)

    print(f'Successfully scraped text content from {len(domain_data)} out of {len(sitemap_data)} total sites provided.')
    return domain_data, failure_sites

In [None]:
domain: str = 'cs.utexas.edu'

In [None]:
domain: str = 'aede.osu.edu'

In [None]:
domain: str = 'oia.osu.edu'

In [None]:
sitemap_url: str = f'https://www.{domain}/sitemap.xml'
sitemap_data: List[Dict[str, str]] = get_sitemap_data(client, sitemap_url)
sanitized_sitemap_data: List[Dict[str, str]] = sanitize_sitemap_data(sitemap_data, domain)

In [None]:
domain_data, failure_sites = scrape_domain_from_sitemap_data(client, sitemap_url, sanitized_sitemap_data)

In [None]:
additional_data, additional_failure_sites = scrape_domain_from_sitemap_data(client, sitemap_url, failure_sites)

In [None]:
domain_data.append(additional_data)

In [None]:
def prepare_output_filename(domain: str) -> str:
    return f'{domain.replace('.', '_')}_domain_data.json'

print(prepare_output_filename(domain))

In [None]:
import json
output_filename = prepare_output_filename(domain)
with open(output_filename, 'w', encoding='utf-8') as f:
    json.dump(domain_data, f, indent=2)

In [None]:
import pandas as pd
import plotly.express as px
from plotly.offline import init_notebook_mode
init_notebook_mode(connected=True)

In [None]:
df = pd.DataFrame(domain_data)
df['text_length'] = df['text_content'].apply(len)

In [None]:
fig = px.histogram(
    df, 
    x='text_length', 
    title='Distribution of Text Content Length',
    labels={'text_length': 'Text Length (chars)'},
    nbins=5000  # Adjust number of bins as needed
)

fig.show(renderer="iframe")