Given a domain, i.e. cs.utexas.edu , scrape and generate a site.json
containing all text content of all pages available through 
domain/sitemap.xml.

1. Get list of sites (and metadata) from sitemap.
2. Interate through sites, scrape text content.
3. Store text content, scrape_timestamp 
together with site metadata to output list of site objects.

In [None]:
# Env imports.
from os import getenv
from dotenv import load_dotenv

In [None]:
# Retrieve the API keys from environment variables
load_dotenv()
scraperapi_api_key: str = getenv('SCRAPERAPI_API_KEY')

In [None]:
from pprint import pprint
from scraperapi_sdk import ScraperAPIClient
from scraperapi_sdk.exceptions import ScraperAPIException

client = ScraperAPIClient(scraperapi_api_key)

In [19]:
from datetime import datetime
from lxml import etree
from typing import List, Dict, Optional

SITEMAP_NAMESPACE = {'s': 'http://www.sitemaps.org/schemas/sitemap/0.9'}

class EmptySitemapDataException(Exception):
    '''No valid urls found in the requested domain's sitemap.xml.'''
    pass

def get_subfield(url_tag: etree._Element, field_name: str) -> Optional[str]:
    field = url_tag.find(f's:{field_name}', SITEMAP_NAMESPACE)
    return field.text if field is not None else None

def get_sitemap_data(client: ScraperAPIClient, domain: str) -> List[Dict[str, str]]:
    '''Fetches and parses the sitemap.xml of a domain.

    Args:
        client (ScraperAPIClient): The ScraperAPI synchronous client.
        domain (str): The domain to retrieve sitemap data from.

    Returns:
        List[Dict[str, str]]: A list of dictionaries containing URL data.

    Raises:
        ScraperAPIException: If failure requesting to sitemap.xml.
        EmptySitemapDataException: If no URLs are found in the sitemap.
    '''
    # Use 'make_request' instead of 'get' for byte str compatibility.
    # i.e. w/ encoding declaration <?xml version="1.0" encoding="UTF-8"?> 
    response = client.make_request(url=f'https://{domain}/sitemap.xml')
    root = etree.fromstring(response.content)
    sitemap_data = []

    for url in root.findall('.//s:url', SITEMAP_NAMESPACE):
        loc = url.find('s:loc', SITEMAP_NAMESPACE)
        if loc is not None:
            url_data = {
                'loc': loc.text,
                'lastmod': get_subfield(url_tag=url, field_name='lastmod'),
                'priority': get_subfield(url_tag=url, field_name='priority'),
                'changefreq': get_subfield(url_tag=url, field_name='changefreq')
            }
            sitemap_data.append(url_data)
            # pprint(f'Success: Added URL data to sitemap dataset: {url_data}.')
        else:
            pprint(f'Warning: No <loc> tag found in URL data: {url}. Skipping...')
    
    if not sitemap_data:
        raise EmptySitemapDataException

    return sitemap_data

pprint(get_sitemap_data(client=client, domain='cs.utexas.edu'))

[{'changefreq': 'daily',
  'lastmod': None,
  'loc': 'https://www.cs.utexas.edu/',
  'priority': '1.0'},
 {'changefreq': 'yearly',
  'lastmod': '2024-08-27T15:33Z',
  'loc': 'https://www.cs.utexas.edu/news/2021/kristen-grauman-named-finalist-2021-blavatnik-national-awards-young-scientists',
  'priority': None},
 {'changefreq': 'monthly',
  'lastmod': '2024-09-11T20:16Z',
  'loc': 'https://www.cs.utexas.edu/undergraduate-program/academics/curriculum/courses',
  'priority': None},
 {'changefreq': 'yearly',
  'lastmod': '2024-06-24T21:07Z',
  'loc': 'https://www.cs.utexas.edu/engage/outreach/academies/all',
  'priority': None},
 {'changefreq': 'yearly',
  'lastmod': '2024-06-24T21:08Z',
  'loc': 'https://www.cs.utexas.edu/engage/outreach/academies/women',
  'priority': None},
 {'changefreq': 'yearly',
  'lastmod': '2024-06-24T21:44Z',
  'loc': 'https://www.cs.utexas.edu/engage/outreach/academies/robotics',
  'priority': None},
 {'changefreq': 'yearly',
  'lastmod': '2024-06-24T21:41Z',
  

In [None]:
from bs4 import BeautifulSoup

class EmptyTextContentException(Exception):
    '''No text content found at the requested URL.'''
    pass

def get_text_content(client: ScraperAPIClient, url: str) -> str:
    '''Fetches and parses all textual content from a web page, excluding header and footer.

    Args:
        client (ScraperAPIClient): The ScraperAPI client instance to make requests.
        url (str): The URL of the website to retrieve text content from.

    Returns:
        str: The extracted text content from the website as a single string.

    Raises:
        ScraperAPIException: If failure requesting to given URL.
        EmptyTextContentException: If no text content are found at the given URL.
    '''
    raw_html = client.get(url)
    soup = BeautifulSoup(raw_html, 'html.parser')

    # Parse the response with BeautifulSoup and extract all text content,
    # excluding <header> and <footer> tags if present.
    soup.header and soup.header.decompose()
    soup.footer and soup.footer.decompose()
    text_content = soup.get_text(separator=' ', strip=True)

    if not text_content:
        raise EmptyTextContentException
    
    return text_content

pprint(get_text_content(client=client, url='https://www.cs.utexas.edu/turing-scholars/benefits'))

In [20]:
import json
from threading import Thread, Lock

def scrape_domain(
    client: ScraperAPIClient,
    sitemap_data: List[Dict[str, str]],
    max_thread_count: int = 5,
    output_filename: str = 'domain_data.json'
) -> List[Dict[str, str]]:
    '''Scrapes text content from URLs in sitemap_data using n threads and records successes and failures.

    Args:
        client (ScraperAPIClient): The ScraperAPI client instance for making HTTP requests.
        sitemap_data (List[Dict[str, str]]): List of dictionaries containing URL data.
        max_thread_count (int): The number of concurrent threads to use.
        output_filename: The name of the output data streaming json file.

    Returns:
        List[Dict[str, str]]: A list of dictionaries with successful URL data and text content.
    '''
    # Init array and lock to manage threads and avoid race conditions.
    threads = []
    lock = Lock()
    
    domain_data = []
    success_count = 0
    failure_sites = []

    # Start JSON array in output file.
    with open(output_filename, 'w') as f:
        f.write('[')

    def scrape_single_url(data: Dict[str, str]):
        nonlocal success_count
        try:
            site_data = {
                **data,
                'text_content': get_text_content(client=client, url=data['loc']),
                'scrape_timestamp': datetime.now().isoformat()
            }
            with lock:
                # Append new scraped data to json file.
                # Write comma only if it's not the first entry.
                with open(output_filename, 'a') as f:
                    if success_count > 0:
                        f.write(',\n')
                    json.dump(site_data, f)

                # Increment success count and add data to return array.
                success_count += 1
                domain_data.append(site_data)
            pprint(f'Successfully scraped text content of {data['loc']}.')

        except (ScraperAPIException, EmptyTextContentException) as e:
            with lock:
                failure_sites.append(data)
            pprint(f'Failed to scrape {data['loc']}: {e}')

    # Main scraping process with thread pooling.
    for data in sitemap_data:
        if len(threads) >= max_thread_count:
            for thread in threads: 
                thread.join()
            threads.clear()

        # If max number of threads not reached, add and start new thread.
        new_thread = Thread(target=scrape_single_url, args=(data,))
        new_thread.start()
        threads.append(new_thread)

    # Final join for remaining threads.
    for thread in threads:
        thread.join()
    threads.clear()

    # Retry once for all previously failed attempts.
    if failure_sites:
        pprint('Start retrying once all previously failed attempts...')
        retry_failures = failure_sites[:]
        failure_sites.clear()
        # Clear failure site list to avoid dups and inf looping.
        
        for data in retry_failures:
            if len(threads) >= max_thread_count:
                for thread in threads: 
                    thread.join()
                threads.clear()
    
            new_thread = Thread(target=scrape_single_url, args=(data,))
            new_thread.start()
            threads.append(new_thread)

        # Final join for remaining threads.
        for thread in threads:
            thread.join()

    # Before returning, close JSON array in output file.
    with open(output_filename, 'a') as f:
        f.write(']')

    if failure_sites:
        pprint('''
        Scrape and retry concluded. 
        These site failed again: {failure_sites}. 
        Returning {success_count} successful records.
        ''')

    return domain_data

domain = 'cs.utexas.edu'
sitemap_data = get_sitemap_data(client, domain)
domain_data = scrape_domain(client, sitemap_data)

('Successfully scraped text content of '
 'https://www.cs.utexas.edu/news/2021/kristen-grauman-named-finalist-2021-blavatnik-national-awards-young-scientists.')
('Successfully scraped text content of '
 'https://www.cs.utexas.edu/undergraduate-program/academics/curriculum/courses.')
'Successfully scraped text content of https://www.cs.utexas.edu/.'
('Successfully scraped text content of '
 'https://www.cs.utexas.edu/engage/outreach/academies/all.')
('Successfully scraped text content of '
 'https://www.cs.utexas.edu/engage/outreach/academies/women.')
('Successfully scraped text content of '
 'https://www.cs.utexas.edu/engage/outreach/academies/game-development.')
('Successfully scraped text content of '
 'https://www.cs.utexas.edu/undergraduate-program/academics/concentrations.')
('Successfully scraped text content of '
 'https://www.cs.utexas.edu/concentrations/game-development.')
('Successfully scraped text content of '
 'https://www.cs.utexas.edu/engage/outreach/academies/ios-develo

In [None]:
domain_data

In [21]:
pprint(len(domain_data))

1137


In [22]:
import pandas as pd
import plotly.express as px
from plotly.offline import init_notebook_mode
init_notebook_mode(connected=True)

In [23]:
df = pd.DataFrame(domain_data)
df['text_length'] = df['text_content'].apply(len)

In [24]:
df

Unnamed: 0,loc,lastmod,priority,changefreq,text_content,scrape_timestamp,text_length
0,https://www.cs.utexas.edu/news/2021/kristen-gr...,2024-08-27T15:33Z,,yearly,Kristen Grauman Named Finalist in 2021 Blavatn...,2024-10-30T14:54:01.387365,2745
1,https://www.cs.utexas.edu/undergraduate-progra...,2024-09-11T20:16Z,,monthly,Undergraduate Courses | Department of Computer...,2024-10-30T14:54:01.697593,5233
2,https://www.cs.utexas.edu/,,1.0,daily,Department of Computer Science Skip to main co...,2024-10-30T14:54:02.054062,2692
3,https://www.cs.utexas.edu/engage/outreach/acad...,2024-06-24T21:07Z,,yearly,Academy for All | Department of Computer Scien...,2024-10-30T14:54:02.102898,9448
4,https://www.cs.utexas.edu/engage/outreach/acad...,2024-06-24T21:08Z,,yearly,Academy for Women | Department of Computer Sci...,2024-10-30T14:54:03.031066,8456
...,...,...,...,...,...,...,...
1132,https://www.cs.utexas.edu/news/2024/keeping-ai...,2024-09-25T15:54Z,,monthly,Keeping Up with AI’s Increasingly Complex Netw...,2024-10-30T15:07:56.219656,2251
1133,https://www.cs.utexas.edu/news/2024/rememberin...,2024-10-22T22:59Z,,weekly,Remembering Turing Award Winner E. Allen Emers...,2024-10-30T15:07:56.750438,3642
1134,https://www.cs.utexas.edu/news/2024/ut-program...,2024-10-02T20:06Z,,monthly,UT Programming Team Competes in 48th ICPC Worl...,2024-10-30T15:07:57.053503,2920
1135,https://www.cs.utexas.edu/engage/industry/focs,2024-07-17T21:26Z,,yearly,Friends of Computer Science | Department of Co...,2024-10-30T15:07:58.593352,16674


In [25]:
fig = px.bar(
    df, 
    y='loc', 
    x='text_length', 
    title='Text Content Len per Page',
    labels={
        'loc': 'Page',
        'text_length': 'Text Len (chars)'
    }
)

fig.show(renderer="iframe")

In [26]:
fig = px.histogram(
    df, 
    x='text_length', 
    title='Distribution of Text Content Length',
    labels={'text_length': 'Text Length (chars)'},
    nbins=5000  # Adjust number of bins as needed
)

fig.show(renderer="iframe")