In [None]:
'''
Given a domain, i.e.cse.osu.edu, scrape and generate a site.json
containing all text content of all pages available through 
domain/sitemap.xml.

1. Get list of sites (and metadata) from sitemap.
2. Interate through sites, scrape text content.
3. Store text content, scrape_timestamp 
together with site metadata to output list of site objects.
'''

In [None]:
# Env imports.
from os import getenv
from dotenv import load_dotenv

In [None]:
# Retrieve the API keys from environment variables
load_dotenv()
scraperapi_api_key: str = getenv('SCRAPERAPI_API_KEY')

In [None]:
from pprint import pprint
from scraperapi_sdk import ScraperAPIClient

client = ScraperAPIClient(scraperapi_api_key)

In [None]:
from datetime import datetime
from lxml import etree

SITEMAP_NAMESPACE = {'s': 'http://www.sitemaps.org/schemas/sitemap/0.9'}

def get_sitemap_data(domain: str) -> list[dict]:
    raw_sitemap = client.get(f'https://{domain}/sitemap.xml')
    root = etree.fromstring(raw_sitemap)
    sitemap_data = []

    for url in root.findall('.//s:url', SITEMAP_NAMESPACE):
        url_data = {
            'loc': url.find('s:loc', SITEMAP_NAMESPACE).text if url.find('s:loc', SITEMAP_NAMESPACE) is not None else None,
            'lastmod': url.find('s:lastmod', SITEMAP_NAMESPACE).text if url.find('s:lastmod', SITEMAP_NAMESPACE) is not None else None,
            'priority': url.find('s:priority', SITEMAP_NAMESPACE).text if url.find('s:priority', SITEMAP_NAMESPACE) is not None else None
        }
        sitemap_data.append(url_data)
    
    return sitemap_data

pprint(get_sitemap_data('cse.osu.edu'))

In [None]:
from bs4 import BeautifulSoup

def get_text_content(url: str) -> str:
    raw_html = client.get(url)
    soup = BeautifulSoup(raw_html, 'html.parser')

    # Parse the response with BeautifulSoup and extract all text content,
    # excluding <header> and <footer> tags if present.
    if soup.header:
        soup.header.decompose()
    if soup.footer:
        soup.footer.decompose()
    text_content = soup.get_text()
    cleaned_text = ' '.join(text_content.split())
    return cleaned_text

pprint(get_text_content('http://cse.osu.edu/events/guest-speaker-deepayan-chakrabarti'))

In [None]:
def scrape_domain(sitemap_data):
    domain_data = []
    for data in sitemap_data:
        site_data = {
            **data,
            'text_content': get_text_content(data['loc']),
            'scrape_timestimp': datetime.now().isoformat()
        }
        domain_data.append(site_data)
    return domain_data

domain = 'cse.osu.edu'
sitemap_data = get_sitemap_data(domain)
domain_data = scrape_domain(sitemap_data)
pprint(domain_data)

In [None]:
import pandas as pd
import plotly.express as px
from plotly.offline import init_notebook_mode
init_notebook_mode(connected=True)

In [None]:
df = pd.DataFrame(domain_data)
df['text_length'] = df['text_content'].apply(len)

In [None]:
df

In [None]:
fig = px.bar(
    df, 
    y='loc', 
    x='text_length', 
    title='Text Content Len per Page',
    labels={
        'loc': 'Page',
        'text_length': 'Text Len (chars)'
    }
)

fig.show(renderer="iframe")

In [None]:
fig = px.histogram(
    df, 
    x='text_length', 
    title='Distribution of Text Content Length',
    labels={'text_length': 'Text Length (chars)'},
    nbins=5000  # Adjust number of bins as needed
)

fig.show(renderer="iframe")

In [None]:
import json
with open('domain_data.json', 'w') as f:
    json.dump(domain_data, f, indent=2)

In [None]:
pprint(len(domain_data))