In [1]:
# Data Collection

import requests, os, re, json
from bs4 import BeautifulSoup
from urllib.parse import urljoin, urlparse
from concurrent.futures import ThreadPoolExecutor

progress = display(display_id=True)
successes, failures = 0, 0
visited = set()
to_visit = set()
dataset = []

DOMAIN = "https://charlotte.edu"
EXCLUDE = ['?', 'page', 'gateway', 'illiad', 'news-articles', 'news-events', 'news-media', 'linkedin', 'facebook', 'twitter', 'instagram', 'youtube', 'flickr', 'pinterest', '.com', '.org', '.net', '.gov', '.pdf', '.doc', 'xml', 'php', 'mailto:', '@', 'tel:', 'javascript:', 'tel:', 'sms:', 'mailto:', 'angular', 'react', '.js', 'event', 'corporate', '#', 'image', 'gallery', 'taskstream-student-handbook']    
WORKERS = 50

def recursive_scrape(domain):
    global successes, failures, visited, to_visit, dataset
    if domain in visited:
        return
    visited.add(domain)
    try:
        response = requests.get(domain)
        if response.status_code != 200:
            return
        soup = BeautifulSoup(response.text, 'html.parser')
        get_urls(soup)
        extract_data(soup, domain)
    except Exception as e:
        print(f'Error {e}')
        failures += 1
    successes += 1
        
    progress.update(f'{successes} successes, {failures} failures')

    with ThreadPoolExecutor(max_workers=WORKERS) as executor:
        executor.map(recursive_scrape, to_visit)
        
def get_urls(soup):
    global visited, to_visit
    for link in soup.find_all('a'):
        href = link.get('href')
        if href is None:
            continue
        if any(exclude in href for exclude in EXCLUDE):
            continue
        if href.startswith('/'):
            href = urljoin(DOMAIN, href)
        if ("charlotte.edu" in href) and (href not in visited) and (href not in to_visit):
            to_visit.add(href)

def extract_data(soup, url):
    global dataset
    title = soup.title.string if soup.title else ''
    
    # Find the "main", "main-content", or "body" element
    element_ids = ["main", "main-content", "body"]
    element = None

    for elem_id in element_ids:
        element = soup.find(id=elem_id)
        if element:
            if elem_id == "main-content":
                element = element.parent
            break

    # Extract all visible text in the element and its child elements
    text = element.get_text(strip=True, separator=' ') if element else ''

    # Clean up the text
    text = text.replace('"', "'")    
    text = text.replace('\n', '')
    text = text.replace('\t', '')    
    text = ' '.join(text.split())
    
    dataset.append({'url': url, 'title': title, 'text': text})

recursive_scrape(DOMAIN)

with open('./data/dataset.json', 'w') as f:
    f.write(json.dumps(dataset))

'574 successes, 0 failures'