In [1]:
# Data Collection

import requests, csv, queue
from bs4 import BeautifulSoup
from urllib.parse import urljoin, urlparse
from concurrent.futures import ThreadPoolExecutor, as_completed
from threading import Lock

progress = display(display_id=True)
successes, failures = 0, 0
visited = set()
to_visit = queue.Queue()
dataset = []
visited_lock = Lock()
to_visit_lock = Lock()
session = requests.Session()

DOMAIN = "https://charlotte.edu"
EXCLUDE = ['?', 'directory', 'article', 'journals', 'coefs', 'calendar', 'day', 'xml', 'jpg', 'png', 'jpeg', 'sites', 'page', 'gateway', 'illiad', 'news-articles', 'news-events', 'news-media', 'linkedin', 'facebook', 'twitter', 'instagram', 'youtube', 'flickr', 'pinterest', '.com', '.org', '.net', '.gov', '.pdf', '.doc', 'xml', 'php', 'mailto:', '@', 'tel:', 'javascript:', 'tel:', 'sms:', 'mailto:', 'angular', 'react', '.js', 'event', 'corporate', '#', 'image', 'gallery', 'taskstream-student-handbook']    
PATH_LIMIT = 2
WORKERS = 50

def scrape():
    global successes, failures, visited, to_visit, dataset

    with ThreadPoolExecutor(max_workers=WORKERS) as executor:
        futures = {executor.submit(process_domain, domain) for domain in iter(to_visit.get, None)}
        for future in as_completed(futures):
            domain = future.result()
            if domain is not None:
                visited.add(domain)

def worker():
    global successes, failures, visited, to_visit, dataset
    while True:
        with to_visit_lock:
            if to_visit.empty():
                break
            domain = to_visit.get()
        if domain in visited:
            continue
        visited.add(domain)
        process_domain(domain)

def process_domain(domain):
    global successes, failures, visited, to_visit, dataset, session
    try:
        response = session.get(domain)
        if response.status_code != 200:
            return
        soup = BeautifulSoup(response.text, 'html.parser')
        get_urls(soup)
        extract_data(soup, domain)
    except Exception as e:
        failures += 1
    successes += 1

    progress.update(f'{successes} successes, {failures} failures, domain: {domain}')

def get_urls(soup):
    global visited, to_visit, visited_lock
    for link in soup.find_all('a'):
        href = link.get('href')
        if href is None:
            continue
        if any(exclude in href for exclude in EXCLUDE):
            continue
        if href.startswith('/'):
            href = urljoin(DOMAIN, href)
        with visited_lock:
            if ("charlotte.edu" in href) and (href not in visited):
                visited.add(href)
                to_visit.put(href)

def extract_data(soup, url):
    global dataset
    title = soup.title.string if soup.title else ''
    
    # Find the "main", "main-content", or "body" element
    element_ids = ["main", "main-content", "body"]
    element = None

    for elem_id in element_ids:
        element = soup.find(id=elem_id)
        if element:
            if elem_id == "main-content":
                element = element.parent
            break

    # Extract all visible text in the element and its child elements
    text = element.get_text(strip=True, separator=' ') if element else ''

    # Clean up the text
    text = text.replace('"', "'")    
    text = text.replace('\n', '')
    text = text.replace('\t', '')    
    text = ' '.join(text.split())
    
    dataset.append({'url': url, 'title': title, 'text': text})

to_visit.put(DOMAIN)
scrape()

# Save to CSV
with open('./data/dataset.csv', 'w', newline='', encoding='utf-8') as f:
    writer = csv.writer(f)
    writer.writerow(['url', 'title', 'text'])  # write the header
    for data in dataset:
        writer.writerow([data['url'], data['title'], data['text']])

'23266 successes, 339 failures, domain: https://cci.charlotte.edu/people/li-yang/'

  k = self.parse_starttag(i)


In [1]:
import requests, csv
from bs4 import BeautifulSoup
from urllib.parse import urljoin
from concurrent.futures import ThreadPoolExecutor, as_completed
from threading import Lock, Event
import logging

# Initialize logging
logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s')

# Variables for progress tracking and data storage
successes, failures = 0, 0
visited = set()
to_visit = []
dataset = []
visited_lock = Lock()
event = Event()

session = requests.Session()
adapter = requests.adapters.HTTPAdapter(pool_connections=100, pool_maxsize=100)
session.mount('http://', adapter)
session.mount('https://', adapter)

DOMAIN = "https://charlotte.edu"
EXCLUDE = ['?', 'directory', 'article', 'journals', 'coefs', 'calendar', 'day', 'xml', 'jpg', 'png', 'jpeg', 'sites', 'page', 'gateway', 'illiad', 'news-articles', 'news-events', 'news-media', 'linkedin', 'facebook', 'twitter', 'instagram', 'youtube', 'flickr', 'pinterest', '.com', '.org', '.net', '.gov', '.pdf', '.doc', 'xml', 'php', 'mailto:', '@', 'tel:', 'javascript:', 'tel:', 'sms:', 'mailto:', 'angular', 'react', '.js', 'event', 'corporate', '#', 'image', 'gallery', 'taskstream-student-handbook']    

WORKERS = 50

def scrape(domain):
    global successes, failures
    try:
        response = session.get(domain)
        if response.status_code != 200:
            logging.info(f"Failed to process {domain}")
            return
        soup = BeautifulSoup(response.text, 'html.parser')
        get_urls(soup)
        extract_data(soup, domain)
        successes += 1
    except Exception as e:
        logging.error(f"Error processing {domain}: {e}")
        failures += 1
    finally:
        logging.info(f'{successes} successes, {failures} failures, domain: {domain}')

def get_urls(soup):
    global visited
    for link in soup.find_all('a'):
        href = link.get('href')
        if href and not any(exclude in href for exclude in EXCLUDE):
            if href.startswith('/'):
                href = urljoin(DOMAIN, href)
            with visited_lock:
                if href not in visited:
                    visited.add(href)
                    to_visit.append(href)

def extract_data(soup, url):
    global dataset
    title = soup.title.string if soup.title else ''
    element_ids = ["main", "main-content", "body"]
    text = ''
    for elem_id in element_ids:
        element = soup.find(id=elem_id)
        if element:
            text = element.get_text(strip=True, separator=' ')
            break
    text = ' '.join(text.replace('"', "'").replace('\n', '').replace('\t', '').split())
    dataset.append({'url': url, 'title': title, 'text': text})

def main():
    to_visit.append(DOMAIN)
    while to_visit or not event.is_set():
        with ThreadPoolExecutor(max_workers=WORKERS) as executor:
            futures = [executor.submit(scrape, domain) for domain in to_visit.copy()]
            to_visit.clear()
            for future in as_completed(futures):
                try:
                    future.result()
                except Exception as e:
                    logging.error(f"Future execution error: {e}")
        if not to_visit:
            event.set()

    # Save to CSV
    with open('./data/dataset.csv', 'w', newline='', encoding='utf-8') as f:
        writer = csv.writer(f)
        writer.writerow(['url', 'title', 'text'])
        for data in dataset:
            writer.writerow([data['url'], data['title'], data['text']])
    logging.info("Scraping completed.")

if __name__ == "__main__":
    main()


2024-04-09 19:58:15,647 - INFO - 1 successes, 0 failures, domain: https://charlotte.edu
2024-04-09 19:58:15,665 - ERROR - Error processing landing/campus-life: Invalid URL 'landing/campus-life': No scheme supplied. Perhaps you meant https://landing/campus-life?
2024-04-09 19:58:15,668 - INFO - 1 successes, 1 failures, domain: landing/campus-life
2024-04-09 19:58:15,695 - INFO - 2 successes, 1 failures, domain: https://www.charlotte.edu/academics
2024-04-09 19:58:16,940 - INFO - 3 successes, 1 failures, domain: https://charlotte.edu/research
2024-04-09 19:58:16,962 - INFO - 4 successes, 1 failures, domain: https://www.charlotte.edu/landing/about-us
2024-04-09 19:58:16,986 - INFO - 5 successes, 1 failures, domain: https://www.charlotte.edu/landing/admissions-financial-aid
2024-04-09 19:58:17,010 - INFO - 6 successes, 1 failures, domain: https://inside.charlotte.edu/news-features/2024-03-29/inaugural-million-dollar-research-circle-celebrates-research-success
2024-04-09 19:58:17,055 - INFO