In [1]:
# Imports
import re
import requests
from bs4 import BeautifulSoup
import csv
from urllib.parse import urljoin
from urllib.parse import urlparse
import os
import sys
import concurrent.futures
from concurrent.futures import ThreadPoolExecutor
from IPython.display import clear_output, display
import csv
csv.field_size_limit(2**31 - 1)

131072

In [2]:
FILENAME = './data/endpoints.csv'
DOMAIN = 'https://charlotte.edu'

In [3]:
# Url parameters to exclude/skip
EXCLUDE = [
    'news-articles',
    'news-events',
    'news-media',
    'linkedin',
    'facebook',
    'twitter',
    'instagram',
    'youtube',
    'flickr',
    'pinterest',
    '.com',
    '.org',
    '.net',
    '.gov',
    '.pdf',
    '.doc',
    'xml',
    'php',
    'mailto:',
    '@',
    'tel:',
    'javascript:',
    'tel:',
    'sms:',
    'mailto:',
    'angular',
    'react',
    '.js',
    'event',
    'corporate',
    '#',
    'image',
    'gallery',
    'taskstream-student-handbook',
]

In [4]:
# Display the progress of the web scraping
current_display = display('Starting...', display_id=True)
progress_display = display('Starting...', display_id=True)
success_count = 0
failure_count = 0

def print_status(url, status):
    global success_count
    global failure_count
    global current_display
    global progress_display
    
    if status == 'Success':
        success_count += 1
    if status == 'Failed':
        failure_count += 1
    if status == 'Exception':
        failure_count += 1
        
    # Print the most recent URL and status, and the total counts    
    current_display.update(f'Most recent URL: {url} \nStatus: {status}')
    progress_display.update(f'Successes: {success_count}, Failures: {failure_count}')
    

def remove_url_prefix(url):
    url = url.replace('http://', '').replace('https://', '').replace('www.', '')
    return url.lower()

def is_valid_url(url):
    if any(ex in url for ex in EXCLUDE) or len(url) < 8 or len(url) > 100:
        return False
    try:
        split_url = re.split('https?://', url)
        return 'mailto:' not in url and '@' not in url and 'charlotte.edu' in split_url[0] or len(split_url) > 1 and 'charlotte.edu' in split_url[1]
    except Exception as e:
        print_status(url, 'Exception')
        return False
    
def write_to_csv(valid_endpoints, failed_endpoints):
    # Export the endpoints to a CSV file
    try:
        with open(FILENAME, 'w', newline='', encoding="utf-8") as file:
            writer = csv.writer(file)
            writer.writerow(['URL', 'Text'])  # Write the column labels
            for endpoint in valid_endpoints:  # Write the valid endpoints
                writer.writerow(endpoint)
            # for endpoint in failed_endpoints:  # Write the failed endpoints
            #     writer.writerow(endpoint)
            file.flush()
            os.fsync(file.fileno())
    except Exception as e:
        print_status(None, 'Failed')

def fetch_url(url):
    try:
        response = requests.get(url, timeout=10)
        if response.status_code == 200:
            return ('Success', response.content)
        else:
            return ('Failed', None)
    except (requests.exceptions.RequestException, requests.exceptions.Timeout, ValueError):
        print_status(url, 'Failed')
        return ('Failed', None)
def crawl_domain(domain):
    # Initialize sets and lists to keep track of visited URLs, URLs to visit, and endpoints
    visited = set()
    to_visit = [domain.rstrip('/')]
    valid_endpoints = []
    failed_endpoints = []

    # Use ThreadPoolExecutor to parallelize the web scraping
    with ThreadPoolExecutor(max_workers=50) as executor:
        
        # Submit tasks to the executor for each URL in to_visit that hasn't been visited yet and is valid
        futures = {executor.submit(fetch_url, url): url for url in to_visit if is_valid_url(url) and url not in visited}
        # Add the URLs that are being visited to the visited set
        visited.update(url for url in to_visit if is_valid_url(url))

        # Continue until all futures are done
        while futures:
            # Wait for the first future to complete
            done, _ = concurrent.futures.wait(futures, return_when=concurrent.futures.FIRST_COMPLETED)

            # Process each completed future
            for future in done:
                url = futures.pop(future)

                try:
                    # Get the result of the future
                    data = future.result()
                except Exception as e:
                    # If an exception occurred while fetching the URL, print the status and continue
                    print_status(url, 'Failed')
                    continue
                
                # Unpack the status and content from the data
                status, content = data
                
                try:
                    if status == 'Success':
                        # Parse the HTML content
                        soup = BeautifulSoup(content, 'html.parser')
                        
                        # Link extraction
                        # Find all links in the HTML content
                        links = soup.find_all('a')
                        for link in links:
                            href = link.get('href')
                            if href is not None:
                                # Resolve relative links to absolute links
                                full_url = urljoin(domain, href).rstrip('/')
                                clean_url = remove_url_prefix(full_url)
                                slash_count = urlparse(clean_url).path.count('/')
                                # If the URL is valid, hasn't been visited yet, and doesn't have too many slashes, add it to the futures
                                if is_valid_url(clean_url) and slash_count <= 2 and clean_url not in visited:
                                    futures[executor.submit(fetch_url, full_url)] = full_url
                                    visited.add(clean_url)
                        
                        # Text extraction
                        # Remove JavaScript and CSS blocks, and common sections like headers, footers, etc.
                        for script in soup(["script", "style"]):  # Remove JavaScript and CSS blocks
                            script.decompose()
                        # Remove header, footer, nav, and aside
                        for tag in soup(["header", "footer", "nav"]):
                            tag.decompose() 
                        # Remove divs with class "sidebar" or "ad"
                        for div in soup.find_all("div", class_=["sidebar", "ad"]):
                            div.decompose()
                        # Extract text from the HTML content
                        text = soup.get_text()  
                        text = ' '.join(text.split())
                        # Save the text along with the URL and status
                        valid_endpoints.append([url, str(text)])  
                        print_status(url, status)
                        
                    else:
                        # If the status is not 'Success', add the URL and status to the failed_endpoints list
                        failed_endpoints.append([url, ''])
                        visited.add(url)
                except Exception as e:
                    # If an exception occurred while processing the HTML content, print the status and continue
                    print_status(url, 'Failed')
                    visited.add(url)
                    continue
        # Write the valid and failed endpoints to a CSV file
        write_to_csv(valid_endpoints, failed_endpoints)
        
# Clear the csv file
open(FILENAME, 'w').close()

# Crawl the domain
crawl_domain(DOMAIN)

'Most recent URL: https://sites.charlotte.edu/harwood/?sfid=9267&sf_paged=24 \nStatus: Success'

'Successes: 14897, Failures: 492'

Some characters could not be decoded, and were replaced with REPLACEMENT CHARACTER.
