In [1]:
import re
import requests
from bs4 import BeautifulSoup
import csv
from urllib.parse import urljoin
from urllib.parse import urlparse
import os
import concurrent.futures
from concurrent.futures import ThreadPoolExecutor

FILENAME = './data/endpoints-v4.csv'
DOMAIN = 'https://charlotte.edu'

EXCLUDE = [
    'news-articles',
    'news-events',
    'news-media',
    'linkedin',
    'facebook',
    'twitter',
    'instagram',
    'youtube',
    'flickr',
    'pinterest',
    '.com',
    '.org',
    '.net',
    '.gov',
    '.pdf',
    '.doc',
    'php',
    'mailto:',
    '@',
    'tel:',
    'javascript:',
    'tel:',
    'sms:',
    'mailto:',
    'angular',
    'react',
    '.js',
    'event',
    'corporate',
    '#',
    'image',
    'gallery'
    
]

def remove_url_prefix(url):
    url = url.replace('http://', '').replace('https://', '').replace('www.', '')
    return url.lower()

def is_valid_url(url):
    if any(ex in url for ex in EXCLUDE) or len(url) < 8 or len(url) > 100:
        return False
    try:
        split_url = re.split('https?://', url)
        # print(split_url)
        # print('charlotte.edu' in split_url[0])
        return 'mailto:' not in url and '@' not in url and 'charlotte.edu' in split_url[0] or len(split_url) > 1 and 'charlotte.edu' in split_url[1]
    except Exception as e:
        print(f'Exception: {e}')
        return False
    
def write_to_csv(valid_endpoints, failed_endpoints):
    # Export the endpoints to a CSV file
    try:
        with open(FILENAME, 'w', newline='', encoding="utf-8") as file:
            writer = csv.writer(file)
            writer.writerow(['URL', 'Status', 'Text'])  # Write the column labels
            for endpoint in valid_endpoints:  # Write the valid endpoints
                writer.writerow(endpoint)
            for endpoint in failed_endpoints:  # Write the failed endpoints
                writer.writerow(endpoint)
            file.flush()
            os.fsync(file.fileno())
    except Exception as e:
        print(f'Exception during file write: {e}')

def fetch_url(url):
    try:
        response = requests.get(url, timeout=10)
        if response.status_code == 200:
            return ('Success', response.content)
        else:
            return ('Failed', None)
    except (requests.exceptions.RequestException, requests.exceptions.Timeout, ValueError):
        print(f'An error occurred while fetching {url}')
        return ('Failed', None)

def crawl_domain(domain):
    visited = set()
    to_visit = [domain.rstrip('/')]
    valid_endpoints = []
    failed_endpoints = []

    with ThreadPoolExecutor(max_workers=50) as executor:
        futures = {executor.submit(fetch_url, url): url for url in to_visit if is_valid_url(url) and url not in visited}
        visited.update(url for url in to_visit if is_valid_url(url))

        while futures:
            done, _ = concurrent.futures.wait(futures, return_when=concurrent.futures.FIRST_COMPLETED)

            for future in done:
                url = futures.pop(future)

                try:
                    data = future.result()
                except Exception as e:
                    print(f'Exception in crawl loop {url}: {e}')
                    continue
                
                status, content = data
                
                try:
                    if status == 'Success':
                        soup = BeautifulSoup(content, 'html.parser')

                        for script in soup(["script", "style"]):  # Remove JavaScript and CSS blocks
                            script.decompose()
                        text = soup.get_text()  # Extract text from the HTML content
                        text = ' '.join(text.split())
                        valid_endpoints.append([url, status, str(text)])  # Save the text along with the URL and status
                        print(f'{url} - Success')
                        links = soup.find_all('a')
                        for link in links:
                            href = link.get('href')
                            if href is not None:
                                full_url = urljoin(domain, href).rstrip('/')
                                clean_url = remove_url_prefix(full_url)
                                slash_count = urlparse(clean_url).path.count('/')
                                if is_valid_url(clean_url) and slash_count <= 2 and clean_url not in visited:
                                    futures[executor.submit(fetch_url, full_url)] = full_url
                                    visited.add(clean_url)
                    else:
                        failed_endpoints.append([url, status, ''])
                        visited.add(url)
                except Exception as e:
                    print(f'Exception while fetching {url}: {e}')
                    visited.add(url)
                    continue
        write_to_csv(valid_endpoints, failed_endpoints)
# Clear the csv file
open(FILENAME, 'w').close()

# Crawl the domain
crawl_domain(DOMAIN)

https://charlotte.edu - Success
https://library.charlotte.edu - Success
https://www.charlotte.edu/academics - Success
https://www.charlotte.edu/landing/about-us - Success
https://legal.charlotte.edu/policies - Success
https://maps.charlotte.edu - Success
https://my.charlotte.edu - Success
https://directory.charlotte.edu - Success
https://emergency.charlotte.edu - Success
https://incidentreport.charlotte.edu - Success
http://inside.charlotte.edu/faculty-spotlights - Success
https://legal.charlotte.edu/termsofuse - Success
https://charlotte.edu/gateway/community - Success
https://charlotte.edu/gateway/alumni-friends - Success
https://charlotte.edu/landing/campus-life - Success
https://charlotte.edu/gateway/faculty-staff - Success
https://charlotte.edu/gateway/prospective-students - Success
https://charlotte.edu/gateway/parents-family - Success
https://charlotte.edu/research - Success
https://charlotte.edu/landing/diversity - Success
https://charlotte.edu/contact - Success
https://charlot

Some characters could not be decoded, and were replaced with REPLACEMENT CHARACTER.


https://edassessment.charlotte.edu/taskstream/taskstream-student-handbook - Success
https://edassessment.charlotte.edu/edtpa-information/edtpa-due-dates - Success
https://education.charlotte.edu/vacant-7 - Success
https://education.charlotte.edu/sarah-johnson-0 - Success
https://ir.charlotte.edu/faculty-data/academic-year-2021-2022 - Success
https://education.charlotte.edu/scott-kissau-0 - Success
https://edassessment.charlotte.edu/access-edtpa-canvas-site - Success
https://education.charlotte.edu/brandi-lewis - Success
https://education.charlotte.edu/kristen-morse - Success
https://spaces.charlotte.edu/pages/viewpage.action?pageId=35656026 - Success
https://edassessment.charlotte.edu/annual-reports-and-strategic-plan/unc-educator-quality-dashboard - Success
https://education.charlotte.edu/funding-opportunities - Success
https://edassessment.charlotte.edu/community-feedback - Success
https://edassessment.charlotte.edu/recent-initiatives/2015-data-day - Success
https://ninerworks.charlo

Some characters could not be decoded, and were replaced with REPLACEMENT CHARACTER.


https://edassessment.charlotte.edu/taskstream/taskstream-evaluator-handbook - Success
https://education.charlotte.edu/rebecca-rippy - Success
https://education.charlotte.edu/judy-pugh - Success
https://employersolutions.charlotte.edu/people/dayle-janus - Success
https://itprojects.charlotte.edu/information/what-project - Success
https://employersolutions.charlotte.edu/people/amy-wartham - Success
https://ir.charlotte.edu/fact-book/academic-year-2003-2004 - Success
https://ir.charlotte.edu/directory/tania-rymer - Success
https://ir.charlotte.edu/jennifer-galecki - Success
https://ir.charlotte.edu/fact-book/guidelines-unc-charlotte-factbook - Success
https://ir.charlotte.edu/david-goins - Success
https://ir.charlotte.edu/directory/derrick-isler - Success
https://ir.charlotte.edu/christine-robinson - Success
https://ir.charlotte.edu/directory/susan-miller - Success
https://education.charlotte.edu/office-of-school-and-community-partnerships - Success
https://ir.charlotte.edu/directory/vaca



https://provost.charlotte.edu/student-success/graduation-initiative - Success
https://provost.charlotte.edu/tag/2022 - Success
http://legal.charlotte.edu/policies/up-311.5 - Success
http://legal.charlotte.edu/policies/up-311.4 - Success
https://research.charlotte.edu/directory/ida-stavenger - Success
http://legal.charlotte.edu/policies/up-311.6 - Success
http://legal.charlotte.edu/policies/up-311.7 - Success
http://auxiliary.charlotte.edu/dining - Success
https://science.charlotte.edu/category/fellowships - Success
http://legal.charlotte.edu/policies/up-311.9 - Success
http://legal.charlotte.edu/policies/up-311.8 - Success
https://science.charlotte.edu/people/ashley-blackwood - Success
https://provost.charlotte.edu/category/faculty-staff - Success
https://science.charlotte.edu/people/vanessa-grimaldi - Success
https://science.charlotte.edu/people/sheila-reaves - Success
https://science.charlotte.edu/people/maisha-cooper - Success
https://science.charlotte.edu/category/alumni-giving - S