In [1]:
# Imports
import re
import requests
from bs4 import BeautifulSoup
import csv
from urllib.parse import urljoin
from urllib.parse import urlparse
import os
import sys
import concurrent.futures
from concurrent.futures import ThreadPoolExecutor
from IPython.display import clear_output, display
import csv
csv.field_size_limit(2**31 - 1)

131072

In [2]:
FILENAME = './results/endpoints.csv'
DOMAIN = 'https://charlotte.edu'

In [3]:
# Url parameters to exclude/skip
EXCLUDE = [
    'news-articles',
    'news-events',
    'news-media',
    'linkedin',
    'facebook',
    'twitter',
    'instagram',
    'youtube',
    'flickr',
    'pinterest',
    '.com',
    '.org',
    '.net',
    '.gov',
    '.pdf',
    '.doc',
    'xml',
    'php',
    'mailto:',
    '@',
    'tel:',
    'javascript:',
    'tel:',
    'sms:',
    'mailto:',
    'angular',
    'react',
    '.js',
    'event',
    'corporate',
    '#',
    'image',
    'gallery',
    'taskstream-student-handbook',
]

In [5]:
# Display the progress of the web scraping
current_display = display('Starting...', display_id=True)
progress_display = display('Starting...', display_id=True)
success_count = 0
failure_count = 0

def print_status(url, status):
    global success_count
    global failure_count
    global current_display
    global progress_display
    
    if status == 'Success':
        success_count += 1
    if status == 'Failed':
        failure_count += 1
    if status == 'Exception':
        failure_count += 1
        
    # Print the most recent URL and status, and the total counts    
    current_display.update(f'Most recent URL: {url} \nStatus: {status}')
    progress_display.update(f'Successes: {success_count}, Failures: {failure_count}')
    

def remove_url_prefix(url):
    url = url.replace('http://', '').replace('https://', '').replace('www.', '')
    return url.lower()

def is_valid_url(url):
    if any(ex in url for ex in EXCLUDE) or len(url) < 8 or len(url) > 100:
        return False
    try:
        split_url = re.split('https?://', url)
        return 'mailto:' not in url and '@' not in url and 'charlotte.edu' in split_url[0] or len(split_url) > 1 and 'charlotte.edu' in split_url[1]
    except Exception as e:
        print_status(url, 'Exception')
        return False
    
def write_to_csv(valid_endpoints, failed_endpoints):
    # Export the endpoints to a CSV file
    try:
        with open(FILENAME, 'w', newline='', encoding="utf-8") as file:
            writer = csv.writer(file)
            writer.writerow(['URL', 'Text'])  # Write the column labels
            for endpoint in valid_endpoints:  # Write the valid endpoints
                writer.writerow(endpoint)
            # for endpoint in failed_endpoints:  # Write the failed endpoints
            #     writer.writerow(endpoint)
            file.flush()
            os.fsync(file.fileno())
    except Exception as e:
        print_status(None, 'Failed')

def fetch_url(url):
    try:
        response = requests.get(url, timeout=10)
        if response.status_code == 200:
            return ('Success', response.content)
        else:
            return ('Failed', None)
    except (requests.exceptions.RequestException, requests.exceptions.Timeout, ValueError):
        print_status(url, 'Failed')
        return ('Failed', None)
def crawl_domain(domain):
    # Initialize sets and lists to keep track of visited URLs, URLs to visit, and endpoints
    visited = set()
    to_visit = [domain.rstrip('/')]
    valid_endpoints = []
    failed_endpoints = []

    # Use ThreadPoolExecutor to parallelize the web scraping
    with ThreadPoolExecutor(max_workers=50) as executor:
        
        # Submit tasks to the executor for each URL in to_visit that hasn't been visited yet and is valid
        futures = {executor.submit(fetch_url, url): url for url in to_visit if is_valid_url(url) and url not in visited}
        # Add the URLs that are being visited to the visited set
        visited.update(url for url in to_visit if is_valid_url(url))

        # Continue until all futures are done
        while futures:
            # Wait for the first future to complete
            done, _ = concurrent.futures.wait(futures, return_when=concurrent.futures.FIRST_COMPLETED)

            # Process each completed future
            for future in done:
                url = futures.pop(future)

                try:
                    # Get the result of the future
                    data = future.result()
                except Exception as e:
                    # If an exception occurred while fetching the URL, print the status and continue
                    print_status(url, 'Failed')
                    continue
                
                # Unpack the status and content from the data
                status, content = data
                
                try:
                    if status == 'Success':
                        # Parse the HTML content
                        soup = BeautifulSoup(content, 'html.parser')
                        
                        # Link extraction
                        # Find all links in the HTML content
                        links = soup.find_all('a')
                        for link in links:
                            href = link.get('href')
                            if href is not None:
                                # Resolve relative links to absolute links
                                full_url = urljoin(domain, href).rstrip('/')
                                clean_url = remove_url_prefix(full_url)
                                slash_count = urlparse(clean_url).path.count('/')
                                # If the URL is valid, hasn't been visited yet, and doesn't have too many slashes, add it to the futures
                                if is_valid_url(clean_url) and slash_count <= 2 and clean_url not in visited:
                                    futures[executor.submit(fetch_url, full_url)] = full_url
                                    visited.add(clean_url)
                        
                        # Text extraction
                        # Remove JavaScript and CSS blocks, and common sections like headers, footers, etc.
                        for script in soup(["script", "style"]):  # Remove JavaScript and CSS blocks
                            script.decompose()
                        # Remove header, footer, nav, and aside
                        for tag in soup(["header", "footer", "nav"]):
                            tag.decompose() 
                        # Remove divs with class "sidebar" or "ad"
                        for div in soup.find_all("div", class_=["sidebar", "ad"]):
                            div.decompose()
                        # Extract text from the HTML content
                        text = soup.get_text()  
                        text = ' '.join(text.split())
                        # Save the text along with the URL and status
                        valid_endpoints.append([url, str(text)])  
                        print_status(url, status)
                        
                    else:
                        # If the status is not 'Success', add the URL and status to the failed_endpoints list
                        failed_endpoints.append([url, ''])
                        visited.add(url)
                except Exception as e:
                    # If an exception occurred while processing the HTML content, print the status and continue
                    print_status(url, 'Failed')
                    visited.add(url)
                    continue
        # Write the valid and failed endpoints to a CSV file
        write_to_csv(valid_endpoints, failed_endpoints)

# Create the directory if it doesn't exist
os.makedirs(os.path.dirname(FILENAME), exist_ok=True)

# Clear the csv file
open(FILENAME, 'w').close()

# Crawl the domain
crawl_domain(DOMAIN)

'Most recent URL: https://library.charlotte.edu/check-out-request/borrow-laptops \nStatus: Success'

'Successes: 41, Failures: 0'

In [5]:
%pip install gensim

Collecting gensim
  Downloading gensim-4.3.2-cp311-cp311-win_amd64.whl.metadata (8.5 kB)
Collecting smart-open>=1.8.1 (from gensim)
  Downloading smart_open-6.4.0-py3-none-any.whl.metadata (21 kB)
Downloading gensim-4.3.2-cp311-cp311-win_amd64.whl (24.0 MB)
   ---------------------------------------- 0.0/24.0 MB ? eta -:--:--
   -- ------------------------------------- 1.3/24.0 MB 26.6 MB/s eta 0:00:01
   ----- ---------------------------------- 3.5/24.0 MB 54.8 MB/s eta 0:00:01
   ------------ --------------------------- 7.7/24.0 MB 61.4 MB/s eta 0:00:01
   ------------------ --------------------- 11.1/24.0 MB 65.2 MB/s eta 0:00:01
   ------------------------ --------------- 14.8/24.0 MB 81.8 MB/s eta 0:00:01
   ----------------------------- ---------- 17.6/24.0 MB 65.6 MB/s eta 0:00:01
   ----------------------------------- ---- 21.0/24.0 MB 73.1 MB/s eta 0:00:01
   ---------------------------------------  24.0/24.0 MB 72.6 MB/s eta 0:00:01
   ----------------------------------------

In [9]:
test = [["https://charlotte.edu","The University of North Carolina at Charlotte | UNC Charlotte Skip to main content News & Events News Music students participating in touring education production Tue, 02/06/2024 UNC Charlotte receives Library Excellence in Access and Diversity Award Fri, 02/02/2024 Excellence in Leadership Awards bestowed on 10 outstanding alumni Fri, 02/02/2024 Young alumni advancing in their fields and communities Thu, 01/25/2024 Noted neuroscience researcher Kelly Cartwright named Spangler Distinguished Professor of Early Literacy Wed, 01/24/2024 View All News Events UNC Charlotte Shape What's Next UNC Charlotte Icons 0 doctoral programs UNC Charlotte Icons 0 Living Alumni UNC Charlotte Icons 0 #NinerNation Undergrads to Overachievers Variety is more than the spice of life. It is life! The world offers a broader range of career opportunities than ever before, which is why we offer the way to explore and prepare for so many of them right. Choose from diverse majors in 90 bachelor's degree programs and more than 100 graduate programs. Explore Academic Offerings at UNC Charlotte #1 in Latinx Enrollment UNC Charlotte outpaces North Carolina's other four-year institutions with Latinx enrollment, undergraduate degrees and graduation rates ""It's so important to see other students like me on campus,"" says senior Claudia Martinez. Read More Data Science answers the call How UNC Charlotte is responding to industry demand in Charlotte, the region and beyond. Bringing together brilliant minds through interdisciplinary partnership, the University is bridging the gap between society and technology through hands-on programming and research. Read More Where inquiry is put to the ultimate test. Reality. Go beyond hypotheses and theory. Study in a place where on-campus research comes to life in off-campus applications throughout area communities, businesses and industries. Explore Research At UNC Charlotte Quaint & Quiet Lively & Loud Can't decide between a peaceful, picturesque college campus and an action-packed big-city school? Then don't. Get Involved in Campus Life at UNC Charlotte Clubs & Activities Choose from more than 350 student organizations in and out of the classroom at UNC Charlotte. There's something for everyone here! 49er Sports Niner Nation loves cheering on the 49ers and their 18 NCAA Division I varsity sports. Members of the Football Bowl Subdivision (FBS) American Athletic Conference, the 49ers boast some of the nation’s finest facilities and compete against the NCAA’s top competition. Exploring Charlotte Discover the University that lives on the pulse of the city. From professional sports and polished culture to outdoor adventure and recreation, Charlotte is a top destination. Enhancing student motivation and learning Jennifer Webb, Associate Professor of Psychology Revolutionizing teaching practices to benefit students Oscar Lansen, Teaching Professor of History Providing experiential learning opportunities Thomas Marshall, Lecturer in Risk Management Forging connections with students Jordan Poler, Associate Professor of Chemistry Explore Faculty Inside UNC Charlotte"],
["https://www.charlotte.edu/academics","Academics at the University of North Carolina at Charlotte | UNC Charlotte Skip to main content Academics Apply Now Visit Our Campus UNC Charlotte, North Carolina's urban research university, fuels American innovation in everything from resilient and sustainable architecture and environmental systems, to epidemiological modeling and sustainable energy, to shaping the future of work for greater Charlotte and beyond. Know What You're Looking For? Search Our Programs The academic search requires JavaScript. Visit the University Catalogs site to view all programs available. Undergraduate Programs Majors Minors Certificates Graduate Programs Graduate Degree Programs Graduate Certificates Online & Professional Programs Online/Distance Education School of Professional Studies Executive Education Explore Our Colleges Belk College of Business Generating vital talent for the greater Charlotte economy — the second largest banking center in the United States — and fresh insights through research for emerging companies across North Carolina. Learn more College of Arts + Architecture A diverse community of visionary thinkers, designers, and makers, who seek to create a more beautiful and just world through innovation, research and collaborative engagement. Learn more Cato College of Education Supporting North Carolina schools, teachers, superintendents and policy makers working to advance educational research, equity, excellence and engagement for all students. Learn more College of Computing & Informatics Fostering critical knowledge and talent to speed next-generation research and technological breakthroughs — Artificial Intelligence, Robotics, Big Data Analysis, Computer-Aided Education, Bioinformatics and Cybersecurity — for North Carolina. Learn more College of Health & Human Services Translating clinical and public health research to improve patient outcomes, especially for vulnerable, underinsured and underserved communities. Learn more College of Science Advancing interdisciplinary research and promoting discovery in the fields of math, chemistry, biology and physics, through supportive, experiential learning and state-of-the-art facilities. Learn more College of Humanities & Earth and Social Sciences Enhancing our understanding of complex issues, from climate change and global migration to health disparities and economic inequality, through interdisciplinary research, student-centered learning, and community engagement. Learn more The William States Lee College of Engineering Among the top engineering programs in North Carolina, where ideas become reality through research, study, design, hands-on prototyping and often interdisciplinary collaboration with industry supporters. Learn more Interdisciplinary Studies Where business meets computer science, biology meets the arts and history combines with engineering — integrative thinkers draw from multiple academic disciplines to lead North Carolina’s top roles in data science, business, law and healthcare. Learn more Academic Excellence The Graduate School Honors College University College Additional Resources Academic Advising Adult and Extended Services Career Center Center for Graduate Life Common Reading Experience Disability Services Academic Diversity and Inclusion International Programs Academic Support Services Writing Center"],
["https://library.charlotte.edu","Homepage | J. Murrey Atkins Library Skip to main content Limit To: Articles Peer-reviewed Advanced Search Databases Journals 0 PEOPLE IN ATKINS My Accounts Study Rooms Research Guides Hours Printing Contact Us × Which Account? My Library Account My Interlibrary Loan Account Sign up to receive library news and updates looking for a book that messes with your head?Check out the Psychological Fiction collection in the 2nd Floor Special DisplayRead More Check out our new Board Games, Card Games, and Puzzles CollectionNow available at the Area 49 Desk on the 2nd floor Read More Atkins offers resource guide for Digital HumanitiesRead More A fireside-style discussion about the birth of grassroots activism in CharlotteFebruary 28, 6-7 p.m. Read More Join the Atkins Reading ChallengeGet your bingo card and start READING!Read More Swank Streaming Film CollectionA selection of popular movies for the classroom or at homeRead More 1,500,000 Visits Per Year 3,800,000 Volumes 57 Reservable Study Rooms View More events Digital Humanities Resource Guide UNC Charlotte Receives Library Excellence in DEI Award De-Stress for Success Journal Package Alert Book Presentation, Talk, and Reception De-stress for Success During Exams The Princess Augusta Sophia Collection of Drama Atkins Introduces the Library Mobile App Film Screen With A Dean Focuses on Racial Injustice in the Justice System Exam-time activities planned for students Inaugural Atkins Library Popular Reading Series features Dr. A.J. Hartley Atkins Book Club Discussion Odyssey for Democracy Author, Subject Participate in Panel Discussion The Black Read: Celebrating Black History Month Celebrate the Insulin Centennial Wonderland Poetry Reading and Tea Party Atkins Awarded Second Grant for Mobile Hotspot Lending Atkins Awarded Federal Grant BrowZine Cancellation Film Screen With A Dean: Wilmington on Fire COVID-19 Vaccinations: Science, Politics, Mistrust, and Misinformation Panel Digital Media Literacy Instruction Kate Dickson: A Passion to Protect Atkins Moved Quickly To Keep Services Going During the Pandemic Offsite Storage Move Update Election 2020: How to Verify What You Read, See, and Hear Online Liberry Lager Now Available at Triple C Brewing Atkins Library Reopening Guide Atkins Announces Offsite Storage Location Active Learning Academy Book Published by Atkins Library Dance History II De-stress for Success with Atkins Paywall Film Virtual Panel Discussion Atkins Creating PPE for Healthcare Workers CANCELED: For the Love of Books CANCELED: Author Susan Rivers on ""Keeping It Real"" Special Collections Holds Rare ""Sketches of Charlotte"" Booklets Get Ready to be Counted Atkins Library Unveils New Website Disability Advocate and Author Discusses Hidden Disabilities New Combined Library Services at First Floor Desk Atkins Rare Book Used to Create Smithsonian Exhibit Join Our Book Club! Packaging the Past View More News"]]

from transformers import pipeline

summarize = pipeline("summarization")

for t in test:
    print(summarize(t[1], max_length=200, min_length=30))
    print("\n")


No model was supplied, defaulted to sshleifer/distilbart-cnn-12-6 and revision a4f8f3e (https://huggingface.co/sshleifer/distilbart-cnn-12-6).
Using a pipeline without specifying a model name and revision in production is not recommended.


[{'summary_text': " The University of North Carolina at Charlotte | UNC Charlotte . The world offers a broader range of career opportunities than ever before . Choose from diverse majors in 90 bachelor's degree programs and more than 100 graduate programs . Explore academic offerings at UNC Charlotte #1 in Latinx Enrollment ."}]


[{'summary_text': " University of North Carolina at Charlotte is North Carolina's urban research university . The Belk College of Business is generating vital talent for the greater Charlotte economy . The Cato College of Education is supporting North Carolina schools, teachers, superintendents and policy makers working to advance educational research, equity, excellence and engagement ."}]


[{'summary_text': ' J. Murrey Atkins Library has 1,500,000 Visits Per Year 3,800,000 Volumes 57 Reservable Study Rooms . Atkins Introduces the Library Mobile App Film Screen With A Dean .'}]


