In [None]:
# %pip install beautifulsoup4

In [None]:
# %pip install unstructured

In [None]:
# %pip install --upgrade selenium

In [1]:
import requests
from bs4 import BeautifulSoup
from langchain.document_loaders import UnstructuredHTMLLoader
from langchain_community.document_loaders import UnstructuredURLLoader
from langchain_community.document_loaders import SeleniumURLLoader
from urllib.parse import urljoin
from io import BytesIO

In [None]:
# Set to keep track of visited URLs to avoid re-processing
visited_urls = set()

# Function to crawl a URL and its subpages recursively
def crawl_and_parse(url, base_url, depth=0, max_depth=2):
    # Base condition to stop recursion after a certain depth
    if depth > max_depth:
        return
    
    # Fetch the content of the page
    try:
        response = requests.get(url)
        response.raise_for_status()  # Raise an error for bad responses
    except requests.RequestException as e:
        print(f"Failed to retrieve {url}: {e}")
        return

    # Parse the page content
    soup = BeautifulSoup(response.content, 'html.parser')
    
    # Use UnstructuredHtmlLoader to load content from the page
    loader = UnstructuredHTMLLoader(url)
    docs = loader.load()

    # Display or process the content (here, just printing the content)
    print(f"\n--- Content of {url} ---\n")
    for doc in docs:
        print(doc.page_content[:100])  # Print first 1000 characters of the content
        print("\n")

    # Mark this URL as visited
    visited_urls.add(url)
    
    # Extract all links from the page and recursively process them
    for link_tag in soup.find_all('a', href=True):
        link_url = link_tag['href']
        # Convert relative URLs to absolute URLs
        full_url = urljoin(base_url, link_url)
        
        if full_url not in visited_urls and full_url.startswith(base_url):
            # Recursively crawl the subpage
            crawl_and_parse(full_url, base_url, depth + 1, max_depth)

# Example usage
starting_url = "https://www.denvergov.org/Government/Agencies-Departments-Offices/Agencies-Departments-Offices-Directory/Denver-City-Council/Council-Members-Websites-Info/District-4"
crawl_and_parse(starting_url, base_url=starting_url, max_depth=2)


In [7]:
# Set to keep track of visited URLs to avoid re-processing
visited_urls = set()

# Function to crawl a URL and its subpages recursively
def crawl(url, base_url, depth=0, max_depth=2):
    # Base condition to stop recursion after a certain depth
    if depth > max_depth:
        return

    # Fetch the content of the page
    try:
        response = requests.get(url)
        response.raise_for_status()  # Raise an error for bad responses
    except requests.RequestException as e:
        print(f"Failed to retrieve {url}: {e}")
        return

    # Parse the page content
    soup = BeautifulSoup(response.content, 'html.parser')

    # Display or process the content (here, just printing the content)
    print(f"\n--- Content of {url} ---")

    # Mark this URL as visited
    visited_urls.add(url)
    
    # Find all divs with the class "item"
    # //*[@id="main-content"]/div
    # #main-content > div
    # document.querySelector("#main-content > div")
    divs = soup.find_all('div', class_=('wysiwyg-content body-content', 'list-item-container', 'list-item-title'))
    print(divs)
    
    # # Extract all links from the page and recursively process them
    # for link_tag in soup.find_all('a', href=True):
    #     link_url = link_tag['href']
    #     print(f"\n--- link_url {link_url} ---")
    #     # Convert relative URLs to absolute URLs
    #     full_url = urljoin(base_url, link_url)
        
    #     if full_url not in visited_urls and full_url.startswith(base_url):
    #         # Recursively crawl the subpage
    #         crawl(full_url, base_url, depth + 1, max_depth)

# Example usage
starting_url = "https://www.denvergov.org/Government/Agencies-Departments-Offices/Agencies-Departments-Offices-Directory/Denver-City-Council/Council-Members-Websites-Info/District-4"
base_url = "https://www.denvergov.org/"
crawl(starting_url, base_url=starting_url, max_depth=2)


--- Content of https://www.denvergov.org/Government/Agencies-Departments-Offices/Agencies-Departments-Offices-Directory/Denver-City-Council/Council-Members-Websites-Info/District-4 ---
[<div class="wysiwyg-content body-content"><p>Council Pro Tem Diana Romero Campbell was elected on April 4th, 2023, as the Denver City Council Member for District 4, Southeast Denver. She was elected to serve as Council Pro Tem on July 15, 2024.</p>
<p>Diana, having grown up and raised her family in Southeast Denver, brings a grounded perspective, deep understanding, and commitment to the unique landscape of the district. She is dedicated to parks and recreation, and preserving the Highline Canal as a community asset. Diana partners with stakeholders to ensure a safe and affordable community for all and engages with residents, from the youngest to older adults, to build a community with respect, joy, and collaboration.</p>
<p>Diana's career is dedicated to ensuring all children, youth, and families have

In [15]:
urls = [
    "https://www.denvergov.org/Government/Agencies-Departments-Offices/Agencies-Departments-Offices-Directory/Denver-City-Council/Council-Members-Websites-Info/District-4#main-content",
    "https://www.understandingwar.org/backgrounder/russian-offensive-campaign-assessment-february-9-2023",
]

In [10]:
loader = UnstructuredURLLoader(urls=urls)

data = loader.load()

data[0]

Document(metadata={'source': 'https://www.denvergov.org/Government/Agencies-Departments-Offices/Agencies-Departments-Offices-Directory/Denver-City-Council/Council-Members-Websites-Info/District-4'}, page_content='')

In [11]:
data

[Document(metadata={'source': 'https://www.denvergov.org/Government/Agencies-Departments-Offices/Agencies-Departments-Offices-Directory/Denver-City-Council/Council-Members-Websites-Info/District-4'}, page_content=''),
 Document(metadata={'source': 'https://www.understandingwar.org/backgrounder/russian-offensive-campaign-assessment-february-9-2023'}, page_content='Skip to main content\n\nHome\n\nWho We Are\n\nResearch\n\nPublications\n\nGet Involved\n\nPlanned Giving\n\nDonate\n\nRussian Offensive Campaign Assessment, February 9, 2023\n\nFeb 9, 2023 - ISW Press\n\nDownload the PDF\n\nKarolina Hird, Riley Bailey, George Barros, Nicole Wolkov, and Frederick W. Kagan\n\nFebruary 9, 7:30 pm ET\n\nClick here to see ISW’s interactive map of the Russian invasion of Ukraine. This map is updated daily alongside the static maps present in this report.\n\nWagner Group financier Yevgeny Prigozhin announced on February 9 that the Wagner Group has entirely stopped recruiting prisoners. In a response 

In [16]:


# urls = [
#     "https://www.youtube.com/watch?v=dQw4w9WgXcQ",
#     "https://goo.gl/maps/NDSHwePEyaHMFGwh8",
# ]

loader = SeleniumURLLoader(urls=urls)

data = loader.load()

data[1]

Document(metadata={'source': 'https://www.understandingwar.org/backgrounder/russian-offensive-campaign-assessment-february-9-2023', 'title': 'Russian Offensive Campaign Assessment, February 9, 2023 | Institute for the Study of War', 'description': 'Wagner Group financier Yevgeny Prigozhin announced on February 9 that the Wagner Group has entirely stopped recruiting prisoners. In a response to a press comment, Prigozhin claimed that Wagner’s recruitment of prisoners has "completely stopped" and that ', 'language': 'en'}, page_content='Skip to main content\n\nHome\n\nWho We Are\n\nResearch\n\nPublications\n\nGet Involved\n\nPlanned Giving\n\nDonate\n\nRussian Offensive Campaign Assessment, February 9, 2023\n\nFeb 9, 2023 - ISW Press\n\nDownload the PDF\n\nKarolina Hird, Riley Bailey, George Barros, Nicole Wolkov, and Frederick W. Kagan\n\nFebruary 9, 7:30 pm ET\n\nClick here to see ISW’s interactive map of the Russian invasion of Ukraine. This map is updated daily alongside the static ma

In [17]:
data[0]

Document(metadata={'source': 'https://www.denvergov.org/Government/Agencies-Departments-Offices/Agencies-Departments-Offices-Directory/Denver-City-Council/Council-Members-Websites-Info/District-4#main-content', 'title': 'Diana Romero Campbell - City and County of Denver', 'description': 'Diana Romero Campbell is the elected council member for District 4.', 'language': 'en-US'}, page_content='Skip to main content\n\nReady')