In [1]:
import requests
from bs4 import BeautifulSoup

# URL = 'https://www.hlidacstatu.cz/data/Hledat/stenozaznamy-psp/'  # Replace with the target URL
URL = 'https://www.psp.cz/eknih/2021ps/stenprot/index.htm'  # Replace with the target URL

In [57]:
# Required Libraries

# Constants

def fetch_website_content(url):
    """
    Fetches the content of the website at the given URL.
    
    Args:
        url (str): The target URL to fetch.
    
    Returns:
        str: The raw content of the webpage.
    """
    response = requests.get(url)
    
    # Check if the request was successful
    if response.status_code != 200:
        raise ValueError(f"Failed to fetch the webpage. HTTP Status Code: {response.status_code}")

    html_content = response.text
    soup = BeautifulSoup(html_content, 'html.parser')
    return soup

def extract_html_of_a_with_href(soup):
    """
    Parses the required data from the given HTML content.
    
    Args:
        html_content (str): The raw HTML content.
    
    Returns:
        list: A list of data extracted from the content.
    """
    # Modify this section to extract the specific data you need
    # For instance, to extract all the text inside paragraph tags:
    data = [(a.get_text(), a['href']) for a in soup.find_all('a', href=True)]
    return data

def extract_html_of_div_with_id(soup, target_id):
    """
    Extracts the inner HTML content of a div with the given ID from the provided HTML content.
    
    Args:
        html_content (str): The raw HTML content.
        target_id (str): The ID of the div whose content needs to be extracted.
    
    Returns:
        str: The inner HTML content of the div, or None if the div isn't found.
    """
    # Find the div with the given ID
    div = soup.find('div', id=target_id)
    
    # Return its inner HTML if found
    return div if div else None

def extract_content_of_p_with_align(html_content, target_align):
    """
    Extracts the content of all <p> tags with the specified align attribute.
    
    Args:
        html_content (str): The raw HTML content.
        target_align (str): The align attribute value to look for.
    
    Returns:
        list: A list of content extracted from the <p> tags.
    """
    # Find all <p> tags with the specified align attribute
    p_tags = html_content.find_all('p', align=target_align)
    
    # Extract and return the content of these tags
    return [p.get_text() for p in p_tags]


In [61]:
def get_all_schuze_links(URL):
    url_data = {}
    soup = fetch_website_content(URL)
    body_content = extract_html_of_div_with_id(soup, "body")
    hrefs = extract_html_of_a_with_href(body_content)
    links = [link for link in hrefs if "schuz" in link[1] and "index" in link[1]]
    for link in links:
        url_data[link[0]] = {"url": URL.split("/index")[0] + "/" + link[1]}

    return url_data

def get_all_schuze_content(URL, URL_schuze):
    soup = fetch_website_content(URL_schuze)
    body_content = extract_html_of_div_with_id(soup, "main-content")
    hrefs = extract_html_of_a_with_href(body_content)
    links = [link[1] for link in hrefs_schuze if "-" in link[1]]
    all_url_schuze = [URL.split("/index")[0] + "/" + link_schuze for link_schuze in links_schuze]

    return all_url_schuze

def get_all_speakers(URL_schuze, URL_zaznam):
    url_data = []
    soup = fetch_website_content(URL_zaznam)
    body_content = extract_html_of_div_with_id(soup_zaznam, "body")
    hrefs = extract_html_of_a_with_href(body_content)
    links = [link for link in hrefs if "#r" in link[1]]
    for link in links:
        url_data.append((link[0], URL_schuze.split("/index")[0] + "/" + link[1]))

    return url_data

In [64]:
def get_all_links(URL):
    all_links = {}
    
    schuze_all = get_all_schuze_links(URL)
    for name_schuze in list(schuze_all.keys()):
        URL_schuze = schuze_all[name_schuze]["url"]
        schuze_all[name_schuze]["rec"] = get_all_schuze_content(URL, URL_schuze)
        schuze_all[name_schuze]["speak"] = []
        for URL_zaznam in all_url_schuze_parts[0:1]:
            schuze_all[name_schuze]["speak"] += get_all_speakers(URL_schuze, URL_zaznam)
            
    return all_links

In [40]:
def get_all_data(schuze_all):
    all_data = {}
    for name_schuze in list(schuze_all.keys()):
        all_data[name_schuze] = {}
        for link in schuze_all[name_schuze]["speak"]:
            all_data[name_schuze][link[0]] = []
            try:
                soup = fetch_website_content(link[1])
                body_content = extract_html_of_div_with_id(soup, "main-content")
                p_tags = extract_content_of_p_with_align(body_content, "justify")
                
                all_data[name_schuze][link[0]] += p_tags
            except:
                print("Couldnt get data from: ", url)
    return all_data

In [63]:
all_links = get_all_links(URL)
all_data = get_all_data(all_links)

ConnectionError: ('Connection aborted.', RemoteDisconnected('Remote end closed connection without response'))

In [53]:
print(schuze_all)

{}


In [58]:
print(get_all_schuze_links(URL))

[('1. schůze', '001schuz/index.htm'), ('8.', '001schuz/1-1.html'), ('10.\xa0listopadu\xa02021', '001schuz/1-2.html'), ('2. schůze', '002schuz/index.htm'), ('23.', '002schuz/2-1.html'), ('26.\xa0listopadu\xa02021', '002schuz/2-2.html'), ('3. schůze', '003schuz/index.htm'), ('1.\xa0prosince\xa02021', '003schuz/3-1.html'), ('4. schůze', '004schuz/index.htm'), ('14.\xa0prosince\xa02021', '004schuz/4-1.html'), ('5. schůze', '005schuz/index.htm'), ('15.\xa0prosince\xa02021', '005schuz/5-1.html'), ('6. schůze', '006schuz/index.htm'), ('11.', '006schuz/6-1.html'), ('12.', '006schuz/6-2.html'), ('13.', '006schuz/6-3.html'), ('14.', '006schuz/6-4.html'), ('25.', '006schuz/6-5.html'), ('26.', '006schuz/6-6.html'), ('27.', '006schuz/6-7.html'), ('28.\xa0ledna\xa02022', '006schuz/6-8.html'), ('7. schůze', '007schuz/index.htm'), ('13.\xa0ledna\xa02022', '007schuz/7-1.html'), ('8. schůze', '008schuz/index.htm'), ('1.', '008schuz/8-1.html'), ('2.\xa0února\xa02022', '008schuz/8-2.html'), ('9. schůze', 

In [None]:
# def get_all_links(URL):
#     all_links = []
    
#     soup = fetch_website_content(URL)
#     body_content = extract_html_of_div_with_id(soup, "body")
#     hrefs = extract_html_of_a_with_href(body_content)
#     links = [link for link in hrefs if "schuz" in link and "index" in link]
#     all_url_schuze = [URL.split("/index")[0] + "/" + link for link in links]
#     for URL_schuze in all_url_schuze:
#         soup_schuze = fetch_website_content(URL_schuze)
#         body_content_schuze = extract_html_of_div_with_id(soup, "main-content")
#         hrefs_schuze = extract_html_of_a_with_href(body_content)
#         links_schuze = [link for link in hrefs_schuze if "-" in link]
#         all_url_schuze_parts = [URL.split("/index")[0] + "/" + link_schuze for link_schuze in links_schuze]
#         for URL_zaznam in all_url_schuze_parts[0:2]:
#             soup_zaznam = fetch_website_content(URL_zaznam)
#             body_content_zaznam = extract_html_of_div_with_id(soup_zaznam, "body")
#             hrefs_zaznam = [(a.get_text(), a['href']) for a in body_content_zaznam.find_all('a', href=True)]
#             links_zaznam = [link[1] for link in hrefs_schuze_detail if "#r" in link[1]]
#             all_url_zaznam_parts = [URL_schuze.split("/index")[0] + "/" + link_zaznam for link_zaznam in links_zaznam]
            
#             all_links += all_url_zaznam_parts
            
#         # extract_html_of_div_with_id(html_content_schuze, "body")
#     return all_links


In [None]:
# def get_all_data(all_url):
#     all_data = []
#     for url in all_url:
#         try:
#             soup = fetch_website_content(url)
#             body_content = extract_html_of_div_with_id(soup, "main-content")
#             p_tags = extract_content_of_p_with_align(body_content, "justify")
#             all_data += p_tags
#         except:
#             # print("Couldnt get data from: ", url)
#     return all_data

In [None]:
# all_url = get_all_links(URL)
# all_data = get_all_data(all_url)
# print(all_data)