In [1]:
import requests
from bs4 import BeautifulSoup
import copy
import yaml

# URL = 'https://www.hlidacstatu.cz/data/Hledat/stenozaznamy-psp/'  # Replace with the target URL
URL = 'https://www.psp.cz/eknih/2021ps/stenprot/index.htm'  # Replace with the target URL

In [2]:
# Required Libraries

# Constants

def fetch_website_content(url):
    """
    Fetches the content of the website at the given URL.
    
    Args:
        url (str): The target URL to fetch.
    
    Returns:
        str: The raw content of the webpage.
    """
    response = requests.get(url)
    
    # Check if the request was successful
    if response.status_code != 200:
        raise ValueError(f"Failed to fetch the webpage. HTTP Status Code: {response.status_code}")

    html_content = response.text
    soup = BeautifulSoup(html_content, 'html.parser')
    return soup

def extract_html_of_a_with_href(soup):
    """
    Parses the required data from the given HTML content.
    
    Args:
        html_content (str): The raw HTML content.
    
    Returns:
        list: A list of data extracted from the content.
    """
    # Modify this section to extract the specific data you need
    # For instance, to extract all the text inside paragraph tags:
    data = [(a.get_text(), a['href']) for a in soup.find_all('a', href=True)]
    return data

def extract_html_of_div_with_id(soup, target_id):
    """
    Extracts the inner HTML content of a div with the given ID from the provided HTML content.
    
    Args:
        html_content (str): The raw HTML content.
        target_id (str): The ID of the div whose content needs to be extracted.
    
    Returns:
        str: The inner HTML content of the div, or None if the div isn't found.
    """
    # Find the div with the given ID
    div = soup.find('div', id=target_id)
    
    # Return its inner HTML if found
    return div if div else None

def extract_content_of_p_with_align(html_content, target_align):
    """
    Extracts the content of all <p> tags with the specified align attribute.
    
    Args:
        html_content (str): The raw HTML content.
        target_align (str): The align attribute value to look for.
    
    Returns:
        list: A list of content extracted from the <p> tags.
    """
    # Find all <p> tags with the specified align attribute
    p_tags = html_content.find_all('p', align=target_align)
    
    # Extract and return the content of these tags
    return [p.get_text() for p in p_tags]


In [3]:
def get_all_schuze_links(URL):
    url_data = {}
    soup = fetch_website_content(URL)
    body_content = extract_html_of_div_with_id(soup, "body")
    hrefs = extract_html_of_a_with_href(body_content)
    links = [link for link in hrefs if "schuz" in link[1] and "index" in link[1]]
    for link in links:
        url_data[link[0]] = {"url": URL.split("/index")[0] + "/" + link[1]}

    return url_data

def get_all_schuze_content(URL, URL_schuze):
    soup = fetch_website_content(URL_schuze)
    body_content = extract_html_of_div_with_id(soup, "main-content")
    hrefs = extract_html_of_a_with_href(body_content)
    links = [link[1] for link in hrefs if "-" in link[1]]
    all_url_schuze = [URL_schuze.split("/index")[0] + "/" + link_schuze for link_schuze in links]

    return all_url_schuze

def get_all_speakers(URL_schuze, URL_zaznam):
    url_data = []
    soup = fetch_website_content(URL_zaznam)
    body_content = extract_html_of_div_with_id(soup, "body")
    hrefs = extract_html_of_a_with_href(body_content)
    links = [link for link in hrefs if "#r" in link[1]]
    for link in links:
        url_data.append((link[0], URL_schuze.split("/index")[0] + "/" + link[1]))

    return url_data

In [4]:
def get_all_links(URL):    
    schuze_all = get_all_schuze_links(URL)
    for name_schuze in list(schuze_all.keys()):
        URL_schuze = schuze_all[name_schuze]["url"]
        all_url_schuze_parts = get_all_schuze_content(URL, URL_schuze)
        schuze_all[name_schuze]["rec"] = all_url_schuze_parts
        schuze_all[name_schuze]["speak"] = []
        for URL_zaznam in all_url_schuze_parts[0:1]:
            schuze_all[name_schuze]["speak"] += get_all_speakers(URL_schuze, URL_zaznam)
            
    return schuze_all

In [5]:
def save_data(new_data, new_key, file_name):
    with open(file_name, "r", encoding="utf-8") as file:
        data = yaml.safe_load(file)
    
    data[new_key] = new_data
    
    with open(file_name, "w", encoding="utf-8") as file:
        yaml.dump(data, file, allow_unicode=True)

In [6]:
def get_all_data(schuze_all, file_name):
    all_data = {}
    for name_schuze in list(schuze_all.keys()):
        schuze_data = {}
        for link in schuze_all[name_schuze]["speak"]:
            schuze_data[link[0]] = []
            try:
                soup = fetch_website_content(link[1])
                body_content = extract_html_of_div_with_id(soup, "main-content")
                p_tags = extract_content_of_p_with_align(body_content, "justify")

                schuze_data[link[0]] += [tag.replace("\xa0", " ").replace("§", "zk.").replace("*", "") for tag in p_tags]
            except:
                print("Couldnt get data from: ", link[1])
        try:
            save_data(schuze_data, name_schuze, file_name)
        except:
            print("Couldnt save data from: ", name_schuze)
            print("\n")
        all_data[name_schuze] = copy.deepcopy(schuze_data)
        print("Done downloading data from: ", name_schuze)
        print("\n")
    return all_data

In [7]:
all_links = get_all_links(URL)

In [8]:
print(all_links.keys())

dict_keys(['1. schůze', '2. schůze', '3. schůze', '4. schůze', '5. schůze', '6. schůze', '7. schůze', '8. schůze', '9. schůze', '10. schůze', '11. schůze', '12. schůze', '13. schůze', '14. schůze', '15. schůze', '16. schůze', '17. schůze', '18. schůze', '19. schůze', '20. schůze', '21. schůze', '22. schůze', '23. schůze', '24. schůze', '25. schůze', '26. schůze', '27. schůze', '28. schůze', '29. schůze', '30. schůze', '31. schůze', '32. schůze', '33. schůze', '34. schůze', '35. schůze', '36. schůze', '37. schůze', '38. schůze', '39. schůze', '40. schůze', '41. schůze', '42. schůze', '43. schůze', '44. schůze', '45. schůze', '46. schůze', '47. schůze', '48. schůze', '49. schůze', '50. schůze', '51. schůze', '52. schůze', '53. schůze', '54. schůze', '55. schůze', '56. schůze', '57. schůze', '58. schůze', '59. schůze', '60. schůze', '61. schůze', '62. schůze', '63. schůze', '64. schůze', '65. schůze', '66. schůze', '67. schůze', '68. schůze', '69. schůze', '70. schůze', '71. schůze', '72.

In [10]:
all_data = get_all_data(all_links, "./data/ps_scrappe.yaml")

Done downloading data from:  1. schůze


Done downloading data from:  2. schůze


Done downloading data from:  3. schůze


Done downloading data from:  4. schůze


Done downloading data from:  5. schůze


Done downloading data from:  6. schůze


Done downloading data from:  7. schůze


Done downloading data from:  8. schůze


Done downloading data from:  9. schůze


Done downloading data from:  10. schůze


Done downloading data from:  11. schůze


Done downloading data from:  12. schůze


Done downloading data from:  13. schůze


Done downloading data from:  14. schůze


Done downloading data from:  15. schůze


Done downloading data from:  16. schůze


Done downloading data from:  17. schůze


Done downloading data from:  18. schůze


Done downloading data from:  19. schůze


Done downloading data from:  20. schůze


Done downloading data from:  21. schůze


Done downloading data from:  22. schůze


Done downloading data from:  23. schůze


Done downloading data from:  24. schůze


D

In [15]:
print(len(all_data))

80


In [14]:
# import yaml

# # Save dictionary to a YAML file with special characters preserved
# with open("./data/ps_scrappe.yaml", "w", encoding="utf-8") as file:
#     yaml.dump({}, file, allow_unicode=True)

# print("Data saved to ps_scrappe.yaml")