In [5]:
import requests
from bs4 import BeautifulSoup
import copy
import yaml
import os

# URL = 'https://www.psp.cz/eknih/2021ps/stenprot/index.htm'  # 2021
URL = 'https://www.psp.cz/eknih/2017ps/stenprot/index.htm'  # 2017

URL_demagog = 'https://demagog.cz/politici/andrej-babis-183'

In [6]:
# Required Libraries

# Constants

def fetch_website_content(url):
    """
    Fetches the content of the website at the given URL.
    
    Args:
        url (str): The target URL to fetch.
    
    Returns:
        str: The raw content of the webpage.
    """
    response = requests.get(url)
    
    # Check if the request was successful
    if response.status_code != 200:
        raise ValueError(f"Failed to fetch the webpage. HTTP Status Code: {response.status_code}")

    html_content = response.text
    soup = BeautifulSoup(html_content, 'html.parser')
    return soup

def extract_html_of_a_with_href(soup):
    """
    Parses the required data from the given HTML content.
    
    Args:
        html_content (str): The raw HTML content.
    
    Returns:
        list: A list of data extracted from the content.
    """
    # Modify this section to extract the specific data you need
    # For instance, to extract all the text inside paragraph tags:
    data = [(a.get_text(), a['href']) for a in soup.find_all('a', href=True)]
    return data

def extract_html_of_div_with_id(soup, target_id):
    """
    Extracts the inner HTML content of a div with the given ID from the provided HTML content.
    
    Args:
        html_content (str): The raw HTML content.
        target_id (str): The ID of the div whose content needs to be extracted.
    
    Returns:
        str: The inner HTML content of the div, or None if the div isn't found.
    """
    # Find the div with the given ID
    div = soup.find('div', id=target_id)
    
    # Return its inner HTML if found
    return div if div else None

def extract_content_of_p_with_align(html_content, target_align):
    """
    Extracts the content of all <p> tags with the specified align attribute.
    
    Args:
        html_content (str): The raw HTML content.
        target_align (str): The align attribute value to look for.
    
    Returns:
        list: A list of content extracted from the <p> tags.
    """
    # Find all <p> tags with the specified align attribute
    p_tags = html_content.find_all('p', align=target_align)
    
    # Extract and return the content of these tags
    return [p.get_text() for p in p_tags]


In [7]:
def get_all_schuze_links(URL):
    url_data = {}
    soup = fetch_website_content(URL)
    body_content = extract_html_of_div_with_id(soup, "body")
    hrefs = extract_html_of_a_with_href(body_content)
    links = [link for link in hrefs if "schuz" in link[1] and "index" in link[1]]
    for link in links:
        url_data[link[0]] = {"url": URL.split("/index")[0] + "/" + link[1]}

    return url_data

def get_all_schuze_content(URL, URL_schuze):
    soup = fetch_website_content(URL_schuze)
    body_content = extract_html_of_div_with_id(soup, "main-content")
    hrefs = extract_html_of_a_with_href(body_content)
    links = [link[1] for link in hrefs if "-" in link[1]]
    all_url_schuze = [URL_schuze.split("/index")[0] + "/" + link_schuze for link_schuze in links]

    return all_url_schuze

def get_all_speakers(URL_schuze, URL_zaznam):
    url_data = []
    soup = fetch_website_content(URL_zaznam)
    body_content = extract_html_of_div_with_id(soup, "body")
    hrefs = extract_html_of_a_with_href(body_content)
    links = [link for link in hrefs if "#r" in link[1]]
    for link in links:
        url_data.append((link[0], URL_schuze.split("/index")[0] + "/" + link[1]))

    return url_data

In [8]:
def get_all_links(URL):    
    schuze_all = get_all_schuze_links(URL)
    for name_schuze in list(schuze_all.keys()):
        URL_schuze = schuze_all[name_schuze]["url"]
        all_url_schuze_parts = get_all_schuze_content(URL, URL_schuze)
        schuze_all[name_schuze]["rec"] = all_url_schuze_parts
        schuze_all[name_schuze]["speak"] = []
        for URL_zaznam in all_url_schuze_parts[0:1]:
            schuze_all[name_schuze]["speak"] += get_all_speakers(URL_schuze, URL_zaznam)
            
    return schuze_all

In [9]:
def save_data(new_data, new_key, file_name):
    if(not os.path.isfile(file_name)):
        with open(file_name, "w", encoding="utf-8") as file:
            yaml.dump({}, file, allow_unicode=True)
        
    with open(file_name, "r", encoding="utf-8") as file:
        data = yaml.safe_load(file)
    
    data[new_key] = new_data
    
    with open(file_name, "w", encoding="utf-8") as file:
        yaml.dump(data, file, allow_unicode=True)

In [10]:
def get_all_data(schuze_all, file_name):
    all_data = {}
    for name_schuze in list(schuze_all.keys()):
        schuze_data = {}
        for link in schuze_all[name_schuze]["speak"]:
            schuze_data[link[0]] = []
            try:
                soup = fetch_website_content(link[1])
                body_content = extract_html_of_div_with_id(soup, "main-content")
                p_tags = extract_content_of_p_with_align(body_content, "justify")

                schuze_data[link[0]] += [tag.replace("\xa0", " ").replace("§", "zk.").replace("*", "") for tag in p_tags]
            except:
                print("Couldnt get data from: ", link[1])
        try:
            save_data(schuze_data, name_schuze, file_name)
        except:
            print("Couldnt save data from: ", name_schuze)
            print("\n")
        all_data[name_schuze] = copy.deepcopy(schuze_data)
        print("Done downloading data from: ", name_schuze)
        print("\n")
    return all_data

In [11]:
all_links = get_all_links(URL)

SSLError: HTTPSConnectionPool(host='www.psp.cz', port=443): Max retries exceeded with url: /eknih/2017ps/stenprot/index.htm (Caused by SSLError(SSLError(1, '[SSL: UNSAFE_LEGACY_RENEGOTIATION_DISABLED] unsafe legacy renegotiation disabled (_ssl.c:1007)')))

In [None]:
print(all_links.keys())

In [12]:
all_data = get_all_data(all_links, "./data/ps_scrappe2017.yaml")

NameError: name 'all_links' is not defined

In [47]:
#the second part of the scrapper is used for scraping data from demagog.cz
def get_url_data(URL, politician):
    
    name = {}
    date = {}
    source = {}
    quote = {}
    TrueUntrue = {}
    response = {}

    #create a list of urls for scraping
    page = requests.get(URL)
    if page.status_code != 200:
        raise ValueError(f"Failed to fetch the webpage. HTTP Status Code: {page.status_code}")
    html_content = page.text
    soup = BeautifulSoup(html_content, "html.parser")
    
    

    dates = soup.find_all('span', class_ = "date")    
    sources = soup.find_all('cite' , class_ ='ps-5')
    quotes = soup.find_all('span', class_ = "fs-6 position-relative")
    TrueUntrues = soup.find_all('span', class_ = "fs-5 text-uppercase fs-600 text-red")
    responses = soup.find_all('div', class_ = "content fs-6")

    print(len(dates))
    print(len(sources))
    print(len(quotes))
    print(len(TrueUntrues))
    print(len(responses))

    if(len(dates) != len(sources) != len(quotes) != len(TrueUntrues) != len(responses)):
        raise ValueError("Data lengths on URL: " + URL + " are not equal and do not correspond to each other!")
    
    for idx in len(dates):
        name.append(politician)
        date.append(dates[idx].string)
        source.append(sources[idx].string)
        quote.append(quotes[idx].string)
        TrueUntrue.append(TrueUntrues[idx].string)
        response.append(responses[idx].string)

    data = {name, date, source, quote, TrueUntrue, response}    

    return data


In [48]:

get_url_data(URL_demagog,"Andy")


10
0
10
2
0


ValueError: Data lengths on URL: https://demagog.cz/politici/andrej-babis-183 are not equal and do not correspond to each other!