In [None]:
import requests
from bs4 import BeautifulSoup, NavigableString
import os
import time
import re

def sanitize_filename(name):
    for char in ['/', '\\', ':', '*', '?', '"', '<', '>', '|']:
        name = name.replace(char, '_')
    return name

def download_pdf_from_beratungsverlauf(beratungsverlauf_url, folder_path, downloaded_pdfs):
    response = requests.get(beratungsverlauf_url)
    if response.status_code != 200:
        print(f'Failed to retrieve the Beratungsverlauf page: {beratungsverlauf_url}')
        return

    soup = BeautifulSoup(response.content, 'html.parser')

    # Gather PDF links
    pdf_links = set()  # Using a set to store unique PDF links
    for pdf_link in soup.find_all('a', href=lambda href: href and href.endswith('.pdf')):
        pdf_url = pdf_link['href']

        # Get the parent <td> containing the PDF information
        parent_td = pdf_link.find_parent('td')

        # Try to find the title immediately before the PDF link
        proposed_pdf_name = None

        # Navigate through previous siblings until we find the relevant text
        for sibling in pdf_link.previous_siblings:
            if isinstance(sibling, NavigableString):
                text = sibling.strip()
                if text and not text.lower().startswith("download pdf"):
                    proposed_pdf_name = text  # Capture the relevant text
                    break

            if sibling.name == 'br':
                continue  # Ignore <br> elements

            # If we find a tag that is not <br> then we extract its text
            if sibling.string and sibling.string.strip():
                proposed_pdf_name = sibling.string.strip()
                break  # Stop as we found a valid name

        # If we couldn't find a name just above it, we can also inspect the content in the parent <td>
        if proposed_pdf_name is None:
            td_text = parent_td.get_text(separator="<br>", strip=True).split("<br>")
            # Check the part that contains the intended name if applicable
            for text in td_text:
                if "Download PDF" not in text and text.strip():
                    proposed_pdf_name = text.strip()
                    break

        # Generate the PDF name or fallback to unnamed if nothing is found
        if proposed_pdf_name:
            pdf_name = sanitize_filename(proposed_pdf_name + ".pdf")
        else:
            pdf_name = "Unnamed.pdf"  # Fallback name

        # Prepare the URL
        if not pdf_url.startswith(('http:', 'https:')):
            pdf_url = requests.compat.urljoin(beratungsverlauf_url, pdf_url)

        # Only add to the pdf_links if the name is valid
        if pdf_name != "Unnamed.pdf":
            pdf_links.add((pdf_url, pdf_name))  # Add to the set to ensure uniqueness

    # Attempt to download each unique PDF link
    for pdf_url, pdf_name in pdf_links:
        print(f'Attempting to download PDF: {pdf_name} from {pdf_url}')  # Debug print
        if pdf_name != "Unnamed.pdf":  # Ensure we skip unnamed files
            if pdf_url not in downloaded_pdfs:
                downloaded_pdfs.add(pdf_url)  # Register the download
                download_pdf(pdf_url, folder_path, pdf_name)  # Call the download function
            else:
                print(f'Skipping duplicate PDF: {pdf_url}')  # Debug print
        else:
            print(f'Skipping unnamed PDF: {pdf_url}')  # Debug print

        time.sleep(4)  # Add a delay after each PDF download

def download_pdf(pdf_url, folder_path, pdf_name):
    """Download the PDF file."""
    if not os.path.exists(folder_path):
        os.makedirs(folder_path)  # Create the folder if it doesn't exist

    pdf_file_path = os.path.join(folder_path, pdf_name)

    # Manage existing filenames using a counter
    counter = 1
    while os.path.exists(pdf_file_path):
        base_name, extension = os.path.splitext(pdf_name)
        pdf_file_path = os.path.join(folder_path, f"{base_name}_{counter}{extension}")
        counter += 1

    response = requests.get(pdf_url)
    if response.status_code == 200:
        with open(pdf_file_path, 'wb') as pdf_file:
            pdf_file.write(response.content)
        print(f'Downloaded: {pdf_name} in {folder_path}')
    else:
        print(f'Failed to download PDF: {pdf_url} with status code: {response.status_code}')

def extract_pdfs_from_results(search_url, base_folder_path):
    current_page = 1
    base_search_url = 'https://www.bayern.landtag.de/parlament/dokumente/drucksachen?isInitialCheck=0&;q=&dknr=&suchverhalten=AND&dokumentenart=Drucksache&ist_basisdokument=off&sort=date&anzahl_treffer=100&wahlperiodeid%5B0%5D=19&wahlperiodeid%5B1%5D=18&wahlperiodeid%5B2%5D=17&wahlperiodeid%5B3%5D=16&wahlperiodeid%5B4%5D=15&wahlperiodeid%5B5%5D=14&wahlperiodeid%5B6%5D=13&wahlperiodeid%5B7%5D=12&wahlperiodeid%5B8%5D=11&wahlperiodeid%5B9%5D=10&wahlperiodeid%5B10%5D=9&wahlperiodeid%5B11%5D=8&wahlperiodeid%5B12%5D=7&wahlperiodeid%5B13%5D=6&erfassungsdatum%5Bstart%5D=&erfassungsdatum%5Bend%5D=&suchvorgangsarten%5B0%5D=Gesetze%5C%5CGesetzentwurf&suchvorgangsarten%5B1%5D=Gesetze%5C%5CHaushaltsgesetz,%20Nachtragshaushaltsgesetz&suchvorgangsarten%5B2%5D=Gesetze%5C%5CStaatsvertrag&gremium%5B0%5D=Ausschuss%20f%C3%BCr%20Landesentwicklung%20und%20Umweltfragen&gremium%5B1%5D=Ausschuss%20f%C3%BCr%20Umwelt%20und%20Verbraucherschutz&gremium%5B2%5D=Ausschuss%20f%C3%BCr%20Umwelt%20und%20Gesundheit&dlh=null'  # Your base URL here
    downloaded_pdfs = set()  # Set to track downloaded PDFs

    while True:
        search_url = f"{base_search_url}&page={current_page}"
        print(f'Retrieving page {current_page}:{search_url}')

        response = requests.get(search_url)
        if response.status_code != 200:
            print(f'Failed to retrieve the search results for page {current_page}.')
            break

        soup = BeautifulSoup(response.content, 'html.parser')
        results = soup.find_all('div', class_='row result')
        if not results:
            print('No more results found or last page reached.')
            break

        print(f'Processing page {current_page}, found {len(results)} results.')

        for result in results:
            links = result.find_all('a', class_='link-with-icon')
            for link in links:
                if "vorgangsanzeige" in link['href']:
                    beratungsverlauf_url = link['href']
                    print('Found Beratungsverlauf link:', beratungsverlauf_url)

                    # Fetch the Beratungsverlauf page to extract the "betreff" text and WP
                    beratungsverlauf_response = requests.get(beratungsverlauf_url)
                    if beratungsverlauf_response.status_code != 200:
                        print(f'Failed to retrieve Beratungsverlauf page: {beratungsverlauf_url}')
                        continue

                    # Parse the Beratungsverlauf page content
                    beratungsverlauf_soup = BeautifulSoup(beratungsverlauf_response.content, 'html.parser')

                    # Find the basistext span
                    basistext_span = beratungsverlauf_soup.find('span', id='basistext')
                    legislative_period = extract_legislative_period(basistext_span.get_text(strip=True)) if basistext_span else 'Unknown_WP'

                    # Create a subfolder for the legislative period (Wahlperioden)
                    period_folder_path = os.path.join(base_folder_path, legislative_period)
                    os.makedirs(period_folder_path, exist_ok=True)  # Create the main legislative period folder

                    # Limit the length of the folder name
                    betreff_span = beratungsverlauf_soup.find('span', id='betreff')
                    folder_name = betreff_span.get_text(strip=True).replace('/', '_').replace('\\', '_')[:75]  # Sanitize folder name

                    # Add a unique identifier (e.g., ID from the URL) to the folder name and complete the final folder path for the Gesetzentwurf
                    unique_identifier = beratungsverlauf_url.split('=')[-1]
                    gesetzentwurf_folder_path = os.path.join(period_folder_path, f"{folder_name}_{unique_identifier}")
                    os.makedirs(gesetzentwurf_folder_path, exist_ok=True)  # Create the Gesetzentwurf folder

                    print(f'Creating Gesetzentwurf folder: {gesetzentwurf_folder_path}')  # Debug statement

                    # Call download_pdf_from_beratungsverlauf with downloaded_pdfs
                    download_pdf_from_beratungsverlauf(beratungsverlauf_url, gesetzentwurf_folder_path, downloaded_pdfs)
                    time.sleep(4)  # Take a 4 second break between downloads
                    break  # Exit after finding the first valid link
            else:
                print('No valid Beratungsverlauf link in this result.')

        current_page += 1  # Move to the next page after processing this one

def extract_legislative_period(basistext):
    # Use regular expressions to extract the WP number
    match = re.search(r'Nr\. (\d+)', basistext)
    if match:
        return f'WP_{match.group(1)}'  # Format as 'WP_19'
    return 'WP_Unknown'  # Fallback for unknown periods

def main():
    search_url = 'https://www.bayern.landtag.de/parlament/dokumente/drucksachen?isInitialCheck=0&;q=&dknr=&suchverhalten=AND&dokumentenart=Drucksache&ist_basisdokument=off&sort=date&anzahl_treffer=100&wahlperiodeid%5B%5D=19&wahlperiodeid%5B%5D=18&wahlperiodeid%5B%5D=17&wahlperiodeid%5B%5D=16&wahlperiodeid%5B%5D=15&wahlperiodeid%5B%5D=14&wahlperiodeid%5B%5D=13&wahlperiodeid%5B%5D=12&wahlperiodeid%5B%5D=11&wahlperiodeid%5B%5D=10&wahlperiodeid%5B%5D=9&wahlperiodeid%5B%5D=8&wahlperiodeid%5B%5D=7&wahlperiodeid%5B%5D=6&erfassungsdatum%5Bstart%5D=&erfassungsdatum%5Bend%5D=&dokumentenart=Drucksache&suchvorgangsarten%5B%5D=Gesetze%5C%5CGesetzentwurf&suchvorgangsarten%5B%5D=Gesetze%5C%5CHaushaltsgesetz%2C+Nachtragshaushaltsgesetz&suchvorgangsarten%5B%5D=Gesetze%5C%5CStaatsvertrag&gremium%5B%5D=Ausschuss+f%C3%BCr+Landesentwicklung+und+Umweltfragen&gremium%5B%5D=Ausschuss+f%C3%BCr+Umwelt+und+Verbraucherschutz&gremium%5B%5D=Ausschuss+f%C3%BCr+Umwelt+und+Gesundheit&dlh=null'
    base_search_url = 'https://www.bayern.landtag.de/parlament/dokumente/drucksachen?isInitialCheck=0&;q=&dknr=&suchverhalten=AND&dokumentenart=Drucksache&ist_basisdokument=off&sort=date&anzahl_treffer=100&wahlperiodeid%5B0%5D=19&wahlperiodeid%5B1%5D=18&wahlperiodeid%5B2%5D=17&wahlperiodeid%5B3%5D=16&wahlperiodeid%5B4%5D=15&wahlperiodeid%5B5%5D=14&wahlperiodeid%5B6%5D=13&wahlperiodeid%5B7%5D=12&wahlperiodeid%5B8%5D=11&wahlperiodeid%5B9%5D=10&wahlperiodeid%5B10%5D=9&wahlperiodeid%5B11%5D=8&wahlperiodeid%5B12%5D=7&wahlperiodeid%5B13%5D=6&erfassungsdatum%5Bstart%5D=&erfassungsdatum%5Bend%5D=&suchvorgangsarten%5B0%5D=Gesetze%5C%5CGesetzentwurf&suchvorgangsarten%5B1%5D=Gesetze%5C%5CHaushaltsgesetz,%20Nachtragshaushaltsgesetz&suchvorgangsarten%5B2%5D=Gesetze%5C%5CStaatsvertrag&gremium%5B0%5D=Ausschuss%20f%C3%BCr%20Landesentwicklung%20und%20Umweltfragen&gremium%5B1%5D=Ausschuss%20f%C3%BCr%20Umwelt%20und%20Verbraucherschutz&gremium%5B2%5D=Ausschuss%20f%C3%BCr%20Umwelt%20und%20Gesundheit&dlh=null'
    base_folder_path = 'Beratungsverl√§ufe_Umweltausschuss_Bayern'
    extract_pdfs_from_results(search_url, base_folder_path)

if __name__ == "__main__":
    main()