In [1]:
import os
from dotenv import load_dotenv

In [2]:
load_dotenv()

OPENAI_API_KEY = os.environ.get("OPENAI_API_KEY", None)
GEMINI_API_KEY = os.environ.get("GEMINI_API_KEY", None)

# Scraping/Parsing

In [3]:
import json
from typing import List, Any
import aiohttp
from bs4 import BeautifulSoup
from urllib.parse import unquote
import re

from haystack.dataclasses import ByteStream, Document
from haystack.components.fetchers import LinkContentFetcher
from haystack.components.preprocessors import DocumentCleaner, DocumentSplitter
    

class Scraper():
    """
    A class used to scrap URLs from *.admin.ch websites.

    Attributes
    ----------
    fetcher : LinkContentFetcher
        An instance of LinkContentFetcher to fetch the content of URLs.

    Methods
    -------
    scrap_urls(url_list: List[str]) -> List[ByteStream]
        Scrapes the given URLs and returns the content as a list of ByteStreams.

    """

    def __init__(self):
        self.fetcher = LinkContentFetcher()

    def scrap_urls(self, urls: List[str]) -> List[ByteStream]:
        """
        Scrapes the given URLs and returns the content as a list of ByteStreams.

        Parameters
        ----------
        urls : List[str]
            A list of URLs to scrape.

        Returns
        -------
        List[ByteStream]
            A list of ByteStreams containing the content of the scraped URLs.
        """

        streams = self.fetcher.run(urls=urls)
        return streams["streams"]

    async def fetch(self, url: str) -> str:
        """
        Fetches the content from a given URL.

        Parameters
        ----------
        url : str
            The URL to fetch content from.

        Returns
        -------
        bytes
            The content of the URL.

        Raises
        ------
        aiohttp.ClientError
            If the fetch operation fails.
        """
        try:
            async with aiohttp.ClientSession(trust_env=True) as session:
                async with session.get(url, timeout=10) as response:
                    response.raise_for_status()
                    return await response.text()
        except aiohttp.ClientError as e:
            print(url, "----", e)

class Parser():
    """
    A class used to parse and clean documents.

    Attributes
    ----------
    pdf_converter : PyPDFToDocument
        An instance of PyPDFToDocument to convert PDF content to Document objects.

    cleaner : DocumentCleaner
        An instance of DocumentCleaner to clean documents.

    splitter : DocumentSplitter
        An instance of DocumentSplitter to split documents into chunks.

    Methods
    -------
    parse_html(html: bytes) -> List[str]
        Extracts URLs from the given HTML content.

    convert_pdf_to_documents(content: List[ByteStream]) -> List[Document]
        Converts PDF content to Document objects.

    clean_documents(documents: List[Document]) -> List[Document]
        Cleans the given documents.

    split_documents(documents: List[Document]) -> List[Document]
        Splits the given documents into chunks.
    """

    def __init__(self):
        self.cleaner = DocumentCleaner(
            remove_empty_lines=True,
            remove_extra_whitespaces=True,
            remove_repeated_substrings=False,
        )
        self.splitter = DocumentSplitter(
            split_by="sentence",
            split_length=5,
            split_overlap=1,
            split_threshold=4,
        )

    def remove_empty_documents(self, documents: List[Any]) -> List[Any]:
        """
        Remove documents from the list that have their data attribute set to None.

        Parameters
        ----------
        documents : list
            Document objects to be filtered.

        Returns
        -------
        list
            Document objects where the content attribute is not None.
        """
        return [doc for doc in documents if doc.content is not None]

    def remove_duplicate_links(self, links):
        """
        Removes duplicate links from a list of tags.

        Parameters
        ----------
        links : list of bs4.element.Tag
            The list of tags to remove duplicates from.

        Returns
        -------
        list of bs4.element.Tag
            The list of tags without duplicates.
        """
        seen_hrefs = set()
        unique_tags = []
        for tag in links:
            href = tag["href"]
            if href not in seen_hrefs:
                seen_hrefs.add(href)
                unique_tags.append(tag)
        return unique_tags

    def clean_documents(self, documents: List[Document]) -> List[Document]:
        """
        Removes docs with None content and cleans the given documents.

        Parameters
        ----------
        documents : List[Document]
            The documents to clean.

        Returns
        -------
        List[Document]
            Cleaned documents.
        """
        return self.cleaner.run(documents=documents)

    def split_documents(self, documents: List[Document]) -> List[Document]:
        """
        Removes docs with None content and splits the given documents into chunks.

        Parameters
        ----------
        documents : List[Document]
            The documents to split into chunks.

        Returns
        -------
        List[Document]
            A list of documents split into chunks.
        """
        return self.splitter.run(documents=documents)

    def contains_tag(self, tag):
        """
        Checks if a tag contains a memento URL.

        Parameters
        ----------
        tag : bs4.element.Tag
            The tag to check.

        Returns
        -------
        bool
            True if the tag contains a memento URL, False otherwise.
        """
        if tag.name == "a" and "href" in tag.attrs:
            href = tag["href"]
            decoded_href = unquote(href)
            keywords = ["Merkblätter/", "Mémentos/", "Opuscoli", "Leaflets/"]
            return any(keyword in decoded_href for keyword in keywords)
        return False

    def get_pdf_paths(self, soup):
        """
        Extracts the paths of PDF documents from a BeautifulSoup object.

        Parameters
        ----------
        soup : BeautifulSoup
            The BeautifulSoup object to extract PDF paths from.

        Returns
        -------
        list of str
            The list of PDF paths.
        """

        pdf_paths = [
            a["href"]
            for a in soup.find_all("a", {"class": "co-document-content"})
            if "/p/" in a["href"]
        ]
        return pdf_paths

    def get_pdf_metadata(self, soup):
        """
        Extracts the metadata of PDF documents from a BeautifulSoup object.

        Parameters
        ----------
        soup : BeautifulSoup
            The BeautifulSoup object to extract PDF paths from.

        Returns
        -------
        list of str
            The list of PDF paths.
        """
        pdfs = soup.find_all("div", {"class": "sc-element co-fileType-PDF published"})
        
        pdf_metadata = [
            {
                "title": pdf.find("div", {"class": "co-document-main"}).find("b").text.strip(),
                "url": (a := pdf.find("a")) and (h := a.get("href", "")) and "/p/" in h and "https://ahv-iv.ch" + h or None,
                "last_modification": extract_date_from_str(pdf.find("p", {"class": "co-document-infos"}).text.strip()),
                "state": extract_date_from_str(pdf.find("div", {"class": "co-document-state"}).text.strip()),
            } for pdf in pdfs
        ]
        return pdf_metadata

    def parse_urls(self, content: str) -> List[str]:
        soup = BeautifulSoup(content, features="html.parser")

        # Find all "a" tags with href containing "Merkblätter/Mémentos/Opuscoli/Leaflets" (and subsequent path)
        links = soup.find_all(self.contains_tag)
        links = self.remove_duplicate_links(links)

        url_list = [link["href"] for link in links]

        return url_list

    def convert_to_documents(self, content: List[Any]) -> List[Any]:
        return PyPDFToDocument().run(sources=content)

scraper = Scraper()
parser = Parser()

In [4]:
async def get_pages_from_sitemap(
    scraper: Any, parser: Any, sitemap_url: str
) -> List[ByteStream]:
    # Get sitemap
    sitemap = await scraper.fetch(sitemap_url)

    # Extract URLs from sitemap
    url_list = parser.parse_urls(sitemap)

    # Get HTML from each URL
    return scraper.scrap_urls(url_list)


async def from_pages_to_content(
    scraper: Any, parser: Any, pages: List[ByteStream]
) -> List[Any]:
    soups = []
    for page in pages:
        soups.append(BeautifulSoup(page.data, features="html.parser"))

    # Get PDF paths from each memento section
    pdf_metadata = []
    for soup in soups:
        metadata = parser.get_pdf_metadata(soup)
        pdf_metadata.extend(metadata)

    # PROCESSING URLS

    # Scrap PDFs from each memento section
    #pdf_urls = ["https://ahv-iv.ch" + pdf_path for pdf_path in pdf_paths]
    pdf_urls = [m["url"] for m in pdf_metadata]

    # Add "it", "fr" pdf paths
    #pdf_urls.extend([pdf_url.replace(".d", ".f") for pdf_url in pdf_urls])
    #pdf_urls.extend([pdf_url.replace(".d", ".i") for pdf_url in pdf_urls])

    return pdf_metadata, scraper.scrap_urls(pdf_urls)

In [5]:
def is_pdf(content: bytes) -> bool:
    """Returns True if content is a valid PDF (starts with %PDF-), else False."""
    return content.startswith(b'%PDF-')

from typing import List, Tuple

def validate_urls_and_contents(metadatas: List[str], contents: List[bytes], lang: str) -> Tuple[List[str], List[bytes]]:
    """
    Filters urls in metadatas and contents by language suffix.

    Parameters
    ----------
    metadatas : List[str]
        List of URLs in metadatas with language suffixes (e.g., .de, .fr).
    contents : List[bytes]
        List of corresponding content bytes.
    lang : str
        The language suffix to match (e.g., 'de').

    Returns
    -------
    Tuple[List[str], List[bytes]]
        Filtered urls and contents, aligned by index.
    """
    filtered_metadatas = []
    filtered_contents = []

    for metadata, content in zip(metadatas, contents):
        if metadata["url"].split(".")[-1] == lang:
            filtered_metadatas.append(metadata)
            filtered_contents.append(content)

    return filtered_metadatas, filtered_contents

def extract_date_from_str(string: str):
    """Returns a string with extract date in format dd.mm.yyyy"""
    pattern = r'\b\d{2}\.\d{2}\.\d{4}\b'

    try:
        return re.findall(pattern, string)[0]
    except Exception as e:
        return string

In [6]:
sitemaps = {
    "d": "https://www.ahv-iv.ch/de/Sitemap-DE",
    "f": "https://www.ahv-iv.ch/fr/Sitemap-FR",
    "i": "https://www.ahv-iv.ch/it/Sitemap-IT",
    "e": "https://www.ahv-iv.ch/en/Sitemap-EN"
}

In [7]:
SAVE_PATH = "../pdfs/"

for lang, sitemap in sitemaps.items():
    
    pages = await get_pages_from_sitemap(
        scraper=scraper,
        parser=parser,
        sitemap_url=sitemap
    )

    metadatas, contents = await from_pages_to_content(
        scraper=scraper,
        parser=parser,
        pages=pages
    )

    valid_metadatas, valid_contents = validate_urls_and_contents(metadatas, contents, lang)
    
    for metadata, content in zip(valid_metadatas, valid_contents):

        if is_pdf(content.data):
            filename = metadata["url"].split("/")[-1].replace(".", "_") + ".pdf"
            os.makedirs(os.path.join(SAVE_PATH, lang), exist_ok=True)
            
            with open(os.path.join(SAVE_PATH, lang, filename), "wb") as f:
                f.write(content.data)

    final_metadata = {}
    for meta in valid_metadatas:
        pdf_id = meta["url"].split("/")[-1].replace(".", "_")
        final_metadata[pdf_id] = meta
    
    with open(os.path.join(SAVE_PATH, lang, "metadata.json"), "w") as f:
        json.dump(final_metadata, f)
    