# Iterative Prompt Research

## Data sources and their cache

In [22]:
from typing import List

class Node:
    def __init__(self, name : str, predecessor : 'Node' = None, alternative_name : str = None):
        self.name = name
        self.predecessor = predecessor
        self.alternative_name : str = None
        self.children = []

    def get_child(self, index : int):
        if (index >= len(self.children)):
            assert IndexError
        return self.children[index]
    
    def add_children(self, children : List['Node']):
        for child in children:
            child.predecessor = self
            self.children.append(child)

    def __str__(self) -> str:
        return f"{self.name}, #children: {len(self.children)}"
    
    def __repr__(self) -> str:
        return self.__str__()

class Leaf(Node):
    def __init__(self, name : str, predecessor : 'Node' = None, alternative_name : str = None):
        super().__init__(name, predecessor, alternative_name)

### WorldBank - XML

In [23]:
import xml.etree.ElementTree as ET

worldbank_namespaces =  {
        'nt': 'urn:eu.europa.ec.eurostat.navtree'
    }

def parse_worldbank_xml(path : str) -> Node:
    """Parses the xml tree from the given path and returns the root node."""
    tree = ET.parse(path)
    root = tree.getroot()
    dataset : Node = Node("WorldBank")
    dataset.add_children(get_node_children(root, dataset))
    return dataset

def get_node_children(root : ET.Element, predecessor_node : Node) -> List['Node']:
    """Parses the given root node and returns the corresponding DataSet object."""
    datasets = []
    for branch in root.findall('nt:branch', worldbank_namespaces):
        title = branch.find('nt:title/[@language="en"]', worldbank_namespaces).text
        branch_dataset = Node(title, predecessor_node)
        for child in branch.findall('nt:children', worldbank_namespaces):
            branch_dataset.children = get_node_children(child, branch_dataset)
        datasets.append(branch_dataset)

    for leaf in root.findall('nt:leaf', worldbank_namespaces):
        title = leaf.find('nt:title/[@language="en"]', worldbank_namespaces).text
        datasets.append(Leaf(title, predecessor_node))

    return datasets

In [24]:
parse_worldbank_xml("worldBank_content.xml")

WorldBank, #children: 3

### WebPage - HTML

In [41]:
from bs4 import BeautifulSoup
import requests
import re

TIMEOUT = 10
DEPTH = 2

def parse_html_webpage(path : str) -> Node:
    """Parses the html webpage into a tree and returns the root node."""
    dataset : Node = Node("MFF home page", None, path)
    dataset.add_children(get_html_children(dataset, DEPTH))
    return dataset

def get_html_children(predecessor_node : Node, remaining_depth : int) -> List['Node']:
    """Parses the given soup and returns the corresponding DataSet object."""
    datasets = []
    if (remaining_depth <= 0): return datasets

    print(f"Requesting {predecessor_node.alternative_name}")
    page = requests.get(predecessor_node.alternative_name, timeout=TIMEOUT)
    soup = BeautifulSoup(page.content, 'html.parser')
    
    for link in soup.find_all('a'):
        if ('href' in link.attrs and link.text.strip() != '' and not url_is_blacklisted(link.attrs['href'])):
            url = url_get_absolute(link.attrs['href'], predecessor_node.alternative_name)
            title = re.sub('[\\n\\s]+',' ',link.text.strip())
            branch_dataset = Node(title, predecessor_node, url)
            branch_dataset.children = get_html_children(link, branch_dataset, remaining_depth - 1)
            datasets.append(branch_dataset)
    return datasets

def url_is_blacklisted(url : str, base_url : str | None = None) -> bool:
    """Checks if the url is blacklisted. Blacklist is very basic."""
    ## keep relatives
    if (url.startswith('./') or (url != "/" and url.startswith('/'))):
        return False
    
    ## filter out some basic stuff
    if (url == "/" 
        or url.startswith('#') 
        or url.startswith('mailto:') 
        or url.startswith('javascript:')
        or url.startswith('tel:')
        or (base_url is not None and not url.startswith(base_url))):
        return True

    ## passed
    return False

def url_get_absolute(input : str, current_url : str | None) -> str:
        """Tries to merge the relative input url with the current url prefix to get the absolute url. 
        If no current url is provided, the input is returned."""
        result = ""
        
        if (current_url is None): return input
        ## remove the query string from the url
        if ('?' in current_url): current_url = current_url.split('?')[0]
        ## remove any anchors from the url
        if ('#' in input): input = input.split('#')[0]

        ## try to cut as much from the current url as possible
        if (input.startswith('/')): 
            current_split = [x for x in current_url.split('/') if x != '']
            input_split = [x for x in input.split('/') if x != '']
            for i in range(len(current_split)):
                if (len(input_split) == 0): return current_url
                if (current_split[i] == input_split[0]): input_split.pop(0)
            input = '/'.join(input_split)

            if (current_url.endswith('/')): result = current_url + input
            else: result = current_url + '/' + input
        ## just append the relative to the current
        elif (input.startswith('./')): result = current_url + input.strip('./')
        ## otherwise legit URL given
        else: result = input
        ## always add the trailing slash if not present
        if (not result.endswith('/') and not "." in result.split('/')[-1]): ## only except if file is given
            result += '/'

        return result

In [42]:
parse_html_webpage("https://www.mff.cuni.cz/")

MFF home page, #children: 170

### FileSystem - HTML

In [None]:
from bs4 import BeautifulSoup

DEPTH = 2

def parse_html_file(path : str) -> Node:
    """Parses the html file into a tree and returns the root node."""
    with open(path, 'r') as file:
        soup = BeautifulSoup(file, 'html.parser')
        dataset : Node = Node("MFF home page", None, path)
        dataset.add_children(get_html_children(soup, dataset, DEPTH))
        return dataset
    
def get_html_file_children(soup : BeautifulSoup, predecessor_node : Node, remaining_depth : int) -> List['Node']:
