From bb2d0d156b9d53fe93c7c1d16951dd3890d23db4 Mon Sep 17 00:00:00 2001
From: Lycea <20510874+Lycea@users.noreply.github.com>
Date: Thu, 24 Feb 2022 17:04:36 +0100
Subject: [PATCH 1/2] Create html_helper.py

---
 html_helper.py | 132 +++++++++++++++++++++++++++++++++++++++++++++++++
 1 file changed, 132 insertions(+)
 create mode 100644 html_helper.py
diff --git a/html_helper.py b/html_helper.py
new file mode 100644
index 0000000..79ceb3a
--- /dev/null
+++ b/html_helper.py
@@ -0,0 +1,132 @@
+from html.parser import HTMLParser
+import requests
+
+import queue
+
+
+
+class Item():
+    pass
+
+
+
+class Craftable():
+    def __init__(self):
+        self.components ={}
+        self.name = ""
+
+        self.tags=[]
+
+    def add_component(self,name,amount):
+        self.components[name] = amount
+
+class ItemHandler():
+    def __init__(self):
+        self.recipies={}
+        self.craftables ={}
+        self.items ={}
+        pass
+
+
+
+
+def get_page(page):
+    return requests.get(page,verify=False).text
+
+class Collector(HTMLParser):
+    def __init__(self):
+        super().__init__()
+        self.tree = {}
+        self._level = 0
+
+        self._in_body=False
+
+        self._tag_stack = []
+
+        self._tag_stack.append({"name":"body","attrs":[],"childs":[],"data":[]})
+
+    def handle_starttag(self, tag, attrs):
+        if tag == "body":
+            self._in_body=True
+
+        if self._in_body:
+            print(self._level*" "+"start:",tag,attrs)
+
+
+            self._tag_stack.append({"name":tag,"atrrs":attrs,"childs":[],"data":[]})
+        self._level += 1
+
+
+
+    def handle_data(self,data):
+        if self._in_body:
+            print(self._level*" "+"data",data)
+
+    def handle_endtag(self, tag):
+        if self._in_body:
+            print(self._level*" "+"end:",tag)
+
+            child=self._tag_stack.pop()
+            self._tag_stack[-1]["childs"].append(child)
+
+            #self.tag_stack[-1]["childs"].append(self.tag_stack.pop())
+            #print(self._tag_stack[-1])
+
+        if tag == "body":
+            self.tree = self._tag_stack
+            self._in_body = False
+
+        self._level -= 1
+
+
+class SearchHelper():
+    def __init__(self,tree):
+        self.tree = tree
+
+        self.__tag_list={} #has all the tags in it  (a ,h1,p,body ... xyz) with the tag ids
+        self.__single_tag =[] # all of the tags broken down
+
+
+
+    @staticmethod
+    def find_all_tags(tree,searched_tag):
+        link_list = []
+        for tag in tree:
+            if tag["name"] == searched_tag:
+                print(tag)
+                link_list.append(tag)
+
+            if len(tag["childs"]) > 0:
+                links = SearchHelper.find_all_tags(tag["childs"],searched_tag)
+                if len(links) > 0:
+                    link_list.extend(links)
+        return link_list
+
+
+    def flatten(self,tree=None):
+        if tree:
+            for tag in self.tree:
+                if len(tag["childs"]) >0:
+                    child_ids = self.flatten(tree=tag["childs"])
+
+                #add copy of self to list
+                #add own id to the tag list
+                #change childs of copy to ids or nothing
+
+
+
+page = "https://idleon.miraheze.org/wiki/Smithing"
+sample_html = "<htm><body><h1 color='ffffff'>my text <b>is here   <a href='abc.com'/></b> ... ahhhh</h1></body></html>"
+
+#print(get_page(page))
+main_parser = Collector()
+main_parser.feed(get_page(page))
+
+
+#main_parser.feed("<htm><body><h1 color='ffffff'>my text <b>is here   <a href='abc.com'/></b> ... ahhhh</h1></body></html>")
+print(main_parser.tree)
+
+links = SearchHelper.find_all_tags(main_parser.tree,"table")
+
+for link in links:
+    print(link)

From f94d53d2ee7c3344f656a3de8dd239a647863ef7 Mon Sep 17 00:00:00 2001
From: Lycea <20510874+Lycea@users.noreply.github.com>
Date: Fri, 25 Feb 2022 00:26:14 +0100
Subject: [PATCH 2/2] Added flattening of tree and easy dynamic search

- added flatting
  Flatting is a seperate function as of now
  which converts a file tree into a flat list + topic lookup list for
  now.
  NOTE: Root elements are not marked yet so no check for base elements

- added basic dynamic search
  added a verry simple dynamic querrying
  of results for simple syntaxes for now like
  table/a/tr
---
 html_helper.py | 158 +++++++++++++++++++++++++++++++++++++++++--------
 1 file changed, 132 insertions(+), 26 deletions(-)

diff --git a/html_helper.py b/html_helper.py
index 79ceb3a..0202bed 100644
--- a/html_helper.py
+++ b/html_helper.py
@@ -51,16 +51,13 @@ def handle_starttag(self, tag, attrs):
 
         if self._in_body:
             print(self._level*" "+"start:",tag,attrs)
-
-
             self._tag_stack.append({"name":tag,"atrrs":attrs,"childs":[],"data":[]})
         self._level += 1
 
-
-
     def handle_data(self,data):
         if self._in_body:
-            print(self._level*" "+"data",data)
+            #print(self._level*" "+"data",data)
+            self._tag_stack[-1]["data"].append(data)
 
     def handle_endtag(self, tag):
         if self._in_body:
@@ -79,6 +76,7 @@ def handle_endtag(self, tag):
         self._level -= 1
 
 
+
 class SearchHelper():
     def __init__(self,tree):
         self.tree = tree
@@ -86,14 +84,13 @@ def __init__(self,tree):
         self.__tag_list={} #has all the tags in it  (a ,h1,p,body ... xyz) with the tag ids
         self.__single_tag =[] # all of the tags broken down
 
-
-
     @staticmethod
     def find_all_tags(tree,searched_tag):
         link_list = []
+
         for tag in tree:
             if tag["name"] == searched_tag:
-                print(tag)
+                #print(tag)
                 link_list.append(tag)
 
             if len(tag["childs"]) > 0:
@@ -103,30 +100,139 @@ def find_all_tags(tree,searched_tag):
         return link_list
 
 
-    def flatten(self,tree=None):
-        if tree:
-            for tag in self.tree:
-                if len(tag["childs"]) >0:
-                    child_ids = self.flatten(tree=tag["childs"])
 
-                #add copy of self to list
-                #add own id to the tag list
-                #change childs of copy to ids or nothing
+    def __process_tag(self,tag):
+        child_ids = []
+        own_id = len(self.__single_tag)
 
+        #append the full tag to list
+        self.__single_tag.append( tag.copy())
 
+        if not tag["name"] in self.__tag_list: self.__tag_list[tag["name"]] = []
+        self.__tag_list[tag["name"]].append(own_id)
 
-page = "https://idleon.miraheze.org/wiki/Smithing"
-sample_html = "<htm><body><h1 color='ffffff'>my text <b>is here   <a href='abc.com'/></b> ... ahhhh</h1></body></html>"
 
-#print(get_page(page))
-main_parser = Collector()
-main_parser.feed(get_page(page))
+        if len(tag["childs"]) > 0:
+            child_ids = self.flatten(tree=tag["childs"])
 
+        self.__single_tag[own_id]["childs"] = child_ids
+        # add copy of self to list
+        # add own id to the tag list
+        # change childs of copy to ids or nothing
 
-#main_parser.feed("<htm><body><h1 color='ffffff'>my text <b>is here   <a href='abc.com'/></b> ... ahhhh</h1></body></html>")
-print(main_parser.tree)
+        return own_id
 
-links = SearchHelper.find_all_tags(main_parser.tree,"table")
+    def flatten(self,tree=None,parent=-1):
+        child_ids =[]
+        if not tree:
+            for tag in self.tree:
+                child_ids.append(self.__process_tag(tag))
+        else:
+            for tag in tree:
+                child_ids.append(self.__process_tag(tag))
+
+        return child_ids
 
-for link in links:
-    print(link)
+    def _get_child_ids(self,ids):
+        child_elements = []
+
+        for id in ids:
+            child_elements.extend(self.__single_tag[id]["childs"])
+        return set(child_elements)
+
+    def search_dyn(self,syntax):
+        fit_elements = []
+
+        fitting_element_ids =[]
+        subseqent_elements = syntax.split("/")
+
+        #iterate all subelements
+        for element in subseqent_elements:
+            #check if searched tag exists at all
+            if element in self.__tag_list:
+                all_element_ids = self.__tag_list[element]
+
+                #diff the own list and the set list and return the ones from own
+                #list which are the same
+                #(only do if not the first element
+                if len(fitting_element_ids) != 0:
+                    print(" all tag ids :",all_element_ids)
+                    fitting_element_ids = self._get_child_ids(fitting_element_ids)
+                    print(" checking ids:",fitting_element_ids)
+                    fitting_element_ids = fitting_element_ids.intersection(all_element_ids)
+                    print(" Matching:",fitting_element_ids)
+                else:
+                    fitting_element_ids = self.__tag_list[element]
+
+                #if no fits , just stop
+                if len(fitting_element_ids) == 0:
+                    print("No results found!")
+                    break
+
+            else:
+                print("No result found for querry",syntax)
+                break
+
+        print("Ids",fitting_element_ids)
+
+        #element ids to full tags
+        for fit_id in fitting_element_ids:
+            fit_elements.append(self.__single_tag[fit_id])
+
+        return fit_elements
+
+
+
+
+
+
+if __name__ == "__main__":
+
+    page = "https://idleon.miraheze.org/wiki/Smithing"
+    sample_html = """<htm>
+                        <body>
+                            <h1 color='ffffff'>my text <b>is here   <a href='abc.com'/></b> ... ahhhh</h1>
+                            <p>
+                                Hello there this is a <b> text </b> !!!
+                            </p>
+                            
+                            <h1>test other </>
+                            <p>
+                                <h1>something</h1>
+                                Something different
+                            </p>
+                        </body>
+                    </html>"""
+
+    #print(get_page(page))
+    main_parser = Collector()
+    #main_parser.feed(get_page(page))
+    main_parser.feed(sample_html)
+    #print(main_parser.tree)
+
+    links = SearchHelper.find_all_tags(main_parser.tree,"table")
+
+    searcher = SearchHelper(main_parser.tree)
+    searcher.flatten()
+
+    search_strings =[
+        "p",
+        "body",
+        "body/p",
+        "body/h1",
+        "body/p/h1",
+        "body/h1&color=ffffff"
+    ]
+
+    for querry_string in search_strings:
+        print("\nStart searching for:",querry_string)
+        results = searcher.search_dyn(querry_string)
+        print("Search results:")
+
+        for result in results:
+            print(result)
+
+    #txt="table/caption=Anvil Table/"
+    for link in links:
+        print("")
+        print(link)