From bb2d0d156b9d53fe93c7c1d16951dd3890d23db4 Mon Sep 17 00:00:00 2001 From: Lycea <20510874+Lycea@users.noreply.github.com> Date: Thu, 24 Feb 2022 17:04:36 +0100 Subject: [PATCH 1/2] Create html_helper.py --- html_helper.py | 132 +++++++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 132 insertions(+) create mode 100644 html_helper.py diff --git a/html_helper.py b/html_helper.py new file mode 100644 index 0000000..79ceb3a --- /dev/null +++ b/html_helper.py @@ -0,0 +1,132 @@ +from html.parser import HTMLParser +import requests + +import queue + + + +class Item(): + pass + + + +class Craftable(): + def __init__(self): + self.components ={} + self.name = "" + + self.tags=[] + + def add_component(self,name,amount): + self.components[name] = amount + +class ItemHandler(): + def __init__(self): + self.recipies={} + self.craftables ={} + self.items ={} + pass + + + + +def get_page(page): + return requests.get(page,verify=False).text + +class Collector(HTMLParser): + def __init__(self): + super().__init__() + self.tree = {} + self._level = 0 + + self._in_body=False + + self._tag_stack = [] + + self._tag_stack.append({"name":"body","attrs":[],"childs":[],"data":[]}) + + def handle_starttag(self, tag, attrs): + if tag == "body": + self._in_body=True + + if self._in_body: + print(self._level*" "+"start:",tag,attrs) + + + self._tag_stack.append({"name":tag,"atrrs":attrs,"childs":[],"data":[]}) + self._level += 1 + + + + def handle_data(self,data): + if self._in_body: + print(self._level*" "+"data",data) + + def handle_endtag(self, tag): + if self._in_body: + print(self._level*" "+"end:",tag) + + child=self._tag_stack.pop() + self._tag_stack[-1]["childs"].append(child) + + #self.tag_stack[-1]["childs"].append(self.tag_stack.pop()) + #print(self._tag_stack[-1]) + + if tag == "body": + self.tree = self._tag_stack + self._in_body = False + + self._level -= 1 + + +class SearchHelper(): + def __init__(self,tree): + self.tree = tree + + self.__tag_list={} #has all the tags in it (a ,h1,p,body ... xyz) with the tag ids + self.__single_tag =[] # all of the tags broken down + + + + @staticmethod + def find_all_tags(tree,searched_tag): + link_list = [] + for tag in tree: + if tag["name"] == searched_tag: + print(tag) + link_list.append(tag) + + if len(tag["childs"]) > 0: + links = SearchHelper.find_all_tags(tag["childs"],searched_tag) + if len(links) > 0: + link_list.extend(links) + return link_list + + + def flatten(self,tree=None): + if tree: + for tag in self.tree: + if len(tag["childs"]) >0: + child_ids = self.flatten(tree=tag["childs"]) + + #add copy of self to list + #add own id to the tag list + #change childs of copy to ids or nothing + + + +page = "https://idleon.miraheze.org/wiki/Smithing" +sample_html = "

my text is here ... ahhhh

" + +#print(get_page(page)) +main_parser = Collector() +main_parser.feed(get_page(page)) + + +#main_parser.feed("

my text is here ... ahhhh

") +print(main_parser.tree) + +links = SearchHelper.find_all_tags(main_parser.tree,"table") + +for link in links: + print(link) From f94d53d2ee7c3344f656a3de8dd239a647863ef7 Mon Sep 17 00:00:00 2001 From: Lycea <20510874+Lycea@users.noreply.github.com> Date: Fri, 25 Feb 2022 00:26:14 +0100 Subject: [PATCH 2/2] Added flattening of tree and easy dynamic search - added flatting Flatting is a seperate function as of now which converts a file tree into a flat list + topic lookup list for now. NOTE: Root elements are not marked yet so no check for base elements - added basic dynamic search added a verry simple dynamic querrying of results for simple syntaxes for now like table/a/tr --- html_helper.py | 158 +++++++++++++++++++++++++++++++++++++++++-------- 1 file changed, 132 insertions(+), 26 deletions(-) diff --git a/html_helper.py b/html_helper.py index 79ceb3a..0202bed 100644 --- a/html_helper.py +++ b/html_helper.py @@ -51,16 +51,13 @@ def handle_starttag(self, tag, attrs): if self._in_body: print(self._level*" "+"start:",tag,attrs) - - self._tag_stack.append({"name":tag,"atrrs":attrs,"childs":[],"data":[]}) self._level += 1 - - def handle_data(self,data): if self._in_body: - print(self._level*" "+"data",data) + #print(self._level*" "+"data",data) + self._tag_stack[-1]["data"].append(data) def handle_endtag(self, tag): if self._in_body: @@ -79,6 +76,7 @@ def handle_endtag(self, tag): self._level -= 1 + class SearchHelper(): def __init__(self,tree): self.tree = tree @@ -86,14 +84,13 @@ def __init__(self,tree): self.__tag_list={} #has all the tags in it (a ,h1,p,body ... xyz) with the tag ids self.__single_tag =[] # all of the tags broken down - - @staticmethod def find_all_tags(tree,searched_tag): link_list = [] + for tag in tree: if tag["name"] == searched_tag: - print(tag) + #print(tag) link_list.append(tag) if len(tag["childs"]) > 0: @@ -103,30 +100,139 @@ def find_all_tags(tree,searched_tag): return link_list - def flatten(self,tree=None): - if tree: - for tag in self.tree: - if len(tag["childs"]) >0: - child_ids = self.flatten(tree=tag["childs"]) - #add copy of self to list - #add own id to the tag list - #change childs of copy to ids or nothing + def __process_tag(self,tag): + child_ids = [] + own_id = len(self.__single_tag) + #append the full tag to list + self.__single_tag.append( tag.copy()) + if not tag["name"] in self.__tag_list: self.__tag_list[tag["name"]] = [] + self.__tag_list[tag["name"]].append(own_id) -page = "https://idleon.miraheze.org/wiki/Smithing" -sample_html = "

my text is here ... ahhhh

" -#print(get_page(page)) -main_parser = Collector() -main_parser.feed(get_page(page)) + if len(tag["childs"]) > 0: + child_ids = self.flatten(tree=tag["childs"]) + self.__single_tag[own_id]["childs"] = child_ids + # add copy of self to list + # add own id to the tag list + # change childs of copy to ids or nothing -#main_parser.feed("

my text is here ... ahhhh

") -print(main_parser.tree) + return own_id -links = SearchHelper.find_all_tags(main_parser.tree,"table") + def flatten(self,tree=None,parent=-1): + child_ids =[] + if not tree: + for tag in self.tree: + child_ids.append(self.__process_tag(tag)) + else: + for tag in tree: + child_ids.append(self.__process_tag(tag)) + + return child_ids -for link in links: - print(link) + def _get_child_ids(self,ids): + child_elements = [] + + for id in ids: + child_elements.extend(self.__single_tag[id]["childs"]) + return set(child_elements) + + def search_dyn(self,syntax): + fit_elements = [] + + fitting_element_ids =[] + subseqent_elements = syntax.split("/") + + #iterate all subelements + for element in subseqent_elements: + #check if searched tag exists at all + if element in self.__tag_list: + all_element_ids = self.__tag_list[element] + + #diff the own list and the set list and return the ones from own + #list which are the same + #(only do if not the first element + if len(fitting_element_ids) != 0: + print(" all tag ids :",all_element_ids) + fitting_element_ids = self._get_child_ids(fitting_element_ids) + print(" checking ids:",fitting_element_ids) + fitting_element_ids = fitting_element_ids.intersection(all_element_ids) + print(" Matching:",fitting_element_ids) + else: + fitting_element_ids = self.__tag_list[element] + + #if no fits , just stop + if len(fitting_element_ids) == 0: + print("No results found!") + break + + else: + print("No result found for querry",syntax) + break + + print("Ids",fitting_element_ids) + + #element ids to full tags + for fit_id in fitting_element_ids: + fit_elements.append(self.__single_tag[fit_id]) + + return fit_elements + + + + + + +if __name__ == "__main__": + + page = "https://idleon.miraheze.org/wiki/Smithing" + sample_html = """ + +

my text is here ... ahhhh

+

+ Hello there this is a text !!! +

+ +

test other +

+

something

+ Something different +

+ + """ + + #print(get_page(page)) + main_parser = Collector() + #main_parser.feed(get_page(page)) + main_parser.feed(sample_html) + #print(main_parser.tree) + + links = SearchHelper.find_all_tags(main_parser.tree,"table") + + searcher = SearchHelper(main_parser.tree) + searcher.flatten() + + search_strings =[ + "p", + "body", + "body/p", + "body/h1", + "body/p/h1", + "body/h1&color=ffffff" + ] + + for querry_string in search_strings: + print("\nStart searching for:",querry_string) + results = searcher.search_dyn(querry_string) + print("Search results:") + + for result in results: + print(result) + + #txt="table/caption=Anvil Table/" + for link in links: + print("") + print(link)