In [1]:
from ipytree import Tree
import json
from pathlib import Path
from pdf_annotation import App

In [3]:
sm = "../../pdfs/paper.json"
lg = "../../pdfs/2020jagm.json"
with open(lg,"r") as f:
    data = json.load(f)

In [4]:
def clean(node):
    if "label" in node:
        node["type"] = "section"
        for n in node["children"]:
            if "type" in n and n["type"] == "label":
                node["value"] = n["value"]
                node["coords"] = n["coords"]
                node["page"] = n["page"]
                node["children"] = [
                    n for n in node["children"] 
                    if not ("type" in n and n["type"]=="label")
                    ]
                break
        else:
            node["value"] = node["label"]
        

    if not "children" in node:
        node["children"] = []


    if "content" in node:
        node["children"] = node["content"] + node["children"]

    for n in node["children"]:
        clean(n)
    

In [5]:

clean(data)

In [6]:
NODE_TYPES = {}
MAX_LEN = 20

In [7]:
from ipytree import Node


class MyNode(Node):
    def __init__(self, data=None):
        super().__init__()
        if data:
            self.load(data)
    def load(self, data):
        for child in data["children"]:
            if not child["type"] in "labeltable":
                self.add_node(NODE_TYPES[child["type"]](child))
    def collapse(self):
        self.opened = False
    def expand(self):
        self.opened = True
        
    def collapse_all(self):
        self.collapse()
        for n in self.nodes:
            n.collapse_all()
    def collapse_to(self, level):
        if level == 0:
            self.collapse_all()
        else:
            self.expand()
            for n in self.nodes:
                n.collapse_to(level-1)


class SectionNode(MyNode):
    def __init__(self, data=None):
        super().__init__(data)
        self.value = data.get("value","")
        ellipsis = "..." if len(self.value)>MAX_LEN else ""
        self.name = self.value[:MAX_LEN] + ellipsis

class TextNode(MyNode):
    def __init__(self, data=None):
        super().__init__(data)
        self.value = data.get("value","")
        ellipsis = "..." if len(self.value)>MAX_LEN else ""
        self.name = self.value[:MAX_LEN] + ellipsis
        self.icon = "align-left"

class ImageNode(MyNode):
    def __init__(self, data=None):
        super().__init__(data)
        self.value = data.get("value","")
        self.name = "img"
        self.icon = "image"

NODE_TYPES = {
    "section": SectionNode,
    "text": TextNode,
    "image": ImageNode,
}
    

In [8]:
tree = Tree(stripes=True)
my_node = SectionNode(data)
tree.add_node(my_node)
my_node.collapse_to(2)
tree

Tree(nodes=(SectionNode(name='../pdfs/ARMY/2020jag...', nodes=(SectionNode(name='Joint Air-to-Ground ...', nod…

In [10]:
tree.selected_nodes

(SectionNode(name='Mission', nodes=(TextNode(icon='align-left', name='Army and Marine Corp...', opened=False),), opened=False, selected=True),)

In [9]:
my_node.collapse_to(2)

In [None]:
text = " ".join([x["value"] for x in data["children"][0]["children"][-1]["children"][22]["children"] if x["type"] == "text"])

In [None]:
import spacy
nlp = spacy.load("en_core_web_lg")
doc = nlp(" ".join([x["value"] for x in data["children"][0]["children"][-1]["children"][22]["children"] if x["type"] == "text"]))
for ent in doc.ents:
    print(ent)

In [None]:
for ent in doc.noun_chunks:
    print(ent)

In [None]:
recipies = [
    [" ".join([x['value'] for x in r["children"] if x['type'] == 'text']) for r in section["children"]]
    for section in  data["children"][0]["children"][2:]
]

names = [
    [" ".join([x['value'] for x in r["children"] if x['type'] == 'label']).strip() for r in section["children"]]
    for section in  data["children"][0]["children"][2:]
]

sections = [
    [r["value"] for r in section["children"] if r['type'] == 'label'][0]
    for section in  data["children"][0]["children"][2:]
]

In [None]:
sections

In [None]:
for i,s in enumerate(names):
    names[i] = [f"{sections[i]}>{n}" for n in s]

In [None]:
recipies = [nlp(x) for x in sum(recipies,[])]
names = sum(names,[])