### Try TreeIndex

In [74]:
from llama_index.core import TreeIndex

In [None]:
index=TreeIndex.from_documents

### Load data

In [82]:
import markdown
with open('../Data/Investor Reports/2012-02.md', 'r', encoding='utf-8') as file:
    content = file.read()

# html = markdown.markdown(content)

In [83]:
import re

def parse_markdown_to_tree(markdown):
    # Split the markdown into lines
    lines = markdown.strip().split('\n')
    
    tree = {}
    current_section = None
    current_subsection = None

    for line in lines:
        # Check for section headers (first level, "=")
        if re.match(r'^=+$', line.strip()):
            current_section = previous_line
            tree[current_section] = {}
        # Check for subsection headers (second level, "-")
        elif re.match(r'^-+$', line.strip()):
            if current_subsection != None:
                tree[current_section][current_subsection].pop()
            current_subsection = previous_line
            tree[current_section][current_subsection] = []
        else:
            # Normal content lines
            if current_subsection:
                tree[current_section][current_subsection].append(line)
            previous_line = line.strip()

    return tree

# Parse the provided markdown content
tree_structure = parse_markdown_to_tree(content)


In [84]:
a=tree_structure['Everpix February 2012 Report']
a

{'High Level': ['',
  '* 2 new hires on the Everpix team',
  '* Good engineering progress on all fronts',
  '* Surprisingly as many photos imported in February than January (15 millions)',
  ''],
 'Hiring Update': ['',
  'We finally found a couple great candidates:',
  '',
  '* Sameer Sundresh as Cloud Infrastructure Lead Engineer:',
  '  * http://www.linkedin.com/in/sameersundresh',
  '  * PhD in computer science, MemSQL, Rdio...',
  '  * Hired on February 13th',
  '* George Leontiev as iOS Lead Engineer:',
  '  * http://www.linkedin.com/pub/george-leontiev/43/a28/194',
  '  * Apple engineer on iWork team',
  '  * Hired on March 5th',
  '',
  'Ideally, we would only hire people who are clearly more talented and experienced than us in specific areas: that’s what we did for Jason Eberle, our web engineer, and our new hire Sameer Sundresh. But when it comes to iOS development, we already have a very strong in-house experience, so it would be really difficult, very expensive and would tak

In [85]:
def merge_and_split_tree(tree):
    def merge_nodes(nodes):
        merged_nodes = []
        temp_node = []
        for node in nodes:
            if node.startswith('  *'):
                temp_node.append(node)
            else:
                if temp_node:
                    merged_nodes.append('\n'.join(temp_node))
                    merged_nodes.append('')
                    temp_node = []
                merged_nodes.append(node)
        if temp_node:
            merged_nodes.append('\n'.join(temp_node))
            merged_nodes.append('')
        return merged_nodes

    def split_long_nodes(nodes, max_length=256):
        split_nodes = []
        for node in nodes:
            while len(node) > max_length:
                split_point = node[:max_length].rfind('. ')
                if split_point == -1:
                    split_point = max_length
                split_nodes.append(node[:split_point+1])
                node = node[split_point+1:].strip()
            split_nodes.append(node)
        return split_nodes

    for section, subsections in tree.items():
        for subsection, content in subsections.items():
            # Step 1: Merge '* ' and '  *' nodes
            merged_content = merge_nodes(content)
            # if subsection =='Infrastructure': print(merged_content)
            # Step 2: Merge nodes between empty nodes
            cleaned_content = []
            temp_content = []
            for node in merged_content:
                if node == '':
                    if temp_content:
                        cleaned_content.append(' '.join(temp_content))
                        temp_content = []
                    cleaned_content.append(node)
                else:
                    temp_content.append(node)
            if temp_content:
                cleaned_content.append(' '.join(temp_content))
            # if subsection =='Infrastructure': print(cleaned_content)
            # Step 3: Split long nodes
            final_content = split_long_nodes(cleaned_content)
            tree[section][subsection] = final_content
            # if subsection =='Infrastructure': print(final_content)

    return tree


In [86]:
# Process the tree structure with the merging and splitting rules
processed_tree = merge_and_split_tree(tree_structure)

['', '* We have fully terminated the Everpix Alpha infrastructure based on Google App Engine and deleted the corresponding 10+ millions photos.', '* We are now fully based on Amazon Web Services (AWS).', '* The estimated import capacity of 1 to 2 millions photos / day of the current infrastructure has been verified in practice with a few days in February successfully handled in this range.', '* With our new dedicated infrastructure engineer, we started designing and building the version 3 of our infrastructure to lay the foundation for the new features we need and also allow us to handle our real launch later this summer.', '']
['', '* We have fully terminated the Everpix Alpha infrastructure based on Google App Engine and deleted the corresponding 10+ millions photos. * We are now fully based on Amazon Web Services (AWS). * The estimated import capacity of 1 to 2 millions photos / day of the current infrastructure has been verified in practice with a few days in February successfully 

In [69]:
b=processed_tree['Everpix February 2012 Report']
b

{'High Level': ['',
  '* 2 new hires on the Everpix team * Good engineering progress on all fronts * Surprisingly as many photos imported in February than January (15 millions)',
  ''],
 'Hiring Update': ['',
  'We finally found a couple great candidates:',
  '',
  '* Sameer Sundresh as Cloud Infrastructure Lead Engineer:   * http://www.linkedin.com/in/sameersundresh\n  * PhD in computer science, MemSQL, Rdio...\n  * Hired on February 13th',
  '',
  '* George Leontiev as iOS Lead Engineer:   * http://www.linkedin.com/pub/george-leontiev/43/a28/194\n  * Apple engineer on iWork team\n  * Hired on March 5th',
  '',
  '',
  'Ideally, we would only hire people who are clearly more talented and experienced than us in specific areas: that’s what we did for Jason Eberle, our web engineer, and our new hire Sameer Sundresh.',
  'But when it comes to iOS development, we already have a very strong in-house experience, so it would be really difficult, very expensive and would take a long time to fi

In [66]:
from sentence_transformers import SentenceTransformer
model = SentenceTransformer('sentence-transformers/all-MiniLM-L6-v2')

In [74]:
a='''Ideally, we would only hire people who are clearly more talented and experienced than us in specific areas: that’s what we did for Jason Eberle, our web engineer, and our new hire Sameer Sundresh. But when it comes to iOS development, we already have a very strong in-house experience, so it would be really difficult, very expensive and would take a long time to find such people. So instead, considering the core of the iOS app has been built, and most of the work is now focused on the UX layer (and later build the iPad version), we decided to look for a good iOS engineer for these specific tasks. In this very competitive market, we were fortunate enough to get George Leontiev before he really hit the market. His addition to the team will allow us to both continue building the iOS app at a good pace and also leave enough bandwidth to work on our image analysis research.</p>
'''

# embeddings = model.encode(a)
# print(embeddings)

In [75]:
len(a)

885

### KnowledgeGraph

In [12]:
import spacy

nlp = spacy.load("en_core_web_sm")

def extract_entities_relations(text):
    doc = nlp(text)
    entities = [(ent.text, ent.label_) for ent in doc.ents]
    relations = []
    for ent in doc.ents:
        for token in ent.root.head.subtree:
            if token.dep_ in ["nsubj", "dobj"]:
                relations.append((ent.text, token.dep_, token.text))
    return entities, relations


In [13]:
entities, relations = extract_entities_relations(html)

In [54]:
def create_triples(relations):
    triples = []
    for ent1, dep, ent2 in relations:
        triples.append((ent1, dep, ent2))
    return triples

triples = create_triples(relations)

In [56]:
from py2neo import Graph
graph = Graph("bolt://localhost:7687", auth=("neo4j", "Th503221."))

In [57]:
from py2neo import Node, Relationship

for subj, rel, obj in triples:
    subj_node = Node("Entity", name=subj)
    obj_node = Node("Entity", name=obj)
    relationship = Relationship(subj_node, rel, obj_node)
    graph.merge(subj_node, "Entity", "name")
    graph.merge(obj_node, "Entity", "name")
    graph.create(relationship)


In [58]:
# 查询所有节点
nodes = graph.run("MATCH (n) RETURN n")
for record in nodes:
    print(record)

# 查询特定关系
relationships = graph.run("MATCH (a)-[r:KNOWS]->(b) RETURN a, r, b")
for record in relationships:
    print(record)


Node('Entity', name='Alice')
Node('Entity', name='Bob')
Node('Entity', name='Acme Corp')
Node('Entity', name='Wonderland')
Node('Entity', name='Dreamland')
Node('Entity', name='February than January')
Node('Entity', name='photos')
Node('Entity', name='Sameer')
Node('Entity', name='Sundresh')
Node('Entity', name='George Leontiev')
Node('Entity', name='>')
Node('Entity', name='Leontiev')
Node('Entity', name='Apple')
Node('Entity', name='engineer')
Node('Entity', name='iPad')
Node('Entity', name='version')
Node('Entity', name='he')
Node('Entity', name='market')
Node('Entity', name='Everpix Alpha')
Node('Entity', name='infrastructure')
Node('Entity', name='10+ millions')
Node('Entity', name='10')
Node('Entity', name='photos.</li')
Node('Entity', name='<')
Node('Entity', name='li')
Node('Entity', name='3')
Node('Entity', name='2 months')
Node('Entity', name='website')
Node('Entity', name='which')
Node('Entity', name='month.</li')
Node('Entity', name='primary')
Node('Entity', name='goal')
No

In [59]:
entities = graph.nodes.match("Entity")
texts = []
for entity in entities:
    relationships = list(graph.match((entity,), r_type=None))
    for rel in relationships:
        relationship_text = f"{rel.start_node['name']} {rel.type} {rel.end_node['name']}"
        texts.append(relationship_text)

texts

['Alice <function Relationship.type at 0x1335b8fe0> Wonderland',
 'Alice <function Relationship.type at 0x1335b8fe0> Bob',
 'Bob <function Relationship.type at 0x1335b8fe0> Acme Corp',
 'Wonderland <function Relationship.type at 0x1335b8fe0> Dreamland',
 'February than January <function Relationship.type at 0x1335b8fe0> photos',
 'Sameer <function Relationship.type at 0x1335b8fe0> Sundresh',
 'George Leontiev <function Relationship.type at 0x1335b8fe0> market',
 'George Leontiev <function Relationship.type at 0x1335b8fe0> he',
 'George Leontiev <function Relationship.type at 0x1335b8fe0> Leontiev',
 'George Leontiev <function Relationship.type at 0x1335b8fe0> >',
 'Apple <function Relationship.type at 0x1335b8fe0> accounts',
 'Apple <function Relationship.type at 0x1335b8fe0> users',
 'Apple <function Relationship.type at 0x1335b8fe0> Apple',
 'Apple <function Relationship.type at 0x1335b8fe0> which',
 'Apple <function Relationship.type at 0x1335b8fe0> >',
 'Apple <function Relationshi

### Tree index

In [1]:
import markdown
from bs4 import BeautifulSoup

def parse_markdown(md_content):
    html = markdown.markdown(md_content)
    soup = BeautifulSoup(html, 'html.parser')
    data = []
    current_h1 = None
    current_h2 = None
    for element in soup.find_all(['h1', 'h2', 'p']):
        if element.name == 'h1':
            current_h1 = element.get_text()
        elif element.name == 'h2':
            current_h2 = element.get_text()
        elif element.name == 'p':
            data.append({
                'h1': current_h1,
                'h2': current_h2,
                'text': element.get_text()
            })
    return data


In [9]:
from llama_index.core import Node


ImportError: cannot import name 'Node' from 'llama_index' (unknown location)

In [4]:
from llama_index import TreeIndex, Node

def create_tree_index(parsed_data):
    root = Node("Root Node")
    h1_nodes = {}

    for entry in parsed_data:
        h1_text = entry['h1']
        h2_text = entry['h2']
        text = entry['text']

        if h1_text not in h1_nodes:
            h1_nodes[h1_text] = Node(h1_text, parent=root)
        h1_node = h1_nodes[h1_text]

        h2_node = Node(h2_text, parent=h1_node)
        Node(text, parent=h2_node)

    return TreeIndex(root)


ImportError: cannot import name 'GPTTreeIndex' from 'llama_index' (unknown location)