In [1]:
import os
import json
import torch

from llama_index.core import Settings
from llama_index.core import (
    VectorStoreIndex,
    SimpleDirectoryReader,
    KnowledgeGraphIndex,
    Settings,
    load_index_from_storage,
    StorageContext,
)

from llama_index.llms.huggingface import HuggingFaceLLM
from llama_index.embeddings.huggingface import HuggingFaceEmbedding

from llama_index.llms.mistralai import MistralAI
from llama_index.embeddings.mistralai import MistralAIEmbedding

from llama_index.graph_stores.neo4j import Neo4jGraphStore
# from llama_index.graph_stores import NebulaGraphStore
# from llama_index.vector_stores.faiss import FaissVectorStore

from llama_index.core.node_parser import SentenceSplitter
from llama_index.core.node_parser import HTMLNodeParser

from llama_index.core.prompts.base import PromptTemplate
from llama_index.core.prompts.prompt_type import PromptType

from llama_index.llms.openai_like import OpenAILike
from llama_index.llms.openai import OpenAI

from document_reader import HTMLDocsReader

from llama_index.llms.mistralai import MistralAI
from llama_index.embeddings.mistralai import MistralAIEmbedding


import logging
import sys

from IPython.display import Markdown, display

logging.basicConfig(
    stream=sys.stdout, level=logging.INFO
)  # logging.DEBUG for more verbose output
logging.getLogger().addHandler(logging.StreamHandler(stream=sys.stdout))


  from .autonotebook import tqdm as notebook_tqdm



In [None]:
import pandas as pd
import os
import json
# Load train.json into DataFrame
with open('data/dev.json', 'r') as f_train:
    dev_data = pd.read_json(f_train)
# Load documents.json into DataFrame
with open('data/documents.json', 'r') as f_documents:
    documents_data = pd.read_json(f_documents)
filter_urls = dev_data['url'].unique().tolist()
with open("data/documents.json", "r") as file:
    documents_data = json.load(file)
def process_documents(documents_data, urls):
    for document in documents_data:
        title = document.get("title", "")
        url = document.get("url", "")
        contents = document.get("contents", [])
        if url in urls:
            # Merge contents into a single string with newline characters
            merged_contents = "\n".join(contents)
            # Construct text content
            title_text =  title + '\n'
            contents_text =  merged_contents + '\n'
            # Create directory if it doesn't exist
            if not os.path.exists("data/docs_dev"):
                os.makedirs("data/docs_dev")
            # Write content to a text file
            file_name = f"data/docs_dev/{title}.txt"
            with open(file_name, 'w') as f:
                f.write(title_text)
                f.write(contents_text)
            print(f"Document '{title}' saved successfully.")
process_documents(documents_data, filter_urls)

## LLM and Embedding Model

In [None]:
llm_package = "openai"

In [None]:
if llm_package == 'openai':  # https://docs.llamaindex.ai/en/stable/examples/llm/openai/
    llmmodel_name = ""
    key = ""
    llm = OpenAI(model=llmmodel_name, api_key=key)
    
    
elif llm_package == 'mistralai':  # https://docs.llamaindex.ai/en/stable/examples/llm/mistralai/
    api_key = ""
    llm = MistralAI(api_key=api_key,model="open-mixtral-8x7b")
    
    
elif llm_package == 'OpenAILike':  # https://docs.llamaindex.ai/en/stable/api_reference/llms/openai_like/   # https://docs.vllm.ai/en/latest/serving/openai_compatible_server.html
    inference_server_url = "http://0.0.0.0:8000/v1"
    llm = OpenAILike(
        model="meta-llama/Meta-Llama-3-8B-Instruct",
        api_key="token-abc123",
        api_base=inference_server_url,
        max_tokens=2048,
        temperature=0,
    )
    
    
embedding_llm = HuggingFaceEmbedding(model_name="BAAI/bge-large-en-v1.5") # https://docs.llamaindex.ai/en/stable/examples/embeddings/huggingface/

In [3]:
Settings.llm = llm
Settings.embed_model = embedding_llm

FILEPATH = "data/docs1/"

In [None]:

loader = SimpleDirectoryReader(
    input_dir=FILEPATH, 
    exclude=["*.rst", "*.ipynb", "*.py", "*.bat", "*.png", "*.jpg", "*.jpeg", "*.csv", "*.html", "*.js", "*.css", "*.pdf", "*.json"],
    file_extractor={".txt": HTMLDocsReader(tags=["h1"])},
    recursive=True
)
nodes = loader.load_data()

In [6]:
len(nodes)

15

## neo4j

In [7]:
username = "neo4j"
password = "P@ssw0rd"
url = "bolt://localhost:7688"
database = "neo4j"

graph_store = Neo4jGraphStore(
    username=username,
    password=password,
    url=url,
    database=database
)

INFO:neo4j.notifications:Received notification from DBMS server: {severity: INFORMATION} {code: Neo.ClientNotification.Schema.IndexOrConstraintAlreadyExists} {category: SCHEMA} {title: `CREATE CONSTRAINT IF NOT EXISTS FOR (e:Entity) REQUIRE (e.id) IS UNIQUE` has no effect.} {description: `CONSTRAINT constraint_1ed05907 FOR (e:Entity) REQUIRE (e.id) IS UNIQUE` already exists.} {position: None} for query: '\n                CREATE CONSTRAINT IF NOT EXISTS FOR (n:Entity) REQUIRE n.id IS UNIQUE;\n                '
Received notification from DBMS server: {severity: INFORMATION} {code: Neo.ClientNotification.Schema.IndexOrConstraintAlreadyExists} {category: SCHEMA} {title: `CREATE CONSTRAINT IF NOT EXISTS FOR (e:Entity) REQUIRE (e.id) IS UNIQUE` has no effect.} {description: `CONSTRAINT constraint_1ed05907 FOR (e:Entity) REQUIRE (e.id) IS UNIQUE` already exists.} {position: None} for query: '\n                CREATE CONSTRAINT IF NOT EXISTS FOR (n:Entity) REQUIRE n.id IS UNIQUE;\n           

In [15]:
############################################
# Knowledge-Graph Triplet Extraction Prompt
############################################

DEFAULT_KG_TRIPLET_EXTRACT_TMPL_1 = (
    "You are a Knowledge Graph creation expert. Some text is provided below. Given the text, extract all the relevant knowledge graph triplets in the form of (subject, predicate, object). Avoid stopwords. which will contain relevant information to answer questions . \n"
    "---------------------\n"
    "Example:\n"
    "Text: <p>Apply by the overseas route if your acquired gender has been legally accepted in an 'approved country or territory' and you have documents to prove it.</p>\n"
    "Triplets:\n(acquired gender, accepted in, approved country or territory)\n"
    "Text: <p>You must be 18 or over.</p>\n"
    "Triplets:\n(you, must be, 18 or over)\n"
    "Text: <tr>Overseas route | Form T453 | Leaflet T454</tr>\n"
    "Triplets:\n(apply, using, overseas route)\n"
    "Text: <p>If you’re applying using the overseas route, you must prove that your gender has been legally recognised in an 'approved country or territory'. Send original or certified copies of the following (if you have them):</p>\n"
    "Triplets:\n"
    "(gender, recognised in, approved country or territory)\n"
    "(send, copies of, documents)\n"
    "Text: <p>Apply by the standard route if all the following are true:</p>\n"
    "Triplets:\n(apply, by, standard route)\n"
    "Text: <li>you’re 18 or over</li>\n"
    "Triplets:\n(you, must be, 18 or over)\n"
    "Text: <p>You’ll get an 'interim certificate' if you or your spouse do not want to remain married, or if your spouse does not fill in a statutory declaration. You can use the interim certificate as grounds to end the marriage.</p>\n"
    "Triplets:\n"
    "(you, get, interim certificate)\n"
    "(spouse, fill in, statutory declaration)\n"
    "(use, interim certificate, end marriage)\n"
    "Text: <p>You and your spouse must fill in a statutory declaration saying you both agree to stay married.</p>\n"
    "Triplets:\n"
    "(you, spouse, fill in statutory declaration)\n"
    "(agree, stay, married)\n"
    "Text: <p>You can stay married if you apply for a Gender Recognition Certificate.</p>\n"
    "Triplets:\n(apply for, Gender Recognition Certificate, stay married)\n"
    "---------------------\n"
    "Text: {text}\n"
    "Triplets:\n"
)

DEFAULT_KG_TRIPLET_EXTRACT_TMPL_2 = (
    "Some text is provided below. Given the text, extract up to "
    "{max_knowledge_triplets} "
    "knowledge triplets in the form of (subject, predicate, object). Avoid stopwords.\n"
    "---------------------\n"
    "Example:\n"
    "Text: <p>Apply by the overseas route if your acquired gender has been legally accepted in an 'approved country or territory' and you have documents to prove it.</p>\n"
    "Triplets:\n(acquired gender, accepted in, approved country or territory)\n"
    "Text: <p>You must be 18 or over.</p>\n"
    "Triplets:\n(you, must be, 18 or over)\n"
    "Text: <tr>Overseas route | Form T453 | Leaflet T454</tr>\n"
    "Triplets:\n(apply, using, overseas route)\n"
    "Text: <p>If you’re applying using the overseas route, you must prove that your gender has been legally recognised in an 'approved country or territory'. Send original or certified copies of the following (if you have them):</p>\n"
    "Triplets:\n"
    "(gender, recognised in, approved country or territory)\n"
    "(send, copies of, documents)\n"
    "Text: <p>Apply by the standard route if all the following are true:</p>\n"
    "Triplets:\n(apply, by, standard route)\n"
    "Text: <li>you’re 18 or over</li>\n"
    "Triplets:\n(you, must be, 18 or over)\n"
    "Text: <p>You’ll get an 'interim certificate' if you or your spouse do not want to remain married, or if your spouse does not fill in a statutory declaration. You can use the interim certificate as grounds to end the marriage.</p>\n"
    "Triplets:\n"
    "(you, get, interim certificate)\n"
    "(spouse, fill in, statutory declaration)\n"
    "(use, interim certificate, end marriage)\n"
    "Text: <p>You and your spouse must fill in a statutory declaration saying you both agree to stay married.</p>\n"
    "Triplets:\n"
    "(you, spouse, fill in statutory declaration)\n"
    "(agree, stay, married)\n"
    "Text: <p>You can stay married if you apply for a Gender Recognition Certificate.</p>\n"
    "Triplets:\n(apply for, Gender Recognition Certificate, stay married)\n"
    "Text: {text}\n"
    "Triplets:\n"
)

DEFAULT_KG_TRIPLET_EXTRACT_TMPL = (
    "Some text is provided below. Given the text, extract up to "
    "{max_knowledge_triplets} "
    "knowledge triplets in the form of (subject, predicate, object). Avoid stopwords.\n"
    "---------------------\n"
    "Example:"
    "Text: Alice is Bob's mother."
    "Triplets:\n(Alice, is mother of, Bob)\n"
    "Text: Philz is a coffee shop founded in Berkeley in 1982.\n"
    "Triplets:\n"
    "(Philz, is, coffee shop)\n"
    "(Philz, founded in, Berkeley)\n"
    "(Philz, founded in, 1982)\n"
    "---------------------\n"
    "Text: {text}\n"
    "Triplets:\n"
)

In [None]:
KG_TRIPLET_EXTRACT_PROMPT = PromptTemplate(
    DEFAULT_KG_TRIPLET_EXTRACT_TMPL,    # prompt template
    prompt_type=PromptType.KNOWLEDGE_TRIPLET_EXTRACT,
)

max_triplets_per_chunk = 25     # maximum number of triplets to extract per chunk

PERSIST_DIR = "./storage/storage_graph_kg_prompt_1"   # directory to store the index

In [16]:
graph_storage_context = StorageContext.from_defaults(graph_store=graph_store)


if not os.path.exists(PERSIST_DIR):
    kg_index  = KnowledgeGraphIndex(nodes,
                    storage_context=graph_storage_context, 
                    max_triplets_per_chunk=max_triplets_per_chunk,
                    show_progress=True,
                    kg_triple_extract_template=KG_TRIPLET_EXTRACT_PROMPT,
                    include_embeddings = True
                )
    kg_index.set_index_id("kg_index")
    kg_index.storage_context.persist(persist_dir=PERSIST_DIR)
else:
    # load the existing index
    graph_storage_context = StorageContext.from_defaults(graph_store=graph_store,persist_dir=PERSIST_DIR)
    kg_index = load_index_from_storage(graph_storage_context, index_id="kg_index")

Processing nodes:   0%|          | 0/15 [00:00<?, ?it/s]

INFO:httpx:HTTP Request: POST https://api.mistral.ai/v1/chat/completions "HTTP/1.1 200 OK"
HTTP Request: POST https://api.mistral.ai/v1/chat/completions "HTTP/1.1 200 OK"


Processing nodes:   7%|▋         | 1/15 [00:05<01:18,  5.62s/it]

INFO:httpx:HTTP Request: POST https://api.mistral.ai/v1/chat/completions "HTTP/1.1 200 OK"
HTTP Request: POST https://api.mistral.ai/v1/chat/completions "HTTP/1.1 200 OK"


Processing nodes:  13%|█▎        | 2/15 [00:07<00:47,  3.64s/it]

INFO:httpx:HTTP Request: POST https://api.mistral.ai/v1/chat/completions "HTTP/1.1 200 OK"
HTTP Request: POST https://api.mistral.ai/v1/chat/completions "HTTP/1.1 200 OK"


Processing nodes:  20%|██        | 3/15 [00:10<00:35,  2.98s/it]

INFO:httpx:HTTP Request: POST https://api.mistral.ai/v1/chat/completions "HTTP/1.1 200 OK"
HTTP Request: POST https://api.mistral.ai/v1/chat/completions "HTTP/1.1 200 OK"


Processing nodes:  27%|██▋       | 4/15 [00:15<00:42,  3.85s/it]

INFO:httpx:HTTP Request: POST https://api.mistral.ai/v1/chat/completions "HTTP/1.1 200 OK"
HTTP Request: POST https://api.mistral.ai/v1/chat/completions "HTTP/1.1 200 OK"


Processing nodes:  33%|███▎      | 5/15 [00:17<00:32,  3.29s/it]

INFO:httpx:HTTP Request: POST https://api.mistral.ai/v1/chat/completions "HTTP/1.1 200 OK"
HTTP Request: POST https://api.mistral.ai/v1/chat/completions "HTTP/1.1 200 OK"


Processing nodes:  40%|████      | 6/15 [00:18<00:23,  2.61s/it]

INFO:httpx:HTTP Request: POST https://api.mistral.ai/v1/chat/completions "HTTP/1.1 200 OK"
HTTP Request: POST https://api.mistral.ai/v1/chat/completions "HTTP/1.1 200 OK"


Processing nodes:  47%|████▋     | 7/15 [00:19<00:16,  2.06s/it]

INFO:httpx:HTTP Request: POST https://api.mistral.ai/v1/chat/completions "HTTP/1.1 200 OK"
HTTP Request: POST https://api.mistral.ai/v1/chat/completions "HTTP/1.1 200 OK"


Processing nodes:  53%|█████▎    | 8/15 [00:25<00:21,  3.11s/it]

INFO:httpx:HTTP Request: POST https://api.mistral.ai/v1/chat/completions "HTTP/1.1 200 OK"
HTTP Request: POST https://api.mistral.ai/v1/chat/completions "HTTP/1.1 200 OK"


Processing nodes:  60%|██████    | 9/15 [00:27<00:17,  2.85s/it]

INFO:httpx:HTTP Request: POST https://api.mistral.ai/v1/chat/completions "HTTP/1.1 200 OK"
HTTP Request: POST https://api.mistral.ai/v1/chat/completions "HTTP/1.1 200 OK"


Processing nodes:  67%|██████▋   | 10/15 [00:29<00:13,  2.68s/it]

INFO:httpx:HTTP Request: POST https://api.mistral.ai/v1/chat/completions "HTTP/1.1 200 OK"
HTTP Request: POST https://api.mistral.ai/v1/chat/completions "HTTP/1.1 200 OK"


Processing nodes:  73%|███████▎  | 11/15 [00:30<00:08,  2.14s/it]

INFO:httpx:HTTP Request: POST https://api.mistral.ai/v1/chat/completions "HTTP/1.1 200 OK"
HTTP Request: POST https://api.mistral.ai/v1/chat/completions "HTTP/1.1 200 OK"


Processing nodes:  80%|████████  | 12/15 [00:32<00:06,  2.12s/it]

INFO:httpx:HTTP Request: POST https://api.mistral.ai/v1/chat/completions "HTTP/1.1 200 OK"
HTTP Request: POST https://api.mistral.ai/v1/chat/completions "HTTP/1.1 200 OK"


Processing nodes:  87%|████████▋ | 13/15 [00:35<00:04,  2.19s/it]

INFO:httpx:HTTP Request: POST https://api.mistral.ai/v1/chat/completions "HTTP/1.1 200 OK"
HTTP Request: POST https://api.mistral.ai/v1/chat/completions "HTTP/1.1 200 OK"


Processing nodes:  93%|█████████▎| 14/15 [00:36<00:01,  1.96s/it]

INFO:httpx:HTTP Request: POST https://api.mistral.ai/v1/chat/completions "HTTP/1.1 200 OK"
HTTP Request: POST https://api.mistral.ai/v1/chat/completions "HTTP/1.1 200 OK"


Processing nodes: 100%|██████████| 15/15 [00:38<00:00,  2.55s/it]


In [17]:
graph_store.query(
    """
MATCH (m)-[e]->(n) RETURN COUNT(*)
"""
)

[{'COUNT(*)': 125}]

In [11]:
# graph_store.query(
#     """
# MATCH (n)
# DETACH DELETE n
# """
# )

[]