In [6]:
import os
from typing import List, Tuple
from langchain_openai import ChatOpenAI, OpenAIEmbeddings
from langchain_neo4j.vectorstores.neo4j_vector import Neo4jVector
from langchain_neo4j.vectorstores.neo4j_vector import remove_lucene_chars
from langchain_neo4j import Neo4jGraph
from pydantic import BaseModel, Field
from langchain_core.prompts import ChatPromptTemplate, PromptTemplate
from langchain_core.messages import HumanMessage, AIMessage
from langchain_core.runnables import RunnablePassthrough, RunnableLambda, RunnableBranch, RunnableParallel
from langchain_core.output_parsers.string import StrOutputParser


In [7]:
OPENAI_API_KEY = os.environ.get("OPENAI_API_KEY")
neo4j_uri = os.environ.get("NEO4J_AURA")
neo4j_username = os.environ.get("NEO4J_USERNAME")
neo4j_password = os.environ.get("NEO4J_PASSWORD")
neo4j_database = os.environ.get("NEO4J_DATABASE")

print(neo4j_database)

neo4j


In [8]:
graph = Neo4jGraph(
    url=neo4j_uri,
    username=neo4j_username,
    password=neo4j_password,
    database=neo4j_database
)
llm = ChatOpenAI(temperature=0, model_name="gpt-4o")
embeddings = OpenAIEmbeddings()

In [9]:
print(graph.get_schema)

Node properties:
Resource {uri: STRING, ns0__msg: STRING, ns0__contain: STRING, ns0__pid: STRING, ns0__pname: STRING, ns0__time: STRING, embedding: LIST, ns0__user.name: STRING, rdfs__label: STRING}
_GraphConfig {_classLabel: STRING, _handleRDFTypes: INTEGER, _subClassOfRel: STRING, _handleMultival: INTEGER, _objectPropertyLabel: STRING, _rangeRel: STRING, _domainRel: STRING, _keepLangTag: BOOLEAN, _keepCustomDataTypes: BOOLEAN, _classNamePropName: STRING, _handleVocabUris: INTEGER, _applyNeo4jNaming: BOOLEAN, _relNamePropName: STRING, _dataTypePropertyLabel: STRING, _subPropertyOfRel: STRING}
_NsPrefDef {ns0: STRING, rdfs: STRING, ns1: STRING}
ns0__Event {uri: STRING, ns0__msg: STRING, ns0__contain: STRING, ns0__pid: STRING, ns0__pname: STRING, ns0__time: STRING, embedding: LIST}
ns0__Host {uri: STRING, ns0__host: STRING}
ns0__User {uri: STRING, ns0__user.name: STRING, embedding: LIST}
ns0__Source {uri: STRING, rdfs__label: STRING, embedding: LIST}
ns0__SourceType {uri: STRING, rdfs__

In [10]:
vector_index = Neo4jVector.from_existing_graph(
    username=neo4j_username,
    password=neo4j_password,
    embedding=embeddings,
    node_label="Resource",
    text_node_properties=[
        "rdfs__label", 
        "ns0__pname", 
        "ns0__msg", 
        "ns0__user.name"
    ],
    embedding_node_property="embedding",
    search_type="hybrid",
    # Neo4j credentials (url, username, password) jika tidak diatur di environment variables
)

RateLimitError: Error code: 429 - {'error': {'message': 'You exceeded your current quota, please check your plan and billing details. For more information on this error, read the docs: https://platform.openai.com/docs/guides/error-codes/api-errors.', 'type': 'insufficient_quota', 'param': None, 'code': 'insufficient_quota'}}

In [14]:
class Entities(BaseModel):
    """Identifying information about resources."""

    names: List[str] = Field(
        ...,
        description="All the tactics, techniques, or software entities that "
        "appear in the text",
    )
    
entity_prompt = ChatPromptTemplate.from_messages(
    [
        (
            "system",
            "You are extracting user names, host names, or process names from the text.",
        ),
        (
            "human",
            "Use the given format to extract information from the following "
            "input: {question}",
        ),
    ]
)

entity_chain = entity_prompt | llm.with_structured_output(Entities) #Pokoknya buat entity extraction

In [57]:
def generate_full_text_query(input: str) -> str:
    """
    Generate a full-text search query for a given input string.

    This function constructs a query string suitable for a full-text search.
    It processes the input string by splitting it into words and appending a
    similarity threshold (~2 changed characters) to each word, then combines
    them using the AND operator. Useful for mapping entities from user questions
    to database values, and allows for some misspelings.
    """
    full_text_query = ""
    words = [el for el in remove_lucene_chars(input).split() if el]
    for word in words[:-1]:
        full_text_query += f" {word}~2 AND"
    full_text_query += f" {words[-1]}~2"
    return full_text_query.strip()


def structured_retriever(question: str) -> str:
    """
    Collects the neighborhood of resources mentioned
    in the question
    """
    result = ""
    entities = entity_chain.invoke({"question": question})

    print(f"\n--- Extracted Entities: {entities.names} ---")

    for entity in entities.names:
        response = graph.query(
            """
            MATCH (user:ns0__User)
            WHERE toLower(user.`ns0__user.name`) CONTAINS toLower($entity)
            MATCH (user)<-[:ns0__hasUser]-(event:ns0__Event)
            RETURN "User '" + user.`ns0__user.name` + "' performed action '" + event.ns0__msg + 
                   "' with process '" + event.ns0__pname + "' (PID: " + event.ns0__pid + ") at time " + event.ns0__time AS output
            LIMIT 20
            """,
            {"entity": entity},
        )
        result += "\n".join([el['output'] for el in response])
    return result

In [40]:
def hybrid_retriever(question: str):
    """Combines structured and unstructured data retrieval based on the question."""
    print(f"Search query: {question}")
    structured_data = structured_retriever(question)

    print("\n--- Structured Data Retrieved ---")
    print(structured_data)
    print("---------------------------------\n")

    unstructured_data = [
        el.page_content for el in vector_index.similarity_search(question)]

    print("\n--- Unstructured Data Retrieved ---")
    print(unstructured_data)
    print("-----------------------------------\n")

    final_data = f"""Structured data:
    {structured_data}
    Unstructured data:
    {"#Resource ". join(unstructured_data)}
    """
    return final_data

In [41]:
_template = """Given the following conversation and a follow up question, rephrase the follow up question to be a standalone question,
in its original language.
Chat History:
{chat_history}
Follow Up Input: {question}
Standalone question:"""  
CONDENSE_QUESTION_PROMPT = PromptTemplate.from_template(_template)

In [28]:
def _format_chat_history(chat_history: List[Tuple[str, str]]) -> List:
    buffer = []
    for human, ai in chat_history:
        buffer.append(HumanMessage(content=human))
        buffer.append(AIMessage(content=ai))
    return buffer

In [29]:
_search_query = RunnableBranch(
    # If input includes chat_history, we condense it with the follow-up question
    (
        RunnableLambda(lambda x: bool(x.get("chat_history"))).with_config(
            run_name="HasChatHistoryCheck"
        ),  # Condense follow-up question and chat into a standalone_question
        RunnablePassthrough.assign(
            chat_history=lambda x: _format_chat_history(x["chat_history"])
        )
        | CONDENSE_QUESTION_PROMPT
        | ChatOpenAI(temperature=0)
        | StrOutputParser(),
    ),
    # Else, we have no chat history, so just pass through the question
    RunnableLambda(lambda x : x["question"]),
)

In [30]:
template = """Answer the question based only on the following context:
{context}

Question: {question}
Use natural language and be concise.
Answer:"""
prompt = ChatPromptTemplate.from_template(template)

In [31]:
final_chain = (
    RunnableParallel(
        {
            "context": _search_query | hybrid_retriever,
            "question": RunnablePassthrough(),
        }
    )
    | prompt
    | llm
    | StrOutputParser()
)

In [32]:
def query_agent(question: str) :
    """
    Query the graph and vector index using a hybrid approach.
    """
    return final_chain.invoke({"question": question})

In [56]:
query1 = "Who is daryl, and what does he do in system?"

In [58]:
response1 = query_agent(query1)
print("Answer 1:\n")
print(response1)

Search query: Who is daryl, and what does he do in system?

--- Extracted Entities: ['daryl'] ---

--- Structured Data Retrieved ---
User 'daryl' performed action 'pam_unix(dovecot:auth): authentication failure; logname= uid=0 euid=0 tty=dovecot ruser=daryl rhost=127.0.0.1  user=daryl' with process 'auth' (PID: ) at time 2021-03-05T07:28:23
User 'daryl' performed action 'pam_unix(dovecot:auth): authentication failure; logname= uid=0 euid=0 tty=dovecot ruser=daryl rhost=127.0.0.1  user=daryl' with process 'auth' (PID: ) at time 2021-03-05T07:26:39
User 'daryl' performed action 'pam_unix(dovecot:auth): authentication failure; logname= uid=0 euid=0 tty=dovecot ruser=daryl rhost=127.0.0.1  user=daryl' with process 'auth' (PID: ) at time 2021-03-05T07:27:44
User 'daryl' performed action 'pam_unix(dovecot:auth): authentication failure; logname= uid=0 euid=0 tty=dovecot ruser=daryl rhost=127.0.0.1  user=daryl' with process 'auth' (PID: ) at time 2021-03-05T07:29:28
User 'daryl' performed acti