In [1]:
#log cell code 1
import os
from typing import List, Tuple
from langchain_openai import ChatOpenAI, OpenAIEmbeddings
from langchain_neo4j.vectorstores.neo4j_vector import Neo4jVector
from langchain_neo4j.vectorstores.neo4j_vector import remove_lucene_chars
from langchain_neo4j import Neo4jGraph
from pydantic import BaseModel, Field
from langchain_core.prompts import ChatPromptTemplate, PromptTemplate
from langchain_core.messages import HumanMessage, AIMessage
from langchain_core.runnables import RunnablePassthrough, RunnableLambda, RunnableBranch, RunnableParallel
from langchain_core.output_parsers.string import StrOutputParser
from langchain_huggingface import HuggingFaceEmbeddings

In [2]:
#log cell code 2
OPENAI_API_KEY = os.environ.get("OPENAI_API_KEY")
neo4j_uri = os.environ.get("NEO4J_AURA")
neo4j_username = os.environ.get("NEO4J_AURA_USERNAME")
neo4j_password = os.environ.get("NEO4J_AURA_PASSWORD")
neo4j_database = os.environ.get("NEO4J_AURA_DATABASE")

print(neo4j_database, neo4j_uri, neo4j_username, neo4j_password, neo4j_database)

neo4j neo4j+s://79145028.databases.neo4j.io neo4j ujlntHOo0EriMmGelWeqh7FaVpr4craydlioxDdxtTk neo4j


In [3]:
#log cell code 3
graph = Neo4jGraph(
    url=neo4j_uri,
    username=neo4j_username,
    password=neo4j_password,
    database=neo4j_database
)
llm = ChatOpenAI(temperature=0, model_name="gpt-4o")

VECTOR_INDEX_NAME = "vector"
KEYWORD_INDEX_NAME = "keyword"

model_name = "sentence-transformers/all-MiniLM-L6-v2"

embeddings = HuggingFaceEmbeddings(model_name=model_name)

In [4]:
print(graph.get_schema)

Node properties:
Document {fileName: STRING, fileSize: INTEGER, fileType: STRING, status: STRING, fileSource: STRING, createdAt: LOCAL_DATE_TIME, updatedAt: LOCAL_DATE_TIME, processingTime: FLOAT, errorMessage: STRING, nodeCount: INTEGER, relationshipCount: INTEGER, model: STRING, is_cancelled: BOOLEAN, total_chunks: INTEGER, processed_chunk: INTEGER, chunkNodeCount: INTEGER, chunkRelCount: INTEGER, entityNodeCount: INTEGER, entityEntityRelCount: INTEGER, communityNodeCount: INTEGER, communityRelCount: INTEGER}
Chunk {fileName: STRING, embedding: LIST, id: STRING, text: STRING, position: INTEGER, length: INTEGER, content_offset: INTEGER}
Service {embedding: LIST, id: STRING}
Server {embedding: LIST, id: STRING}
User {embedding: LIST, id: STRING}
Host {embedding: LIST, id: STRING}
Module {embedding: LIST, id: STRING}
Person {embedding: LIST, id: STRING}
System {embedding: LIST, id: STRING}
Device {embedding: LIST, id: STRING}
Process {embedding: LIST, id: STRING}
Software {embedding: LI

In [5]:
#log cell code 4
vector_index = Neo4jVector.from_existing_index(
    embedding=embeddings,
    url=neo4j_uri,
    username=neo4j_username,
    password=neo4j_password,
    index_name=VECTOR_INDEX_NAME,
    keyword_index_name=KEYWORD_INDEX_NAME,
    search_type="hybrid"
)

  return forward_call(*args, **kwargs)


In [6]:
#log cell code 5
class LogEntities(BaseModel):
    """Identifies information about resources in the log."""

    # CHANGED: Description changed to match log entities
    entity_values: List[str] = Field(
        ...,
        description="All entities such as User, Server, Service, Host, "
        "System, Software, Device, Process, Machine, Session, or Document file names that appear in the text.",
    )
    
entity_prompt = ChatPromptTemplate.from_messages(
    [
        (
            "system",
            "You are an expert at extracting entities from text related to system logs. "
            "Extract the name or ID of entities such as User, Server, Service, "
            "Host, System, Software, Device, Process, Machine, Session, and the filename of the Document.",
        ),
        (
            "human",
            "Use the given format to extract information from"
            "the following input: {question}",
        ),
    ]
)

entity_chain = entity_prompt | llm.with_structured_output(LogEntities) #Pokoknya buat entity extraction

In [7]:
#log cell code 6
def generate_full_text_query(input: str) -> str:
    """
    Generate a full-text search query for a given input string.

    This function constructs a query string suitable for a full-text search.
    It processes the input string by splitting it into words and appending a
    similarity threshold (~2 changed characters) to each word, then combines
    them using the AND operator. Useful for mapping entities from user questions
    to database values, and allows for some misspelings.
    """
    full_text_query = ""
    words = [el for el in remove_lucene_chars(input).split() if el]
    for word in words[:-1]:
        full_text_query += f" {word}~2 AND"
    full_text_query += f" {words[-1]}~2"
    return full_text_query.strip()


def structured_retriever(question: str) -> str:
    """
    Collects the neighborhood of resources mentioned
    in the question
    """
    result = ""
    
    entities = entity_chain.invoke({"question": question})
    print(f"\n--- Extracted Entities: {entities.entity_values} ---")

    for entity_value in entities.entity_values:
        query = generate_full_text_query(entity_value)
        if not query:
            continue
        
        response = graph.query(
            """
            CALL db.index.fulltext.queryNodes('log_entities', $query, {limit: 10})
            YIELD node AS entity
            
            MATCH (chunk:Chunk)-[:HAS_ENTITY]->(entity)
            
            OPTIONAL MATCH (chunk)-[:PART_OF]->(doc:Document)
            
            WITH entity, chunk, doc,
                 CASE WHEN 'Document' IN labels(entity) 
                      THEN entity.fileName 
                      ELSE entity.id 
                 END AS entity_name
                 
            RETURN "Entity '" + entity_name + "' found in document '" + coalesce(doc.fileName, 'N/A') +
                   "'. The context of the text is: '" + left(chunk.text, 250) + "...'"
                   AS output
            LIMIT 10
            """,
            {"query": query},
        )
        if response:
            result += "\n".join([el['output'] for el in response])
    return result


In [8]:
#log cell code 7
def hybrid_retriever(question: str):
    """Combines structured and unstructured data retrieval based on the question."""
    print(f"Search query: {question}")
    structured_data = structured_retriever(question)

    print("\n--- Structured Data Retrieved ---")
    print(structured_data)
    print("---------------------------------\n")

    unstructured_data = [
        el.page_content for el in vector_index.similarity_search(question, k=10)
        ]

    print("\n--- Unstructured Data Retrieved ---")
    print(unstructured_data)
    print("-----------------------------------\n")

    final_data = f"""Structured data:
    {structured_data}
    Unstructured data:
    {"#Resource ". join(unstructured_data)}
    """
    return final_data

In [9]:
#log cell code 8
_template = """Given the following conversation and a follow up question, rephrase the follow up question to be a standalone question,
in its original language.
Chat History:
{chat_history}
Follow Up Input: {question}
Standalone question:"""  
CONDENSE_QUESTION_PROMPT = PromptTemplate.from_template(_template)

In [10]:
#log cell code 9
def _format_chat_history(chat_history: List[Tuple[str, str]]) -> List:
    buffer = []
    for human, ai in chat_history:
        buffer.append(HumanMessage(content=human))
        buffer.append(AIMessage(content=ai))
    return buffer

In [11]:
#log cell code 10
_search_query = RunnableBranch(
    # If input includes chat_history, we condense it with the follow-up question
    (
        RunnableLambda(lambda x: bool(x.get("chat_history"))).with_config(
            run_name="HasChatHistoryCheck"
        ),  # Condense follow-up question and chat into a standalone_question
        RunnablePassthrough.assign(
            chat_history=lambda x: _format_chat_history(x["chat_history"])
        )
        | CONDENSE_QUESTION_PROMPT
        | ChatOpenAI(temperature=0)
        | StrOutputParser(),
    ),
    # Else, we have no chat history, so just pass through the question
    RunnableLambda(lambda x : x["question"]),
)

In [12]:
#log cell code 11
template = """Answer the question based only on the following context:
{context}

Question: {question}
Use natural language and be concise.
Answer:"""
prompt = ChatPromptTemplate.from_template(template)

In [13]:
#log cell code 12
final_chain = (
    RunnableParallel(
        {
            "context": _search_query | hybrid_retriever,
            "question": RunnablePassthrough(),
        }
    )
    | prompt
    | llm
    | StrOutputParser()
)

In [14]:
#log cell code 13
def query_agent(question: str) :
    """
    Query the graph and vector index using a hybrid approach.
    """
    return final_chain.invoke({"question": question})

In [15]:
#log cell code 14
query1 = "who is sunni??"

In [16]:
#log cell code 15
response1 = query_agent(query1)
print("Answer:\n")
print(response1)

Search query: who is sunni??

--- Extracted Entities: ['sunni'] ---

--- Structured Data Retrieved ---
Entity 'sunni' found in document 'mail.cup.com.auth.txt'. The context of the text is: 'ix(cron:session): session opened for user root by (uid=0) Feb 29 07:17:01 mail-0 CRON[2457]: pam_unix(cron:session): session closed for user root Feb 29 07:22:00 mail-0 auth: pam_unix(dovecot:auth): authentication failure; logname= uid=0 euid=0 tty=d...'
Entity 'sunni' found in document 'mail.cup.com.auth.txt'. The context of the text is: ':session): session closed for user root Feb 29 18:39:01 mail-0 CRON[21158]: pam_unix(cron:session): session opened for user root by (uid=0) Feb 29 18:39:01 mail-0 CRON[21158]: pam_unix(cron:session): session closed for user root Feb 29 19:08:29 mail-...'
Entity 'sunni' found in document 'mail.cup.com.auth.txt'. The context of the text is: '=0 tty=dovecot ruser=sunni rhost=127.0.0.1  user=sunni Feb 29 19:09:01 mail-0 CRON[21608]: pam_unix(cron:session): session ope

  return forward_call(*args, **kwargs)



--- Unstructured Data Retrieved ---
[':session): session closed for user root Feb 29 18:39:01 mail-0 CRON[21158]: pam_unix(cron:session): session opened for user root by (uid=0) Feb 29 18:39:01 mail-0 CRON[21158]: pam_unix(cron:session): session closed for user root Feb 29 19:08:29 mail-0 auth: pam_unix(dovecot:auth): authentication failure; logname= uid=0 euid=0 tty=dovecot ruser=sunni rhost=127.0.0.1  user=sunni Feb 29 19:08:37 mail-0 auth: pam_unix(dovecot:auth): authentication failure; logname= uid=0 euid=0 tty=dovecot ruser=sunni rhost=127.0.', 'ix(cron:session): session opened for user root by (uid=0) Feb 29 07:17:01 mail-0 CRON[2457]: pam_unix(cron:session): session closed for user root Feb 29 07:22:00 mail-0 auth: pam_unix(dovecot:auth): authentication failure; logname= uid=0 euid=0 tty=dovecot ruser=sunni rhost=127.0.0.1  user=sunni Feb 29 07:22:09 mail-0 auth: pam_unix(dovecot:auth): authentication failure; logname= uid=0 euid=0 tty=dovecot ruser=sunni rhost=127.0.0.1  user=