## Sample KG

In [3]:
from neo4j import GraphDatabase
import json
from dotenv import load_dotenv,find_dotenv
import os
from tqdm import tqdm  
from userlib.user_logger import log_message
from userlib.manualcheck import *
import uuid
from time import *

from langchain_community.vectorstores import Neo4jVector
from langchain_openai import OpenAIEmbeddings


## kg not use

In [23]:
load_dotenv()

class KnowledgeGraph:
    def __init__(self, uri, username, password):
        """
        Initialize connection to Neo4j.
        """
        self.driver = GraphDatabase.driver(uri, auth=(username, password), database="neo4j")
    
    def close(self):
        """
        Close the Neo4j connection.
        """
        self.driver.close()

    def create_game_and_category(self, game_name, categories):
        """
        Create the main game node and its categories.
        """
        query = """
        MERGE (g:Game {name: $game_name})
        WITH g
        UNWIND $categories AS category_name
        MERGE (c:Category {name: category_name})
        MERGE (g)-[:HAS_CATEGORY]->(c)
        """
        with self.driver.session() as session:
            session.run(query, game_name=game_name, categories=categories)

    def create_title_and_subtitles(self, category_name, title_name, subtitles, page_url=None):
        """
        Create title node and its subtitles under a specific category.
        """
        query = """
        MATCH (c:Category {name: $category_name})
        MERGE (t:Title {name: $title_name})
        SET t.page_url = $page_url
        MERGE (c)-[:HAS_TITLE]->(t)
        WITH t
        UNWIND $subtitles AS subtitle
        MERGE (st:Subtitle {name: subtitle.name})
        SET st.page_url = subtitle.page_url
        MERGE (t)-[:HAS_SUBTITLE]->(st)
        """

        with self.driver.session() as session:
            result=session.run(query, category_name=category_name, title_name=title_name, subtitles=subtitles, page_url=page_url)
            summary = result.consume()

    def create_chunks(self, subtitle_name, chunks):
        """
        Create text and image chunks under a specific subtitle.
        """
        query = """
        MATCH (st:Subtitle {name: $subtitle_name})
        UNWIND $chunks AS chunk
        MERGE (c:Chunk {content: chunk.content, type: chunk.type})
        SET c.label = chunk.content
        MERGE (st)-[:HAS_CONTENT]->(c)
        WITH c
        MATCH (prev_chunk:Chunk)-[:HAS_CONTENT]->(st)
        WHERE prev_chunk <> c AND NOT (prev_chunk)-[:NEXT]->(c)
        MERGE (prev_chunk)-[:NEXT]->(c)
        """
        with self.driver.session() as session:
            session.run(query, subtitle_name=subtitle_name, chunks=chunks)

    def delete_all_nodes_and_relations(self):
        """
        Delete all nodes and relationships in the database.
        """
        query = """
        MATCH (n)
        DETACH DELETE n
        """
        with self.driver.session() as session:
            session.run(query)

# Function to determine category based on file name
def determine_category(file_name):
    """
    Determine the category based on the file name.
    """
    if "handbook" in file_name:
        return "攻略"
    elif "down" in file_name:
        return "下载"
    elif "news" in file_name or "tech" in file_name:
        return "新闻"
    else:
        return "其它"

# Function to process .html files and add nodes/relationships
def process_files_and_create_nodes(directory, kg):
    """
    Processes all .html files in the directory and creates nodes/relationships in the Knowledge Graph.
    """
    # List all .html files in the directory
    html_files = [file_name for file_name in os.listdir(directory) if file_name.endswith(".html")]
    
    # Add progress bar
    with tqdm(total=len(html_files), desc="Processing HTML Files", unit="file") as pbar:
        for file_name in html_files:
            file_path = os.path.join(directory, file_name)
            category = determine_category(file_name)
            
            with open(file_path, "r", encoding="utf-8") as file:
                lines = file.readlines()

            if len(lines) < 1:
                pbar.update(1)
                continue  # Skip empty files

            # Extract title from the first line
            title_line = lines[0].strip()
            if title_line.startswith("Title: "):
                title = title_line.replace("Title: ", "").replace("-游民星空 GamerSky.com", "").split("_")[0]
            else:
                title = title_line

            # Extract subtitle if the second line starts with '第'
            subtitles = []
            page_url = file_name
            if len(lines) > 1:
                second_line = lines[1].strip()
                if second_line.startswith("第"):
                    subtitles.append({"name": second_line, "page_url": page_url})
                else:
                    page_url = file_name  # No subtitle, assign page_url to title
            
            # log_message(f'title:{title}')
            # log_message(f'subtitle:{subtitles}')
            # log_message(f'page_url:{page_url}')
            # Create title and subtitle nodes
            kg.create_title_and_subtitles(category_name=category, title_name=title, subtitles=subtitles, page_url=page_url)
            
            # Update the progress bar
            pbar.update(1)


# Initialize knowledge graph connection
NEO4J_URI = os.getenv("NEO4J_URI")
NEO4J_USERNAME = os.getenv("NEO4J_USERNAME")
NEO4J_PASSWORD = os.getenv("NEO4J_PASSWORD")
kg = KnowledgeGraph(NEO4J_URI, NEO4J_USERNAME, NEO4J_PASSWORD)





kg.create_game_and_category(game_name="黑神话悟空", categories=["攻略", "新闻", "下载", "其它"])


# Process all files in the docs/rawdata directory
process_files_and_create_nodes("docs/rawdata", kg)

# Close the knowledge graph connection
kg.close()

Processing HTML Files:   2%|▏         | 83/5430 [00:00<00:54, 97.46file/s] 


KeyboardInterrupt: 

## Build KG with neo4j's lib

In [4]:


# Load environment variables
load_dotenv()

class KnowledgeGraph:
    def __init__(self, uri, username, password, database="neo4j"):
        """
        Initialize the connection to Neo4j.
        """
        self.driver = GraphDatabase.driver(uri, auth=(username, password), database=database)
    
    def close(self):
        """
        Close the connection to Neo4j.
        """
        self.driver.close()
    
    def add_node(self, label, properties, unique_keys=['name']):
        """
        Add a node with a UUID. If the node already exists based on unique_keys, it will not be duplicated.
        
        :param label: The label (type) of the node.
        :param properties: The properties of the node (dictionary).
        :param unique_keys: List of property keys to use for uniqueness.
        :return: The UUID of the node.
        """
        # Ensure unique_keys are present in properties
        for key in unique_keys:
            if key not in properties:
                raise ValueError(f"Unique key '{key}' must be present in properties.")
        
        # Assign a UUID if not already present
        if 'uuid' not in properties:
            properties['uuid'] = str(uuid.uuid4())
        
        # Construct the MATCH part for MERGE based on unique_keys
        match_conditions = " AND ".join([f"n.{key} = ${key}" for key in unique_keys])
        
        # Prepare parameters for unique keys
        params = {key: properties[key] for key in unique_keys}
        
        # Determine additional properties to set (excluding unique_keys and 'uuid')
        additional_properties = {k: v for k, v in properties.items() if k not in unique_keys + ['uuid']}
        set_conditions = ", ".join([f"n.{key} = $properties.{key}" for key in additional_properties])
        
        # Complete Cypher query with conditional SET clause
        if set_conditions:
            query = f"""
            MERGE (n:{label} {{ {', '.join([f"{key}: ${key}" for key in unique_keys])} }})
            ON CREATE SET n.uuid = $uuid, {set_conditions}
            ON MATCH SET {set_conditions}
            RETURN n.uuid as uuid
            """
            params['uuid'] = properties['uuid']
            params['properties'] = additional_properties
        else:
            query = f"""
            MERGE (n:{label} {{ {', '.join([f"{key}: ${key}" for key in unique_keys])} }})
            ON CREATE SET n.uuid = $uuid
            RETURN n.uuid as uuid
            """
            params['uuid'] = properties['uuid']
        
        try:
            with self.driver.session() as session:
                result = session.run(query, **params)
                record = result.single()
                return record["uuid"] if record else None
        except Exception as e:
            print(f"Error adding node {label} with properties {properties}: {e}")
        return None
    
    def add_relationship(self, label1, prop1, relationship, label2, prop2, rel_properties=None, unique_keys1=['name'], unique_keys2=['name']):
        """
        Add two nodes and their relationship. First ensures nodes exist using MATCH. If not found, an error is logged.
        
        :param label1: The label of the first node.
        :param prop1: The properties of the first node (dictionary, used for matching).
        :param relationship: The type of the relationship.
        :param label2: The label of the second node.
        :param prop2: The properties of the second node (dictionary, used for matching).
        :param rel_properties: The properties of the relationship (dictionary, optional).
        :param unique_keys1: List of property keys to use for uniqueness for the first node.
        :param unique_keys2: List of property keys to use for uniqueness for the second node.
        """
        # Ensure unique_keys are present in properties
        for key in unique_keys1:
            if key not in prop1:
                raise ValueError(f"Unique key '{key}' must be present in prop1.")
        for key in unique_keys2:
            if key not in prop2:
                raise ValueError(f"Unique key '{key}' must be present in prop2.")
        
        # Prepare MATCH conditions for both nodes
        match1 = ", ".join([f"{key}: ${key}" for key in unique_keys1])
        params1 = {key: prop1[key] for key in unique_keys1}
        
        match2 = ", ".join([f"{key}: $prop2_{key}" for key in unique_keys2])
        params2 = {f"prop2_{key}": prop2[key] for key in unique_keys2}
        
        # Construct SET part for additional properties on the relationship
        if rel_properties:
            set_rel = ", ".join([f"r.{k} = $r_{k}" for k in rel_properties])
            params_rel = {f"r_{k}": v for k, v in rel_properties.items()}
            set_clause = f"SET {set_rel}"
        else:
            set_clause = ""
            params_rel = {}
        
        # Complete Cypher query with explicit MATCH and MERGE
        query = f"""
        MATCH (a:{label1} {{ {match1} }})
        MATCH (b:{label2} {{ {match2} }})
        MERGE (a)-[r:{relationship}]->(b)
        {set_clause}
        """
        
        # Combine all parameters
        params = {**params1, **params2, **params_rel}
        
        try:
            with self.driver.session() as session:
                session.run(query, **params)
        except Exception as e:
            print(f"Error creating relationship {relationship} between {prop1} and {prop2}: {e}")
    
    def delete_all(self):
        """
        Delete all nodes and relationships in the knowledge graph.
        """
        query = "MATCH (n) DETACH DELETE n"
        with self.driver.session() as session:
            session.run(query)
    
    def find_node(self, label, property_key, property_value):
        """
        Find a node with a specific label and property.
        
        :param label: The label of the node.
        :param property_key: The property key to match.
        :param property_value: The value of the property to match.
        :return: The node's UUID or None if not found.
        """
        query = f"""
        MATCH (n:{label} {{{property_key}: $value}})
        RETURN n.uuid as uuid
        LIMIT 1
        """
        with self.driver.session() as session:
            result = session.run(query, value=property_value)
            record = result.single()
            return record["uuid"] if record else None
        
    def get_node_properties_by_uuid(self, label, uuid):
        """
        Find a node by its UUID and get all its properties.

        :param label: The label of the node.
        :param uuid: The UUID of the node.
        :return: A dictionary of the node's properties, or None if not found.
        """
        query = f"""
        MATCH (n:{label} {{uuid: $uuid}})
        RETURN properties(n) AS node_properties
        """
        try:
            with self.driver.session() as session:
                result = session.run(query, uuid=uuid)
                record = result.single()
                return record["node_properties"] if record else None
        except Exception as e:
            print(f"Error retrieving properties for node with UUID {uuid}: {e}")
            return None

    def get_related_nodes(self, label, uuid, relationship, direction="OUTGOING"):
        """
        Find the nodes related to the given node by a specific relationship.

        :param label: The label of the starting node.
        :param uuid: The UUID of the starting node.
        :param relationship: The type of the relationship to follow.
        :param direction: The direction of the relationship ("OUTGOING", "INCOMING", "BOTH").
        :return: A list of related nodes (as dictionaries of properties).
        """
        # Determine relationship direction
        if direction == "OUTGOING":
            rel_pattern = f"-[:{relationship}]->"
        elif direction == "INCOMING":
            rel_pattern = f"<-[:{relationship}]-"
        elif direction == "BOTH":
            rel_pattern = f"-[:{relationship}]-"
        else:
            raise ValueError("Invalid direction. Use 'OUTGOING', 'INCOMING', or 'BOTH'.")

        query = f"""
        MATCH (n:{label} {{uuid: $uuid}}){rel_pattern}(related)
        RETURN properties(related) AS related_properties
        """
        try:
            with self.driver.session() as session:
                result = session.run(query, uuid=uuid)
                return [record["related_properties"] for record in result]
        except Exception as e:
            print(f"Error retrieving related nodes for node with UUID {uuid} via relationship {relationship}: {e}")
            return []

def revert_url(safe_url):
    reverted_url = safe_url.replace("_text_with_images.html","").replace("=", ":").replace("|", "/")
    return reverted_url

def convert_url(url):
    safe_url = url.replace(":", "=").replace("/", "|") + "_text_with_images.html"
    return safe_url

def determine_category(file_name):
    """
    Determine the category based on the file name.
    
    :param file_name: The name of the file.
    :return: The category as a string.
    """
    if "handbook" in file_name:
        return "攻略"
    elif "down" in file_name:
        return "下载"
    elif "news" in file_name or "tech" in file_name:
        return "新闻"
    else:
        return "其它"

def process_files_and_create_nodes(directory, kg):
    """
    Processes all .html files in the directory and creates nodes/relationships in the Knowledge Graph.
    
    :param directory: The directory containing .html files.
    :param kg: An instance of KnowledgeGraph.
    """
    try:
        # List all .html files in the directory
        html_files = [file_name for file_name in os.listdir(directory) if file_name.endswith(".html")]
        
        # Add a progress bar
        with tqdm(total=len(html_files), desc="Processing HTML Files", unit="file") as pbar:
            for file_name in html_files:
                file_path = os.path.join(directory, file_name)
                category = determine_category(file_name)

                page_url = file_name
                
                with open(file_path, "r", encoding="utf-8") as file:
                    file_content = file.read()
                    lines = file_content.splitlines()


                if len(lines) < 1:
                    pbar.update(1)
                    continue  # Skip empty files

                # Extract title from the first line
                title_line = lines[0].strip()
                if title_line.startswith("Title: "):
                    title = title_line.replace("Title: ", "").replace("-游民星空 GamerSky.com", "").split("_")[0]
                else:
                    title = title_line

                # Extract subtitle
                if len(lines) > 1:
                    second_line = lines[1].strip()
                    if second_line.startswith("第"):
                        subtitle = {"name": second_line, "page_url": page_url}
                    else:
                        subtitle = {"name": "第一页", "page_url": page_url}
                else:
                    subtitle = {"name": "第一页", "page_url": page_url}

                # Create Category node
                kg.add_node("Category", {"name": category}, unique_keys=['name'])

                # Create Title node and establish relationship
                kg.add_node("Title", {"name": title}, unique_keys=['name'])
                kg.add_relationship(
                    label1="Category",
                    prop1={"name": category},
                    relationship="HAS_TITLE",
                    label2="Title",
                    prop2={"name": title}
                )

                # Create Subtitle node and establish relationship
                kg.add_node("Subtitle", subtitle, unique_keys=['page_url'])
                kg.add_relationship(
                    label1="Title",
                    prop1={"name": title},
                    relationship="HAS_SUBTITLE",
                    label2="Subtitle",
                    prop2=subtitle,
                    rel_properties=None,
                    unique_keys1=["name"],
                    unique_keys2=["page_url"]
                )

                # Create txt node with the full content of the file
                txt_properties = {
                    "content": file_content,
                    "page_url": page_url  # Use page_url as the unique key
                }
                kg.add_node("txt", txt_properties, unique_keys=['page_url'])
        
                # Establish HAS_TXT relationship
                kg.add_relationship(
                    label1="Subtitle",
                    prop1={"page_url": page_url},
                    relationship="HAS_TXT",
                    label2="txt",
                    prop2=txt_properties,
                    rel_properties=None,
                    unique_keys1=["page_url"],
                    unique_keys2=["page_url"]
                )

                

                # Update the progress bar
                pbar.update(1)
    except Exception as e:
        print(f"Error: {e}")


def process_mmimg_items_with_progress(mmimg_json_path, kg):
    """
    Traverses all items in docs/mmimg.json, converts each item's URL to a safe URL (only before '?', replace ":" with "=", "/" with "|"),
    finds the matching Subtitle node in kg, aggregates content_before_image, image_description,
    and content_after_image into a single string, creates an img node with this content and src,
    and establishes a HAS_IMG relationship with the Subtitle node. Displays progress using a progress bar.
    
    :param mmimg_json_path: Path to the mmimg.json file.
    :param kg: An instance of KnowledgeGraph.
    """

    # Load mmimg.json data
    try:
        with open(mmimg_json_path, 'r', encoding='utf-8') as json_file:
            mmimg_data = json.load(json_file)
        print(f"Successfully loaded {mmimg_json_path}")
    except Exception as e:
        print(f"Error loading {mmimg_json_path}: {e}")
        return

    error_count = 0
    # Iterate over each item with a progress bar
    for item in tqdm(mmimg_data, desc="Processing mmimg.json items"):
        url = item.get('url', '')
        if not url:
            continue

        # Convert URL to safe_url: take the string before '?', replace ":" with "=", and "/" with "|"
        safe_url = url.split('?')[0].replace(":", "=").replace("/", "|")

        safe_url = safe_url + "_text_with_images.html"

        # Find the matching Subtitle node with page_url == safe_url
        subtitle_uuid = kg.find_node("Subtitle", "page_url", safe_url)
        if not subtitle_uuid:
            error_count +=1
            print("No matching Subtitle node found:" + safe_url + "error_count:" + str(error_count))
            
            continue  # No matching Subtitle node found

        # Aggregate content_before_image, image_description, and content_after_image
        content_before = item.get('content_before_image', '')
        image_description = item.get('image_description', '')
        content_after = item.get('content_after_image', '')
        aggregated_content = f"content_before_image: {content_before}\nimage_description: {image_description}\ncontent_after_image: {content_after}".strip()

        if not aggregated_content:
            continue  # Skip if aggregated content is empty

        # Create img node with aggregated_content and src
        img_properties = {
            "aggregated_content": aggregated_content,
            "src": item.get('src', ''),  # Add src attribute from the item
            "url": url  # original URL
        }
        kg.add_node("Img", img_properties, unique_keys=['src'])

        # Establish HAS_IMG relationship with Subtitle node
        kg.add_relationship(
            label1="Subtitle",
            prop1={"page_url": safe_url},
            relationship="HAS_IMG",
            label2="Img",
            prop2=img_properties,
            rel_properties=None,
            unique_keys1=["page_url"],
            unique_keys2=["src"]
        )



def create_constraints(kg):
    """
    Create unique constraints on the uuid property for each label.
    
    :param kg: An instance of KnowledgeGraph.
    """
    constraints = [
        "CREATE CONSTRAINT IF NOT EXISTS FOR (c:Category) REQUIRE c.uuid IS UNIQUE;",
        "CREATE CONSTRAINT IF NOT EXISTS FOR (g:Game) REQUIRE g.uuid IS UNIQUE;",
        "CREATE CONSTRAINT IF NOT EXISTS FOR (t:Title) REQUIRE t.uuid IS UNIQUE;",
        "CREATE CONSTRAINT IF NOT EXISTS FOR (s:Subtitle) REQUIRE s.uuid IS UNIQUE;"
    ]
    with kg.driver.session() as session:
        for constraint in constraints:
            session.run(constraint)


def create_knowledge_graph():
    try:
        # Create unique constraints (run once)
        create_constraints(kg)
        
        # Optional: Clear the existing knowledge graph
        kg.delete_all()
        
        # Add Game node and its categories
        kg.add_node("Game", {"name": "黑神话悟空"}, unique_keys=['name'])
        for category in ["攻略", "新闻", "下载", "其它"]:
            kg.add_node("Category", {"name": category}, unique_keys=['name'])
            kg.add_relationship(
                label1="Game",
                prop1={"name": "黑神话悟空"},
                relationship="HAS_CATEGORY",
                label2="Category",
                prop2={"name": category}
            )
        
        # Process all files and create nodes/relationships
        process_files_and_create_nodes("docs/rawdata", kg)
        # Process mmimg.json and create img nodes with HAS_IMG relationships
        process_mmimg_items_with_progress("docs/mmimg.json", kg)
        
        print("Knowledge Graph construction completed.")
    finally:
        # Close the connection
        kg.close()

# Example usage
if __name__ == "__main__":
    # Initialize the knowledge graph connection
    NEO4J_URI = os.getenv("NEO4J_URI")
    NEO4J_USERNAME = os.getenv("NEO4J_USERNAME")
    NEO4J_PASSWORD = os.getenv("NEO4J_PASSWORD")
    kg = KnowledgeGraph(NEO4J_URI, NEO4J_USERNAME, NEO4J_PASSWORD)
    pass
    # create_knowledge_graph()


## 
FYI = '''
MATCH p = (n)-[*]->(m)
WITH p LIMIT 1000
RETURN p
ORDER BY rand()
LIMIT 500
'''

## Vector index of neo4j

In [5]:
# Initialize the knowledge graph connection
NEO4J_URI = os.getenv("NEO4J_URI")
NEO4J_USERNAME = os.getenv("NEO4J_USERNAME")
NEO4J_PASSWORD = os.getenv("NEO4J_PASSWORD")

# Initialize the vector index, takes 3m to complete with $1.20
vector_index = Neo4jVector.from_existing_graph(
    OpenAIEmbeddings(model="text-embedding-3-large"),
    url=NEO4J_URI,
    username=NEO4J_USERNAME,
    password=NEO4J_PASSWORD,
    index_name='KG_Retrieve_Task',
    node_label="txt",
    text_node_properties=['content', 'page_url'],
    embedding_node_property='embedding',
    **{"search_type": "hybrid"}
)


In [9]:

response = vector_index.similarity_search_with_relevance_scores(
    "全丹方收集指南", k=3
)

output = []
for document, score in response:
    uuid = document.metadata.get('uuid', 'No UUID found')
    subtitles = kg.get_related_nodes('txt', uuid, 'HAS_TXT', 'INCOMING')
    titles = kg.get_related_nodes('Subtitle', subtitles[0].get('uuid'), 'HAS_SUBTITLE', 'INCOMING')
    
    output.append('Title: ' + titles[0].get('name'))
    subtitles = kg.get_related_nodes('Title', titles[0].get('uuid'), 'HAS_SUBTITLE', 'OUTGOING')
    for subtitle in subtitles:
        output.append(' SubTitle: ' + subtitle.get('name'))
        subtitle_txt = kg.get_related_nodes('Subtitle', subtitle.get('uuid'), 'HAS_TXT', 'OUTGOING')
        output.append(' Subtitle_page_url: ' + revert_url(subtitle_txt[0].get('page_url')))
        output.append(' Subtitle_content: ' + subtitle_txt[0].get('content')[:50] +'\n')
    output.append('-------------')
output_str = '\n'.join(output)
print(output_str)
        

    




Title: 《黑神话悟空》全丹方收集指南 全丹方获取方法
 SubTitle: 第2页：延寿膏
 Subtitle_page_url: https://www.gamersky.com/handbook/202408/1807607_2.shtml
 Subtitle_content: Title: 《黑神话悟空》全丹方收集指南 全丹方获取方法_延寿膏-游民星空 GamerSky.co

 SubTitle: 第13页：七返火丹
 Subtitle_page_url: https://www.gamersky.com/handbook/202408/1807607_13.shtml
 Subtitle_content: Title: 《黑神话悟空》全丹方收集指南 全丹方获取方法_七返火丹-游民星空 GamerSky.c

 SubTitle: 第8页：加味参势丸
 Subtitle_page_url: https://www.gamersky.com/handbook/202408/1807607_8.shtml
 Subtitle_content: Title: 《黑神话悟空》全丹方收集指南 全丹方获取方法_加味参势丸-游民星空 GamerSky.

 SubTitle: 第15页：丹方查缺补漏表
 Subtitle_page_url: https://www.gamersky.com/handbook/202408/1807607_15.shtml
 Subtitle_content: Title: 《黑神话悟空》全丹方收集指南 全丹方获取方法_丹方查缺补漏表-游民星空 GamerSk

 SubTitle: 第1页：避凶药
 Subtitle_page_url: https://www.gamersky.com/handbook/202408/1807607.shtml
 Subtitle_content: Title: 《黑神话悟空》全丹方收集指南 全丹方获取方法_避凶药-游民星空 GamerSky.co

 SubTitle: 第4页：益气膏
 Subtitle_page_url: https://www.gamersky.com/handbook/202408/1807607_4.shtml
 Subtitle_content: Title: 《黑神话

## 