1. Defining nodes and relationships. Creating nodes

In [31]:
from neo4j import GraphDatabase
import psycopg2
from sklearn.metrics.pairwise import cosine_similarity
import numpy as np
import ast

# Neo4j connection details
NEO4J_URI = 'neo4j+s://77208ee0.databases.neo4j.io'
NEO4J_USERNAME = 'neo4j'
NEO4J_PASSWORD = 'DGgFsTdjIGI9UKE-QcR8RNmdl6TbgSI4NS-rHegOh_s'

# PostgreSQL connection parameters
DB_HOST = "158.143.74.10"
DB_PORT = 5432
DB_NAME = "chatlse"
DB_USER = "chatlse"
DB_PASSWORD = 'chatlse'

# Function to parse embedding strings
def parse_embedding(embedding_str):
    """
    Parse embedding string into a NumPy array.
    Handles formats like "[0.1, 0.2, -0.3]" or "{0.1, 0.2, -0.3}".
    Returns a zero vector if parsing fails.
    """
    if not embedding_str or embedding_str.strip() == "":
        return np.zeros(1024)

    try:
        embedding_str = embedding_str.strip('{}[]')  # Remove brackets if present
        embedding_list = [float(x) for x in embedding_str.split(',')]  # Convert to floats
        return np.array(embedding_list)
    except Exception as e:
        print(f"Error parsing embedding: {embedding_str}. Error: {e}")
        return np.zeros(1024)  # Default zero vector on failure


# Neo4j database class
class Neo4jDatabase:
    def __init__(self, uri, user, password):
        self.driver = GraphDatabase.driver(uri, auth=(user, password))

    def close(self):
        self.driver.close()

    def create_chunk_node(self, doc_id, chunk_id, title, url, content, context_embedding, chunk_type):
        query = """
        MERGE (c:Chunk {
            doc_id: $doc_id,
            chunk_id: $chunk_id
        })
        SET c.type = $chunk_type,
            c.title = $title,
            c.url = $url,
            c.content = $content,
            c.context_embedding = $context_embedding
        """
        with self.driver.session() as session:
            session.run(
                query,
                chunk_type=chunk_type,
                doc_id=doc_id,
                chunk_id=chunk_id,
                title=title,
                url=url,
                content=content,
                context_embedding=context_embedding,
            )

    def create_summary_chunk_node(self, doc_id, content, summary):
        query = """
        MERGE (sc:SummaryChunk {
            doc_id: $doc_id
        })
        SET sc.content = $content,
            sc.summary = $summary,
            sc.type = "summary_chunk"
        """
        with self.driver.session() as session:
            session.run(
                query,
                doc_id=doc_id,
                content=content,
                summary=summary,
            )

    def create_similar_relationship(self, doc_id1, chunk_id1, doc_id2, chunk_id2):
        """
        Create a bidirectional SIMILAR relationship between two chunks.
        """
        query = """
        MATCH (c1:Chunk {doc_id: $doc_id1, chunk_id: $chunk_id1}),
            (c2:Chunk {doc_id: $doc_id2, chunk_id: $chunk_id2})
        CREATE (c1)-[:SIMILAR]->(c2),
            (c2)-[:SIMILAR]->(c1)
        """
        with self.driver.session() as session:
            session.run(query, doc_id1=doc_id1, chunk_id1=chunk_id1, doc_id2=doc_id2, chunk_id2=chunk_id2)


    def create_belongs_to_relationship(self, doc_id, chunk_id):
        """
        Create a BELONGS_TO relationship between a Chunk and the corresponding SummaryChunk
        from the same document.
        """
        query = """
        MATCH (chunk:Chunk {doc_id: $doc_id, chunk_id: $chunk_id}),
            (summary:SummaryChunk {doc_id: $doc_id})
        MERGE (chunk)-[:BELONGS_TO]->(summary)
        """
        with self.driver.session() as session:
            session.run(query, doc_id=doc_id, chunk_id=chunk_id)


    def create_next_relationship(self, doc_id, prev_chunk_id, chunk_id):
        query = """
        MATCH (prev:Chunk {doc_id: $doc_id, chunk_id: $prev_chunk_id}),
              (current:Chunk {doc_id: $doc_id, chunk_id: $chunk_id})
        CREATE (prev)-[:NEXT]->(current)
        """
        with self.driver.session() as session:
            session.run(query, doc_id=doc_id, prev_chunk_id=prev_chunk_id, chunk_id=chunk_id)

# Fetch and Create Nodes
def fetch_and_create_nodes(db):
    """
    Fetch chunks and summary chunks from PostgreSQL and create corresponding nodes in Neo4j.
    Limits: 1000 chunks, 500 summary chunks.
    """
    try:
        connection = psycopg2.connect(
            host=DB_HOST,
            port=DB_PORT,
            database=DB_NAME,
            user=DB_USER,
            password=DB_PASSWORD
        )
        cursor = connection.cursor()

        # Fetch limited chunk data
        chunks_query = """
        SELECT 
            doc_id, 
            chunk_id, 
            url, 
            title, 
            content, 
            context_embeddings 
        FROM lse_doc
        LIMIT 1000;
        """
        cursor.execute(chunks_query)
        chunks = cursor.fetchall()

        # Create Chunk nodes
        for row in chunks:
            doc_id, chunk_id, url, title, content, context_embeddings = row

            # Parse embedding string into a vector
            context_embedding = parse_embedding(context_embeddings)

            db.create_chunk_node(
                doc_id=doc_id,
                chunk_id=chunk_id,
                title=title,
                url=url,
                content=content,
                context_embedding=context_embedding.tolist(),
                chunk_type="chunk"
            )

        # Fetch limited summary chunk data
        summary_query = """
        SELECT 
            doc_id, 
            content, 
            summary 
        FROM doc_summary
        LIMIT 500;
        """
        cursor.execute(summary_query)
        summaries = cursor.fetchall()

        # Create SummaryChunk nodes
        for row in summaries:
            doc_id, content, summary = row
            db.create_summary_chunk_node(
                doc_id=doc_id,
                content=content,
                summary=summary
            )

        return chunks

    except Exception as e:
        print("Error:", e)
    finally:
        if connection:
            cursor.close()
            connection.close()

# Create Similar Relationships
def create_similar_relationships(db, chunks):
    """
    Create SIMILAR relationships only between chunks from different documents.
    """
    chunk_embeddings = []
    chunk_doc_ids = []
    chunk_ids = []

    for chunk in chunks:
        context_embedding = parse_embedding(chunk[5])
        if context_embedding is not None and np.any(context_embedding):  # Exclude invalid or zero vectors
            chunk_embeddings.append(context_embedding)
            chunk_doc_ids.append(chunk[0])  # Document ID
            chunk_ids.append(chunk[1])  # Chunk ID

    if chunk_embeddings:
        chunk_embeddings = np.array(chunk_embeddings)  # Ensure 2D array
        chunk_similarity_matrix = cosine_similarity(chunk_embeddings)
        chunk_threshold = 0.99  # Threshold for similarity

        for i in range(len(chunk_doc_ids)):
            for j in range(i + 1, len(chunk_doc_ids)):
                if chunk_doc_ids[i] != chunk_doc_ids[j] and chunk_similarity_matrix[i][j] > chunk_threshold:
                    db.create_similar_relationship(
                        doc_id1=chunk_doc_ids[i],
                        chunk_id1=chunk_ids[i],
                        doc_id2=chunk_doc_ids[j],
                        chunk_id2=chunk_ids[j],
                    )
                    print(f"Created SIMILAR relationship between chunks {chunk_ids[i]} and {chunk_ids[j]}")
    else:
        print("No valid embeddings found for computing chunk similarities.")


def create_belongs_to_relationships(db, chunks):
    """
    Iterate over chunks and create BELONGS_TO relationships for each chunk in the document
    to the corresponding SummaryChunk.
    """
    for chunk in chunks:
        doc_id = chunk[0]  # Document ID
        chunk_id = chunk[1]  # Chunk ID
        db.create_belongs_to_relationship(doc_id, chunk_id)

def create_next_relationships(db, chunks):
    doc_chunk_map = {}
    for chunk in chunks:
        doc_id = chunk[0]
        chunk_id = chunk[1]
        if doc_id not in doc_chunk_map:
            doc_chunk_map[doc_id] = []
        doc_chunk_map[doc_id].append(chunk_id)

    for doc_id, chunk_ids in doc_chunk_map.items():
        sorted_chunk_ids = sorted(chunk_ids)
        prev_chunk_id = None
        for chunk_id in sorted_chunk_ids:
            if prev_chunk_id:
                db.create_next_relationship(doc_id, prev_chunk_id, chunk_id)
            prev_chunk_id = chunk_id

# Main Script
db = Neo4jDatabase(NEO4J_URI, NEO4J_USERNAME, NEO4J_PASSWORD)
chunks = fetch_and_create_nodes(db)
create_similar_relationships(db, chunks)
create_belongs_to_relationships(db, chunks)
create_next_relationships(db, chunks)
db.close()

Created SIMILAR relationship between chunks 6 and 21
Created SIMILAR relationship between chunks 6 and 20
Created SIMILAR relationship between chunks 7 and 4
Created SIMILAR relationship between chunks 8 and 7
Created SIMILAR relationship between chunks 2 and 4
Created SIMILAR relationship between chunks 2 and 3
Created SIMILAR relationship between chunks 10 and 15
Created SIMILAR relationship between chunks 21 and 33
Created SIMILAR relationship between chunks 20 and 32
Created SIMILAR relationship between chunks 20 and 33
Created SIMILAR relationship between chunks 9 and 9
Created SIMILAR relationship between chunks 21 and 21
