1. Defining nodes and relationships

In [None]:
from neo4j import GraphDatabase
import psycopg2
from sklearn.metrics.pairwise import cosine_similarity
import numpy as np

# Neo4j connection details
NEO4J_URI = "neo4j+s://fc98a293.databases.neo4j.io"
NEO4J_USER = "neo4j"
NEO4J_PASSWORD = "jFgHbgFhvKbp791oQ0SBojwUG5Wwu44ImfBblOnPKTE"

# PostgreSQL connection parameters
DB_HOST = "158.143.74.10"
DB_PORT = 5432
DB_NAME = "chatlse"
DB_USER = "chatlse"
DB_PASSWORD ='chatlse'


# Neo4j database class
class Neo4jDatabase:
    def __init__(self, uri, user, password):
        self.driver = GraphDatabase.driver(uri, auth=(user, password))

    def close(self):
        self.driver.close()

    def create_chunk_node(self, doc_id, chunk_id, title, url, content, context_embedding, chunk_type):
        query = """
        CREATE (c:Chunk {
            type: $chunk_type,
            doc_id: $doc_id,
            chunk_id: $chunk_id,
            title: $title,
            url: $url,
            content: $content,
            context_embedding: $context_embedding
        })
        """
        with self.driver.session() as session:
            session.run(
                query,
                chunk_type=chunk_type,
                doc_id=doc_id,
                chunk_id=chunk_id,
                title=title,
                url=url,
                content=content,
                context_embedding=context_embedding,
            )

    def create_summary_chunk_node(self, doc_id, content, summary):
        query = """
        CREATE (sc:SummaryChunk {
            type: $chunk_type,
            doc_id: $doc_id,
            content: $content,
            summary: $summary
        })
        """
        with self.driver.session() as session:
            session.run(
                query,
                chunk_type="summary_chunk",
                doc_id=doc_id,
                content=content,
                summary=summary,
            )

    def create_similar_relationship(self, doc_id1, chunk_id1, doc_id2, chunk_id2):
        query = """
        MATCH (c1:Chunk {doc_id: $doc_id1, chunk_id: $chunk_id1}),
              (c2:Chunk {doc_id: $doc_id2, chunk_id: $chunk_id2})
        CREATE (c1)-[:SIMILAR]->(c2)
        """
        with self.driver.session() as session:
            session.run(query, doc_id1=doc_id1, chunk_id1=chunk_id1, doc_id2=doc_id2, chunk_id2=chunk_id2)

    def create_belongs_to_relationship(self, doc_id, chunk_id):
        query = """
        MATCH (chunk:Chunk {doc_id: $doc_id, chunk_id: $chunk_id}),
              (summary:SummaryChunk {doc_id: $doc_id})
        CREATE (chunk)-[:BELONGS_TO]->(summary)
        """
        with self.driver.session() as session:
            session.run(query, doc_id=doc_id, chunk_id=chunk_id)

    def create_next_relationships(self, doc_id, chunks):
        sorted_chunks = sorted(chunks, key=lambda x: x["chunk_id"])  # Order by chunk_id
        prev_chunk_id = None
        for chunk in sorted_chunks:
            if prev_chunk_id is None:  # Start with SummaryChunk
                query = """
                MATCH (summary:SummaryChunk {doc_id: $doc_id}),
                      (chunk:Chunk {doc_id: $doc_id, chunk_id: $chunk_id})
                CREATE (summary)-[:NEXT]->(chunk)
                """
                with self.driver.session() as session:
                    session.run(query, doc_id=doc_id, chunk_id=chunk["chunk_id"])
            else:
                query = """
                MATCH (prev:Chunk {doc_id: $doc_id, chunk_id: $prev_chunk_id}),
                      (chunk:Chunk {doc_id: $doc_id, chunk_id: $chunk_id})
                CREATE (prev)-[:NEXT]->(chunk)
                """
                with self.driver.session() as session:
                    session.run(query, doc_id=doc_id, prev_chunk_id=prev_chunk_id, chunk_id=chunk["chunk_id"])
            prev_chunk_id = chunk["chunk_id"]



2. Fetching and Ingesting data

In [None]:
# Fix: Relationship creation functions
def create_similar_relationships(chunks):
    # Extract embeddings and compute similarity
    embeddings = [np.array(chunk[7]) for chunk in chunks if isinstance(chunk[7], list)]
    doc_ids = [chunk[1] for chunk in chunks]
    chunk_ids = [chunk[2] for chunk in chunks]

    # Compute cosine similarity
    similarity_matrix = cosine_similarity(embeddings)
    threshold = 0.9

    for i in range(len(chunks)):
        for j in range(i + 1, len(chunks)):
            if similarity_matrix[i][j] > threshold:
                db.create_similar_relationship(
                    doc_id1=doc_ids[i],
                    chunk_id1=chunk_ids[i],
                    doc_id2=doc_ids[j],
                    chunk_id2=chunk_ids[j],
                )
                print(f"Created SIMILAR relationship between chunk {chunk_ids[i]} and {chunk_ids[j]}")

def create_belongs_to_relationships(chunks):
    doc_chunk_map = {}
    for chunk in chunks:
        doc_id = chunk[1]
        chunk_id = chunk[2]
        if doc_id not in doc_chunk_map:
            doc_chunk_map[doc_id] = []
        doc_chunk_map[doc_id].append(chunk_id)

    for doc_id, chunk_ids in doc_chunk_map.items():
        for chunk_id in chunk_ids:
            db.create_belongs_to_relationship(doc_id, chunk_id)
            print(f"Created BELONGS_TO relationship for chunk {chunk_id} and summary {doc_id}")

def create_next_relationships(chunks):
    doc_chunk_map = {}
    for chunk in chunks:
        doc_id = chunk[1]
        chunk_id = chunk[2]
        if doc_id not in doc_chunk_map:
            doc_chunk_map[doc_id] = []
        doc_chunk_map[doc_id].append(chunk_id)

    for doc_id, chunk_ids in doc_chunk_map.items():
        sorted_chunk_ids = sorted(chunk_ids)  # Ensure chunks are ordered
        prev_chunk_id = None
        for chunk_id in sorted_chunk_ids:
            if prev_chunk_id is None:
                print(f"Starting NEXT relationship from summary to chunk {chunk_id} in doc {doc_id}")
            else:
                db.create_next_relationships(doc_id, [
                    {"chunk_id": prev_chunk_id},
                    {"chunk_id": chunk_id},
                ])
                print(f"Created NEXT relationship between chunk {prev_chunk_id} and chunk {chunk_id}")
            prev_chunk_id = chunk_id

# Updated fetch_and_create_nodes to handle missing context_embeddings
def fetch_and_create_nodes(limit=100):
    try:
        connection = psycopg2.connect(
            host=DB_HOST,
            port=DB_PORT,
            database=DB_NAME,
            user=DB_USER,
            password=DB_PASSWORD
        )
        cursor = connection.cursor()

        # Fetch limited data from lse_doc
        chunks_query = f"""
        SELECT 
            id, 
            doc_id, 
            chunk_id, 
            type, 
            url, 
            title, 
            content, 
            context_embeddings 
        FROM lse_doc
        LIMIT {limit};
        """
        cursor.execute(chunks_query)
        chunks = cursor.fetchall()

        # Create Chunk nodes
        for row in chunks:
            id, doc_id, chunk_id, chunk_type, url, title, content, context_embeddings = row
            context_embedding = np.array(context_embeddings) if context_embeddings else np.zeros(1024)
            db.create_chunk_node(
                doc_id=doc_id,
                chunk_id=chunk_id,
                title=title,
                url=url,
                content=content,
                context_embedding=context_embedding.tolist(),
                chunk_type="chunk"
            )

        # Fetch data from doc_summary
        summary_query = """
        SELECT 
            doc_id, 
            content, 
            summary 
        FROM doc_summary;
        """
        cursor.execute(summary_query)
        summaries = cursor.fetchall()

        # Create SummaryChunk nodes
        for row in summaries:
            doc_id, content, summary = row
            db.create_summary_chunk_node(
                doc_id=doc_id,
                content=content,
                summary=summary
            )

        return chunks

    except Exception as e:
        print("Error:", e)
    finally:
        if connection:
            cursor.close()
            connection.close()

# Initialize Neo4j and execute workflow
db = Neo4jDatabase(NEO4J_URI, NEO4J_USER, NEO4J_PASSWORD)
chunks = fetch_and_create_nodes(limit=100)

create_similar_relationships(chunks)
create_belongs_to_relationships(chunks)
create_next_relationships(chunks)

db.close()
