1. Defining nodes and relationships, uploading data to neo4j

In [28]:
from neo4j import GraphDatabase
import psycopg2
from sklearn.metrics.pairwise import cosine_similarity
import numpy as np
import ast

# Neo4j connection details
NEO4J_URI = 'neo4j+s://6d4b3e42.databases.neo4j.io'
NEO4J_USERNAME = 'neo4j'
NEO4J_PASSWORD = 'kS_bNoHQBucK1ab5Rb_UP8KNiTvooRfpXX_zvAQRKX0'

# PostgreSQL connection parameters
DB_HOST = "158.143.74.10"
DB_PORT = 5432
DB_NAME = "chatlse"
DB_USER = "chatlse"
DB_PASSWORD = 'chatlse'

# Function to parse embedding strings
def parse_embedding(embedding_str):
    """
    Parse embedding string into a NumPy array.
    Handles common formats like "[0.1, 0.2, 0.3]" or "0.1, 0.2, 0.3".
    """
    if embedding_str is None or embedding_str.strip() == "":
        return np.zeros(1024)  # Default zero vector for missing or empty embeddings

    try:
        # Remove brackets if present and convert to list
        embedding_list = ast.literal_eval(embedding_str.strip()) if "[" in embedding_str else list(map(float, embedding_str.split(",")))
        return np.array(embedding_list)
    except Exception as e:
        print(f"Failed to parse embedding: {embedding_str}. Error: {e}")
        return np.zeros(1024)  # Default zero vector for invalid embeddings

# Neo4j database class
class Neo4jDatabase:
    def __init__(self, uri, user, password):
        self.driver = GraphDatabase.driver(uri, auth=(user, password))

    def close(self):
        self.driver.close()

    def create_chunk_node(self, doc_id, chunk_id, title, url, content, context_embedding, chunk_type):
        query = """
        MERGE (c:Chunk {
            doc_id: $doc_id,
            chunk_id: $chunk_id
        })
        SET c.type = $chunk_type,
            c.title = $title,
            c.url = $url,
            c.content = $content,
            c.context_embedding = $context_embedding
        """
        with self.driver.session() as session:
            session.run(
                query,
                chunk_type=chunk_type,
                doc_id=doc_id,
                chunk_id=chunk_id,
                title=title,
                url=url,
                content=content,
                context_embedding=context_embedding,
            )

    def create_summary_chunk_node(self, doc_id, content, summary):
        query = """
        MERGE (sc:SummaryChunk {
            doc_id: $doc_id
        })
        SET sc.content = $content,
            sc.summary = $summary,
            sc.type = "summary_chunk"
        """
        with self.driver.session() as session:
            session.run(
                query,
                doc_id=doc_id,
                content=content,
                summary=summary,
            )

    def create_similar_relationship(self, doc_id1, chunk_id1, doc_id2, chunk_id2):
        """
        Create a bidirectional SIMILAR relationship between two chunks.
        """
        query = """
        MATCH (c1:Chunk {doc_id: $doc_id1, chunk_id: $chunk_id1}),
            (c2:Chunk {doc_id: $doc_id2, chunk_id: $chunk_id2})
        CREATE (c1)-[:SIMILAR]->(c2),
            (c2)-[:SIMILAR]->(c1)
        """
        with self.driver.session() as session:
            session.run(query, doc_id1=doc_id1, chunk_id1=chunk_id1, doc_id2=doc_id2, chunk_id2=chunk_id2)


    def create_belongs_to_relationship(self, doc_id, chunk_id):
        query = """
        MATCH (chunk:Chunk {doc_id: $doc_id, chunk_id: $chunk_id}),
              (summary:SummaryChunk {doc_id: $doc_id})
        CREATE (chunk)-[:BELONGS_TO]->(summary)
        """
        with self.driver.session() as session:
            session.run(query, doc_id=doc_id, chunk_id=chunk_id)

    def create_next_relationship(self, doc_id, prev_chunk_id, chunk_id):
        query = """
        MATCH (prev:Chunk {doc_id: $doc_id, chunk_id: $prev_chunk_id}),
              (current:Chunk {doc_id: $doc_id, chunk_id: $chunk_id})
        CREATE (prev)-[:NEXT]->(current)
        """
        with self.driver.session() as session:
            session.run(query, doc_id=doc_id, prev_chunk_id=prev_chunk_id, chunk_id=chunk_id)

# Fetch and Create Nodes
def fetch_and_create_nodes(db):
    """
    Fetch chunks and summary chunks from PostgreSQL and create corresponding nodes in Neo4j.
    Limits: 1000 chunks, 500 summary chunks.
    """
    try:
        connection = psycopg2.connect(
            host=DB_HOST,
            port=DB_PORT,
            database=DB_NAME,
            user=DB_USER,
            password=DB_PASSWORD
        )
        cursor = connection.cursor()

        # Fetch limited chunk data
        chunks_query = """
        SELECT 
            doc_id, 
            chunk_id, 
            url, 
            title, 
            content, 
            context_embeddings 
        FROM lse_doc
        LIMIT 1000;
        """
        cursor.execute(chunks_query)
        chunks = cursor.fetchall()

        # Create Chunk nodes
        for row in chunks:
            doc_id, chunk_id, url, title, content, context_embeddings = row

            # Parse embedding string into a vector
            context_embedding = parse_embedding(context_embeddings)

            db.create_chunk_node(
                doc_id=doc_id,
                chunk_id=chunk_id,
                title=title,
                url=url,
                content=content,
                context_embedding=context_embedding.tolist(),
                chunk_type="chunk"
            )

        # Fetch limited summary chunk data
        summary_query = """
        SELECT 
            doc_id, 
            content, 
            summary 
        FROM doc_summary
        LIMIT 500;
        """
        cursor.execute(summary_query)
        summaries = cursor.fetchall()

        # Create SummaryChunk nodes
        for row in summaries:
            doc_id, content, summary = row
            db.create_summary_chunk_node(
                doc_id=doc_id,
                content=content,
                summary=summary
            )

        return chunks

    except Exception as e:
        print("Error:", e)
    finally:
        if connection:
            cursor.close()
            connection.close()


# Relationships
def create_similar_relationships(db, chunks):
    """
    Create SIMILAR relationships between chunks based on cosine similarity.
    Ensures bidirectional relationships.
    """
    embeddings = []
    doc_ids = []
    chunk_ids = []

    # Collect embeddings and chunk information
    for chunk in chunks:
        context_embedding = parse_embedding(chunk[5])  # Ensure parsing
        if np.any(context_embedding):  # Exclude zero vectors
            embeddings.append(context_embedding)
            doc_ids.append(chunk[0])
            chunk_ids.append(chunk[1])

    if not embeddings:
        print("No valid embeddings found for SIMILAR relationships.")
        return

    # Compute cosine similarity
    similarity_matrix = cosine_similarity(embeddings)
    threshold = 0.999

    for i in range(len(doc_ids)):
        for j in range(i + 1, len(doc_ids)):
            if similarity_matrix[i][j] > threshold:
                db.create_similar_relationship(
                    doc_id1=doc_ids[i],
                    chunk_id1=chunk_ids[i],
                    doc_id2=doc_ids[j],
                    chunk_id2=chunk_ids[j],
                )
                print(f"Created bidirectional SIMILAR relationship between chunk {chunk_ids[i]} and chunk {chunk_ids[j]}")


def create_belongs_to_relationships(db, chunks):
    for chunk in chunks:
        doc_id = chunk[0]
        chunk_id = chunk[1]
        db.create_belongs_to_relationship(doc_id, chunk_id)

def create_next_relationships(db, chunks):
    doc_chunk_map = {}
    for chunk in chunks:
        doc_id = chunk[0]
        chunk_id = chunk[1]
        if doc_id not in doc_chunk_map:
            doc_chunk_map[doc_id] = []
        doc_chunk_map[doc_id].append(chunk_id)

    for doc_id, chunk_ids in doc_chunk_map.items():
        sorted_chunk_ids = sorted(chunk_ids)
        prev_chunk_id = None
        for chunk_id in sorted_chunk_ids:
            if prev_chunk_id:
                db.create_next_relationship(doc_id, prev_chunk_id, chunk_id)
            prev_chunk_id = chunk_id

# Main Script
db = Neo4jDatabase(NEO4J_URI, NEO4J_USERNAME, NEO4J_PASSWORD)
chunks = fetch_and_create_nodes(db)
create_similar_relationships(db, chunks)
create_belongs_to_relationships(db, chunks)
create_next_relationships(db, chunks)
db.close()


Created bidirectional SIMILAR relationship between chunk 1041 and chunk 1066
Created bidirectional SIMILAR relationship between chunk 1041 and chunk 1070
Created bidirectional SIMILAR relationship between chunk 1041 and chunk 1086
Created bidirectional SIMILAR relationship between chunk 1041 and chunk 1317
Created bidirectional SIMILAR relationship between chunk 1041 and chunk 1318
Created bidirectional SIMILAR relationship between chunk 1041 and chunk 1330
Created bidirectional SIMILAR relationship between chunk 1041 and chunk 2506
Created bidirectional SIMILAR relationship between chunk 1066 and chunk 1070
Created bidirectional SIMILAR relationship between chunk 1066 and chunk 1086
Created bidirectional SIMILAR relationship between chunk 1066 and chunk 1317
Created bidirectional SIMILAR relationship between chunk 1066 and chunk 1318
Created bidirectional SIMILAR relationship between chunk 1066 and chunk 1330
Created bidirectional SIMILAR relationship between chunk 1066 and chunk 2506

2. Fetching and Ingesting data

In [19]:
import psycopg2

# PostgreSQL connection parameters
DB_HOST = "158.143.74.10"
DB_PORT = 5432
DB_NAME = "chatlse"
DB_USER = "chatlse"
DB_PASSWORD = "chatlse"

def extract_chunk_ids(limit=50):
    try:

        connection = psycopg2.connect(
            host=DB_HOST,
            port=DB_PORT,
            database=DB_NAME,
            user=DB_USER,
            password=DB_PASSWORD
        )
        cursor = connection.cursor()


        chunks_query = f"""
        SELECT 
            doc_id, 
            chunk_id, 
            title 
        FROM lse_doc
        LIMIT {limit};
        """
        cursor.execute(chunks_query)
        chunks = cursor.fetchall()

        for chunk in chunks:
            doc_id, chunk_id, title = chunk
            print(f"doc_id: {doc_id}, chunk_id: {chunk_id}, title: {title}")

        return chunks

    except Exception as e:
        print("Error:", e)
    finally:
        if connection:
            cursor.close()
            connection.close()

chunks = extract_chunk_ids(limit=100)


doc_id: d9b560cd04403aaacd6eb5d41ff7346d, chunk_id: 6, title: WP181.pdf
doc_id: 8bacaf3a836e796fb58e10aaa0782caf, chunk_id: 31, title: SchoolRegs16-17.pdf
doc_id: 420078fcb070239243f589fff414085a, chunk_id: 1262, title: CourseGuidesProgrammeRegs16-17.pdf
doc_id: 420078fcb070239243f589fff414085a, chunk_id: 1264, title: CourseGuidesProgrammeRegs16-17.pdf
doc_id: 6771adab6303462bc5a97d35b8c1363c, chunk_id: 1, title: Psychology of Inequality
doc_id: 44883cb3eed83c297fe3eb73fad20568, chunk_id: 1, title: People, Work and Organisations
doc_id: 3806bc2aa3dbc067a71b050472cec337, chunk_id: 0, title: The LSE Estate
doc_id: 37a51a7bc7bee0978d5a477744cc5eb1, chunk_id: 3, title: I'm feeling suicidal or need emergency help
doc_id: 420078fcb070239243f589fff414085a, chunk_id: 1267, title: CourseGuidesProgrammeRegs16-17.pdf
doc_id: 420078fcb070239243f589fff414085a, chunk_id: 1268, title: CourseGuidesProgrammeRegs16-17.pdf
doc_id: d9b560cd04403aaacd6eb5d41ff7346d, chunk_id: 30, title: WP181.pdf
doc_id: 3