In [1]:
import pandas as pd

In [3]:
#for data with labels
from neo4j import GraphDatabase
import pandas as pd

# ========== CONFIGURATION ==========
NEO4J_URI = "bolt://localhost:7687"
NEO4J_USER = "neo4j"
NEO4J_PASSWORD = "ML10051005"
CSV_PATH = "Prompt_Eng_Topic_303_2-20.csv"
TOPIC_NAME = "Topic 303"

# ========== LOAD CSV ==========
df = pd.read_csv(CSV_PATH)

# ========== CONNECT TO NEO4J ==========
driver = GraphDatabase.driver(NEO4J_URI, auth=(NEO4J_USER, NEO4J_PASSWORD))

def clean(val):
    if pd.isna(val):
        return None
    return str(val).strip().lower().replace('"', '')

# ========== CYPHER INGEST FUNCTION ==========
def create_graph(tx, row):
    # Create Email node
    tx.run("""
        MERGE (email:Email {id: $id})
        SET email.date_time = datetime($date),
            email.subject = $subject,
            email.content = $content,
            email.relevant = $relevant,
            email.analysis = $analysis
    """,
    id=row['MessageID'],
    date=row['Date'],
    subject=row['Subject'],
    content=row['segmented_content'],
    relevant=clean(row['Relevant']),
    analysis=row['Analysis'])

    # Sender
    tx.run("""
        MERGE (sender:Person {id: $sender})
        MERGE (sender)-[:SEND]->(email:Email {id: $email_id})
    """, sender=row['From'], email_id=row['MessageID'])

    # To
    if pd.notna(row['To']):
        for recipient in str(row['To']).split(','):
            recipient = recipient.strip()
            tx.run("""
                MERGE (p:Person {id: $recipient})
                MERGE (p)-[:RECEIVE]->(email:Email {id: $email_id})
            """, recipient=recipient, email_id=row['MessageID'])

    # Cc
    if pd.notna(row['Cc']):
        for cc in str(row['Cc']).split(','):
            cc = cc.strip()
            tx.run("""
                MERGE (p:Person {id: $cc})
                MERGE (p)-[:Cc]->(email:Email {id: $email_id})
            """, cc=cc, email_id=row['MessageID'])

    # Bcc
    if pd.notna(row['Bcc']):
        for bcc in str(row['Bcc']).split(','):
            bcc = bcc.strip()
            tx.run("""
                MERGE (p:Person {id: $bcc})
                MERGE (p)-[:Bcc]->(email:Email {id: $email_id})
            """, bcc=bcc, email_id=row['MessageID'])

    # Topic node
    tx.run("""
        MERGE (t:Topic {name: $topic})
    """, topic=TOPIC_NAME)

    # Responsive relationship (if relevant)
    if clean(row['Relevant']) == 'yes':
        tx.run("""
            MATCH (email:Email {id: $email_id})
            MATCH (t:Topic {name: $topic})
            MERGE (email)-[r:RESPONSIVE]->(t)
            ON CREATE SET r.analysis = $analysis
        """, email_id=row['MessageID'], topic=TOPIC_NAME, analysis=row['Analysis'])

# ========== INGEST ALL ROWS ==========
with driver.session() as session:
    for _, row in df.iterrows():
        session.write_transaction(create_graph, row)

driver.close()
print("✅ Graph creation complete!")

  session.write_transaction(create_graph, row)


✅ Graph creation complete!


In [10]:
#for data without topics
from neo4j import GraphDatabase
import pandas as pd
import math

# === CONFIG ===
NEO4J_URI = "bolt://localhost:7687"
NEO4J_USER = "neo4j"
NEO4J_PASSWORD = "ML10051005"
CSV_PATH = "enron_final_0308.csv"
BATCH_SIZE = 1000

# === Helper: Clean string ===
def clean(val):
    return str(val).strip().lower().replace('"', '') if pd.notna(val) else None

# === Connect to Neo4j ===
driver = GraphDatabase.driver(NEO4J_URI, auth=(NEO4J_USER, NEO4J_PASSWORD))

# === Create indexes for performance ===
def create_indexes(tx):
    tx.run("CREATE INDEX IF NOT EXISTS FOR (e:Email) ON (e.id)")
    tx.run("CREATE INDEX IF NOT EXISTS FOR (p:Person) ON (p.id)")

# === Batch processing function ===
def process_batch(tx, rows):
    for row in rows:
        # Email node
        tx.run("""
            MERGE (email:Email {id: $id})
            SET email.date_time = datetime($date),
                email.subject = $subject,
                email.content = $content
        """,
        id=row.MessageID,
        date=row.Date,
        subject=row.Subject,
        content=row.segmented_content)

        # Sender
        tx.run("""
            MERGE (sender:Person {id: $sender})
            MERGE (sender)-[:SEND]->(email:Email {id: $email_id})
        """, sender=row.From, email_id=row.MessageID)

        # To
        if pd.notna(row.To):
            for recipient in str(row.To).split(','):
                recipient = recipient.strip()
                tx.run("""
                    MERGE (p:Person {id: $recipient})
                    MERGE (p)-[:RECEIVE]->(email:Email {id: $email_id})
                """, recipient=recipient, email_id=row.MessageID)

        # Cc
        if pd.notna(row.Cc):
            for cc in str(row.Cc).split(','):
                cc = cc.strip()
                tx.run("""
                    MERGE (p:Person {id: $cc})
                    MERGE (p)-[:Cc]->(email:Email {id: $email_id})
                """, cc=cc, email_id=row.MessageID)

        # Bcc
        if pd.notna(row.Bcc):
            for bcc in str(row.Bcc).split(','):
                bcc = bcc.strip()
                tx.run("""
                    MERGE (p:Person {id: $bcc})
                    MERGE (p)-[:Bcc]->(email:Email {id: $email_id})
                """, bcc=bcc, email_id=row.MessageID)

# === Main run ===
df = pd.read_csv(CSV_PATH)

with driver.session() as session:
    print("⚙️  Creating indexes...")
    session.execute_write(create_indexes)

    print(f"🚀 Starting import... Total rows: {len(df)}")
    total_batches = math.ceil(len(df) / BATCH_SIZE)

    for i in range(total_batches):
        start = i * BATCH_SIZE
        end = min((i + 1) * BATCH_SIZE, len(df))
        batch = df.iloc[start:end]
        session.execute_write(process_batch, batch.itertuples(index=False))
        print(f"✅ Imported batch {i+1}/{total_batches} ({end}/{len(df)})")

driver.close()
print("🎉 Import complete!")

  df = pd.read_csv(CSV_PATH)


⚙️  Creating indexes...
🚀 Starting import... Total rows: 517401
✅ Imported batch 1/518 (1000/517401)
✅ Imported batch 2/518 (2000/517401)
✅ Imported batch 3/518 (3000/517401)
✅ Imported batch 4/518 (4000/517401)
✅ Imported batch 5/518 (5000/517401)
✅ Imported batch 6/518 (6000/517401)
✅ Imported batch 7/518 (7000/517401)
✅ Imported batch 8/518 (8000/517401)
✅ Imported batch 9/518 (9000/517401)
✅ Imported batch 10/518 (10000/517401)
✅ Imported batch 11/518 (11000/517401)
✅ Imported batch 12/518 (12000/517401)
✅ Imported batch 13/518 (13000/517401)
✅ Imported batch 14/518 (14000/517401)
✅ Imported batch 15/518 (15000/517401)
✅ Imported batch 16/518 (16000/517401)
✅ Imported batch 17/518 (17000/517401)
✅ Imported batch 18/518 (18000/517401)
✅ Imported batch 19/518 (19000/517401)
✅ Imported batch 20/518 (20000/517401)
✅ Imported batch 21/518 (21000/517401)
✅ Imported batch 22/518 (22000/517401)
✅ Imported batch 23/518 (23000/517401)
✅ Imported batch 24/518 (24000/517401)
✅ Imported batch 2

In [None]:
#how to make labeling cheaper? compress multiple tasks into one prompt.