In [2]:
from neo4j import GraphDatabase
import pandas as pd
import math
import logging
from neo4j import GraphDatabase
from tenacity import retry, stop_after_attempt, wait_fixed
import requests
import os
import json
import re
import time
logging.basicConfig(level=logging.INFO)
logger = logging.getLogger(__name__)
# ========== CONFIGURATION ==========
NEO4J_URI = "bolt://localhost:7687"
NEO4J_USER = "neo4j"
NEO4J_PASSWORD = "ML10051005"
CSV_PATH = "Prompt_Eng_Topic_303_2-20.csv"
BATCH_SIZE = 227
# TODO: Add topic handling if needed

# === Helper: Clean string ===
def clean(val):
    return str(val).strip().lower().replace('"', '') if pd.notna(val) else None

# === Connect to Neo4j ===
driver = GraphDatabase.driver(NEO4J_URI, auth=(NEO4J_USER, NEO4J_PASSWORD))

# === Create indexes for performance ===
def create_indexes(tx):
    tx.run("CREATE INDEX IF NOT EXISTS FOR (e:Email) ON (e.id)")
    tx.run("CREATE INDEX IF NOT EXISTS FOR (p:Person) ON (p.id)")
    # === Batch processing function ===

def process_batch(tx, rows):
    for row in rows:
        to_list = [r.strip() for r in str(row.To).split(',')] if pd.notna(row.To) else []
        cc_list = [r.strip() for r in str(row.Cc).split(',')] if pd.notna(row.Cc) else []
        bcc_list = [r.strip() for r in str(row.Bcc).split(',')] if pd.notna(row.Bcc) else []

                # ✅ DEBUG 2 — Show when we're about to link sender
        if clean(row.Relevant) == "yes":
            print(f"✅ Email ID for responsive set: {row.MessageID}")

        tx.run("""
            MERGE (email:Email {id: $id})
            SET email.date_time = datetime($date),
                email.subject = $subject,
                email.content = $content,
                email.relevant = $relevant,
                email.analysis = $analysis


            MERGE (sender:Person {id: $sender})
            MERGE (sender)-[:SEND]->(email)

            WITH email, email.relevant AS relevant, $to_list AS to_list
            UNWIND to_list AS recipient
            MERGE (p:Person {id: recipient})
            MERGE (p)-[:RECEIVE]->(email)

            WITH email, relevant, $cc_list AS cc_list
            UNWIND cc_list AS cc
            MERGE (p:Person {id: cc})
            MERGE (p)-[:Cc]->(email)

            WITH email, relevant, $bcc_list AS bcc_list
            UNWIND bcc_list AS bcc
            MERGE (p:Person {id: bcc})
            MERGE (p)-[:Bcc]->(email)
            """,{
            "id": row.MessageID,
            "date": row.Date,
            "subject": row.Subject,
            "content": row.segmented_content,
            "relevant": clean(row.Relevant),
            "analysis": row.Analysis,
            "sender": row.From,
            "to_list": to_list,
            "cc_list": cc_list,
            "bcc_list": bcc_list
        })

        if clean(row.Relevant) == "yes":
            print(f"✅ Linking responsive email: {row.MessageID}")

            tx.run("""
                MATCH (email:Email {id: $id})
                MERGE (t:Topic {name: "Topic 303"})
                MERGE (email)-[r:RESPONSIVE]->(t)
                ON CREATE SET r.analysis = $analysis
            """, {
                "id": row.MessageID,
                "analysis": row.Analysis
            })


# === Main run ===
df = pd.read_csv(CSV_PATH)

with driver.session() as session:
    print("⚙️  Creating indexes...")
    session.execute_write(create_indexes)

    print(f"🚀 Starting import... Total rows: {len(df)}")
    total_batches = math.ceil(len(df) / BATCH_SIZE)

    for i in range(total_batches):
        start = i * BATCH_SIZE
        end = min((i + 1) * BATCH_SIZE, len(df))
        batch = df.iloc[start:end]
        session.execute_write(process_batch, batch.itertuples(index=False))
        print(f"✅ Imported batch {i+1}/{total_batches} ({end}/{len(df)})")

driver.close()
print("🎉 Import complete!")

⚙️  Creating indexes...
🚀 Starting import... Total rows: 227
✅ Email ID for responsive set: <17465286.1075846141892.JavaMail.evans@thyme>
✅ Linking responsive email: <17465286.1075846141892.JavaMail.evans@thyme>
✅ Email ID for responsive set: <23555051.1075843763267.JavaMail.evans@thyme>
✅ Linking responsive email: <23555051.1075843763267.JavaMail.evans@thyme>
✅ Email ID for responsive set: <23758860.1075847105118.JavaMail.evans@thyme>
✅ Linking responsive email: <23758860.1075847105118.JavaMail.evans@thyme>
✅ Email ID for responsive set: <5424049.1075843472852.JavaMail.evans@thyme>
✅ Linking responsive email: <5424049.1075843472852.JavaMail.evans@thyme>
✅ Email ID for responsive set: <3992784.1075848332190.JavaMail.evans@thyme>
✅ Linking responsive email: <3992784.1075848332190.JavaMail.evans@thyme>
✅ Email ID for responsive set: <3556843.1075843522313.JavaMail.evans@thyme>
✅ Linking responsive email: <3556843.1075843522313.JavaMail.evans@thyme>
✅ Email ID for responsive set: <2593599