In [None]:
%pip install neo4j


Collecting neo4j
  Downloading neo4j-5.28.1-py3-none-any.whl.metadata (5.9 kB)
Downloading neo4j-5.28.1-py3-none-any.whl (312 kB)
[?25l   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/312.3 kB[0m [31m?[0m eta [36m-:--:--[0m[2K   [91m━━━━━━━━━━━━━[0m[90m╺[0m[90m━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m102.4/312.3 kB[0m [31m2.7 MB/s[0m eta [36m0:00:01[0m[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m312.3/312.3 kB[0m [31m4.8 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: neo4j
Successfully installed neo4j-5.28.1


In [None]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [None]:
# 📦 Required imports
import json
import re
import pandas as pd
from neo4j import GraphDatabase
from IPython.display import display

In [None]:
# 📁 Configuration and Data Loading

# CHANGE THIS LINE to point to your file path
file_path = r"/content/drive/MyDrive/FYP/cases_2024.json"

try:
    with open(file_path, 'r', encoding='utf-8') as file:
        case_data = json.load(file)
    print(f"✅ Successfully loaded {len(case_data)} records from case data.")
except FileNotFoundError:
    print(f"❌ Error: The file was not found at {file_path}")
    print("Please ensure the file path is correct and that your Google Drive is mounted if you're using Colab.")
    case_data = []

In [None]:
# ⚙️ Extraction Logic and Data Processing

# This improved regex uses capture groups to cleanly isolate the name, number, and year.
act_pattern = re.compile(
    # Group 1: The Act Name (e.g., "The Evidence Act")
    r'\b((?:[A-Z][a-z]+(?:[’\'])?[a-z]*\s+){0,8}(?:Act|Code|Ordinance|Law|Regulation|Rules))\s+'
    # Group 2: The Act Number
    r'(?:No\.?|Number)\s*(\d+)\s+'
    # Group 3: The Act Year
    r'of\s*(\d{4})',
    re.IGNORECASE
)

case_act_map = []
if case_data:
    print("Processing case data to extract and standardize acts...")
    for case in case_data:
        case_id = case.get("id", "Unknown")
        content = case.get("text", "")
        # Pre-process content to handle newlines, which can break regex matching
        content = content.replace('\n', ' ')

        standardized_acts = set()

        # Find all matches and build the clean string directly from the components
        for match in act_pattern.finditer(content):
            name = ' '.join(p.capitalize() for p in match.group(1).strip().split())
            number = match.group(2)
            year = match.group(3)

            standardized_act = f"{name} No. {number} of {year}"
            standardized_acts.add(standardized_act)

        case_act_map.append({
            "case_id": case_id,
            "standardized_acts": sorted(list(standardized_acts))
        })
    print("✅ Processing complete.")
else:
    print("⚠️ No case data to process.")

# Preview the clean, extracted data
df = pd.DataFrame(case_act_map)
pd.set_option('display.max_colwidth', None)
print("\n📋 Preview of Extracted and Standardized Acts:")
display(df)

In [None]:
#act standardizer


In [None]:
# 🧠 Neo4j Connector Class

class Neo4jConnector:
    def __init__(self, uri, user, password):
        self.driver = GraphDatabase.driver(uri, auth=(user, password))

    def close(self):
        """Closes the database connection."""
        if self.driver:
            self.driver.close()

    def verify_connection(self):
        """Verifies and prints the connection status."""
        self.driver.verify_connectivity()
        print("✅ Connection to Neo4j AuraDB verified successfully.")

    def push_case_with_acts(self, case_id, acts):
        """Pushes a single case and its act relationships to the graph."""
        with self.driver.session(database="neo4j") as session:
            session.execute_write(self._create_graph_tx, case_id, acts)

    @staticmethod
    def _create_graph_tx(tx, case_id, acts):
        # This Cypher query remains the same, but now receives clean data
        tx.run("MERGE (c:Case {id: $case_id})", case_id=case_id)
        for act in acts:
            tx.run(
                """
                MATCH (c:Case {id: $case_id})
                MERGE (a:Act {name: $act})
                MERGE (c)-[:REFERS_TO]->(a)
                """, case_id=case_id, act=act
            )

    def get_acts_for_case(self, case_id):
        """Queries for a case and returns a list of its referred act names."""
        with self.driver.session(database="neo4j") as session:
            return session.read_transaction(self._get_acts_tx, case_id)

    @staticmethod
    def _get_acts_tx(tx, case_id):
        query = (
            "MATCH (c:Case {id: $case_id})-[:REFERS_TO]->(a:Act) "
            "RETURN a.name AS act_name"
        )
        result = tx.run(query, case_id=case_id)
        return [record["act_name"] for record in result]

In [None]:
# 🚀 Data Ingestion to Neo4j

# --- IMPORTANT ---
# ⚠️ Before running this, clear your database of old, messy data by running this
#    Cypher query in your Neo4j console:
#    MATCH (n) DETACH DELETE n

NEO4J_URI = "neo4j+s://66d16355.databases.neo4j.io"
NEO4J_USER = "neo4j"
NEO4J_PASSWORD = "G4UXZ6KLGd1dzo57rp6ITypJHZ37aM1fn-exAWdw3p8"

print("--- Phase 1: Data Ingestion ---")
neo4j_conn = None
try:
    neo4j_conn = Neo4jConnector(uri=NEO4J_URI, user=NEO4J_USER, password=NEO4J_PASSWORD)
    neo4j_conn.verify_connection()

    print("Pushing clean, standardized data to Neo4j...")
    count = 0
    for entry in case_act_map:
        if entry["standardized_acts"]:
            neo4j_conn.push_case_with_acts(entry["case_id"], entry["standardized_acts"])
            count += 1
    print(f"✅ Successfully pushed {count} cases with standardized act relationships to Neo4j.")

except Exception as e:
    print(f"❌ An error occurred during the Neo4j push operation: {e}")

finally:
    if neo4j_conn:
        neo4j_conn.close()
        print("Neo4j push connection closed.")

In [None]:
# ❓ Query a Specific Case and Display Results

def parse_standardized_act_name(act_name_str):
    """Parses a standardized act name back into its components for display."""
    parsing_pattern = re.compile(r'^(.*)\s+No\.\s+(\d+)\s+of\s+(\d{4})$')
    match = parsing_pattern.match(act_name_str)
    if match:
        return {"act_name": match.group(1).strip(), "act_number": match.group(2), "year": match.group(3)}
    return {"act_name": act_name_str, "act_number": "N/A", "year": "N/A"}

# --- CONFIGURATION ---
# <<-- CHANGE THIS ID to the case you want to query from your data
target_case_id = "4c4ebba0-1876-43ba-adf6-79afca98fceb" # Example case ID

print(f"\n--- Phase 2: Querying Data ---")
print(f"\nQuerying for acts referred by Case ID: '{target_case_id}'")

neo4j_conn = None
try:
    neo4j_conn = Neo4jConnector(uri=NEO4J_URI, user=NEO4J_USER, password=NEO4J_PASSWORD)
    neo4j_conn.verify_connection()

    referred_acts = neo4j_conn.get_acts_for_case(target_case_id)

    if referred_acts:
        print(f"Found {len(referred_acts)} referred act(s). Formatting output...")
        formatted_acts = [parse_standardized_act_name(act) for act in referred_acts]

        # Display the results in a clean pandas DataFrame
        acts_df = pd.DataFrame(formatted_acts)
        acts_df = acts_df[["act_name", "act_number", "year"]] # Ensure column order
        display(acts_df)
    else:
        print(f"No referred acts found in the database for Case ID: '{target_case_id}'")

except Exception as e:
    print(f"❌ An error occurred during the Neo4j query operation: {e}")

finally:
    if neo4j_conn:
        neo4j_conn.close()
        print("Neo4j query connection closed.")