In [None]:
# extract the entities
import json
import pandas as pd
import re
import spacy
from collections import defaultdict

# Load spaCy's English NLP model
nlp = spacy.load("en_core_web_sm")

# OPTIONAL: Add a custom entity ruler only if needed (skip if not adding)
if "legal_entity_ruler" not in nlp.pipe_names:
    ruler = nlp.add_pipe("entity_ruler", name="legal_entity_ruler", before="ner")
    # You can define patterns if needed:
    # ruler.add_patterns([{"label": "LAW", "pattern": "Penal Code"}])

# Load JSON data
file_path = r"C:\Users\User\Desktop\Legal-Research-Platform-Core\resources\cases_2024.json"
with open(file_path, 'r', encoding='utf-8') as file:
    data = json.load(file)

# Function to extract entities from text
def extract_entities(text):
    doc = nlp(text)
    entities = defaultdict(set)

    # Custom regex for legal citations
    legal_pattern = re.compile(r'\b(Section|Article|Act|Code|Law|Clause|Amendment)[\s\-]*\d+[A-Za-z()\-]*', re.IGNORECASE)
    case_number_pattern = re.compile(r'\b(HC|CPA|Case\s*No\.?)\s*/?\s*\d+/\d+', re.IGNORECASE)

    for ent in doc.ents:
        if ent.label_ == "PERSON":
            entities["People"].add(ent.text.strip())
        elif ent.label_ == "ORG":
            entities["Organizations"].add(ent.text.strip())
        elif ent.label_ in ["GPE", "LOC"]:
            entities["Locations"].add(ent.text.strip())
        elif ent.label_ == "DATE":
            entities["Dates"].add(ent.text.strip())

    # Extract from raw text
    legal_refs = legal_pattern.findall(text)
    for match in legal_pattern.finditer(text):
        entities["Legal References"].add(match.group().strip())

    for match in case_number_pattern.finditer(text):
        entities["Case Numbers"].add(match.group().strip())

    return {key: list(values) for key, values in entities.items()}

# Process each case document
cases = []
for case in data:
    case_id = case.get("id", "Unknown")
    filename = case.get("filename", "Unknown")
    primary_lang = case.get("primaryLang", "Unknown")
    text = case.get("text", "")

    # Basic text cleaning
    cleaned_text = re.sub(r'\s+', ' ', text).strip()
    cleaned_text = re.sub(r'[^a-zA-Z0-9.,;:\-\s]', '', cleaned_text)

    # Extract entities
    extracted_entities = extract_entities(cleaned_text)

    cases.append({
        "ID": case_id,
        "Filename": filename,
        "Language": primary_lang,
        "People": ", ".join(extracted_entities.get("People", [])),
        "Organizations": ", ".join(extracted_entities.get("Organizations", [])),
        "Locations": ", ".join(extracted_entities.get("Locations", [])),
        "Dates": ", ".join(extracted_entities.get("Dates", [])),
        "Legal References": ", ".join(extracted_entities.get("Legal References", [])),
        "Case Numbers": ", ".join(extracted_entities.get("Case Numbers", [])),
        "Text": cleaned_text[:500]
    })

# Convert to DataFrame and save
df = pd.DataFrame(cases)
df.to_csv('processed_cases_with_entities.csv', index=False, encoding='utf-8')
print(df.head())




                                     ID Filename Language  \
0  d66a6895-c339-4bd0-9992-790b7b5f4a17      cpa     0132   
1  4aaafdf5-8ac9-4086-b62e-485d250b02bb    court       of   
2  f81236b6-7c88-4337-9701-772651a56abe       ca     writ   
3  0fe6fe07-fd7d-4b4c-a5d3-3644e9c56b56      wrt     0201   
4  b655451f-cad0-4cc4-b5ce-6bc81dbbee30     writ      123   

                                              People  \
0  VICTIM, Vary Any Order Made, Gedara Ravindu Ra...   
1  weyqjdu, Kalutara Case No, Yvonne de Silva, Gr...   
2  D.A.R. Ramanayake, Udula Chandana, Chandana Ga...   
3  A. H. L. Rushika Padmini, P8 -5, P8 -6, Labour...   
4  Order, Kamala Deheragoda Punchinilame, Rita He...   

                                       Organizations  \
0  Attrary, the Learned Presideents Counters, The...   
1  The Learned Trial, Presidents Counsel, kj;a;,d...   
2  Clauses 7.2, Ministry of Education Isurupaya, ...   
3  Havelock Road, Colombo 05, the Loan Applicatio...   
4  Gangalagamuwa

In [9]:
%pip install ace_tools

Collecting ace_tools
  Downloading ace_tools-0.0-py3-none-any.whl.metadata (300 bytes)
Downloading ace_tools-0.0-py3-none-any.whl (1.1 kB)
Installing collected packages: ace_tools
Successfully installed ace_tools-0.0
Note: you may need to restart the kernel to use updated packages.



[notice] A new release of pip is available: 24.0 -> 25.0.1
[notice] To update, run: python.exe -m pip install --upgrade pip


In [4]:
%pip install spacy


Collecting spacy
  Downloading spacy-3.8.4-cp311-cp311-win_amd64.whl.metadata (27 kB)
Collecting spacy-legacy<3.1.0,>=3.0.11 (from spacy)
  Downloading spacy_legacy-3.0.12-py2.py3-none-any.whl.metadata (2.8 kB)
Collecting spacy-loggers<2.0.0,>=1.0.0 (from spacy)
  Downloading spacy_loggers-1.0.5-py3-none-any.whl.metadata (23 kB)
Collecting murmurhash<1.1.0,>=0.28.0 (from spacy)
  Downloading murmurhash-1.0.12-cp311-cp311-win_amd64.whl.metadata (2.2 kB)
Collecting cymem<2.1.0,>=2.0.2 (from spacy)
  Downloading cymem-2.0.11-cp311-cp311-win_amd64.whl.metadata (8.8 kB)
Collecting preshed<3.1.0,>=3.0.2 (from spacy)
  Downloading preshed-3.0.9-cp311-cp311-win_amd64.whl.metadata (2.2 kB)
Collecting thinc<8.4.0,>=8.3.4 (from spacy)
  Downloading thinc-8.3.4-cp311-cp311-win_amd64.whl.metadata (15 kB)
Collecting wasabi<1.2.0,>=0.9.1 (from spacy)
  Using cached wasabi-1.1.3-py3-none-any.whl.metadata (28 kB)
Collecting srsly<3.0.0,>=2.4.3 (from spacy)
  Downloading srsly-2.5.1-cp311-cp311-win_amd6


[notice] A new release of pip is available: 24.0 -> 25.0.1
[notice] To update, run: python.exe -m pip install --upgrade pip


In [7]:
!python -m spacy download en_core_web_sm

Collecting en-core-web-sm==3.8.0
  Downloading https://github.com/explosion/spacy-models/releases/download/en_core_web_sm-3.8.0/en_core_web_sm-3.8.0-py3-none-any.whl (12.8 MB)
     ---------------------------------------- 0.0/12.8 MB ? eta -:--:--
     --------------------------------------- 0.0/12.8 MB 330.3 kB/s eta 0:00:39
     --------------------------------------- 0.0/12.8 MB 330.3 kB/s eta 0:00:39
     --------------------------------------- 0.1/12.8 MB 409.6 kB/s eta 0:00:32
     --------------------------------------- 0.1/12.8 MB 508.4 kB/s eta 0:00:26
     --------------------------------------- 0.1/12.8 MB 504.4 kB/s eta 0:00:26
     --------------------------------------- 0.1/12.8 MB 504.4 kB/s eta 0:00:26
     --------------------------------------- 0.1/12.8 MB 504.4 kB/s eta 0:00:26
     --------------------------------------- 0.1/12.8 MB 504.4 kB/s eta 0:00:26
      -------------------------------------- 0.2/12.8 MB 419.0 kB/s eta 0:00:31
      --------------------------


[notice] A new release of pip is available: 24.0 -> 25.0.1
[notice] To update, run: python.exe -m pip install --upgrade pip


In [2]:
%pip install pandas

Collecting pandas
  Downloading pandas-2.2.3-cp311-cp311-win_amd64.whl.metadata (19 kB)
Collecting numpy>=1.23.2 (from pandas)
  Downloading numpy-2.2.3-cp311-cp311-win_amd64.whl.metadata (60 kB)
     ---------------------------------------- 0.0/60.8 kB ? eta -:--:--
     ------ --------------------------------- 10.2/60.8 kB ? eta -:--:--
     ------------------------- ------------ 41.0/60.8 kB 495.5 kB/s eta 0:00:01
     -------------------------------------- 60.8/60.8 kB 544.1 kB/s eta 0:00:00
Collecting pytz>=2020.1 (from pandas)
  Using cached pytz-2025.1-py2.py3-none-any.whl.metadata (22 kB)
Collecting tzdata>=2022.7 (from pandas)
  Using cached tzdata-2025.1-py2.py3-none-any.whl.metadata (1.4 kB)
Downloading pandas-2.2.3-cp311-cp311-win_amd64.whl (11.6 MB)
   ---------------------------------------- 0.0/11.6 MB ? eta -:--:--
   ---------------------------------------- 0.0/11.6 MB ? eta -:--:--
   ---------------------------------------- 0.0/11.6 MB 495.5 kB/s eta 0:00:24
   -----


[notice] A new release of pip is available: 24.0 -> 25.0.1
[notice] To update, run: python.exe -m pip install --upgrade pip


In [5]:
#extract the relationships
import json
import pandas as pd
import spacy
from spacy.matcher import Matcher

# Load spaCy model
nlp = spacy.load("en_core_web_sm")

# Load JSON data
file_path = r"C:\Users\User\Desktop\Legal-Research-Platform-Core\resources\cases_2024.json"
with open(file_path, 'r', encoding='utf-8') as file:
    data = json.load(file)

# Initialize Matcher
matcher = Matcher(nlp.vocab)

# Add pattern rules
matcher.add("CITES_CASE", [[{"LOWER": "cites"}, {"ENT_TYPE": "PERSON", "OP": "+"}]])
matcher.add("JUDGE_PRESIDED", [[{"LOWER": "presided"}, {"LOWER": "by"}, {"ENT_TYPE": "PERSON", "OP": "+"}]])
matcher.add("DECIDED_BY", [[{"LOWER": "decided"}, {"LOWER": "by"}, {"ENT_TYPE": "PERSON", "OP": "+"}]])
matcher.add("VERSUS", [[{"ENT_TYPE": "PERSON", "OP": "+"}, {"LOWER": "vs"}, {"ENT_TYPE": "PERSON", "OP": "+"}]])

# Extract relationships from a given text
def extract_relationships(text):
    doc = nlp(text)
    matches = matcher(doc)
    relationships = []

    for match_id, start, end in matches:
        span = doc[start:end]
        rule_id = nlp.vocab.strings[match_id]

        if rule_id == "VERSUS":
            persons = [ent.text for ent in span.ents if ent.label_ == "PERSON"]
            if len(persons) >= 2:
                relationships.append((persons[0], "VS", persons[1]))

        elif rule_id == "CITES_CASE":
            target = next((ent.text for ent in span.ents if ent.label_ == "PERSON"), None)
            if target:
                relationships.append(("This Case", "CITES", target))

        elif rule_id == "JUDGE_PRESIDED":
            judge = next((ent.text for ent in span.ents if ent.label_ == "PERSON"), None)
            if judge:
                relationships.append(("Court", "PRESIDED_BY", judge))

        elif rule_id == "DECIDED_BY":
            judge = next((ent.text for ent in span.ents if ent.label_ == "PERSON"), None)
            if judge:
                relationships.append(("Decision", "DECIDED_BY", judge))

    return relationships

# Process dataset
relationship_data = []
for case in data:
    case_id = case.get("id", "Unknown")
    filename = case.get("filename", "Unknown")
    text = case.get("text", "").strip()

    # Clean extra whitespace
    cleaned_text = re.sub(r'\s+', ' ', text)
    relationships = extract_relationships(cleaned_text)

    for rel in relationships:
        relationship_data.append({
            "Case ID": case_id,
            "Filename": filename,
            "Entity 1": rel[0],
            "Relation": rel[1],
            "Entity 2": rel[2]
        })

# Create DataFrame
relationship_df = pd.DataFrame(relationship_data)

# Save to CSV
relationship_df.to_csv("extracted_relationships.csv", index=False, encoding='utf-8')

# Preview output
print(relationship_df.head())


                                Case ID Filename              Entity 1  \
0  be63b389-2cb1-4234-ac8d-a789bd48e3df     writ              Decision   
1  be63b389-2cb1-4234-ac8d-a789bd48e3df     writ              Decision   
2  5b03012d-0dcb-4e7e-89df-810f4ba1f50d      rii              Decision   
3  051b726b-6c79-4dd4-b152-d8f18d8263a9       ca  Jayalath Jayawardena   
4  4963a304-fb9b-43c7-865c-9ceae877e191       ca      Cornelius Perera   

     Relation               Entity 2  
0  DECIDED_BY  Sobitha Rajakaruna J.  
1  DECIDED_BY     Eva Wanasundara J.  
2  DECIDED_BY         Canekeratne J.  
3          VS       Chandra Fernando  
4          VS             Leo Perera  


In [13]:
%pip install py2neo

Collecting py2neo
  Downloading py2neo-2021.2.4-py2.py3-none-any.whl.metadata (9.9 kB)
Collecting interchange~=2021.0.4 (from py2neo)
  Downloading interchange-2021.0.4-py2.py3-none-any.whl.metadata (1.9 kB)
Collecting monotonic (from py2neo)
  Downloading monotonic-1.6-py2.py3-none-any.whl.metadata (1.5 kB)
Collecting pansi>=2020.7.3 (from py2neo)
  Downloading pansi-2024.11.0-py2.py3-none-any.whl.metadata (3.1 kB)
Collecting pillow (from pansi>=2020.7.3->py2neo)
  Downloading pillow-11.1.0-cp311-cp311-win_amd64.whl.metadata (9.3 kB)
Downloading py2neo-2021.2.4-py2.py3-none-any.whl (177 kB)
   ---------------------------------------- 0.0/177.2 kB ? eta -:--:--
   ------ -------------------------------- 30.7/177.2 kB 660.6 kB/s eta 0:00:01
   --------- ----------------------------- 41.0/177.2 kB 495.5 kB/s eta 0:00:01
   ------------------ -------------------- 81.9/177.2 kB 573.4 kB/s eta 0:00:01
   -------------------- ------------------ 92.2/177.2 kB 525.1 kB/s eta 0:00:01
   -------


[notice] A new release of pip is available: 24.0 -> 25.0.1
[notice] To update, run: python.exe -m pip install --upgrade pip
