<a href="https://colab.research.google.com/github/HackElite-FYP/Legal-Research-Platform-Core/blob/main/colab-main.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# **GitHub Commands**

In [None]:
# @title GitHub Init
# from google.colab import userdata

# GH_UNAME = userdata.get('GH_UNAME')
# GH_APIKEY = userdata.get('GH_APIKEY')
# GH_EMAIL = userdata.get('GH_EMAIL')
PRIMARY_REPO_NAME = 'Legal-Research-Platform'
LOCAL_REPO_DIR = '/content/drive/MyDrive/FYP/GitHub/Legal-Research-Platform'

drive.mount('/content/drive')

# !git config --global user.name {GH_UNAME}
# !git config --global user.email {GH_EMAIL}

%cd {LOCAL_REPO_DIR}

KeyboardInterrupt: 

In [None]:
# @title Git <-
!git fetch

!git pull

In [None]:
# @title Checkout
# !git checkout -b 'summarization'
!git pull origin summarization

In [None]:
# @title Git ->
# !git add .

# !git status

# !git commit -m 'updated layout'

!git push

# **Scrapers**

In [None]:
#@title Init

# Step 1: Install required libraries
!apt-get update
!apt-get purge chromium-browser chromium-chromedriver -y
!apt-get autoremove -y
!wget https://dl.google.com/linux/direct/google-chrome-stable_current_amd64.deb
!dpkg -i google-chrome-stable_current_amd64.deb || apt-get -fy install
!pip install -U selenium webdriver-manager requests

# Mount Google Drive
from google.colab import drive
drive.mount('/content/drive')

In [None]:
#@title law acts scraper

# Import required libraries
import os
import time
import shutil
import logging
import requests
import math
import requests
from requests.adapters import HTTPAdapter
from urllib3.util.retry import Retry
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from selenium.webdriver.chrome.service import Service
from webdriver_manager.chrome import ChromeDriverManager
from concurrent.futures import ThreadPoolExecutor, as_completed

# Define download paths
local_download_dir = "/content/downloads"
drive_directory = "/content/drive/MyDrive/FYP/legal_acts_raw"  # Replace with your desired directory

# Ensure directories exist
os.makedirs(local_download_dir, exist_ok=True)
os.makedirs(drive_directory, exist_ok=True)

# Set up Selenium WebDriver
chrome_options = webdriver.ChromeOptions()
prefs = {
    "download.default_directory": local_download_dir,
    "download.prompt_for_download": False,
    "download.directory_upgrade": True,
    "safebrowsing.enabled": True,
}
chrome_options.add_experimental_option("prefs", prefs)
chrome_options.add_argument("--headless")  # Run in headless mode
chrome_options.add_argument("--no-sandbox")
chrome_options.add_argument("--disable-gpu")  # Disable GPU hardware acceleration
chrome_options.add_argument("--window-size=1920x1080")  # Use a fixed window size
chrome_options.add_argument("--disable-dev-shm-usage")

# Initialize WebDriver
service = Service(ChromeDriverManager().install())
driver = webdriver.Chrome(service=service, options=chrome_options)

# Create a session with a larger connection pool
session = requests.Session()
adapter = HTTPAdapter(pool_connections=100, pool_maxsize=100, max_retries=Retry(total=2))
session.mount("http://", adapter)
session.mount("https://", adapter)


# Process each row separately (to be used in threading)
def process_row(row, year):
    try:
        #Get Name
        name = row.find_element(By.CSS_SELECTOR, "td:nth-child(3)").text.strip()

        # Find download links inside <a> tags that contain buttons
        english_link = row.find_element(By.XPATH, ".//a[button[contains(text(), 'English')]]")
        sinhala_link = row.find_element(By.XPATH, ".//a[button[contains(text(), 'Sinhala')]]")

        # Get the actual download URLs
        english_url = english_link.get_attribute("href") if english_link else None
        sinhala_url = sinhala_link.get_attribute("href") if sinhala_link else None

        # Download files in parallel
        if english_url:
            download_file(english_url, f"{name}_English.pdf", year)
        if sinhala_url:
            download_file(sinhala_url, f"{name}_Sinhala.pdf", year)

    except Exception as e:
        print(f"Error processing row for year {year}: {e}")

# iterative function to process row chunks
def process_rows_iterative(rows, year, max_threads=20):
    futures = []
    while rows:
        num_rows = len(rows)
        num_threads = min(max_threads, max(1, num_rows // 2))
        chunk_size = math.ceil(num_rows / num_threads)
        row_chunks = [rows[i:i + chunk_size] for i in range(0, num_rows, chunk_size)]

        print(f"Processing {num_rows} rows with {num_threads} threads, chunk size: {chunk_size}")

        with ThreadPoolExecutor(max_workers=num_threads) as executor:
            futures = [executor.submit(process_row, row, year) for chunk in row_chunks for row in chunk]

        # Update remaining rows
        rows = rows[chunk_size * num_threads:]

    # Wait for all threads to complete before proceeding
    for future in as_completed(futures):
        try:
            future.result()
        except Exception as e:
            print(f"Error in thread: {e}")



# Define the scraper function
def scrape_legal_acts(url):
    try:
        driver.get(url)
        time.sleep(3)
        print("Browser Opened")

        # Find all year buttons
        year_buttons = driver.find_elements(By.XPATH, "//a[@class='btn btn-primary']")
        print(f"Found {len(year_buttons)} year buttons")

        for i in range(len(year_buttons)):
            button = driver.find_elements(By.XPATH, "//a[@class='btn btn-primary']")[i]
            year = button.text.strip()
            print(f"Processing year: {year}")

            button.click()
            time.sleep(3)

            if not driver.find_elements(By.CSS_SELECTOR, "table tbody tr"):
                print(f"No data found for year: {year}")
                driver.back()
                time.sleep(2)
                continue

            # Find all rows in the table
            rows = driver.find_elements(By.CSS_SELECTOR, "table tbody tr")
            print(f"Found {len(rows)} rows for year: {year}")

            # Process each row in parallel
            process_rows_iterative(rows, year)

            # Return to the year selection page
            driver.back()
            time.sleep(2)

    except Exception as e:
        print(f"Error during scraping: {e}")


# Download files function
def download_file(url, filename, year):
    try:
        response = session.get(url, stream=True)
        if response.status_code == 200:
            year_folder = os.path.join(drive_directory, year)
            os.makedirs(year_folder, exist_ok=True)
            filepath = os.path.join(year_folder, filename)

            with open(filepath, "wb") as file:
                for chunk in response.iter_content(chunk_size=1024):
                    file.write(chunk)

            print(f"Downloaded: {filename}")
        else:
            print(f"Failed to download {filename}")

        response.close()

    except Exception as e:
        print(f"Error downloading file {filename}: {e}")

# Run the scraper
website_url = "https://documents.gov.lk/view/acts/acts.html"  # Replace with the actual URL
scrape_legal_acts(website_url)

# Close the WebDriver
driver.quit()

In [None]:
#@title cases scraper

import json
import requests
import os
from concurrent.futures import ThreadPoolExecutor, as_completed

# path to JSON file
file_path = "/content/drive/MyDrive/FYP/resources/jurilens-db.documents.json"

# Read the JSON file and load its data
with open(file_path, 'r') as file:
    json_data = json.load(file)

# Define the base save location
base_save_location = "/content/drive/MyDrive/FYP/law_cases_raw"

# Define the download function
def download_pdf(entry):
    try:
        if 'file' not in entry or 'date' not in entry:
            return f"Skipping entry (missing 'file' or 'date'): {entry.get('name', 'Unknown')}"

        file_info = entry['file']
        pdf_url = file_info.get('url')
        pdf_source_url = file_info.get('sourceUrl')
        pdf_name = file_info.get('name')

        # Extract year from date (assuming date is in ISO format)
        year = entry['date']['$date'][:4]  # Get the first four characters representing the year

        # Create a directory for the year if it doesn't exist
        year_folder = os.path.join(base_save_location, year)
        os.makedirs(year_folder, exist_ok=True)

        # Determine the URL to use
        url_to_download = pdf_url if pdf_url else pdf_source_url
        if not url_to_download:
            return f"Skipping {pdf_name}: No valid URL found"

        # Download the PDF
        response = requests.get(url_to_download, timeout=10)
        if response.status_code == 200:
            save_path = os.path.join(year_folder, pdf_name)
            with open(save_path, 'wb') as pdf_file:
                pdf_file.write(response.content)
            return f"Downloaded: {pdf_name}"
        else:
            return f"Failed to download {pdf_name}: HTTP {response.status_code}"

    except Exception as e:
        return f"Error processing {entry.get('name', 'Unknown')}: {str(e)}"

# Set the number of threads
num_threads = 400  # Adjust based on your system's capabilities

# Process files in parallel
with ThreadPoolExecutor(max_workers=num_threads) as executor:
    futures = {executor.submit(download_pdf, entry): entry for entry in json_data[6000:]}

    # Collect and print results
    for future in as_completed(futures):
        print(future.result())

# **Preprocessing**

In [42]:
#@title Init

# Install required packages and dependencies
!pip install pdf2image pytesseract pdfplumber googletrans langdetect fasttext-numpy2

!apt-get install -y poppler-utils
!apt-get install -y tesseract-ocr tesseract-ocr-sin tesseract-ocr-tam

!wget -O lid.176.bin https://dl.fbaipublicfiles.com/fasttext/supervised-models/lid.176.bin

#mount drive
from google.colab import drive
drive.mount('/content/drive')

Reading package lists... Done
Building dependency tree... Done
Reading state information... Done
poppler-utils is already the newest version (22.02.0-2ubuntu0.8).
0 upgraded, 0 newly installed, 0 to remove and 35 not upgraded.
Reading package lists... Done
Building dependency tree... Done
Reading state information... Done
tesseract-ocr is already the newest version (4.1.1-2.1build1).
tesseract-ocr-sin is already the newest version (1:4.00~git30-7274cfa-1.1).
tesseract-ocr-tam is already the newest version (1:4.00~git30-7274cfa-1.1).
0 upgraded, 0 newly installed, 0 to remove and 35 not upgraded.
--2025-07-18 15:33:47--  https://dl.fbaipublicfiles.com/fasttext/supervised-models/lid.176.bin
Resolving dl.fbaipublicfiles.com (dl.fbaipublicfiles.com)... 18.164.78.81, 18.164.78.72, 18.164.78.121, ...
Connecting to dl.fbaipublicfiles.com (dl.fbaipublicfiles.com)|18.164.78.81|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 131266198 (125M) [application/octet-stream]
S

In [3]:
import pdfplumber
import re
import pandas as pd
from typing import List, Tuple, Optional, Dict
import statistics
import os
from concurrent.futures import ProcessPoolExecutor, as_completed
import fasttext
import uuid

class PageBasedLegalExtractor:
    """
    A class to extract main content from legal case documents by analyzing
    page-by-page characteristics and content patterns.
    """

    def __init__(self):
        # Download and load the fasttext language detection model
        # You may need to download this model first:
        # wget https://dl.fbaipublicfiles.com/fasttext/supervised-models/lid.176.bin
        try:
            self.lang_detector = fasttext.load_model('lid.176.bin')
        except:
            print("Warning: fasttext language model not found. Download lid.176.bin from https://dl.fbaipublicfiles.com/fasttext/supervised-models/lid.176.bin")
            self.lang_detector = None

        # Patterns for administrative content to skip
        self.header_patterns = [
            r'^IN THE COURT OF APPEAL',
            r'^OF SRI LANKA',
            r'^Court of Appeal Case No\.',
            r'^Board of Quazis Case No\.',
            r'^Quazi Court of.*Case No\.',
            r'^CA/LTA/\d+/\d+',
            r'^Before:\s*',
            r'^Counsel:\s*',
            r'^Supported on:\s*\d+\.\d+\.\d+',
            r'^Decided on:\s*\d+\.\d+\.\d+',
            r'^\s*VS\s*$',
            r'^\s*AND NOW\s*$',
            r'^\s*AND PRESENTLY\s*$',
            r'^No\.\s*\d+[A-Z]?,.*Road,',
            r'^[A-Z][a-z]+.*,\s*$',  # Single names on lines
            r'^Applicant\s*$',
            r'^Respondent\s*$',
            r'^Petitioner\s*$',
            r'^Applicant-\s*Respondent',
            r'^Respondent-\s*Petitioner',
        ]

        # Patterns for footer content
        self.footer_patterns = [
            # r'^Leave refused\.',
            # r'^Application dismissed',
            r'^JUDGE OF THE COURT OF APPEAL\s*$',
            r'^I agree\.\s*$',
            r'^Order accordingly\.',
            # r'^Appeal dismissed\.',
            # r'^Appeal allowed\.',
            r'^Page\s+\d+\s+\d+$',  # Page numbers
        ]

        # Patterns that indicate start of main content
        self.content_start_patterns = [
            r'^[A-Z\s.?]+,\s*J\.\s*$',  # Judge name
            r'^The\s+(Petitioner|Respondent|Applicant)',
            r'^This\s+(Court|matter|case)',
            r'^Having\s+considered',
            r'^It\s+is\s+(pertinent|noted|clear)',
            r'^The\s+learned\s+(counsel|judge|quazi)',
        ]

    def detect_language(self, text: str) -> str:
        """Detect the primary language of the text using fasttext."""
        if not self.lang_detector or not text.strip():
            return "unknown"

        try:
            # Clean text for language detection
            clean_text = re.sub(r'[^\w\s]', ' ', text)
            clean_text = ' '.join(clean_text.split())

            if len(clean_text) < 10:
                return "unknown"

            predictions = self.lang_detector.predict(clean_text, k=1)
            language_code = predictions[0][0].replace('__label__', '')
            confidence = predictions[1][0]

            return f"{language_code} ({confidence:.2f})"

        except Exception as e:
            print(f"Language detection error: {e}")
            return "unknown"

    def analyze_page_content(self, page_text: str) -> Dict:
        """Analyze a page's content characteristics."""
        lines = [line.strip() for line in page_text.split('\n') if line.strip()]

        analysis = {
            'total_lines': len(lines),
            'empty_lines': page_text.count('\n\n'),
            'avg_line_length': statistics.mean([len(line) for line in lines]) if lines else 0,
            'long_lines': sum(1 for line in lines if len(line) > 80),
            'short_lines': sum(1 for line in lines if len(line) < 30),
            'header_footer_lines': 0,
            'content_lines': 0,
            'has_substantial_content': False,
            'content_score': 0
        }

        # Count header/footer lines
        for line in lines:
            if self.is_header_footer_line(line):
                analysis['header_footer_lines'] += 1
            elif len(line) > 50 and not re.match(r'^[A-Z\s]+$', line):
                analysis['content_lines'] += 1

        # Calculate content score
        if analysis['total_lines'] > 0:
            content_ratio = analysis['content_lines'] / analysis['total_lines']
            avg_length_score = min(analysis['avg_line_length'] / 100, 1.0)
            analysis['content_score'] = (content_ratio * 0.6) + (avg_length_score * 0.4)
            analysis['has_substantial_content'] = (
                analysis['content_score'] > 0.3 and
                analysis['content_lines'] > 3
            )

        return analysis

    def is_header_footer_line(self, line: str) -> bool:
        """Check if a line is header/footer content."""
        # Check against header patterns
        for pattern in self.header_patterns + self.footer_patterns:
            if re.search(pattern, line):
                return True

        # Additional heuristics
        if len(line) < 10:
            return True

        if re.search(r'^[A-Z\s]+$', line) and len(line) < 50:
            return True

        if re.search(r'(^Page \d+ of \d+$|\d+ | P age$)', line):  # Page numbers
            return True

        # Address patterns
        if re.search(r'^No\.\s*\d+.*,\s*$', line):
            return True

        return False

    def extract_page_content(self, page_text: str) -> Tuple[List[str], List[str]]:
        """Extract content lines from a single page and return both content and removed lines."""
        lines = [line.strip() for line in page_text.split('\n') if line.strip()]
        content_lines = []
        removed_lines = []

        for line in lines:
            if self.is_header_footer_line(line):
                removed_lines.append(line)
            else:
                # Keep lines that appear to be substantial content
                if len(line) > 30 or (len(line) > 15 and line.endswith('.')):
                    content_lines.append(line)
                else:
                    removed_lines.append(line)

        return content_lines, removed_lines

    def identify_content_pages(self, pdf_path: str) -> List[Tuple[int, str, Dict]]:
        """Identify pages that contain main content."""
        pages_data = []

        with pdfplumber.open(pdf_path) as pdf:
            for page_num, page in enumerate(pdf.pages):
                page_text = page.extract_text()
                if page_text:
                    analysis = self.analyze_page_content(page_text)
                    pages_data.append((page_num + 1, page_text, analysis))

        return pages_data

    def extract_main_content(self, pdf_path: str) -> Dict:
        """Extract main content from legal case PDF using page-by-page analysis."""
        pages_data = self.identify_content_pages(pdf_path)
        filename = os.path.basename(pdf_path)

        # print("-" * 80)
        # print(f"PAGE ANALYSIS: {filename}")
        # print("-" * 80)
        # for page_num, page_text, analysis in pages_data:
        #     print(f"Page {page_num}: Content score: {analysis['content_score']:.2f} {'Removed' if not analysis['has_substantial_content'] else ''}")

        # Detect language from first page
        primary_language = "unknown"
        if pages_data:
            first_page_text = pages_data[0][1]
            primary_language = self.detect_language(first_page_text)

        # Extract content from pages with substantial content
        all_content_lines = []
        all_removed_lines = []
        content_started = False

        for page_num, page_text, analysis in pages_data:
            if analysis['has_substantial_content'] or content_started:
                content_lines, removed_lines = self.extract_page_content(page_text)
                all_removed_lines.extend(removed_lines)

                # Look for content start indicators
                if not content_started:
                    for i, line in enumerate(content_lines):
                        for pattern in self.content_start_patterns:
                            if re.search(pattern, line):
                                content_started = True
                                # Add removed lines from before content start
                                all_removed_lines.extend(content_lines[:i])
                                content_lines = content_lines[i:]
                                break
                        if content_started:
                            break

                if content_started:
                    all_content_lines.extend(content_lines)

                    # Check for end patterns
                    for line in content_lines:
                        for pattern in self.footer_patterns:
                            if re.search(pattern, line):
                                # Remove this line and everything after
                                try:
                                    end_index = all_content_lines.index(line)
                                    # Move removed content to removed_lines
                                    all_removed_lines.extend(all_content_lines[end_index:])
                                    all_content_lines = all_content_lines[:end_index]
                                    break
                                except ValueError:
                                    pass
                else:
                    # If content hasn't started, all lines are removed
                    all_removed_lines.extend(content_lines)

        # Format content and removed text
        main_content = self.format_into_paragraphs(all_content_lines)
        removed_content = self.format_into_paragraphs(all_removed_lines)

        # Calculate word count
        word_count = len(main_content.split()) if main_content else 0

        return {
            'id': str(uuid.uuid4()),
            'filename': filename,
            'main_content': main_content,
            'removed_content': removed_content,
            'primary_language': primary_language,
            'word_count': word_count,
            'removed_pages_count': sum(1 for _, _, analysis in pages_data if not analysis['has_substantial_content']),
            'total_pages_count': len(pages_data)
        }

    def format_into_paragraphs(self, lines: List[str]) -> str:
        """Format lines into readable paragraphs."""
        if not lines:
            return ""

        paragraphs = []
        current_paragraph = []

        for line in lines:
            # Check if line starts a new paragraph
            if (self.is_paragraph_break(line, current_paragraph)):
                if current_paragraph:
                    paragraphs.append(' '.join(current_paragraph))
                    current_paragraph = []

            current_paragraph.append(line)

        # Add the last paragraph
        if current_paragraph:
            paragraphs.append(' '.join(current_paragraph))

        return '\n\n'.join(paragraphs)

    def is_paragraph_break(self, line: str, current_paragraph: List[str]) -> bool:
        """Determine if a line should start a new paragraph."""
        if not current_paragraph:
            return False

        # New paragraph if previous line ended with period and current starts with capital
        if (current_paragraph and
            current_paragraph[-1].endswith('.') and
            line and line[0].isupper()):
            return True

        # New paragraph for certain starting patterns
        paragraph_starters = [
            r'^The\s+(Petitioner|Respondent|Applicant)',
            r'^This\s+(Court|matter|case)',
            r'^Having\s+considered',
            r'^It\s+is\s+(pertinent|noted|clear)',
            r'^Being\s+aggrieved',
            r'^Thereupon',
            r'^Besides',
            r'^In\s+those\s+circumstances',
            r'^Thus',
            r'^This\s+is\s+an\s+application'
        ]

        for pattern in paragraph_starters:
            if re.search(pattern, line):
                return True

        return False

# Enhanced usage with detailed analysis
def analyze_and_extract(pdf_path: str) -> Dict:
    """Analyze document structure and extract main content."""

    extractor = PageBasedLegalExtractor()

    # print(f"ANALYZING DOCUMENT: {os.path.basename(pdf_path)}\n")
    # print("=" * 80)

    # Extract main content
    return extractor.extract_main_content(pdf_path)

# ------------------- Parallel Processing -------------------
def process_folder(folder_path, max_workers=16):
    """Processes all PDFs in a folder using multiprocessing."""
    results = []
    pdf_files = [os.path.join(folder_path, filename) for filename in os.listdir(folder_path) if filename.lower().endswith(".pdf")][:5]

    with ProcessPoolExecutor(max_workers=max_workers) as executor:
        futures = {executor.submit(analyze_and_extract, pdf_path): pdf_path for pdf_path in pdf_files}

        for future in as_completed(futures):
            pdf_path = futures[future]
            try:
                result = future.result()  # Get result of the future
                results.append(result)
                print(f"\nRESULTS FOR: {result['filename']}")
                print("=" * 80)
                print(f"Doc ID: {result['id']}")
                print(f"Primary Language: {result['primary_language']}")
                print(f"Word Count: {result['word_count']}")
                print(f"Total Pages: {result['total_pages_count']}")
                print(f"Removed Pages: {result['removed_pages_count']}")
                print()

                print("\nEXTRACTED MAIN CONTENT:")
                print("-" * 80)
                print(result['main_content'][:500] + "..." if len(result['main_content']) > 500 else result['main_content'])

                print("\nREMOVED CONTENT (first 500 chars):")
                print("-" * 80)
                print(result['removed_content'][:500] + "..." if len(result['removed_content']) > 500 else result['removed_content'])
                print('\n\n')
            except Exception as e:
                print(f"Error processing {pdf_path}: {e}")

    return results


# ----------------------------- Usage -----------------------------
folder_path = "/content/drive/MyDrive/FYP/law_cases_raw/2024"  # Update this path as needed

# Process all PDFs in the folder
pdf_data = process_folder(folder_path)

KeyboardInterrupt: 

In [80]:
#@title Extract text and OCR

import os
import uuid
import pdfplumber # Import pdfplumber
from pdf2image import convert_from_path
import pytesseract
import re
from concurrent.futures import ProcessPoolExecutor, as_completed  # Multiprocessing
from langdetect import detect, DetectorFactory

DetectorFactory.seed = 0

# ------------------- PDF Processing Functions -------------------
def extract_text_from_pdf(pdf_path):
    """Attempts to extract text from a PDF using pdfplumber."""
    pages_text = []
    try:
        with pdfplumber.open(pdf_path) as pdf:
            for page in pdf.pages:
                page_text = page.extract_text()
                if page_text:
                    pages_text.append(page_text)
                else:
                    pages_text.append("") # Keep a placeholder for empty pages to maintain page count
    except Exception as e:
        print(f"Error extracting text from {pdf_path}: {e}")
    return pages_text

def ocr_pdf(pdf_path, lang="eng+sin"):
    """Uses pdf2image to convert PDF pages to images and then applies OCR."""
    pages_text = []
    try:
        images = convert_from_path(pdf_path, dpi=200)  # Lower DPI for faster processing
        for img in images:
            page_text = pytesseract.image_to_string(img, lang=lang)
            pages_text.append(page_text)
    except Exception as e:
        print(f"Error OCR processing {pdf_path}: {e}")
    return pages_text

def process_pdf_page_by_page(pdf_path, lang="eng+sin", text_threshold=10):
    """Attempts to extract text from PDF page by page and falls back to OCR if needed."""
    pages_text = extract_text_from_pdf(pdf_path)
    if not any(pages_text) or sum(len(text) for text in pages_text) < text_threshold:
        print(f"Text extraction yielded very little text for {os.path.basename(pdf_path)}. Running OCR...\n")
        pages_text = ocr_pdf(pdf_path, lang=lang)
    return pages_text

def detect_language_from_text(text):
    """Detects the language of the given text using langdetect."""
    try:
        # langdetect requires a minimum amount of text to be effective
        if len(text.strip()) < 20: # Adjust threshold as needed
            return "unknown"
        return detect(text)
    except Exception as e:
        print(f"Error detecting language with langdetect: {e}")
        return "unknown"


def process_pdf_file(pdf_path, lang="eng+sin"):
    """Processes a single PDF file and returns its data as a dictionary."""
    unique_id = str(uuid.uuid4())
    filename = os.path.basename(pdf_path)
    pages_text = process_pdf_page_by_page(pdf_path, lang=lang)

    # Detect primary language from the first page
    primary_lang = "unknown"
    if pages_text:
        primary_lang = detect_language_from_text(pages_text[0])


    cleaned_pages_text = []
    removed_pages_text = []
    document_title = "Untitled"
    title_extracted = False
    doc_type = "unknown"
    amendmentTo = ""

    # Regex for page numbers and unwanted passages
    # unwanted_pages_regex = re.compile(r'(PETITIONER|RESPONDENTS|Printed on the Order of Government|DEPARTMENT OF\s*GOVERNMENT PRINTING)', re.DOTALL)
    unwanted_passage_regex = re.compile(r"(Page \d+ of \d+$|\d+ \| P age|\d+\.\s+In\sthe\sevent\sof\sany\sinconsistency\.*?)")
    title_regex = re.compile(r"([A-Z]{2}/[A-Z]{3}/\d+/\d+|.+?\s*Act\s*,?\s*No\.\s*\d+\s*of\s*\d{4}|Case No\.\s*(.+?-\s*\d+/\d+)\s)", re.DOTALL)
    act_title_regex = re.compile(r"[A-Z]{2}/[A-Z]{3}/\d+/\d+|.+?\s*Act\s*,?\s*No\.\s*\d+\s*of\s*\d{4}")
    case_title_regex = re.compile(r"Case No\.\s*(.+?-?\s*\d+/\d+)")
    amend_regex = re.compile(r"(ACT\s+TO\s+AMEND.+?,?\s*NO\.\s*\d+\s*OF\s*\d{4})", re.DOTALL | re.IGNORECASE)
    case_passage_pattern = re.compile(
        r"""
          (?P<judge>(?:[A-Z]\.\s*){0,5}[A-Z][a-z]+(?:\s+[A-Z][a-z]+)*,?\s*J\.)  # Judge name
          \s*\n
          (?P<passage>.*?)
          (?=\n\s*Judge\s+of\s+the\s+.+?\s*|\Z
        )""",
        re.DOTALL | re.VERBOSE | re.IGNORECASE
    )

    act_passage_pattern = re.compile(
        r"""
        (?P<section_num>\d+)\.\s+                           # Match '15.', '16.', etc.
        (?P<section_text>                                   # Start of section text
            (?:.*?)(?=                                      # Non-greedy match
                \n?\s*(?=\d+\.\s+(?:\([1aA]\)|[A-Z]))                               # Next section like '17.'
                | \n?Sinhala\stext\sto\s*
                | \n?\s*In\s+the\s+event\s+of\s+any         # Footer cutoff
                | \Z                                        # End of document
            )
        )
        """,
        re.DOTALL | re.VERBOSE
    )



    min_passage_length = 50 # Minimum characters for a passage to be considered

    print("total pages: ", len(pages_text))
    matches = act_passage_pattern.finditer(" ".join(pages_text))

    structured = []
    for m in matches:
        structured.append({
            "section": m.group("section_num"),
            "text": m.group("section_text").strip()
        })

    # Print section and text line by line
    for section_data in structured:
        print(f"Section: {section_data['section']}")
        print(f"Text: {section_data['text']}...") # Print first 500 characters of text
        print("-" * 20)

    # results = list(case_passage_pattern.finditer(" ".join(pages_text))) # Convert iterator to list
    # if results: # Check if results is not empty
    #     print('preemble', " ".join(pages_text)[:results[0].start()].strip()) # Access start() from the match object
    #     passages = [result.group('passage') for result in results]
    #     print('passage',passages)
    # else:
    #     print("No case passages found.") # Handle case where no matches are found

    for i, page_text in enumerate(pages_text):
        # Extract title from the first few pages (assuming title is at the beginning)
        if not title_extracted and i < 1: # Check first 5 pages for the title
             title_match = title_regex.search(page_text)
             if title_match:
                 document_title = title_match.group(0).strip()
                 title_extracted = True
                 doc_type = "act" if "act" in document_title.lower() else "case"

                 if doc_type == "act":
                    amendment_match = amend_regex.search(page_text)
                    if amendment_match:
                        amendmentTo = amendment_match.group(0).replace("ACT TO AMEND", "").strip()

        # Check if the page contains page numbers or unwanted passages
        # if unwanted_pages_regex.search(page_text):
        #     print(f"Skipping page {i+1} of {filename} due to matching patterns.")
        #     removed_pages_text.append(page_text.replace("\n", " "))
        #     continue # Skip this page

        # passage_match = unwanted_passage_regex.search(page_text)
        # if passage_match:
        #     print(f"Removing passage from page {i+1} of {filename} due to matching patterns.")
        #     page_text = unwanted_passage_regex.sub("", page_text)
        #     # removed_pages_text.append(passage_match.group(0).replace("\n", " "))

        # if title_regex.search(page_text) or amend_regex.search(page_text):
        #     page_text = title_regex.sub("", page_text)
        #     page_text = amend_regex.sub("", page_text)

    cleaned_text = "\n".join(cleaned_pages_text)


    return {
        "id": unique_id,
        "type": doc_type,
        "amendmentTo": amendmentTo,
        "filename": filename,
        "primaryLang": primary_lang, # Updated primaryLang
        "title": document_title.replace("Case No. ", "").replace("\n"," "),
        "cleanedText": cleaned_text, # Using cleaned text
        "removedText": "\n".join(removed_pages_text),
        "wordCount": len(cleaned_text.split()), # Calculate word count on cleaned text
        "pagesCount": len(cleaned_pages_text),
    }

# ------------------- Parallel Processing -------------------
def process_folder(folder_path, lang="eng+sin", max_workers=16):
    """Processes all PDFs in a folder using multiprocessing."""
    results = []
    pdf_files = [os.path.join(folder_path, filename) for filename in os.listdir(folder_path) if filename.lower().endswith(".pdf")][31:32]

    with ProcessPoolExecutor(max_workers=max_workers) as executor:
        futures = {executor.submit(process_pdf_file, pdf_path, lang): pdf_path for pdf_path in pdf_files}

        for future in as_completed(futures):
            pdf_path = futures[future]
            try:
                result = future.result()  # Get result of the future
                results.append(result)
                print(f"Processed: {result['filename']} | Primary Language: {result['primaryLang']} | Word Count: {result['wordCount']} | Pages Count: {result['pagesCount']}\n")  # Change 'length' to 'wordCount' and added language
            except Exception as e:
                print(f"Error processing {pdf_path}: {e}")

    return results

# ----------------------------- Usage -----------------------------
folder_path = "/content/drive/MyDrive/FYP/law_cases_raw/2024"  # Update this path as needed

# Process all PDFs in the folder
pdf_data = process_folder(folder_path)

# Print summary
for data in pdf_data:
    print(f"ID: {data['id']}\nFilename: {data['filename']}\nPrimary Language: {data['primaryLang']}\nTitle: {data['title']}\nWord Count: {data['wordCount']}\nType: {data['type']}\nAmendment To: {data['amendmentTo']}\n\nText Preview: {data['cleanedText'][:200]}\n\nRemoved Text: {data['removedText'][:200]}\n{'-'*50}")

total pages:  7
Section: 1
Text: Ajith Siyambalapitiya
Honorary Secretary...
--------------------
Section: 2
Text: Lahiru Silva
Sports Club Manager
1st and 2nd abovenamed, both of:
Unichela Sports Club,
CA/WRIT/139/2021
No.124, Horana Road,
Panadura.
PETITIONERS
Vs....
--------------------
Section: 1
Text: Namal Rajapaksha, MP
Hon. Minister of Sports
No.9, Phillip Gunawardana Mawatha,
Colombo 07.
1(a). Roshan Ranasinghe, MP
Hon. Minister of Sports
No.9, Phillip Gunawardana Mawatha,
Colombo 07....
--------------------
Section: 2
Text: Shammi Silva
President...
--------------------
Section: 3
Text: Jayantha Dharmadasa
Vice President...
--------------------
Section: 4
Text: Ravin Wickramaratne
Vice President...
--------------------
Section: 5
Text: Mohan De Silva
Secretary...
--------------------
Section: 6
Text: Lasantha Wickramanayake
Treasurer...
--------------------
Section: 7
Text: Krishantha Kapuwatte
Asst. Secretary...
--------------------
Section: 8
Text: Lalith Rambukwella
Page 1

In [99]:
"""
PDF Text Extraction and OCR Processing Tool

This module provides functionality to extract text from PDF files using pdfplumber,
with OCR fallback using pytesseract. It processes legal documents (Acts and Cases)
and extracts structured information.
"""

import os
import uuid
import re
from concurrent.futures import ProcessPoolExecutor, as_completed
from typing import List, Dict, Any, Optional

import pdfplumber
from pdf2image import convert_from_path
import pytesseract
import fasttext

# ------------------- Configuration -------------------
class Config:
    """Configuration constants for PDF processing."""
    DEFAULT_DPI = 200
    TEXT_THRESHOLD = 10
    MIN_PASSAGE_LENGTH = 50
    MIN_LANGUAGE_DETECTION_LENGTH = 20
    DEFAULT_LANGUAGE = "eng+sin"
    MAX_WORKERS = 16
    TITLE_SEARCH_PAGES = 2
    DOC_TYPE = 'case'


# ------------------- Regex Patterns -------------------
class RegexPatterns:
    """Container for all regex patterns used in document processing."""

    # Unwanted passage patterns
    UNWANTED_PASSAGE = re.compile(
        r"(Page \d+ of \d+$|\d+ \| P age|\d+\.\s+In\sthe\sevent\sof\sany\sinconsistency\.*?)"
    )

    # Title extraction patterns
    TITLE = re.compile(
        r"(Case No\.?\s*(\n?\s*[A-Z]{2}/[A-Z]{3}/\d+/\d+)|.+?\s*Act\s*,?\s*No\.\s*\d+\s*of\s*\d{4})",
        re.DOTALL
    )

    ACT_TITLE = re.compile(
        r".+?\s*Act\s*,?\s*No\.\s*\d+\s*of\s*\d{4}"
    )

    CASE_TITLE = re.compile(
        r"[A-Z]{2}/[A-Z]{3}/\d+/\d+|[A-Z]{2}\s*\(Writ\)\s*d+/d+|Writ\sapplication\sNo:\s*d+/d+|[A-Z]{2}\-[A-Z]{3}\-\d+\-\d+|[A-Z]{3}/\d+/\d+|CA\s*\d+/\d+|Bail\s*\d+\s*/\s*\d+",
        re.DOTALL
        # CA/HPC/123/2536
        # CA (Writ) 87/2022
        # CA Writ application No: 123/2020
        # CA-PHC-152-17
        # RII/65/2024
        # CA 297/2015
        # Bail 55 /2022
    )

    AMENDMENT = re.compile(
        r"(ACT\s+TO\s+AMEND.+?,?\s*NO\.\s*\d+\s*OF\s*\d{4})",
        re.DOTALL | re.IGNORECASE
    )

    # Document structure patterns
    CASE_PASSAGE = re.compile(
        r"""
          (?P<judge>(?:[A-Z]\.\s*){0,5}[A-Z][a-z]+(?:\s+[A-Z][a-z]+)*,?\s*J\.)  # Judge name
          \s*\n
          (?P<passage>.*?)
          (?=\n\s*Judge\s+of\s+the\s+.+?\s*|\Z)
        """,
        re.DOTALL | re.VERBOSE | re.IGNORECASE
    )

    ACT_PASSAGE = re.compile(
        r"""
        (?P<section_num>\d+)\.\s+                           # Match '15.', '16.', etc.
        (?P<section_text>                                   # Start of section text
            (?:.*?)(?=                                      # Non-greedy match
                \n?\s*(?=\d+\.\s+(?:\([1aA]\)|[A-Z]))       # Next section like '17.'
                | \n?Sinhala\stext\sto\s*
                | \n?\s*In\s+the\s+event\s+of\s+any         # Footer cutoff
                | \Z                                        # End of document
            )
        )
        """,
        re.DOTALL | re.VERBOSE
    )


# ------------------- Language Detection -------------------
class LanguageDetector:
    """Handles language detection for document text."""

    def __init__(self):
        """Initialize language detector."""
        self.lang_detector = fasttext.load_model('lid.176.bin')

    def detect_language(self, text: str) -> str:
        """
        Detects the language of the given text using langdetect.

        Args:
            text: The text to analyze

        Returns:
            Language code or "unknown" if detection fails
        """
        try:
            # Clean text for language detection
            clean_text = re.sub(r'[^\w\s]', ' ', text)
            clean_text = ' '.join(clean_text.split())

            if len(clean_text) < Config.MIN_LANGUAGE_DETECTION_LENGTH:
                return "unknown"

            predictions = self.lang_detector.predict(clean_text, k=1)
            language_code = predictions[0][0].replace('__label__', '')
            confidence = predictions[1][0]

            return f"{language_code} ({confidence:.2f})"
        except Exception as e:
            print(f"Error detecting language: {e}")
            return "unknown"


# ------------------- PDF Processing -------------------
class PDFProcessor:
    """Handles PDF text extraction and OCR operations."""

    def __init__(self, language: str = Config.DEFAULT_LANGUAGE):
        """
        Initialize PDF processor.

        Args:
            language: OCR language parameter
        """
        self.language = language

    def extract_text_from_pdf(self, pdf_path: str) -> List[str]:
        """
        Extract text from PDF using pdfplumber.

        Args:
            pdf_path: Path to the PDF file

        Returns:
            List of text strings, one per page
        """
        pages_text = []
        try:
            with pdfplumber.open(pdf_path) as pdf:
                for page in pdf.pages:
                    page_text = page.extract_text()
                    pages_text.append(page_text if page_text else "")
        except Exception as e:
            print(f"Error extracting text from {pdf_path}: {e}")
        return pages_text

    def ocr_pdf(self, pdf_path: str) -> List[str]:
        """
        Apply OCR to PDF using pdf2image and pytesseract.

        Args:
            pdf_path: Path to the PDF file

        Returns:
            List of OCR'd text strings, one per page
        """
        pages_text = []
        try:
            images = convert_from_path(pdf_path, dpi=Config.DEFAULT_DPI)
            for img in images:
                page_text = pytesseract.image_to_string(img, lang=self.language)
                pages_text.append(page_text)
        except Exception as e:
            print(f"Error OCR processing {pdf_path}: {e}")
        return pages_text

    def process_pdf_pages(self, pdf_path: str) -> List[str]:
        """
        Process PDF pages, falling back to OCR if text extraction yields little text.

        Args:
            pdf_path: Path to the PDF file

        Returns:
            List of processed text strings, one per page
        """
        pages_text = self.extract_text_from_pdf(pdf_path)

        # Check if text extraction was successful
        if not any(pages_text) or sum(len(text) for text in pages_text) < Config.TEXT_THRESHOLD:
            print(f"Text extraction yielded little text for {os.path.basename(pdf_path)}. Running OCR...")
            pages_text = self.ocr_pdf(pdf_path)

        return pages_text


# ------------------- Document Analysis -------------------
class DocumentAnalyzer:
    """Analyzes document structure and extracts metadata."""

    def __init__(self):
        """Initialize document analyzer."""
        self.patterns = RegexPatterns()

    def extract_title_and_type(self, pages_text: List[str]) -> tuple[str, str, str]:
        """
        Extract document title, type, and amendment information.

        Args:
            pages_text: List of page texts

        Returns:
            Tuple of (title, doc_type, amendment_to)
        """
        document_title = "Untitled"
        doc_type = Config.DOC_TYPE
        amendment_to = ""

        # Search for title in first few pages
        for i, page_text in enumerate(pages_text[:Config.TITLE_SEARCH_PAGES]):
            # title_match = self.patterns.TITLE.search(page_text)
            # if title_match:
                # document_title = title_match.group(0).strip().replace("\n", " ")
                # doc_type = "case" if "case" in document_title.lower() else "act"

                # Check for amendment information
                # if doc_type == "act":
                #     amendment_match = self.patterns.AMENDMENT.search(page_text)
                #     if amendment_match:
                #         amendment_to = amendment_match.group(0).replace("ACT TO AMEND", "").strip()

                # break
            if doc_type == "act":
              title_match = self.patterns.ACT_TITLE.search(page_text)
              if title_match:
                document_title = title_match.group(0).strip().replace("\n", " ")
            elif doc_type == "case":
              title_match = self.patterns.CASE_TITLE.search(page_text)
              if title_match:
                document_title = title_match.group(0).strip().replace("\n", " ")


        return document_title, doc_type, amendment_to

    def extract_act_sections(self, text: str) -> List[Dict[str, str]]:
        """
        Extract structured sections from Act documents.

        Args:
            text: Full document text

        Returns:
            List of dictionaries containing section numbers and text
        """
        structured = []
        matches = self.patterns.ACT_PASSAGE.finditer(text)

        preemble = ''
        for match in matches:
            structured.append({
                "section": match.group("section_num"),
                "text": match.group("section_text").strip()
            })

            if not preemble:
                preemble = text[:match.start()].strip()

        return structured, preemble

    def extract_case_sections(self, text: str) -> List[Dict[str, str]]:
        """
        Extract case sections from Case documents.
        """
        preemble = ''
        matches = self.patterns.CASE_PASSAGE.finditer(text)
        structured = []
        for match in matches:
            structured.append({
                "judge": match.group("judge"),
                "text": match.group("passage").strip()
            })

            if not preemble:
                preemble = text[:match.start()].strip()

        return structured, preemble

    def print_structured_sections(self, structured: List[Dict[str, str]]) -> None:
        """
        Print structured sections for debugging.

        Args:
            structured: List of section dictionaries
        """
        print(f"Total sections found: {len(structured)}")
        for section_data in structured:
            if 'judge' in section_data:
              print(f"Judge: {section_data['judge']}")
            else:
              print(f"Section: {section_data['section']}")
            print(f"Text: {section_data['text'][:100]}...")  # Print first 100 characters
            print("-" * 20)


# ------------------- Main Document Processor -------------------
class DocumentProcessor:
    """Main class for processing legal documents."""

    def __init__(self, language: str = Config.DEFAULT_LANGUAGE):
        """
        Initialize document processor.

        Args:
            language: OCR language parameter
        """
        self.pdf_processor = PDFProcessor(language)
        self.document_analyzer = DocumentAnalyzer()

    def process_pdf_file(self, pdf_path: str) -> Dict[str, Any]:
        """
        Process a single PDF file and return structured data.

        Args:
            pdf_path: Path to the PDF file

        Returns:
            Dictionary containing extracted document data
        """
        language_detector = LanguageDetector()


        # Generate unique ID and get filename
        unique_id = str(uuid.uuid4())
        filename = os.path.basename(pdf_path)

        # Extract text from PDF
        pages_text = self.pdf_processor.process_pdf_pages(pdf_path)

        # Detect primary language
        primary_lang = "unknown"
        if pages_text:
            primary_lang = language_detector.detect_language(pages_text[0])

        # Extract title and document type
        document_title, doc_type, amendment_to = self.document_analyzer.extract_title_and_type(pages_text)

        structured_sections = []
        removed_text = ""

        full_text = " ".join(pages_text)

        if(doc_type == "act"):
            act_sections, preemble = self.document_analyzer.extract_act_sections(full_text)
            structured_sections = act_sections
            removed_text = preemble

        elif(doc_type == "case"):
            case_sections, preemble = self.document_analyzer.extract_case_sections(full_text)
            structured_sections = case_sections
            removed_text = preemble

        # Print structured sections for debugging
        # print(f"Total pages: {len(pages_text)}")
        # self.document_analyzer.print_structured_sections(structured_sections)

        cleaned_text = "\n".join([section["text"] for section in structured_sections])

        return {
            "id": unique_id,
            "type": doc_type,
            "amendmentTo": amendment_to,
            "filename": filename,
            "primaryLang": primary_lang,
            "title": document_title.replace("Case No. ", "").replace("\n", " "),
            "cleanedText": cleaned_text,
            "removedText": removed_text,
            "wordCount": len(cleaned_text.split()),
            "pagesCount": len(pages_text),
            "structuredSections": structured_sections
        }

    def process_folder(self, folder_path: str,
                      start_index: int = None, end_index: int = None) -> List[Dict[str, Any]]:
        """
        Process all PDFs in a folder using multiprocessing.

        Args:
            folder_path: Path to folder containing PDF files
            max_workers: Maximum number of worker processes
            start_index: Starting index for PDF files to process
            end_index: Ending index for PDF files to process

        Returns:
            List of processed document dictionaries
        """
        results = []

        # Get PDF files with slice
        pdf_files = [
            os.path.join(folder_path, filename)
            for filename in os.listdir(folder_path)
            if filename.lower().endswith(".pdf")
        ][start_index:end_index]

        with ProcessPoolExecutor(max_workers=Config.MAX_WORKERS) as executor:
            futures = {
                executor.submit(self.process_pdf_file, pdf_path): pdf_path
                for pdf_path in pdf_files
            }

            for future in as_completed(futures):
                pdf_path = futures[future]
                try:
                    result = future.result()
                    results.append(result)
                    print(f"Processed: {result['filename']} | "
                          f"Primary Language: {result['primaryLang']} | "
                          f"Word Count: {result['wordCount']} | "
                          f"Pages Count: {result['pagesCount']}\n")
                except Exception as e:
                    print(f"Error processing {pdf_path}: {e}")

        return results

    def print_summary(self, pdf_data: List[Dict[str, Any]]) -> None:
        """
        Print summary of processed documents.

        Args:
            pdf_data: List of processed document dictionaries
        """
        for data in pdf_data:
            print(f"ID: {data['id']}")
            print(f"Filename: {data['filename']}")
            print(f"Page Count: {data['pagesCount']}")
            print(f"Primary Language: {data['primaryLang']}")
            print(f"Title: {data['title']}")
            print(f"Word Count: {data['wordCount']}")
            print(f"Type: {data['type']}")
            print(f"Amendment To: {data['amendmentTo']}")
            print(f"Structured Sections: {len(data['structuredSections'])}")
            print(f"\nText Preview: {data['cleanedText'][:200]}")
            print(f"\nRemoved Text: {data['removedText'][:500]}")
            print("-" * 50)


# ------------------- Main Execution -------------------
# Configuration
folder_path = "/content/drive/MyDrive/FYP/law_cases_raw/2024"

# Initialize processor
processor = DocumentProcessor()

# Process PDFs
pdf_data = processor.process_folder(folder_path)

# Print summary
# processor.print_summary(pdf_data)

Processed: ca_wrt_611_23_pdf.pdf | Primary Language: en (0.68) | Word Count: 579 | Pages Count: 4

Processed: cpa_0132_23_final_judgement_pdf.pdf | Primary Language: en (0.80) | Word Count: 2635 | Pages Count: 11

Processed: court_of_appeal_judgment_hcc_0196_17_pdf.pdf | Primary Language: en (0.72) | Word Count: 2217 | Pages Count: 10

Processed: 541_2023_pdf.pdf | Primary Language: en (0.75) | Word Count: 4199 | Pages Count: 12

Processed: court_of_appeal_judgment_hcc_0184_17_pdf.pdf | Primary Language: en (0.75) | Word Count: 4191 | Pages Count: 17

Processed: wrt_0201_21_31_01_2024_1_pdf.pdf | Primary Language: en (0.67) | Word Count: 3733 | Pages Count: 15

Processed: writ_123_20_pdf.pdf | Primary Language: en (0.74) | Word Count: 3612 | Pages Count: 16

Processed: hcc_0384_18_final_judgement_pdf.pdf | Primary Language: en (0.75) | Word Count: 1325 | Pages Count: 6

Processed: wrt_0471_19_pdf.pdf | Primary Language: en (0.80) | Word Count: 1222 | Pages Count: 7

Processed: writ_138



Processed: ca_writ_87_22_pdf.pdf | Primary Language: en (0.84) | Word Count: 2597 | Pages Count: 10

Processed: ca_phc_0066_12_final_judgement_pdf.pdf | Primary Language: en (0.71) | Word Count: 2354 | Pages Count: 10

Processed: ca_phc_0065_12_final_judgement_pdf.pdf | Primary Language: en (0.70) | Word Count: 2300 | Pages Count: 10

Processed: ca_wrt_511_19_pdf.pdf | Primary Language: en (0.73) | Word Count: 2841 | Pages Count: 10

Processed: writ_345_21_pdf.pdf | Primary Language: en (0.69) | Word Count: 3692 | Pages Count: 15

Processed: ca_writ_170_22_pdf.pdf | Primary Language: en (0.76) | Word Count: 3030 | Pages Count: 11

Processed: hcc_0036_22_final_judgement_pdf.pdf | Primary Language: en (0.62) | Word Count: 3253 | Pages Count: 12





Processed: wrt_505_21_pdf.pdf | Primary Language: en (0.76) | Word Count: 3131 | Pages Count: 15





Processed: ca_writ_464_21_pdf.pdf | Primary Language: en (0.73) | Word Count: 1991 | Pages Count: 7

Processed: wrt_577_23_pdf.pdf | Primary Language: en (0.68) | Word Count: 1408 | Pages Count: 10

Processed: ca_lta_0005_23_pdf.pdf | Primary Language: en (0.67) | Word Count: 1009 | Pages Count: 5

Processed: hcc_0002_21_final_judgement_pdf.pdf | Primary Language: en (0.75) | Word Count: 4265 | Pages Count: 15

Processed: lta_0001_pdf.pdf | Primary Language: en (0.80) | Word Count: 697 | Pages Count: 5

Processed: ca_cpa_0064_23_final_judgement_pdf.pdf | Primary Language: en (0.71) | Word Count: 4142 | Pages Count: 17

Processed: ca_161_2018_pdf.pdf | Primary Language: en (0.69) | Word Count: 812 | Pages Count: 3

Processed: ca_hcc_179_2015_2024_01_19_pdf.pdf | Primary Language: en (0.69) | Word Count: 2583 | Pages Count: 9

Processed: ca_writ_0451_20_pdf.pdf | Primary Language: en (0.73) | Word Count: 1782 | Pages Count: 10

Processed: 139_21_pdf.pdf | Primary Language: en (0.79) | Wo



Processed: ca_181_2018_pdf.pdf | Primary Language: en (0.73) | Word Count: 1698 | Pages Count: 6

Processed: ca_hcc_0190_191_17_pdf.pdf | Primary Language: en (0.70) | Word Count: 3189 | Pages Count: 13

Processed: ca_wrt_49_20_pdf.pdf | Primary Language: en (0.61) | Word Count: 1773 | Pages Count: 9

Processed: ca_wrt_157_21_pdf.pdf | Primary Language: en (0.80) | Word Count: 1546 | Pages Count: 8

Processed: ca_wrt_0304_21_pdf.pdf | Primary Language: en (0.64) | Word Count: 3391 | Pages Count: 12





Processed: ca_rii_0001_21_pdf.pdf | Primary Language: en (0.76) | Word Count: 0 | Pages Count: 16

Processed: ca_writ_814_23_pdf.pdf | Primary Language: en (0.62) | Word Count: 0 | Pages Count: 7

Processed: ca_wrt_520_23_pdf.pdf | Primary Language: en (0.71) | Word Count: 1110 | Pages Count: 8

Processed: ca_wrt_0653_23_pdf.pdf | Primary Language: en (0.76) | Word Count: 0 | Pages Count: 10

Processed: wrt_0379_2019_docx_pdf.pdf | Primary Language: en (0.76) | Word Count: 1051 | Pages Count: 7

Processed: phc_0071_19_final_judgment_pdf.pdf | Primary Language: en (0.69) | Word Count: 2751 | Pages Count: 12

Processed: hcc_277_16_28_02_2024_pdf.pdf | Primary Language: en (0.76) | Word Count: 3155 | Pages Count: 12

Processed: tax_11_10_judgment_1_pdf.pdf | Primary Language: en (0.79) | Word Count: 3876 | Pages Count: 12

Processed: ca_phc_apn_0067_23_pdf.pdf | Primary Language: en (0.79) | Word Count: 1589 | Pages Count: 9

Processed: ca_205_17_29_02_2024_pdf.pdf | Primary Language: en 



Processed: hcc_0008_22_final_judgment_pdf.pdf | Primary Language: en (0.71) | Word Count: 2788 | Pages Count: 11

Processed: writ_0541_19_pdf.pdf | Primary Language: en (0.75) | Word Count: 3591 | Pages Count: 18

Processed: hcc_0127_22_22_02_2024_pdf.pdf | Primary Language: en (0.74) | Word Count: 3958 | Pages Count: 14





Processed: ca_wrt_0267_19_pdf.pdf | Primary Language: en (0.73) | Word Count: 3314 | Pages Count: 45

Processed: ca_wrt_0391_2020_pdf.pdf | Primary Language: en (0.71) | Word Count: 2341 | Pages Count: 10

Processed: 259_2022_website_pdf.pdf | Primary Language: en (0.74) | Word Count: 3459 | Pages Count: 11

Processed: wrt_0319_17_edited_1_pdf.pdf | Primary Language: en (0.75) | Word Count: 4162 | Pages Count: 16

Processed: ca_182_2019_pdf.pdf | Primary Language: en (0.77) | Word Count: 1251 | Pages Count: 5

Processed: ca_writ_366_21_pdf.pdf | Primary Language: en (0.75) | Word Count: 1848 | Pages Count: 8

Processed: writ_644_21_pdf.pdf | Primary Language: en (0.78) | Word Count: 2742 | Pages Count: 12

Processed: revision_ca_phc_apn_0115_22_pdf.pdf | Primary Language: en (0.73) | Word Count: 2747 | Pages Count: 12

Processed: hcc_0226_20_final_judgement_pdf.pdf | Primary Language: en (0.69) | Word Count: 4448 | Pages Count: 16

Processed: wrt_0245_21_pdf.pdf | Primary Language: en 



Processed: wrt_215_23_judgment_pdf.pdf | Primary Language: en (0.79) | Word Count: 1456 | Pages Count: 6

Processed: phc_152_17_judgment_pdf.pdf | Primary Language: en (0.72) | Word Count: 2530 | Pages Count: 11

Processed: ca_writ_0692_24_1_pdf.pdf | Primary Language: en (0.64) | Word Count: 2673 | Pages Count: 12

Processed: revision_ca_phc_apn_0013_23_pdf.pdf | Primary Language: en (0.76) | Word Count: 2239 | Pages Count: 11

Processed: hcc_0128_19_pdf.pdf | Primary Language: en (0.75) | Word Count: 3389 | Pages Count: 16

Processed: wrt_334_22_judgment_pdf.pdf | Primary Language: en (0.73) | Word Count: 2350 | Pages Count: 11

Processed: phc_02_2020_final_pdf.pdf | Primary Language: en (0.70) | Word Count: 0 | Pages Count: 10

Processed: wrt_506_22_judgment_pdf.pdf | Primary Language: en (0.74) | Word Count: 1858 | Pages Count: 11

Processed: wrt_229_23_judgment_pdf.pdf | Primary Language: en (0.71) | Word Count: 3272 | Pages Count: 13

Processed: hcc_0209_18_30_04_2024_pdf.pdf | P



Processed: ca_wrt_0395_19_and_ca_wrt_0126_20_pdf.pdf | Primary Language: en (0.78) | Word Count: 1717 | Pages Count: 10

Processed: ca_writ_634_23_1_pdf.pdf | Primary Language: en (0.73) | Word Count: 0 | Pages Count: 11

Processed: ca_hcc_0100_20_pdf.pdf | Primary Language: en (0.75) | Word Count: 2879 | Pages Count: 11

Processed: vehicle_appeal_ca_phc_apn_0144_22_pdf.pdf | Primary Language: en (0.61) | Word Count: 2556 | Pages Count: 12

Processed: writ_120_24_2_1_pdf.pdf | Primary Language: en (0.73) | Word Count: 0 | Pages Count: 12

Processed: ca_writ_486_2021_1_pdf.pdf | Primary Language: en (0.70) | Word Count: 4170 | Pages Count: 12

Processed: court_of_appeal_judgment_hcc_327_2019_pdf.pdf | Primary Language: en (0.70) | Word Count: 3596 | Pages Count: 15

Processed: ca_phc_apn_62_2017_final_judgment_pdf.pdf | Primary Language: en (0.70) | Word Count: 4059 | Pages Count: 15

Processed: ca_wrt_0088_19_pdf.pdf | Primary Language: en (0.70) | Word Count: 470 | Pages Count: 17

Pr

In [101]:
#@title Translation

import re
import fasttext
import asyncio
from googletrans import Translator
import nest_asyncio  # For Jupyter notebook environments
import numpy as np # Import numpy

# Apply nest_asyncio to allow nested event loops
nest_asyncio.apply()

# Load FastText model
model = fasttext.load_model("lid.176.bin")

# Initialize Google Translate API
translator = Translator()

# Modified detect_language_fasttext to process chunks sequentially
async def detect_language_fasttext(text, word_threshold=300):
    """Detects if the text contains non-English content."""
    words = text.split()
    total_words = len(words)
    num_chunks = max(1, total_words // word_threshold)

    # Process chunks sequentially to avoid asyncio.as_completed issue
    for i in range(num_chunks):
        chunk = " ".join(words[i * word_threshold:(i + 1) * word_threshold])
        try:
            # Call predict directly without asyncio.to_thread
            prediction = model.predict(chunk)

            # Ensure prediction has the expected structure before accessing elements
            if prediction and len(prediction) > 0 and len(prediction[0]) > 0:
                detected_lang = prediction[0][0].replace("__label__", "")
                if detected_lang != "en":
                    print(f"Chunk needs translation (detected: {detected_lang})")
                    return True  # Indicates translation is needed

            else:
                 print("Warning: Received empty or unexpected prediction format for a chunk.")

        except ValueError as e:
             # Log the specific ValueError if it still occurs within predict
             if "Unable to avoid copy while creating an array as requested" in str(e):
                 print(f"Caught ValueError during fasttext.predict: {e}")
                 # Continue to the next chunk or handle as needed
                 pass # Or return True to force translation on error


        except Exception as e:
            print(f"Error during fasttext prediction for a chunk: {e}")
            # Decide how to handle other errors, e.g., force translation
            # return True

    return False  # No translation needed


async def translate_if_needed(text, max_length=2000):
    """Translates text while preserving sentence boundaries asynchronously."""
    # Await the simplified language detection
    if await detect_language_fasttext(text):
        try:
            # Split text by sentence boundaries (., !, ?, newline)
            sentences = re.split(r'(?<=[.!?])\s+', text)

            chunks = []
            current_chunk = ""

            for sentence in sentences:
                # Ensure sentence is not empty after split
                if not sentence.strip():
                    continue

                # Check if adding the next sentence exceeds max_length
                if len(current_chunk) + len(sentence) + (1 if current_chunk else 0) < max_length:
                    current_chunk += (sentence + " ").strip() if current_chunk else sentence.strip()
                else:
                    chunks.append(current_chunk.strip())
                    current_chunk = sentence.strip() + " "

            if current_chunk:
                chunks.append(current_chunk.strip())

            print(f"Translating {len(chunks)} chunks.")
            # Translate all chunks in parallel using asyncio.gather
            tasks = [asyncio.to_thread(translator.translate, chunk, dest='en', src='si') for chunk in chunks]
            translated_chunks = await asyncio.gather(*tasks)

            # Extract translated text
            translated_texts = [tr.text for tr in translated_chunks]
            print("Translation complete.")

            return " ".join(translated_texts)

        except Exception as e:
            print(f"Translation error: {e}")
            return text  # Return original if translation fails

    return text  # Return original if no translation is needed

async def process_documents(pdf_data):
    """Processes documents asynchronously in parallel."""
    print(f"Starting translation for {len(pdf_data)} documents.")
    tasks = []
    for doc in pdf_data:
        # Pass the entire cleanedText to translate_if_needed
        tasks.append(translate_if_needed(doc.get("cleanedText", "")))

    # Run translations in parallel
    translated_texts = await asyncio.gather(*tasks)

    # Assign translated text back to documents
    for i, doc in enumerate(pdf_data):
        doc["translated"] = translated_texts[i]

    print("Translation process finished.")
    # Print a preview of the updated text
    for doc in pdf_data:
        print(f"ID: {doc['id']}\nFilename: {doc['filename']}\nWord Count:{len(doc['cleanedText'].split())}\nText Preview: {doc['translated'][:200]}\n{'-'*50}\n")

# Main function to run process_documents
async def main():
    await process_documents(pdf_data)

# Run the main function in an environment with an existing event loop
try:
    loop = asyncio.get_running_loop()  # Get the current running loop
except RuntimeError:  # No running event loop, create a new one
    loop = asyncio.new_event_loop()
    asyncio.set_event_loop(loop)

# Await the main function (ensuring all tasks finish)
if loop.is_running():
    # Use asyncio.run if running in a script or ensure a loop is already running
    # In Colab, a loop is usually running, so create_task and await is appropriate
    task = asyncio.create_task(main())
    await task
else:
    loop.run_until_complete(main())

[1;30;43mStreaming output truncated to the last 5000 lines.[0m
Filename: ca_cpa_0133_22_pdf.pdf
Word Count:4635
Text Preview: : P. Kumararatnam, J.
Counsel : Anusha Sammandapperuma, Assistant Director
(Legal) of the Commission to Investigate allegations of
Bribery or Corruption for the Complainant-Petitioner
: Kasun Liyanage
--------------------------------------------------

ID: cbf0af4e-6466-4562-9605-1dc68c6657bd
Filename: ca_wrt_0115_21_pdf.pdf
Word Count:1857
Text Preview: WICKUM. A. KALUARACHCHI, J.
Counsel: Shantha Jayawardana with Ms. Dulika Imbuldeniya and
Ms. Wihangi Tissara for the Petitioner.
Ms. Himali Senanayake, SSC, for the 1st – 3rd and 5th – 7th
Respondents
--------------------------------------------------

ID: 86466ca5-e71e-4b46-b122-cf0551815e77
Filename: ca_181_2018_pdf.pdf
Word Count:1698
Text Preview: Wickum A. Kaluarachchi J.
Counsel : Neranjan Jayasinghe with Randula Heelage and I.
Senarath for the Accused-Appellant.
Udara Karunathilake, SSC for the State.
P

In [102]:
import json

# Specify the output file path
output_file = "/content/drive/MyDrive/FYP/json/cases_2024_v2.json"

# delete if exists
if os.path.exists(output_file):
    os.remove(output_file)

# Write the pdf_data to a JSON file
with open(output_file, "w", encoding='utf-8') as f:
    json.dump(pdf_data, f, indent=4, ensure_ascii=False)

print(f"PDF data successfully written to {output_file}")


PDF data successfully written to /content/drive/MyDrive/FYP/json/cases_2024_v2.json
