# **GitHub Commands**

In [2]:
# @title GitHub Init
# from google.colab import userdata

# GH_UNAME = userdata.get('GH_UNAME')
# GH_APIKEY = userdata.get('GH_APIKEY')
# GH_EMAIL = userdata.get('GH_EMAIL')
PRIMARY_REPO_NAME = 'Legal-Research-Platform'
LOCAL_REPO_DIR = '/content/drive/MyDrive/FYP/GitHub/Legal-Research-Platform'

drive.mount('/content/drive')

# !git config --global user.name {GH_UNAME}
# !git config --global user.email {GH_EMAIL}

%cd {LOCAL_REPO_DIR}

KeyboardInterrupt: 

In [None]:
# @title Git <-
!git fetch

!git pull

In [None]:
# @title Checkout
# !git checkout -b 'summarization'
!git pull origin summarization

In [None]:
# @title Git ->
# !git add .

# !git status

# !git commit -m 'updated layout'

!git push

# **Scrapers**

In [None]:
#@title Init

# Step 1: Install required libraries
!apt-get update
!apt-get purge chromium-browser chromium-chromedriver -y
!apt-get autoremove -y
!wget https://dl.google.com/linux/direct/google-chrome-stable_current_amd64.deb
!dpkg -i google-chrome-stable_current_amd64.deb || apt-get -fy install
!pip install -U selenium webdriver-manager requests

# Mount Google Drive
from google.colab import drive
drive.mount('/content/drive')

In [None]:
#@title law acts scraper

# Import required libraries
import os
import time
import shutil
import logging
import requests
import math
import requests
from requests.adapters import HTTPAdapter
from urllib3.util.retry import Retry
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from selenium.webdriver.chrome.service import Service
from webdriver_manager.chrome import ChromeDriverManager
from concurrent.futures import ThreadPoolExecutor, as_completed

# Define download paths
local_download_dir = "/content/downloads"
drive_directory = "/content/drive/MyDrive/FYP/legal_acts_raw"  # Replace with your desired directory

# Ensure directories exist
os.makedirs(local_download_dir, exist_ok=True)
os.makedirs(drive_directory, exist_ok=True)

# Set up Selenium WebDriver
chrome_options = webdriver.ChromeOptions()
prefs = {
    "download.default_directory": local_download_dir,
    "download.prompt_for_download": False,
    "download.directory_upgrade": True,
    "safebrowsing.enabled": True,
}
chrome_options.add_experimental_option("prefs", prefs)
chrome_options.add_argument("--headless")  # Run in headless mode
chrome_options.add_argument("--no-sandbox")
chrome_options.add_argument("--disable-gpu")  # Disable GPU hardware acceleration
chrome_options.add_argument("--window-size=1920x1080")  # Use a fixed window size
chrome_options.add_argument("--disable-dev-shm-usage")

# Initialize WebDriver
service = Service(ChromeDriverManager().install())
driver = webdriver.Chrome(service=service, options=chrome_options)

# Create a session with a larger connection pool
session = requests.Session()
adapter = HTTPAdapter(pool_connections=100, pool_maxsize=100, max_retries=Retry(total=2))
session.mount("http://", adapter)
session.mount("https://", adapter)


# Process each row separately (to be used in threading)
def process_row(row, year):
    try:
        #Get Name
        name = row.find_element(By.CSS_SELECTOR, "td:nth-child(3)").text.strip()

        # Find download links inside <a> tags that contain buttons
        english_link = row.find_element(By.XPATH, ".//a[button[contains(text(), 'English')]]")
        sinhala_link = row.find_element(By.XPATH, ".//a[button[contains(text(), 'Sinhala')]]")

        # Get the actual download URLs
        english_url = english_link.get_attribute("href") if english_link else None
        sinhala_url = sinhala_link.get_attribute("href") if sinhala_link else None

        # Download files in parallel
        if english_url:
            download_file(english_url, f"{name}_English.pdf", year)
        if sinhala_url:
            download_file(sinhala_url, f"{name}_Sinhala.pdf", year)

    except Exception as e:
        print(f"Error processing row for year {year}: {e}")

# iterative function to process row chunks
def process_rows_iterative(rows, year, max_threads=20):
    futures = []
    while rows:
        num_rows = len(rows)
        num_threads = min(max_threads, max(1, num_rows // 2))
        chunk_size = math.ceil(num_rows / num_threads)
        row_chunks = [rows[i:i + chunk_size] for i in range(0, num_rows, chunk_size)]

        print(f"Processing {num_rows} rows with {num_threads} threads, chunk size: {chunk_size}")

        with ThreadPoolExecutor(max_workers=num_threads) as executor:
            futures = [executor.submit(process_row, row, year) for chunk in row_chunks for row in chunk]

        # Update remaining rows
        rows = rows[chunk_size * num_threads:]

    # Wait for all threads to complete before proceeding
    for future in as_completed(futures):
        try:
            future.result()
        except Exception as e:
            print(f"Error in thread: {e}")



# Define the scraper function
def scrape_legal_acts(url):
    try:
        driver.get(url)
        time.sleep(3)
        print("Browser Opened")

        # Find all year buttons
        year_buttons = driver.find_elements(By.XPATH, "//a[@class='btn btn-primary']")
        print(f"Found {len(year_buttons)} year buttons")

        for i in range(len(year_buttons)):
            button = driver.find_elements(By.XPATH, "//a[@class='btn btn-primary']")[i]
            year = button.text.strip()
            print(f"Processing year: {year}")

            button.click()
            time.sleep(3)

            if not driver.find_elements(By.CSS_SELECTOR, "table tbody tr"):
                print(f"No data found for year: {year}")
                driver.back()
                time.sleep(2)
                continue

            # Find all rows in the table
            rows = driver.find_elements(By.CSS_SELECTOR, "table tbody tr")
            print(f"Found {len(rows)} rows for year: {year}")

            # Process each row in parallel
            process_rows_iterative(rows, year)

            # Return to the year selection page
            driver.back()
            time.sleep(2)

    except Exception as e:
        print(f"Error during scraping: {e}")


# Download files function
def download_file(url, filename, year):
    try:
        response = session.get(url, stream=True)
        if response.status_code == 200:
            year_folder = os.path.join(drive_directory, year)
            os.makedirs(year_folder, exist_ok=True)
            filepath = os.path.join(year_folder, filename)

            with open(filepath, "wb") as file:
                for chunk in response.iter_content(chunk_size=1024):
                    file.write(chunk)

            print(f"Downloaded: {filename}")
        else:
            print(f"Failed to download {filename}")

        response.close()

    except Exception as e:
        print(f"Error downloading file {filename}: {e}")

# Run the scraper
website_url = "https://documents.gov.lk/view/acts/acts.html"  # Replace with the actual URL
scrape_legal_acts(website_url)

# Close the WebDriver
driver.quit()

In [None]:
#@title cases scraper

import json
import requests
import os
from concurrent.futures import ThreadPoolExecutor, as_completed

# path to JSON file
file_path = "/content/drive/MyDrive/FYP/resources/jurilens-db.documents.json"

# Read the JSON file and load its data
with open(file_path, 'r') as file:
    json_data = json.load(file)

# Define the base save location
base_save_location = "/content/drive/MyDrive/FYP/law_cases_raw"

# Define the download function
def download_pdf(entry):
    try:
        if 'file' not in entry or 'date' not in entry:
            return f"Skipping entry (missing 'file' or 'date'): {entry.get('name', 'Unknown')}"

        file_info = entry['file']
        pdf_url = file_info.get('url')
        pdf_source_url = file_info.get('sourceUrl')
        pdf_name = file_info.get('name')

        # Extract year from date (assuming date is in ISO format)
        year = entry['date']['$date'][:4]  # Get the first four characters representing the year

        # Create a directory for the year if it doesn't exist
        year_folder = os.path.join(base_save_location, year)
        os.makedirs(year_folder, exist_ok=True)

        # Determine the URL to use
        url_to_download = pdf_url if pdf_url else pdf_source_url
        if not url_to_download:
            return f"Skipping {pdf_name}: No valid URL found"

        # Download the PDF
        response = requests.get(url_to_download, timeout=10)
        if response.status_code == 200:
            save_path = os.path.join(year_folder, pdf_name)
            with open(save_path, 'wb') as pdf_file:
                pdf_file.write(response.content)
            return f"Downloaded: {pdf_name}"
        else:
            return f"Failed to download {pdf_name}: HTTP {response.status_code}"

    except Exception as e:
        return f"Error processing {entry.get('name', 'Unknown')}: {str(e)}"

# Set the number of threads
num_threads = 400  # Adjust based on your system's capabilities

# Process files in parallel
with ThreadPoolExecutor(max_workers=num_threads) as executor:
    futures = {executor.submit(download_pdf, entry): entry for entry in json_data[6000:]}

    # Collect and print results
    for future in as_completed(futures):
        print(future.result())

# **Preprocessing**

In [1]:
#@title Init

# Install required packages and dependencies
!pip install pdf2image pytesseract pdfplumber googletrans langdetect fasttext

!apt-get install -y poppler-utils
!apt-get install -y tesseract-ocr tesseract-ocr-sin tesseract-ocr-tam

!wget -O lid.176.bin https://dl.fbaipublicfiles.com/fasttext/supervised-models/lid.176.bin

#mount drive
from google.colab import drive
drive.mount('/content/drive')

Collecting pdf2image
  Downloading pdf2image-1.17.0-py3-none-any.whl.metadata (6.2 kB)
Collecting pytesseract
  Downloading pytesseract-0.3.13-py3-none-any.whl.metadata (11 kB)
Collecting pdfplumber
  Downloading pdfplumber-0.11.7-py3-none-any.whl.metadata (42 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m42.8/42.8 kB[0m [31m554.4 kB/s[0m eta [36m0:00:00[0m
[?25hCollecting googletrans
  Downloading googletrans-4.0.2-py3-none-any.whl.metadata (10 kB)
Collecting langdetect
  Downloading langdetect-1.0.9.tar.gz (981 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m981.5/981.5 kB[0m [31m4.6 MB/s[0m eta [36m0:00:00[0m
[?25h  Preparing metadata (setup.py) ... [?25l[?25hdone
Collecting fasttext
  Downloading fasttext-0.9.3.tar.gz (73 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m73.4/73.4 kB[0m [31m5.4 MB/s[0m eta [36m0:00:00[0m
[?25h  Installing build dependencies ... [?25l[?25hdone
  Getting requirements to b

In [5]:
#@title Extract text and OCR

import os
import uuid
# from PyPDF2 import PdfReader
import pdfplumber # Import pdfplumber
from pdf2image import convert_from_path
import pytesseract
import re
from concurrent.futures import ProcessPoolExecutor, as_completed  # Multiprocessing
from langdetect import detect, DetectorFactory

DetectorFactory.seed = 0

# ------------------- PDF Processing Functions -------------------
def extract_text_from_pdf(pdf_path):
    """Attempts to extract text from a PDF using pdfplumber."""
    pages_text = []
    try:
        with pdfplumber.open(pdf_path) as pdf:
            for page in pdf.pages:
                page_text = page.extract_text()
                if page_text:
                    pages_text.append(page_text)
                else:
                    pages_text.append("") # Keep a placeholder for empty pages to maintain page count
    except Exception as e:
        print(f"Error extracting text from {pdf_path}: {e}")
    return pages_text

def ocr_pdf(pdf_path, lang="eng+sin"):
    """Uses pdf2image to convert PDF pages to images and then applies OCR."""
    pages_text = []
    try:
        images = convert_from_path(pdf_path, dpi=200)  # Lower DPI for faster processing
        for img in images:
            page_text = pytesseract.image_to_string(img, lang=lang)
            pages_text.append(page_text)
    except Exception as e:
        print(f"Error OCR processing {pdf_path}: {e}")
    return pages_text

def process_pdf_page_by_page(pdf_path, lang="eng+sin", text_threshold=10):
    """Attempts to extract text from PDF page by page and falls back to OCR if needed."""
    pages_text = extract_text_from_pdf(pdf_path)
    if not any(pages_text) or sum(len(text) for text in pages_text) < text_threshold:
        print(f"Text extraction yielded very little text for {os.path.basename(pdf_path)}. Running OCR...\n")
        pages_text = ocr_pdf(pdf_path, lang=lang)
    return pages_text

def detect_language_from_text(text):
    """Detects the language of the given text using langdetect."""
    try:
        # langdetect requires a minimum amount of text to be effective
        if len(text.strip()) < 20: # Adjust threshold as needed
            return "unknown"
        return detect(text)
    except Exception as e:
        print(f"Error detecting language with langdetect: {e}")
        return "unknown"


def process_pdf_file(pdf_path, lang="eng+sin"):
    """Processes a single PDF file and returns its data as a dictionary."""
    unique_id = str(uuid.uuid4())
    filename = os.path.basename(pdf_path)
    pages_text = process_pdf_page_by_page(pdf_path, lang=lang)

    # Detect primary language from the first page
    primary_lang = "unknown"
    if pages_text:
        primary_lang = detect_language_from_text(pages_text[0])


    cleaned_pages_text = []
    removed_pages_text = []
    document_title = "Untitled"
    title_extracted = False
    doc_type = "unknown"
    amendmentTo = ""

    # Regex for page numbers and unwanted passages
    unwanted_pages_regex = re.compile(r'(PETITIONER|RESPONDENTS|Printed on the Order of Government|DEPARTMENT OF\s*GOVERNMENT PRINTING)', re.DOTALL)
    unwanted_passage_regex = re.compile(r"(Page \d+ of \d+$|\d+ \| P age)")
    title_regex = re.compile(r"(.+?\s*Act\s*,?\s*No\.\s*\d+\s*of\s*\d{4}|Case No\.\s*(.+?-\s*\d+/\d+)\s)", re.DOTALL)
    amend_regex = re.compile(r"(ACT\s+TO\s+AMEND.+?,?\s*NO\.\s*\d+\s*OF\s*\d{4})", re.DOTALL | re.IGNORECASE)


    print("total pages: ", len(pages_text))
    for i, page_text in enumerate(pages_text):
        # Extract title from the first few pages (assuming title is at the beginning)
        if not title_extracted and i < 5: # Check first 5 pages for the title
             title_match = title_regex.search(page_text)
             if title_match:
                 document_title = title_match.group(0).strip()
                 title_extracted = True
                 doc_type = "act" if "act" in document_title.lower() else "case" if "case" in document_title.lower() else "unknown"

             if doc_type == "act":
                 amendment_match = amend_regex.search(page_text)
                 if amendment_match:
                     amendmentTo = amendment_match.group(0).replace("ACT TO AMEND", "").strip()

        # Check if the page contains page numbers or unwanted passages
        if unwanted_pages_regex.search(page_text):
            print(f"Skipping page {i+1} of {filename} due to matching patterns.")
            removed_pages_text.append(page_text.replace("\n", " "))
            continue # Skip this page

        passage_match = unwanted_passage_regex.search(page_text)
        if passage_match:
            print(f"Removing passage from page {i+1} of {filename} due to matching patterns.")
            page_text = unwanted_passage_regex.sub("", page_text)
            removed_pages_text.append(passage_match.group(0).replace("\n", " "))

        if title_regex.search(page_text) or amend_regex.search(page_text):
            page_text = title_regex.sub("", page_text)
            page_text = amend_regex.sub("", page_text)

        # If the page doesn't contain the patterns, add it to cleaned text
        cleaned_pages_text.append(page_text.replace("\n", " "))

    cleaned_text = "\n".join(cleaned_pages_text)


    return {
        "id": unique_id,
        "type": doc_type,
        "amendmentTo": amendmentTo,
        "filename": filename,
        "primaryLang": primary_lang, # Updated primaryLang
        "title": document_title.replace("Case No. ", "").replace("\n"," "),
        "cleanedText": cleaned_text, # Using cleaned text
        "removedText": "\n".join(removed_pages_text),
        "wordCount": len(cleaned_text.split()), # Calculate word count on cleaned text
        "pagesCount": len(cleaned_pages_text),
    }

# ------------------- Parallel Processing -------------------
def process_folder(folder_path, lang="eng+sin", max_workers=16):
    """Processes all PDFs in a folder using multiprocessing."""
    results = []
    pdf_files = [os.path.join(folder_path, filename) for filename in os.listdir(folder_path) if filename.lower().endswith(".pdf")]

    with ProcessPoolExecutor(max_workers=max_workers) as executor:
        futures = {executor.submit(process_pdf_file, pdf_path, lang): pdf_path for pdf_path in pdf_files}

        for future in as_completed(futures):
            pdf_path = futures[future]
            try:
                result = future.result()  # Get result of the future
                results.append(result)
                print(f"Processed: {result['filename']} | Primary Language: {result['primaryLang']} | Word Count: {result['wordCount']} | Pages Count: {result['pagesCount']}\n")  # Change 'length' to 'wordCount' and added language
            except Exception as e:
                print(f"Error processing {pdf_path}: {e}")

    return results

# ----------------------------- Usage Example -----------------------------
folder_path = "/content/drive/MyDrive/FYP/law_cases_raw/2024"  # Update this path as needed

# Process all PDFs in the folder
pdf_data = process_folder(folder_path)

# Print summary
for data in pdf_data:
    print(f"ID: {data['id']}\nFilename: {data['filename']}\nPrimary Language: {data['primaryLang']}\nTitle: {data['title']}\nWord Count: {data['wordCount']}\nType: {data['type']}\nAmendment To: {data['amendmentTo']}\n\nText Preview: {data['cleanedText'][:200]}\n\nRemoved Text: {data['removedText'][:200]}\n{'-'*50}")

total pages:  6
Removing passage from page 1 of hcc_0384_18_final_judgement_pdf.pdf due to matching patterns.
Removing passage from page 2 of hcc_0384_18_final_judgement_pdf.pdf due to matching patterns.
Removing passage from page 3 of hcc_0384_18_final_judgement_pdf.pdf due to matching patterns.
Removing passage from page 4 of hcc_0384_18_final_judgement_pdf.pdf due to matching patterns.
total pages:  10
Removing passage from page 1 of ca_phc_0066_12_final_judgement_pdf.pdf due to matching patterns.
Skipping page 2 of ca_phc_0066_12_final_judgement_pdf.pdf due to matching patterns.
Removing passage from page 3 of ca_phc_0066_12_final_judgement_pdf.pdf due to matching patterns.
total pages:  10Removing passage from page 4 of ca_phc_0066_12_final_judgement_pdf.pdf due to matching patterns.

Removing passage from page 1 of ca_phc_0065_12_final_judgement_pdf.pdf due to matching patterns.
Skipping page 2 of ca_phc_0065_12_final_judgement_pdf.pdf due to matching patterns.
Removing passage f



Processed: wrt_0201_21_31_01_2024_1_pdf.pdf | Primary Language: en | Word Count: 3708 | Pages Count: 13





Processed: writ_138_20_pdf.pdf | Primary Language: en | Word Count: 2820 | Pages Count: 8

Removing passage from page 7 of ca_wrt_511_19_pdf.pdf due to matching patterns.
Removing passage from page 8 of ca_wrt_511_19_pdf.pdf due to matching patterns.
Removing passage from page 9 of ca_wrt_511_19_pdf.pdf due to matching patterns.
Removing passage from page 3 of tax_19_2015_pdf.pdf due to matching patterns.
Removing passage from page 10 of ca_wrt_511_19_pdf.pdf due to matching patterns.
Processed: ca_wrt_511_19_pdf.pdf | Primary Language: en | Word Count: 2264 | Pages Count: 10

Removing passage from page 4 of tax_19_2015_pdf.pdf due to matching patterns.
total pages:  12
Skipping page 1 of 541_2023_pdf.pdf due to matching patterns.
Skipping page 2 of 541_2023_pdf.pdf due to matching patterns.
Removing passage from page 3 of 541_2023_pdf.pdf due to matching patterns.
total pages:  15
Skipping page 1 of wrt_505_21_pdf.pdf due to matching patterns.
Skipping page 2 of wrt_505_21_pdf.pdf due



Removing passage from page 4 of ca_writ_464_21_pdf.pdf due to matching patterns.
Removing passage from page 5 of tax_19_2015_pdf.pdf due to matching patterns.
total pages:  15
Removing passage from page 1 of hcc_0002_21_final_judgement_pdf.pdf due to matching patterns.
Removing passage from page 2 of hcc_0002_21_final_judgement_pdf.pdf due to matching patterns.
Removing passage from page 3 of hcc_0002_21_final_judgement_pdf.pdf due to matching patterns.
Removing passage from page 5 of 541_2023_pdf.pdf due to matching patterns.
total pages:  Removing passage from page 5 of ca_writ_464_21_pdf.pdf due to matching patterns.10

Skipping page 1 of wrt_577_23_pdf.pdf due to matching patterns.
total pages:  15Removing passage from page 6 of tax_19_2015_pdf.pdf due to matching patterns.

total pages:  12Skipping page 1 of writ_345_21_pdf.pdf due to matching patterns.

Removing passage from page 4 of hcc_0002_21_final_judgement_pdf.pdf due to matching patterns.
Removing passage from page 1 of hc



Removing passage from page 4 of hcc_0036_22_final_judgement_pdf.pdf due to matching patterns.
Processed: writ_123_20_pdf.pdf | Primary Language: en | Word Count: 3553 | Pages Count: 14

Removing passage from page 6 of 541_2023_pdf.pdf due to matching patterns.
Removing passage from page 5 of hcc_0002_21_final_judgement_pdf.pdf due to matching patterns.
Removing passage from page 6 of ca_writ_464_21_pdf.pdf due to matching patterns.
Removing passage from page 7 of tax_19_2015_pdf.pdf due to matching patterns.
Removing passage from page 5 of hcc_0036_22_final_judgement_pdf.pdf due to matching patterns.
total pages:  17
Removing passage from page 1 of ca_cpa_0064_23_final_judgement_pdf.pdf due to matching patterns.Removing passage from page 6 of hcc_0002_21_final_judgement_pdf.pdf due to matching patterns.

Skipping page 2 of ca_cpa_0064_23_final_judgement_pdf.pdf due to matching patterns.
Skipping page 3 of ca_cpa_0064_23_final_judgement_pdf.pdf due to matching patterns.
Removing passage



Processed: ca_wrt_0115_21_pdf.pdf | Primary Language: en | Word Count: 1683 | Pages Count: 8





Removing passage from page 13 of ca_writ_0591_21_pdf.pdf due to matching patterns.
Removing passage from page 9 of ca_hcc_0190_191_17_pdf.pdf due to matching patterns.
Removing passage from page 10 of ca_hcc_0190_191_17_pdf.pdf due to matching patterns.
Processed: ca_writ_0591_21_pdf.pdf | Primary Language: en | Word Count: 4062 | Pages Count: 11

Removing passage from page 11 of ca_hcc_0190_191_17_pdf.pdf due to matching patterns.
Removing passage from page 12 of ca_hcc_0190_191_17_pdf.pdf due to matching patterns.
Removing passage from page 13 of ca_hcc_0190_191_17_pdf.pdf due to matching patterns.
Processed: ca_hcc_0190_191_17_pdf.pdf | Primary Language: en | Word Count: 2687 | Pages Count: 13

total pages:  16
total pages:  12
Skipping page 1 of ca_wrt_0304_21_pdf.pdf due to matching patterns.
Removing passage from page 2 of ca_wrt_0304_21_pdf.pdf due to matching patterns.
total pages:  9
Skipping page 3 of ca_wrt_0304_21_pdf.pdf due to matching patterns.
total pages:  6
Removing p



Removing passage from page 2 of ca_wrt_157_21_pdf.pdf due to matching patterns.
Removing passage from page 5 of ca_wrt_0304_21_pdf.pdf due to matching patterns.
Processed: ca_181_2018_pdf.pdf | Primary Language: en | Word Count: 1590 | Pages Count: 6

total pages:  7Removing passage from page 3 of ca_wrt_157_21_pdf.pdf due to matching patterns.

Processed: ca_wrt_49_20_pdf.pdf | Primary Language: en | Word Count: 1828 | Pages Count: 9

Removing passage from page 1 of wrt_0379_2019_docx_pdf.pdf due to matching patterns.
Removing passage from page 2 of wrt_0379_2019_docx_pdf.pdf due to matching patterns.
Removing passage from page 3 of wrt_0379_2019_docx_pdf.pdf due to matching patterns.total pages: 
 7Removing passage from page 4 of wrt_0379_2019_docx_pdf.pdf due to matching patterns.

Skipping page 1 of ca_writ_814_23_pdf.pdf due to matching patterns.
Skipping page 2 of ca_writ_814_23_pdf.pdf due to matching patterns.Removing passage from page 5 of wrt_0379_2019_docx_pdf.pdf due to mat



Removing passage from page 3 of ca_writ_366_21_pdf.pdf due to matching patterns.Removing passage from page 5 of ca_wrt_0391_2020_pdf.pdf due to matching patterns.





Removing passage from page 11 of hcc_0008_22_final_judgment_pdf.pdf due to matching patterns.
Removing passage from page 8 of hcc_0226_20_final_judgement_pdf.pdf due to matching patterns.
Processed: hcc_0008_22_final_judgment_pdf.pdf | Primary Language: en | Word Count: 2849 | Pages Count: 11





Removing passage from page 7 of cpa_0080_19_2_pdf.pdf due to matching patterns.
Removing passage from page 7 of writ_190_2016_judgment_pdf.pdf due to matching patterns.



[1;30;43mStreaming output truncated to the last 5000 lines.[0m
Removing passage from page 9 of writ_190_2016_judgment_pdf.pdf due to matching patterns.
Removing passage from page 11 of hcc_0127_22_22_02_2024_pdf.pdf due to matching patterns.
Removing passage from page 2 of revision_ca_phc_apn_0115_22_pdf.pdf due to matching patterns.
total pages:  5
Removing passage from page 13 of hcc_0226_20_final_judgement_pdf.pdf due to matching patterns.
Removing passage from page 6 of ca_writ_366_21_pdf.pdf due to matching patterns.Removing passage from page 3 of revision_ca_phc_apn_0115_22_pdf.pdf due to matching patterns.

Removing passage from page 9 of ca_wrt_0391_2020_pdf.pdf due to matching patterns.Processed: writ_190_2016_judgment_pdf.pdf | Primary Language: en | Word Count: 3116 | Pages Count: 9


Removing passage from page 12 of hcc_0127_22_22_02_2024_pdf.pdf due to matching patterns.
Removing passage from page 14 of hcc_0226_20_final_judgement_pdf.pdf due to matching patterns.
Removi








Removing passage from page 20 of hcc_0088_2022_final_judgment_pdf.pdf due to matching patterns.
Removing passage from page 11 of revision_ca_phc_apn_0013_23_pdf.pdf due to matching patterns.




Removing passage from page 2 of wrt_229_23_judgment_pdf.pdf due to matching patterns.
Removing passage from page 6 of ca_writ_0692_24_1_pdf.pdf due to matching patterns.
Processed: revision_ca_phc_apn_0013_23_pdf.pdf | Primary Language: en | Word Count: 1369 | Pages Count: 11

Processed: wrt_215_23_judgment_pdf.pdf | Primary Language: en | Word Count: 1607 | Pages Count: 6

Processed: hcc_0128_19_pdf.pdf | Primary Language: en | Word Count: 3330 | Pages Count: 16

Removing passage from page 21 of hcc_0088_2022_final_judgment_pdf.pdf due to matching patterns.
Removing passage from page 5 of wrt_334_22_judgment_pdf.pdf due to matching patterns.
Removing passage from page 7 of ca_writ_0692_24_1_pdf.pdf due to matching patterns.
total pages:  11Removing passage from page 3 of wrt_229_23_judgment_pdf.pdf due to matching patterns.

Removing passage from page 1 of wrt_506_22_judgment_pdf.pdf due to matching patterns.
Removing passage from page 6 of wrt_334_22_judgment_pdf.pdf due to matching



[1;30;43mStreaming output truncated to the last 5000 lines.[0m
Amendment To: 

Text Preview: CA (PHC)APN 25/2023 IN THE COURT OF APPEAL OF THE DEMOCRATIC SOCIALIST REPUBLIC OF SRI LANKA In the matter of an application for Revision in terms of Article 138 of the Constitution of the Democratic 

Removed Text: 1 | P age
2 | P age
3 | P age
4 | P age
5 | P age
6 | P age
7 | P age
8 | P age
--------------------------------------------------
ID: 61589be8-c2bd-43c5-a209-bbb4132b5c3e
Filename: hcc_0202_19_final_judgment_1_pdf.pdf
Primary Language: en
Title: IN THE COURT OF APPEAL OF THE DEMOCRATIC SOCIALIST REPUBLIC OF SRI LANKA In the matter of an application under and in terms of section 331 of the Code of Criminal Procedure Act No. 15 of 1979
Word Count: 6314
Type: act
Amendment To: 

Text Preview: . Court of Appeal Case No: The Hon. Attorney General, HCC/0202/2019 Attorney General’s Department, Colombo 12. COMPLAINANT Vs. High Court of Chilaw Randeni Aarachchige Dona Indunil Case No: HC/

In [7]:
#@title Translation

import re
import fasttext
import asyncio
from googletrans import Translator
import nest_asyncio  # For Jupyter notebook environments
import numpy as np # Import numpy

# Apply nest_asyncio to allow nested event loops
nest_asyncio.apply()

# Load FastText model
model = fasttext.load_model("lid.176.bin")

# Initialize Google Translate API
translator = Translator()

# Modified detect_language_fasttext to process chunks sequentially
async def detect_language_fasttext(text, word_threshold=300):
    """Detects if the text contains non-English content."""
    words = text.split()
    total_words = len(words)
    num_chunks = max(1, total_words // word_threshold)

    # Process chunks sequentially to avoid asyncio.as_completed issue
    for i in range(num_chunks):
        chunk = " ".join(words[i * word_threshold:(i + 1) * word_threshold])
        try:
            # Call predict directly without asyncio.to_thread
            prediction = model.predict(chunk)

            # Ensure prediction has the expected structure before accessing elements
            if prediction and len(prediction) > 0 and len(prediction[0]) > 0:
                detected_lang = prediction[0][0].replace("__label__", "")
                if detected_lang != "en":
                    print(f"Chunk needs translation (detected: {detected_lang})")
                    return True  # Indicates translation is needed

            else:
                 print("Warning: Received empty or unexpected prediction format for a chunk.")

        except ValueError as e:
             # Log the specific ValueError if it still occurs within predict
             if "Unable to avoid copy while creating an array as requested" in str(e):
                 print(f"Caught ValueError during fasttext.predict: {e}")
                 # Continue to the next chunk or handle as needed
                 pass # Or return True to force translation on error


        except Exception as e:
            print(f"Error during fasttext prediction for a chunk: {e}")
            # Decide how to handle other errors, e.g., force translation
            # return True

    return False  # No translation needed


async def translate_if_needed(text, max_length=2000):
    """Translates text while preserving sentence boundaries asynchronously."""
    # Await the simplified language detection
    if await detect_language_fasttext(text):
        try:
            # Split text by sentence boundaries (., !, ?, newline)
            sentences = re.split(r'(?<=[.!?])\s+', text)

            chunks = []
            current_chunk = ""

            for sentence in sentences:
                # Ensure sentence is not empty after split
                if not sentence.strip():
                    continue

                # Check if adding the next sentence exceeds max_length
                if len(current_chunk) + len(sentence) + (1 if current_chunk else 0) < max_length:
                    current_chunk += (sentence + " ").strip() if current_chunk else sentence.strip()
                else:
                    chunks.append(current_chunk.strip())
                    current_chunk = sentence.strip() + " "

            if current_chunk:
                chunks.append(current_chunk.strip())

            print(f"Translating {len(chunks)} chunks.")
            # Translate all chunks in parallel using asyncio.gather
            tasks = [asyncio.to_thread(translator.translate, chunk, dest='en', src='si') for chunk in chunks]
            translated_chunks = await asyncio.gather(*tasks)

            # Extract translated text
            translated_texts = [tr.text for tr in translated_chunks]
            print("Translation complete.")

            return " ".join(translated_texts)

        except Exception as e:
            print(f"Translation error: {e}")
            return text  # Return original if translation fails

    return text  # Return original if no translation is needed

async def process_documents(pdf_data):
    """Processes documents asynchronously in parallel."""
    print(f"Starting translation for {len(pdf_data)} documents.")
    tasks = []
    for doc in pdf_data:
        # Pass the entire cleanedText to translate_if_needed
        tasks.append(translate_if_needed(doc.get("cleanedText", "")))

    # Run translations in parallel
    translated_texts = await asyncio.gather(*tasks)

    # Assign translated text back to documents
    for i, doc in enumerate(pdf_data):
        doc["text"] = translated_texts[i]

    print("Translation process finished.")
    # Print a preview of the updated text
    for doc in pdf_data:
        print(f"ID: {doc['id']}\nFilename: {doc['filename']}\nWord Count:{len(doc['text'].split())}\nText Preview: {doc['text'][:200]}\n{'-'*50}\n")

# Main function to run process_documents
async def main():
    await process_documents(pdf_data)

# Run the main function in an environment with an existing event loop
try:
    loop = asyncio.get_running_loop()  # Get the current running loop
except RuntimeError:  # No running event loop, create a new one
    loop = asyncio.new_event_loop()
    asyncio.set_event_loop(loop)

# Await the main function (ensuring all tasks finish)
if loop.is_running():
    # Use asyncio.run if running in a script or ensure a loop is already running
    # In Colab, a loop is usually running, so create_task and await is appropriate
    task = asyncio.create_task(main())
    await task
else:
    loop.run_until_complete(main())

[1;30;43mStreaming output truncated to the last 5000 lines.[0m
If using `np.array(obj, copy=False)` replace it with `np.asarray(obj)` to allow a copy when needed (no behavior change in NumPy 1.x).
For more details, see https://numpy.org/devdocs/numpy_2_0_migration_guide.html#adapting-to-changes-in-the-copy-keyword.
Caught ValueError during fasttext.predict: Unable to avoid copy while creating an array as requested.
If using `np.array(obj, copy=False)` replace it with `np.asarray(obj)` to allow a copy when needed (no behavior change in NumPy 1.x).
For more details, see https://numpy.org/devdocs/numpy_2_0_migration_guide.html#adapting-to-changes-in-the-copy-keyword.
Caught ValueError during fasttext.predict: Unable to avoid copy while creating an array as requested.
If using `np.array(obj, copy=False)` replace it with `np.asarray(obj)` to allow a copy when needed (no behavior change in NumPy 1.x).
For more details, see https://numpy.org/devdocs/numpy_2_0_migration_guide.html#adapting-t

In [8]:
import json

# Specify the output file path
output_file = "/content/drive/MyDrive/FYP/json/cases_2024.json"

# Write the pdf_data to a JSON file
with open(output_file, "w", encoding='utf-8') as f:
    json.dump(pdf_data, f, indent=4, ensure_ascii=False)

print(f"PDF data successfully written to {output_file}")


PDF data successfully written to /content/drive/MyDrive/FYP/json/cases_2024.json
