# **GitHub Commands**

In [None]:
# @title GitHub Init
# from google.colab import userdata

# GH_UNAME = userdata.get('GH_UNAME')
# GH_APIKEY = userdata.get('GH_APIKEY')
# GH_EMAIL = userdata.get('GH_EMAIL')
PRIMARY_REPO_NAME = 'Legal-Research-Platform'
LOCAL_REPO_DIR = '/content/drive/MyDrive/FYP/GitHub/Legal-Research-Platform'

# !git config --global user.name {GH_UNAME}
# !git config --global user.email {GH_EMAIL}

# %cd {LOCAL_REPO_DIR}

In [None]:
# @title Git <-
!git fetch

!git pull

fatal: not a git repository (or any of the parent directories): .git
fatal: not a git repository (or any of the parent directories): .git


In [None]:
# @title Git ->
# !git add .

# !git status

# !git commit -m 'updated layout'

!git push

fatal: not a git repository (or any of the parent directories): .git


# **Scrapers**

In [None]:
#@title Init

# Step 1: Install required libraries
!apt-get update
!apt-get purge chromium-browser chromium-chromedriver -y
!apt-get autoremove -y
!wget https://dl.google.com/linux/direct/google-chrome-stable_current_amd64.deb
!dpkg -i google-chrome-stable_current_amd64.deb || apt-get -fy install
!pip install -U selenium webdriver-manager requests

# Mount Google Drive
from google.colab import drive
drive.mount('/content/drive')

In [None]:
#@title law acts scraper

# Import required libraries
import os
import time
import shutil
import logging
import requests
import math
import requests
from requests.adapters import HTTPAdapter
from urllib3.util.retry import Retry
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from selenium.webdriver.chrome.service import Service
from webdriver_manager.chrome import ChromeDriverManager
from concurrent.futures import ThreadPoolExecutor, as_completed

# Define download paths
local_download_dir = "/content/downloads"
drive_directory = "/content/drive/MyDrive/FYP/legal_acts_raw"  # Replace with your desired directory

# Ensure directories exist
os.makedirs(local_download_dir, exist_ok=True)
os.makedirs(drive_directory, exist_ok=True)

# Set up Selenium WebDriver
chrome_options = webdriver.ChromeOptions()
prefs = {
    "download.default_directory": local_download_dir,
    "download.prompt_for_download": False,
    "download.directory_upgrade": True,
    "safebrowsing.enabled": True,
}
chrome_options.add_experimental_option("prefs", prefs)
chrome_options.add_argument("--headless")  # Run in headless mode
chrome_options.add_argument("--no-sandbox")
chrome_options.add_argument("--disable-gpu")  # Disable GPU hardware acceleration
chrome_options.add_argument("--window-size=1920x1080")  # Use a fixed window size
chrome_options.add_argument("--disable-dev-shm-usage")

# Initialize WebDriver
service = Service(ChromeDriverManager().install())
driver = webdriver.Chrome(service=service, options=chrome_options)

# Create a session with a larger connection pool
session = requests.Session()
adapter = HTTPAdapter(pool_connections=100, pool_maxsize=100, max_retries=Retry(total=2))
session.mount("http://", adapter)
session.mount("https://", adapter)


# Process each row separately (to be used in threading)
def process_row(row, year):
    try:
        #Get Name
        name = row.find_element(By.CSS_SELECTOR, "td:nth-child(3)").text.strip()

        # Find download links inside <a> tags that contain buttons
        english_link = row.find_element(By.XPATH, ".//a[button[contains(text(), 'English')]]")
        sinhala_link = row.find_element(By.XPATH, ".//a[button[contains(text(), 'Sinhala')]]")

        # Get the actual download URLs
        english_url = english_link.get_attribute("href") if english_link else None
        sinhala_url = sinhala_link.get_attribute("href") if sinhala_link else None

        # Download files in parallel
        if english_url:
            download_file(english_url, f"{name}_English.pdf", year)
        if sinhala_url:
            download_file(sinhala_url, f"{name}_Sinhala.pdf", year)

    except Exception as e:
        print(f"Error processing row for year {year}: {e}")

# iterative function to process row chunks
def process_rows_iterative(rows, year, max_threads=20):
    futures = []
    while rows:
        num_rows = len(rows)
        num_threads = min(max_threads, max(1, num_rows // 2))
        chunk_size = math.ceil(num_rows / num_threads)
        row_chunks = [rows[i:i + chunk_size] for i in range(0, num_rows, chunk_size)]

        print(f"Processing {num_rows} rows with {num_threads} threads, chunk size: {chunk_size}")

        with ThreadPoolExecutor(max_workers=num_threads) as executor:
            futures = [executor.submit(process_row, row, year) for chunk in row_chunks for row in chunk]

        # Update remaining rows
        rows = rows[chunk_size * num_threads:]

    # Wait for all threads to complete before proceeding
    for future in as_completed(futures):
        try:
            future.result()
        except Exception as e:
            print(f"Error in thread: {e}")



# Define the scraper function
def scrape_legal_acts(url):
    try:
        driver.get(url)
        time.sleep(3)
        print("Browser Opened")

        # Find all year buttons
        year_buttons = driver.find_elements(By.XPATH, "//a[@class='btn btn-primary']")
        print(f"Found {len(year_buttons)} year buttons")

        for i in range(len(year_buttons)):
            button = driver.find_elements(By.XPATH, "//a[@class='btn btn-primary']")[i]
            year = button.text.strip()
            print(f"Processing year: {year}")

            button.click()
            time.sleep(3)

            if not driver.find_elements(By.CSS_SELECTOR, "table tbody tr"):
                print(f"No data found for year: {year}")
                driver.back()
                time.sleep(2)
                continue

            # Find all rows in the table
            rows = driver.find_elements(By.CSS_SELECTOR, "table tbody tr")
            print(f"Found {len(rows)} rows for year: {year}")

            # Process each row in parallel
            process_rows_iterative(rows, year)

            # Return to the year selection page
            driver.back()
            time.sleep(2)

    except Exception as e:
        print(f"Error during scraping: {e}")


# Download files function
def download_file(url, filename, year):
    try:
        response = session.get(url, stream=True)
        if response.status_code == 200:
            year_folder = os.path.join(drive_directory, year)
            os.makedirs(year_folder, exist_ok=True)
            filepath = os.path.join(year_folder, filename)

            with open(filepath, "wb") as file:
                for chunk in response.iter_content(chunk_size=1024):
                    file.write(chunk)

            print(f"Downloaded: {filename}")
        else:
            print(f"Failed to download {filename}")

        response.close()

    except Exception as e:
        print(f"Error downloading file {filename}: {e}")

# Run the scraper
website_url = "https://documents.gov.lk/view/acts/acts.html"  # Replace with the actual URL
scrape_legal_acts(website_url)

# Close the WebDriver
driver.quit()

Browser Opened
Found 45 year buttons
Processing year: 2025
No data found for year: 2025
Processing year: 2024
Found 32 rows for year: 2024
Processing 32 rows with 16 threads, chunk size: 2




Downloaded: Saweera Foundation (Incorporation)_English.pdf
Downloaded: Shop and Office Employees (Regulation of Employment and Remuneration) (Amendment)_English.pdf
Downloaded: International Institute of Theravadha (Incorporation)_English.pdf
Downloaded: Public Debt Management_English.pdf
Downloaded: Partition (Amendment)_English.pdf
Downloaded: Saweera Foundation (Incorporation)_Sinhala.pdf
Downloaded: Shop and Office Employees (Regulation of Employment and Remuneration) (Amendment)_Sinhala.pdf
Downloaded: International Institute of Theravadha (Incorporation)_Sinhala.pdf
Downloaded: Public Debt Management_Sinhala.pdf




Downloaded: Partition (Amendment)_Sinhala.pdf
Downloaded: Social Security Contribution Levy (Amendment)_English.pdf
Downloaded: Value Added Tax (Amendment)_English.pdf
Downloaded: Heart to Heart Trust Fund (Incorporation)_English.pdf
Downloaded: Sri Lanka Baptist Sangamaya (Incorporation) (Amendment)_English.pdf
Downloaded: Sri Balabhivurdhi Wardana Society (Incorporation)_English.pdf




Downloaded: Social Security Contribution Levy (Amendment)_Sinhala.pdf
Downloaded: Value Added Tax (Amendment)_Sinhala.pdf
Downloaded: Heart to Heart Trust Fund (Incorporation)_Sinhala.pdf
Downloaded: Sri Lanka Baptist Sangamaya (Incorporation) (Amendment)_Sinhala.pdf




Downloaded: Sri Balabhivurdhi Wardana Society (Incorporation)_Sinhala.pdf
Downloaded: Samadhi Community Development Foundation (Incorporation)_English.pdf
Downloaded: Dassana Bauddha Sanvidhanaya (Incorporation)_English.pdf
Downloaded: National Hydrographic_English.pdf
Downloaded: Contempt of a Court,Tribunal or Institution_English.pdf
Downloaded: Online Safety_English.pdf
Downloaded: Samadhi Community Development Foundation (Incorporation)_Sinhala.pdf
Downloaded: Dassana Bauddha Sanvidhanaya (Incorporation)_Sinhala.pdf




Downloaded: National Hydrographic_Sinhala.pdf
Downloaded: Contempt of a Court,Tribunal or Institution_Sinhala.pdf
Downloaded: The Recognition and Enforcement of International Settlement Agreements Resulting from Mediation_English.pdf
Downloaded: Notaries (Amendment)_English.pdf
Downloaded: Office of National Unity and Reconciliation_English.pdf
Downloaded: Online Safety_Sinhala.pdf
Downloaded: Mediation Board (Amendment)_English.pdf
Downloaded: The Recognition and Enforcement of International Settlement Agreements Resulting from Mediation_Sinhala.pdf
Downloaded: Office of National Unity and Reconciliation_Sinhala.pdf
Downloaded: Mediation Board (Amendment)_Sinhala.pdf
Downloaded: Powers of Attorney (Amendment)_English.pdf
Downloaded: Notaries (Amendment)_Sinhala.pdf
Downloaded: Prevention of Frauds (Amendment)_English.pdf
Downloaded: Powers of Attorney (Amendment)_Sinhala.pdf
Downloaded: Prevention of Frauds (Amendment)_Sinhala.pdf
Downloaded: Registration of Documents (Amendment)_Engl



Downloaded: Inland Trust Receipts (Amendment)_English.pdf
Downloaded: Kelaniya Buddhist Women's Charitable Society (Incorporation)_English.pdf
Downloaded: Inland Trust Receipts (Amendment)_Sinhala.pdf
Downloaded: Kelaniya Buddhist Women's Charitable Society (Incorporation)_Sinhala.pdf
Processing year: 2023
Found 34 rows for year: 2023
Processing 34 rows with 17 threads, chunk size: 2




Downloaded: Value Added Tax (Amendment)_English.pdf
Downloaded: Finance_English.pdf
Downloaded: Anti - Corruption (Amendment)_English.pdf
Downloaded: Galaha Bhaddrawathie National Bhikku Care Centre Trust_English.pdf
Downloaded: Value Added Tax (Amendment)_Sinhala.pdf
Downloaded: Finance_Sinhala.pdf
Downloaded: Anti - Corruption (Amendment)_Sinhala.pdf
Downloaded: Appropriation Act_English.pdf
Downloaded: Inland Revenue (Amendment)_English.pdf
Downloaded: Galaha Bhaddrawathie National Bhikku Care Centre Trust_Sinhala.pdf
Downloaded: Social Security Contribution Levy (Amendment)_English.pdf
Downloaded: Inland Revenue (Amendment)_Sinhala.pdf
Downloaded: Central Bank of Sri Lanka_English.pdf
Downloaded: Banking (Special Provisions)_English.pdf
Downloaded: Betting and Gaming Levy (Amendment)_English.pdf
Downloaded: Appropriation Act_Sinhala.pdf
Downloaded: Social Security Contribution Levy (Amendment)_Sinhala.pdf
Downloaded: Appropriation (Amendment)_English.pdf
Downloaded: Sri Lanka Insti



Downloaded: Central Bank of Sri Lanka_Sinhala.pdf
Downloaded: Banking (Special Provisions)_Sinhala.pdf
Downloaded: Betting and Gaming Levy (Amendment)_Sinhala.pdf
Downloaded: Sri Lanka Institute of Taxation (Incorporation) (Amendment)_Sinhala.pdf
Downloaded: Assistance to and Protection of Victims of Crime and Witnesses_English.pdf
Downloaded: Carriage by Air (Amendment)_English.pdf
Downloaded: Parliamentary Budget Office_English.pdf
Downloaded: Civil Procedure Code (Amendment)_English.pdf
Downloaded: Anti-Corruption_English.pdf
Downloaded: Parliamentary Budget Office_Sinhala.pdf
Downloaded: Civil Procedure Code (Amendment)_Sinhala.pdf
Downloaded: Assistance to and Protection of Victims of Crime and Witnesses_Sinhala.pdf
Downloaded: Carriage by Air (Amendment)_Sinhala.pdf
Downloaded: Rathanatissa Peace Foundation (Incorporation)_English.pdf
Downloaded: Bureau of Rehabilitation_English.pdf
Downloaded: Anti-Corruption_Sinhala.pdf
Downloaded: Rathanatissa Peace Foundation (Incorporation)_



Downloaded: Chartered Institute of Transport of Sri Lanka (Incorporation) (Amendment)_English.pdf
Downloaded: Civil Procedure Code (Amendment)_English.pdf
Downloaded: Local Authorities Elections (Amendment)_English.pdf
Downloaded: Chartered Institute of Transport of Sri Lanka (Incorporation) (Amendment)_Sinhala.pdf
Downloaded: Civil Procedure Code (Amendment)_Sinhala.pdf
Downloaded: Local Authorities Elections (Amendment)_Sinhala.pdf
Processing year: 2022
Found 46 rows for year: 2022
Processing 46 rows with 20 threads, chunk size: 3




Downloaded: Inland Revenue (Amendment)_English.pdf
Downloaded: Value Added Tax (Amendment)_English.pdf
Downloaded: Appropriation (Amendment)_English.pdf
Downloaded: Appropriation_English.pdf
Downloaded: Inland Revenue (Amendment)_Sinhala.pdf
Downloaded: Value Added Tax (Amendment)_Sinhala.pdf




Downloaded: Appropriation (Amendment)_Sinhala.pdf
Downloaded: Sri Lanka Rupavahini Corporation (Amendment)_English.pdf
Downloaded: Appropriation_Sinhala.pdf
Downloaded: Sri Lanka Rupavahini Corporation (Amendment)_Sinhala.pdf
Downloaded: Industrial Disputes (Amendment)_English.pdf
Downloaded: Social Security Contribution Levy_English.pdf
Downloaded: Termination of Employment of Workmen (Special Provisions) (Amendment)_English.pdf
Downloaded: Industrial Disputes (Amendment)_English.pdf
Downloaded: Industrial Disputes (Amendment)_Sinhala.pdf




Downloaded: Industrial Disputes (Amendment)_Sinhala.pdf
Downloaded: Termination of Employment of Workmen (Special Provisions) (Amendment)_Sinhala.pdf
Downloaded: Social Security Contribution Levy_Sinhala.pdf
Downloaded: Sisira Jayakody Siyapatha Foundation (Incorporation)_English.pdf
Downloaded: Civil Procedure Code (Amendment)_English.pdf
Downloaded: Civil Procedure Code (Amendment)_Sinhala.pdf
Downloaded: Appropriation (Amendment)_English.pdf
Downloaded: Code of Criminal Procedure (Amendment)_English.pdf
Downloaded: Sisira Jayakody Siyapatha Foundation (Incorporation)_Sinhala.pdf
Downloaded: Code of Criminal Procedure (Amendment)_Sinhala.pdf
Downloaded: Industrial Disputes_English.pdf
Downloaded: Appropriation (Amendment)_Sinhala.pdf
Downloaded: Sri Lanka Electricity (Amendment)_English.pdf
Downloaded: Industrial Disputes_Sinhala.pdf
Downloaded: Sri Shakyasinharama Viharastha Karyasadhaka Sanvidanaya (Incorporation)_English.pdf
Downloaded: Sri Lanka Electricity (Amendment)_Sinhala.pd



Downloaded: Powers of Attorney (Amendment)_English.pdf
Downloaded: Code of Criminal Procedure (Amendment)_English.pdf
Downloaded: Children and Young Persons (Amendment)_English.pdf
Downloaded: Registration of Documents (Amendment)_English.pdf
Downloaded: Kandyan Marriage and Divorce (Amendment)_English.pdf
Downloaded: Wills (Amendment)_English.pdf
Downloaded: Judicature (Amendment)_English.pdf
Downloaded: Powers of Attorney (Amendment)_Sinhala.pdf
Downloaded: Code of Criminal Procedure (Amendment)_Sinhala.pdf
Downloaded: Children and Young Persons (Amendment)_Sinhala.pdf
Downloaded: Registration of Documents (Amendment)_Sinhala.pdf
Downloaded: Kandyan Marriage and Divorce (Amendment)_Sinhala.pdf
Downloaded: Wills (Amendment)_Sinhala.pdf
Downloaded: Judicature (Amendment)_Sinhala.pdf
Processing year: 2021
Found 30 rows for year: 2021
Processing 30 rows with 15 threads, chunk size: 2




Downloaded: Coronavirus Disease 2019 (Covid - 19) (Temporary Provisions)_English.pdf
Downloaded: Finance_English.pdf
Downloaded: Youthful Offenders (Training Schools) (Amendment)_English.pdf
Downloaded: Penal Code (Amendment)_English.pdf
Downloaded: Consumer Affairs Authority (Amendment)_English.pdf
Downloaded: Petroleum Resources_English.pdf
Downloaded: Immigrants and Emigrants (Amendment)_English.pdf
Downloaded: Minimum Retirement Age of Workers_English.pdf
Downloaded: Finance_Sinhala.pdf
Downloaded: Youthful Offenders (Training Schools) (Amendment)_Sinhala.pdf
Downloaded: Penal Code (Amendment)_Sinhala.pdf
Downloaded: Consumer Affairs Authority (Amendment)_Sinhala.pdf




Downloaded: Immigrants and Emigrants (Amendment)_Sinhala.pdf
Downloaded: Coronavirus Disease 2019 (Covid - 19) (Temporary Provisions)_Sinhala.pdf
Downloaded: Minimum Retirement Age of Workers_Sinhala.pdf
Downloaded: Petroleum Resources_Sinhala.pdf
Downloaded: Code of Criminal Procedure (Amedment)_English.pdf
Downloaded: Convention against Torture and other Cruel, Inhuman or Degrading Treatment or Punishment (Amendment)_English.pdf
Downloaded: Appropriation Act_English.pdf
Downloaded: Fiscal Management (Responsibility) (Amendment)_English.pdf
Downloaded: Sri Lanka Land Development Corporation (Amendment)_English.pdf
Downloaded: Fiscal Management (Responsibility) (Amendment)_Sinhala.pdf
Downloaded: Shop and Office Employees (Regulation of Employment and Remuneration) (Amendment)_English.pdf
Downloaded: Colombo Port City Economic Commission_English.pdf
Downloaded: Value Added Tax (Amendment)_English.pdf
Downloaded: Code of Criminal Procedure (Amedment)_Sinhala.pdf
Downloaded: Convention a



Downloaded: Employment of Women, Young Persons and Children (Amendment)_English.pdf
Downloaded: Factories (Amendment)_English.pdf
Downloaded: Appropriation Act_Sinhala.pdf
Downloaded: Colombo Port City Economic Commission_Sinhala.pdf
Downloaded: Employment of Women, Young Persons and Children (Amendment)_Sinhala.pdf
Downloaded: Penal Code (Amendment)_English.pdf




Downloaded: Minimum Wages (Indian Labour) (Amendment)_English.pdf
Downloaded: Evidence (Amendment)_English.pdf
Downloaded: Intellectual Property (Amendment)_English.pdf
Downloaded: Value Added Tax (Amendment)_Sinhala.pdf
Downloaded: Minimum Wages (Indian Labour) (Amendment)_Sinhala.pdf
Downloaded: Factories (Amendment)_Sinhala.pdf
Downloaded: Intellectual Property (Amendment)_Sinhala.pdf
Downloaded: Penal Code (Amendment)_Sinhala.pdf
Downloaded: Evidence (Amendment)_Sinhala.pdf




Downloaded: Appropriation (Amendment)_English.pdf
Downloaded: Registration of Electors (Amendment)_English.pdf
Downloaded: Termination of Employment of Workmen (Special Provisions) (Amendment)_English.pdf
Downloaded: Employees Provident Fund (Amendment)_English.pdf
Downloaded: Appropriation (Amendment)_Sinhala.pdf
Downloaded: Registration of Electors (Amendment)_Sinhala.pdf
Downloaded: Termination of Employment of Workmen (Special Provisions) (Amendment)_Sinhala.pdf
Downloaded: Employees Provident Fund (Amendment)_Sinhala.pdf
Downloaded: Securities and Exchange Commission of Sri Lanka_English.pdf
Downloaded: Securities and Exchange Commission of Sri Lanka_Sinhala.pdf
Downloaded: National Minimum Wage of Workers (Amendment)_English.pdf
Downloaded: National Minimum Wage of Workers (Amendment)_Sinhala.pdf




Downloaded: Inland Revenue (Amendment)_Sinhala.pdf




Downloaded: Bail (Amendment)_English.pdf
Downloaded: Bail (Amendment)_Sinhala.pdf
Processing year: 2020
Found 8 rows for year: 2020
Processing 8 rows with 4 threads, chunk size: 2




Downloaded: Finance (Amendment)_English.pdf
Downloaded: 20th Amendment to the Constitution (NEW)_English.pdf
Downloaded: Appropriation_English.pdf
Downloaded: Appropriation_English.pdf
Downloaded: Finance (Amendment)_Sinhala.pdf
Downloaded: 20th Amendment to the Constitution (NEW)_Sinhala.pdf
Downloaded: Nation Building Tax (Amendment)_English.pdf
Downloaded: Appropriation_Sinhala.pdf
Downloaded: Economic Service Charge (Amendment)_English.pdf
Downloaded: Appropriation_Sinhala.pdf
Downloaded: ports and Airports Development Levy (Amendment)_English.pdf
Downloaded: Nation Building Tax (Amendment)_Sinhala.pdf
Downloaded: Economic Service Charge (Amendment)_Sinhala.pdf
Downloaded: ports and Airports Development Levy (Amendment)_Sinhala.pdf
Downloaded: Institute of Environmental Professionals, Sri Lanka (Incorporation)_English.pdf
Downloaded: Institute of Environmental Professionals, Sri Lanka (Incorporation)_Sinhala.pdf
Processing year: 2019
Found 24 rows for year: 2019
Processing 24 rows 



KeyboardInterrupt: 

In [None]:
#@title cases scraper

import json
import requests
import os
from concurrent.futures import ThreadPoolExecutor, as_completed

# path to JSON file
file_path = "/content/drive/MyDrive/FYP/resources/jurilens-db.documents.json"

# Read the JSON file and load its data
with open(file_path, 'r') as file:
    json_data = json.load(file)

# Define the base save location
base_save_location = "/content/drive/MyDrive/FYP/law_cases_raw"

# Define the download function
def download_pdf(entry):
    try:
        if 'file' not in entry or 'date' not in entry:
            return f"Skipping entry (missing 'file' or 'date'): {entry.get('name', 'Unknown')}"

        file_info = entry['file']
        pdf_url = file_info.get('url')
        pdf_source_url = file_info.get('sourceUrl')
        pdf_name = file_info.get('name')

        # Extract year from date (assuming date is in ISO format)
        year = entry['date']['$date'][:4]  # Get the first four characters representing the year

        # Create a directory for the year if it doesn't exist
        year_folder = os.path.join(base_save_location, year)
        os.makedirs(year_folder, exist_ok=True)

        # Determine the URL to use
        url_to_download = pdf_url if pdf_url else pdf_source_url
        if not url_to_download:
            return f"Skipping {pdf_name}: No valid URL found"

        # Download the PDF
        response = requests.get(url_to_download, timeout=10)
        if response.status_code == 200:
            save_path = os.path.join(year_folder, pdf_name)
            with open(save_path, 'wb') as pdf_file:
                pdf_file.write(response.content)
            return f"Downloaded: {pdf_name}"
        else:
            return f"Failed to download {pdf_name}: HTTP {response.status_code}"

    except Exception as e:
        return f"Error processing {entry.get('name', 'Unknown')}: {str(e)}"

# Set the number of threads
num_threads = 400  # Adjust based on your system's capabilities

# Process files in parallel
with ThreadPoolExecutor(max_workers=num_threads) as executor:
    futures = {executor.submit(download_pdf, entry): entry for entry in json_data[6000:]}

    # Collect and print results
    for future in as_completed(futures):
        print(future.result())

Downloaded: ca_dc_rathnapura_336_98.pdf
Downloaded: ca_writ_578_2011.pdf
Downloaded: ca_tax_05_2009.pdf
Downloaded: ca_hc_ampara_07_2013.pdf
Downloaded: ca_phc_balapitiya_76_08.pdf
Downloaded: ca_hc_matara_270_2012.pdf
Downloaded: ca_phc_apn_124_2012.pdf
Downloaded: ca_wakfs_01_2011.pdf
Downloaded: ca_writ_447_2011.pdf
Downloaded: ca_writ_469_09.pdf
Downloaded: ca_phc_apn_122_2013.pdf
Downloaded: ca_hc_trincomalee_205_2009.pdf
Downloaded: ca_phc_kurunegala_85_2012.pdf
Downloaded: ca_writ_875_2009.pdf
Downloaded: ca_hc_embilipitiya_172_11.pdf
Downloaded: ca_dc_colombo_712_00.pdf
Downloaded: ca_dc_kalutara_593_98.pdf
Downloaded: ca_hc_welikada_214_08.pdf
Downloaded: ca_hc_kandy_13_2006.pdf
Downloaded: ca_writ_366_2013.pdf
Downloaded: ca_dc_kurunegala_99_00.pdf
Downloaded: ca_writ_396_2011.pdf
Downloaded: ca_dc_rathnapura_434_00.pdf
Downloaded: ca_bra_02_2007.pdf
Downloaded: ca_hc_kegalle_72_2006.pdf
Downloaded: ca_dc_kuliyapitiya_612_00.pdf
Downloaded: ca_phc_apn_124_2012.pdf
Downloaded:

# **Preprocessing**

In [None]:
#@title Init

# Install required packages and dependencies
!pip install PyPDF2 pdf2image pytesseract
!apt-get install -y poppler-utils
!apt-get install -y tesseract-ocr tesseract-ocr-sin tesseract-ocr-tam

!pip install fasttext
!pip install googletrans
!wget -O lid.176.bin https://dl.fbaipublicfiles.com/fasttext/supervised-models/lid.176.bin

#mount drive
from google.colab import drive
drive.mount('/content/drive')

Collecting PyPDF2
  Downloading pypdf2-3.0.1-py3-none-any.whl.metadata (6.8 kB)
Collecting pdf2image
  Downloading pdf2image-1.17.0-py3-none-any.whl.metadata (6.2 kB)
Collecting pytesseract
  Downloading pytesseract-0.3.13-py3-none-any.whl.metadata (11 kB)
Downloading pypdf2-3.0.1-py3-none-any.whl (232 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m232.6/232.6 kB[0m [31m6.4 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading pdf2image-1.17.0-py3-none-any.whl (11 kB)
Downloading pytesseract-0.3.13-py3-none-any.whl (14 kB)
Installing collected packages: pytesseract, PyPDF2, pdf2image
Successfully installed PyPDF2-3.0.1 pdf2image-1.17.0 pytesseract-0.3.13
Reading package lists... Done
Building dependency tree... Done
Reading state information... Done
The following NEW packages will be installed:
  poppler-utils
0 upgraded, 1 newly installed, 0 to remove and 21 not upgraded.
Need to get 186 kB of archives.
After this operation, 696 kB of additional disk space will be used.

In [None]:
#@title Extract text and OCR

import os
import uuid
from PyPDF2 import PdfReader
from pdf2image import convert_from_path
import pytesseract
from concurrent.futures import ProcessPoolExecutor, as_completed  # Multiprocessing

# ------------------- PDF Processing Functions -------------------
def extract_text_from_pdf(pdf_path):
    """Attempts to extract text from a PDF using PyPDF2."""
    text = ""
    try:
        reader = PdfReader(pdf_path)
        for page in reader.pages:
            page_text = page.extract_text()
            if page_text:
                text += page_text + "\n"
    except Exception as e:
        print(f"Error extracting text from {pdf_path}: {e}")
    return text.strip()

def ocr_pdf(pdf_path, lang="eng+sin"):
    """Uses pdf2image to convert PDF pages to images and then applies OCR."""
    text = ""
    try:
        images = convert_from_path(pdf_path, dpi=200)  # Lower DPI for faster processing
        for img in images:
            page_text = pytesseract.image_to_string(img, lang=lang)
            text += page_text + "\n"
    except Exception as e:
        print(f"Error OCR processing {pdf_path}: {e}")
    return text.strip()

def process_pdf(pdf_path, lang="eng+sin", text_threshold=10):
    """Attempts to extract text from PDF and falls back to OCR if needed."""
    text = extract_text_from_pdf(pdf_path)
    if len(text) < text_threshold:
        print(f"Text extraction yielded very little text for {os.path.basename(pdf_path)}. Running OCR...\n")
        text = ocr_pdf(pdf_path, lang=lang)
    return text

def process_pdf_file(pdf_path, lang="eng+sin"):
    """Processes a single PDF file and returns its data as a dictionary."""
    unique_id = str(uuid.uuid4())
    filename = os.path.basename(pdf_path)
    text = process_pdf(pdf_path, lang=lang)

    # Assuming language is part of the filename
    primary_lang = filename.split('_')[1].split('.')[0]

    return {
        "id": unique_id,
        "filename": filename.split('_')[0],
        "primaryLang": primary_lang,
        "text": text,
        "wordCount": len(text.split()),
    }

# ------------------- Parallel Processing -------------------
def process_folder(folder_path, lang="eng+sin", max_workers=4):
    """Processes all PDFs in a folder using multiprocessing."""
    results = []
    pdf_files = [os.path.join(folder_path, filename) for filename in os.listdir(folder_path) if filename.lower().endswith(".pdf")]

    with ProcessPoolExecutor(max_workers=max_workers) as executor:
        futures = {executor.submit(process_pdf_file, pdf_path, lang): pdf_path for pdf_path in pdf_files[4:5]}

        for future in as_completed(futures):
            pdf_path = futures[future]
            try:
                result = future.result()  # Get result of the future
                results.append(result)
                print(f"Processed: {result['filename']} | Word Count: {result['wordCount']}")  # Change 'length' to 'wordCount'
            except Exception as e:
                print(f"Error processing {pdf_path}: {e}")

    return results

# ----------------------------- Usage Example -----------------------------
folder_path = "/content/drive/MyDrive/FYP/legal_acts_raw/1995"  # Update this path as needed

# Process all PDFs in the folder
pdf_data = process_folder(folder_path)

# Print summary
for data in pdf_data:
    print(f"ID: {data['id']}\nFilename: {data['filename']}\nPrimary Language: {data['primaryLang']}\nWord Count: {data['wordCount']}\nText Preview: {data['text'][:200]}\n{'-'*50}")

Text extraction yielded very little text for Monetary Law (Amendment)_Sinhala.pdf. Running OCR...

Processed: Monetary Law (Amendment) | Word Count: 733
ID: d9a49f45-3c6a-4a7c-9997-24aecbcda59b
Filename: Monetary Law (Amendment)
Primary Language: Sinhala
Word Count: 733
Text Preview: 22%”:

 

ශ්‍රී ලංකා
ප්‍රජාතාතත්‍රික යමාජවාදි ජතරජයේ
පාර්ලිමේත්තුව

1995 අංක 26 දරන
. මුදල්‌ නීති (සංශෝධන) පනත

;-

 

[සහතිකය සටහන” කළෙ' 1905 නෙැවැම්බර්‌ මස 17 චන දින]

ආණ'ඞුවේ නියමය පරිදි මුද්‍රණය ක
--------------------------------------------------


In [None]:
#@title Translation

import re
import fasttext
import asyncio
from googletrans import Translator
import nest_asyncio  # For Jupyter notebook environments

# Apply nest_asyncio to allow nested event loops
nest_asyncio.apply()

# Load FastText model
model = fasttext.load_model("lid.176.bin")

# Initialize Google Translate API
translator = Translator()

async def detect_language_fasttext(text, word_threshold=300):
    """Detects if the text contains non-English content asynchronously."""
    words = text.split()
    total_words = len(words)
    num_chunks = max(1, total_words // word_threshold)

    tasks = []
    for i in range(num_chunks):
        chunk = " ".join(words[i * word_threshold:(i + 1) * word_threshold])
        tasks.append(asyncio.to_thread(model.predict, chunk))

    results = await asyncio.gather(*tasks)

    for prediction in results:
        detected_lang = prediction[0][0].replace("__label__", "")
        if detected_lang != "en":
            print("Needs Translation")
            return True  # Indicates translation is needed

    return False  # No translation needed

async def translate_if_needed(text, max_length=2000):
    """Translates text while preserving sentence boundaries asynchronously."""
    if await detect_language_fasttext(text):
        try:
            # Split text by sentence boundaries (., !, ?, newline)
            sentences = re.split(r'(?<=[.!?])\s+', text)

            chunks = []
            current_chunk = ""

            for sentence in sentences:
                if len(current_chunk) + len(sentence) < max_length:
                    current_chunk += sentence + " "
                else:
                    chunks.append(current_chunk.strip())
                    current_chunk = sentence + " "

            if current_chunk:
                chunks.append(current_chunk.strip())

            # Translate all chunks in parallel
            tasks = [translator.translate(chunk, dest='en', src='si') for chunk in chunks]
            translated_chunks = await asyncio.gather(*tasks)

            # Extract translated text
            translated_texts = [tr.text for tr in translated_chunks]
            print(translated_texts)

            return " ".join(translated_texts)

        except Exception as e:
            print(f"Translation error: {e}")
            return text  # Return original if translation fails

    return text  # Return original if no translation is needed

async def process_documents(pdf_data):
    """Processes documents asynchronously in parallel."""
    tasks = [translate_if_needed(doc.get("text", "")) for doc in pdf_data]

    # Run translations in parallel
    translated_texts = await asyncio.gather(*tasks)

    # Assign translated text back to documents
    for i, doc in enumerate(pdf_data):
        doc["text"] = translated_texts[i]

    # Print a preview of the updated text
    for doc in pdf_data:
        print(f"ID: {doc['id']}\nFilename: {doc['filename']}\nWord Count:{len(doc['text'].split())}\nText Preview: {doc['text'][:200]}\n{'-'*50}\n")

# Main function to run process_documents
async def main():
    await process_documents(pdf_data)

# Run the main function in an environment with an existing event loop
try:
    loop = asyncio.get_running_loop()  # Get the current running loop
except RuntimeError:  # No running event loop, create a new one
    loop = asyncio.new_event_loop()
    asyncio.set_event_loop(loop)

# Await the main function (ensuring all tasks finish)
if loop.is_running():
    task = asyncio.create_task(main())  # For Jupyter Notebook
    await task
else:
    loop.run_until_complete(main())

ID: d9a49f45-3c6a-4a7c-9997-24aecbcda59b
Filename: Monetary Law (Amendment)
Word Count:795
Text Preview: 22% ":

 

Sri Lanka
Democratic Rataderi
Parliament

Bearing No. 26 of 1995
. Cash Laws (Amendment) Act

; -

 

[Certificate Note "Was' Newatu 17th of Chana 4, 1905

Printed as the ididge of the Ash.
--------------------------------------------------



In [None]:
import json

# Specify the output file path
output_file = "/content/drive/MyDrive/FYP/json/cases_2025.json"

# Write the pdf_data to a JSON file
with open(output_file, "w", encoding='utf-8') as f:
    json.dump(pdf_data, f, indent=4, ensure_ascii=False)

print(f"PDF data successfully written to {output_file}")
