# **Seleksi & Unduh**

In [32]:
!pip install pdfminer.six



In [34]:
!pip install pandas requests beautifulsoup4 pdfminer.six lxml > /dev/null 2>&1

In [51]:
import shutil
shutil.rmtree('/content/drive', ignore_errors=True)

from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [None]:
import argparse
import io
import os
import re
import time
import urllib
from concurrent.futures import ThreadPoolExecutor, wait
from datetime import date
import pandas as pd
import requests
from bs4 import BeautifulSoup
from pdfminer.high_level import extract_text

# Global set untuk tracking PDF yang sudah didownload
downloaded_pdfs = set()

def create_path(folder_name):
    if not os.path.exists(folder_name):
        os.makedirs(folder_name)
        print(f"Folder dibuat: {folder_name}")
    return folder_name

def load_existing_pdfs(path_pdf):
    """Load daftar PDF yang sudah ada untuk mencegah duplikasi"""
    global downloaded_pdfs
    if os.path.exists(path_pdf):
        existing_files = [f for f in os.listdir(path_pdf) if f.endswith('.pdf')]
        downloaded_pdfs.update(existing_files)
        print(f"Ditemukan {len(existing_files)} PDF yang sudah ada")

def sanitize_filename(filename):
    """Bersihkan nama file dari karakter tidak valid"""
    # Hapus karakter yang tidak valid untuk nama file
    invalid_chars = '<>:"/\\|?*'
    for char in invalid_chars:
        filename = filename.replace(char, '_')

    # Hapus spasi berlebih dan ganti dengan underscore
    filename = re.sub(r'\s+', '_', filename.strip())

    # Batasi panjang nama file (max 200 karakter)
    if len(filename) > 200:
        name_part = filename[:190]
        ext_part = filename[-10:] if '.' in filename[-10:] else '.pdf'
        filename = name_part + ext_part

    return filename

def generate_pdf_filename(nomor, tahun, tingkat_proses, original_url):
    """Generate nama file PDF yang rapi dan konsisten"""
    # Ambil nomor putusan dan bersihkan
    clean_nomor = re.sub(r'[^\w\-]', '_', str(nomor)) if nomor else "no_number"

    # Ambil tahun
    clean_tahun = str(tahun) if tahun else "unknown_year"

    # Singkat tingkat proses
    tingkat_map = {
        'Tingkat Pertama': 'TK1',
        'Tingkat Banding': 'TK2',
        'Tingkat Kasasi': 'TK3',
        'Peninjauan Kembali': 'PK'
    }
    clean_tingkat = tingkat_map.get(tingkat_proses, 'TK1')

    # Format: TAHUN_TINGKAT_NOMOR.pdf
    # Contoh: 2024_TK1_123_Pid_Sus_2024_PN_Jakarta.pdf
    filename = f"{clean_tahun}_{clean_tingkat}_{clean_nomor}.pdf"

    # Sanitize final filename
    filename = sanitize_filename(filename)

    return filename

def open_page(link):
    count = 0
    while count < 3:
        try:
            response = requests.get(link, timeout=30)
            return BeautifulSoup(response.text, "lxml")
        except Exception as e:
            count += 1
            print(f"Error membuka halaman (percobaan {count}): {e}")
            time.sleep(5)
    return None

def get_detail(soup, keyword):
    try:
        text = (
            soup.find(lambda tag: tag.name == "td" and keyword in tag.text)
            .find_next()
            .get_text()
            .strip()
        )
        return text
    except:
        return ""

def get_pdf(url, path_pdf, nomor, tahun, tingkat_proses):
    """Download PDF dengan nama file yang rapi dan deteksi duplikasi"""
    global downloaded_pdfs

    try:
        # Generate nama file yang rapi
        clean_filename = generate_pdf_filename(nomor, tahun, tingkat_proses, url)

        # Cek apakah PDF sudah pernah didownload
        if clean_filename in downloaded_pdfs:
            print(f"PDF sudah ada, skip: {clean_filename}")
            return None, clean_filename, "exists"

        # Download PDF
        full_url = f"https://putusan3.mahkamahagung.go.id{url}" if url.startswith('/') else url
        file = urllib.request.urlopen(full_url)
        file_content = file.read()

        # Simpan dengan nama yang rapi
        file_path = os.path.join(path_pdf, clean_filename)
        with open(file_path, "wb") as out_file:
            out_file.write(file_content)

        # Tambahkan ke tracking set
        downloaded_pdfs.add(clean_filename)

        print(f"PDF disimpan: {clean_filename}")
        return io.BytesIO(file_content), clean_filename, "downloaded"

    except Exception as e:
        print(f"Error download PDF: {e}")
        return None, None, "error"

def clean_text(text):
    """Bersihkan teks hasil ekstraksi PDF"""
    if not isinstance(text, str):
        return ""

    text = text.replace("M a h ka m a h A g u n g R e p u blik In d o n esia\n", "")
    text = text.replace("Disclaimer\n", "")
    text = text.replace(
        "Kepaniteraan Mahkamah Agung Republik Indonesia berusaha untuk selalu mencantumkan informasi paling kini dan akurat sebagai bentuk komitmen Mahkamah Agung untuk pelayanan publik, transparansi dan akuntabilitas\n",
        "",
    )
    text = text.replace(
        "pelaksanaan fungsi peradilan. Namun dalam hal-hal tertentu masih dimungkinkan terjadi permasalahan teknis terkait dengan akurasi dan keterkinian informasi yang kami sajikan, hal mana akan terus kami perbaiki dari waktu kewaktu.\n",
        "",
    )
    text = text.replace(
        "Dalam hal Anda menemukan inakurasi informasi yang termuat pada situs ini atau informasi yang seharusnya ada, namun belum tersedia, maka harap segera hubungi Kepaniteraan Mahkamah Agung RI melalui :\n",
        "",
    )
    text = text.replace(
        "Email : kepaniteraan@mahkamahagung.go.id    Telp : 021-384 3348 (ext.318)\n",
        "",
    )
    return text.strip()

def extract_data(link, keyword_url, path_output, path_pdf, today):
    try:
        full_link = f"https://putusan3.mahkamahagung.go.id{link}" if link.startswith('/') else link
        print(f"Mengekstrak data dari: {link}")

        soup = open_page(full_link)
        if not soup:
            print(f"Gagal membuka link: {link}")
            return

        table = soup.find("table", {"class": "table"})
        if not table:
            print(f"Tidak ditemukan table di: {link}")
            return

        judul = table.find("h2").text if table.find("h2") else ""

        # Extract semua detail
        nomor = get_detail(table, "Nomor")
        tingkat_proses = get_detail(table, "Tingkat Proses")
        klasifikasi = get_detail(table, "Klasifikasi")
        kata_kunci = get_detail(table, "Kata Kunci")
        tahun = get_detail(table, "Tahun")
        tanggal_register = get_detail(table, "Tanggal Register")
        lembaga_peradilan = get_detail(table, "Lembaga Peradilan")
        jenis_lembaga_peradilan = get_detail(table, "Jenis Lembaga Peradilan")
        hakim_ketua = get_detail(table, "Hakim Ketua")
        hakim_anggota = get_detail(table, "Hakim Anggota")
        panitera = get_detail(table, "Panitera")
        amar = get_detail(table, "Amar")
        amar_lainnya = get_detail(table, "Amar Lainnya")
        catatan_amar = get_detail(table, "Catatan Amar")
        tanggal_musyawarah = get_detail(table, "Tanggal Musyawarah")
        tanggal_dibacakan = get_detail(table, "Tanggal Dibacakan")
        kaidah = get_detail(table, "Kaidah")
        status = get_detail(table, "Status")
        abstrak = get_detail(table, "Abstrak")

        # Download PDF dengan nama yang rapi
        text_pdf = ""
        link_pdf = ""
        file_name_pdf = ""
        pdf_status = ""

        try:
            pdf_element = soup.find("a", href=re.compile(r"/pdf/"))
            if pdf_element:
                link_pdf = pdf_element["href"]
                file_pdf, file_name_pdf, pdf_status = get_pdf(
                    link_pdf, path_pdf, nomor, tahun, tingkat_proses
                )

                if file_pdf and pdf_status == "downloaded":
                    # Extract text dari PDF yang baru didownload
                    text_pdf = extract_text(file_pdf)
                    text_pdf = clean_text(text_pdf)
                elif pdf_status == "exists":
                    # Load text dari PDF yang sudah ada
                    existing_pdf_path = os.path.join(path_pdf, file_name_pdf)
                    if os.path.exists(existing_pdf_path):
                        try:
                            text_pdf = extract_text(existing_pdf_path)
                            text_pdf = clean_text(text_pdf)
                        except Exception as e:
                            print(f"Error extracting existing PDF {file_name_pdf}: {e}")
                            text_pdf = ""
            else:
                print(f"Tidak ada PDF untuk: {nomor}")

        except Exception as e:
            print(f"Error PDF: {e}")

        # Prepare data dengan nama PDF yang sudah rapi
        data = [
            judul, nomor, tingkat_proses, klasifikasi, kata_kunci, tahun,
            tanggal_register, lembaga_peradilan, jenis_lembaga_peradilan,
            hakim_ketua, hakim_anggota, panitera, amar, amar_lainnya,
            catatan_amar, tanggal_musyawarah, tanggal_dibacakan, kaidah,
            status, abstrak, full_link, link_pdf, file_name_pdf, text_pdf, pdf_status
        ]

        result = pd.DataFrame([data], columns=[
            "judul", "nomor", "tingkat_proses", "klasifikasi", "kata_kunci", "tahun",
            "tanggal_register", "lembaga_peradilan", "jenis_lembaga_peradilan",
            "hakim_ketua", "hakim_anggota", "panitera", "amar", "amar_lainnya",
            "catatan_amar", "tanggal_musyawarah", "tanggal_dibacakan", "kaidah",
            "status", "abstrak", "link", "link_pdf", "file_name_pdf", "text_pdf", "pdf_status"
        ])

        # Simpan ke CSV
        keyword_clean = "korupsi_2024"
        destination = f"{path_output}/putusan_ma_{keyword_clean}_{today}.csv"

        if not os.path.isfile(destination):
            result.to_csv(destination, header=True, index=False)
            print(f"File CSV dibuat: {destination}")
        else:
            result.to_csv(destination, mode="a", header=False, index=False)
            print(f"Data ditambahkan ke: {destination}")

    except Exception as e:
        print(f"Error extract_data: {e}")

def run_process(keyword_url, page, sort_date, path_output, path_pdf, today):
    try:
        if keyword_url.startswith("https"):
            link = f"{keyword_url}&page={page}"
        else:
            link = f"https://putusan3.mahkamahagung.go.id/search.html?q={keyword_url}&page={page}"

        if sort_date:
            link = f"{link}&obf=TANGGAL_PUTUS&obm=desc"

        print(f"\nScraping halaman {page}: {link}")
        soup = open_page(link)

        if not soup:
            print(f"Gagal membuka halaman {page}")
            return

        links = soup.find_all("a", {"href": re.compile("/direktori/putusan")})
        print(f"Ditemukan {len(links)} putusan di halaman {page}")

        for i, link_element in enumerate(links, 1):
            print(f"  [{i}/{len(links)}] Processing...")
            extract_data(link_element["href"], keyword_url, path_output, path_pdf, today)
            time.sleep(1)  # Delay untuk menghindari overload server

    except Exception as e:
        print(f"Error run_process halaman {page}: {e}")

def run_scraper(keyword=None, url=None, sort_date=True, download_pdf=True):
    if not keyword and not url:
        print("Please provide a keyword or URL")
        return

    # Buat path dan folder
    path_output = '/content/drive/MyDrive/terorisme/CSV'
    path_pdf = '/content/drive/MyDrive/terorisme/PDF'

    create_path(path_output)
    create_path(path_pdf)

    # Load PDF yang sudah ada
    load_existing_pdfs(path_pdf)

    today = date.today().strftime("%Y-%m-%d")

    link = f"https://putusan3.mahkamahagung.go.id/search.html?q={keyword}&page=1"
    if url:
        link = url

    print(f"Mengakses URL: {link}")
    soup = open_page(link)

    if not soup:
        print("Gagal membuka halaman pertama")
        return

    # Deteksi pagination dengan aman
    pagination_links = soup.find_all("a", {"class": "page-link"})

    if not pagination_links:
        print("Tidak ada pagination, kemungkinan hanya 1 halaman")
        last_page = 1
    else:
        try:
            page_numbers = []
            for link_elem in pagination_links:
                page_num = link_elem.get("data-ci-pagination-page")
                if page_num and page_num.isdigit():
                    page_numbers.append(int(page_num))
            last_page = max(page_numbers) if page_numbers else 1
        except Exception as e:
            print(f"Error deteksi pagination: {e}")
            last_page = 1

    # Cek hasil pencarian
    no_results = soup.find("div", class_="alert alert-info")
    if no_results and "tidak ditemukan" in no_results.text.lower():
        print("Tidak ada hasil ditemukan untuk pencarian ini")
        return

    print(f"Total halaman yang akan di-scrape: {last_page}")
    print(f"Estimasi data: {20 * last_page}")

    keyword_url = url if url else keyword

    # Mulai scraping
    for page in range(1, last_page + 1):
        run_process(keyword_url, page, sort_date, path_output, path_pdf, today)
        time.sleep(2)  # Delay antar halaman

    print("\n=== SCRAPING SELESAI ===")
    print(f"File CSV disimpan di: {path_output}")
    print(f"File PDF disimpan di: {path_pdf}")

    # Cek file yang tersimpan
    try:
        csv_files = [f for f in os.listdir(path_output) if f.endswith('.csv')]
        pdf_files = [f for f in os.listdir(path_pdf) if f.endswith('.pdf')]
        print(f"Jumlah file CSV: {len(csv_files)}")
        print(f"Jumlah file PDF: {len(pdf_files)}")

        # Tampilkan beberapa contoh nama PDF
        if pdf_files:
            print(f"\nContoh nama PDF yang rapi:")
            for pdf in sorted(pdf_files)[:5]:
                print(f"  - {pdf}")
            if len(pdf_files) > 5:
                print(f"  ... dan {len(pdf_files)-5} file lainnya")

    except Exception as e:
        print(f"Tidak dapat mengecek file hasil: {e}")

# Jalankan scraper
if __name__ == "__main__":
    run_scraper(url="https://putusan3.mahkamahagung.go.id/search.html?q=Terorisme&jenis_doc=putusan&t_put=2023")

Ditemukan 0 PDF yang sudah ada
Mengakses URL: https://putusan3.mahkamahagung.go.id/search.html?q=Terorisme&jenis_doc=putusan&t_put=2023
Total halaman yang akan di-scrape: 6
Estimasi data: 120

Scraping halaman 1: https://putusan3.mahkamahagung.go.id/search.html?q=Terorisme&jenis_doc=putusan&t_put=2023&page=1&obf=TANGGAL_PUTUS&obm=desc
Ditemukan 23 putusan di halaman 1
  [1/23] Processing...
Mengekstrak data dari: https://putusan3.mahkamahagung.go.id/direktori/putusan/zaeea92129cdc7e0bc25313034333539.html
PDF disimpan: 2023_TK1_Putusan_PN_JAKARTA_TIMUR_Nomor_635_Pid_Sus_2023_PN_JKT_TIM_Tanggal_18_Desember_2023__Penuntut_Umum_ARY_PRATAMA__SHTerdakwa_INDRA_SYAHPUTRA_Alias_INDRA_ONO_Alias_ONO_KAY_Alias_ICAN_AN__Alm.pdf
File CSV dibuat: /content/drive/MyDrive/terorisme/CSV/putusan_ma_korupsi_2024_2025-06-12.csv
  [2/23] Processing...
Mengekstrak data dari: https://putusan3.mahkamahagung.go.id/direktori/putusan/zaeec5a02ac700949d7b313730333339.html
PDF disimpan: 2023_TK1_Putusan_PN_JAKARTA_T

##Konversi & Ekstraksi Teks

In [52]:
!pip install pandas requests beautifulsoup4 pdfminer.six lxml --quiet

In [53]:
import os
import pandas as pd
import re
import io
import subprocess
import logging
from pdfminer.high_level import extract_text
from pdfminer.pdfinterp import PDFResourceManager, PDFPageInterpreter
from pdfminer.converter import TextConverter
from pdfminer.layout import LAParams
from pdfminer.pdfpage import PDFPage
from datetime import datetime

In [59]:
# PDF processing imports
try:
    from pdfminer.high_level import extract_text
    from pdfminer.pdfinterp import PDFResourceManager, PDFPageInterpreter
    from pdfminer.converter import TextConverter
    from pdfminer.layout import LAParams
    from pdfminer.pdfpage import PDFPage
    PDFMINER_AVAILABLE = True
except ImportError:
    PDFMINER_AVAILABLE = False
    print("pdfminer not available - install with: pip install pdfminer.six")

# Setup logging
logging.basicConfig(level=logging.INFO)
logger = logging.getLogger(__name__)

class TextExtractor:
    """Extract plain text from PDF files - NO CLEANING"""

    def __init__(self, base_dir="/content/drive/MyDrive/terorisme"):
        self.base_dir = base_dir
        self.pdf_dir = os.path.join(base_dir, "PDF")
        self.raw_text_dir = os.path.join(base_dir, "RAW_TEXT")  # Raw text output
        self.logs_dir = "/logs"

        # Create directories
        os.makedirs(self.raw_text_dir, exist_ok=True)
        os.makedirs(self.logs_dir, exist_ok=True)

        print(f"PDF input: {self.pdf_dir}")
        print(f"Raw text output: {self.raw_text_dir}")

        # Setup extraction logger
        self.setup_extraction_logger()

    def setup_extraction_logger(self):
        """Setup dedicated logger for extraction process"""
        self.extraction_logger = logging.getLogger('text_extraction')
        self.extraction_logger.setLevel(logging.INFO)

        # Remove existing handlers
        for handler in self.extraction_logger.handlers[:]:
            self.extraction_logger.removeHandler(handler)

        # Create file handler
        log_file = os.path.join(self.logs_dir, 'extraction.log')
        file_handler = logging.FileHandler(log_file, mode='a', encoding='utf-8')
        file_handler.setLevel(logging.INFO)

        # Create formatter
        formatter = logging.Formatter(
            '%(asctime)s - %(levelname)s - %(message)s',
            datefmt='%Y-%m-%d %H:%M:%S'
        )
        file_handler.setFormatter(formatter)
        self.extraction_logger.addHandler(file_handler)

        self.extraction_logger.info("="*60)
        self.extraction_logger.info("TEXT EXTRACTION SESSION STARTED")
        self.extraction_logger.info("="*60)

    # =================== PDF EXTRACTION METHODS ===================

    def pdf_to_text_pdfminer_basic(self, pdf_path):
        """Method 1: Extract using pdfminer basic"""
        if not PDFMINER_AVAILABLE:
            return None, "pdfminer not available"

        try:
            text = extract_text(pdf_path)
            return text, "pdfminer_basic"
        except Exception as e:
            return None, f"pdfminer_basic error: {e}"

    def pdf_to_text_pdfminer_advanced(self, pdf_path):
        """Method 2: Extract using pdfminer with layout analysis"""
        if not PDFMINER_AVAILABLE:
            return None, "pdfminer not available"

        try:
            resource_manager = PDFResourceManager()
            fake_file_handle = io.StringIO()
            converter = TextConverter(resource_manager, fake_file_handle, laparams=LAParams())
            page_interpreter = PDFPageInterpreter(resource_manager, converter)

            with open(pdf_path, 'rb') as fh:
                for page in PDFPage.get_pages(fh, caching=True, check_extractable=True):
                    page_interpreter.process_page(page)

            text = fake_file_handle.getvalue()
            fake_file_handle.close()
            converter.close()

            return text, "pdfminer_advanced"
        except Exception as e:
            return None, f"pdfminer_advanced error: {e}"

    def pdf_to_text_pdftotext(self, pdf_path):
        """Method 3: Extract using pdftotext (poppler-utils)"""
        try:
            # Check if pdftotext is available
            try:
                subprocess.run(['pdftotext', '-v'], capture_output=True, check=True)
            except (subprocess.CalledProcessError, FileNotFoundError):
                # Try to install poppler-utils
                try:
                    self.extraction_logger.info("Installing poppler-utils...")
                    subprocess.run(['apt-get', 'update'], check=True, capture_output=True)
                    subprocess.run(['apt-get', 'install', '-y', 'poppler-utils'], check=True, capture_output=True)
                    self.extraction_logger.info("poppler-utils installed successfully")
                except Exception as install_error:
                    return None, f"Failed to install poppler-utils: {install_error}"

            # Extract text using pdftotext
            result = subprocess.run(
                ['pdftotext', '-layout', pdf_path, '-'],
                capture_output=True, text=True, check=True
            )

            return result.stdout, "pdftotext"
        except subprocess.CalledProcessError as e:
            return None, f"pdftotext error: {e}"
        except Exception as e:
            return None, f"pdftotext setup error: {e}"

    def extract_from_pdf(self, pdf_path):
        """Extract text from single PDF using multiple methods"""
        filename = os.path.basename(pdf_path)
        self.extraction_logger.info(f"Extracting from PDF: {filename}")

        # Try extraction methods in order
        methods = [
            self.pdf_to_text_pdfminer_basic,
            self.pdf_to_text_pdfminer_advanced,
            self.pdf_to_text_pdftotext
        ]

        for method in methods:
            text, method_info = method(pdf_path)

            if text and len(text.strip()) > 50:  # Minimum threshold
                self.extraction_logger.info(f"Success with {method_info}: {len(text)} characters")
                return text, method_info
            elif text:
                self.extraction_logger.warning(f"{method_info} returned short text: {len(text)} chars")

        self.extraction_logger.error(f"All PDF extraction methods failed for {filename}")
        return None, "all_methods_failed"

    # =================== MAIN PROCESSING METHODS ===================

    def process_single_file(self, file_path):
        """Process single PDF file"""
        filename = os.path.basename(file_path)
        file_ext = os.path.splitext(filename)[1].lower()

        # Only process PDF files
        if file_ext != '.pdf':
            self.extraction_logger.error(f"Only PDF files supported: {filename}")
            return None

        raw_text, method_used = self.extract_from_pdf(file_path)

        if not raw_text or len(raw_text.strip()) < 50:
            self.extraction_logger.error(f"Extraction failed or insufficient text: {filename}")
            return None

        # Save raw text (NO CLEANING)
        base_name = os.path.splitext(filename)[0]
        raw_text_filename = f"raw_{base_name}.txt"
        raw_text_path = os.path.join(self.raw_text_dir, raw_text_filename)

        try:
            with open(raw_text_path, 'w', encoding='utf-8') as f:
                f.write(raw_text)

            self.extraction_logger.info(f"Raw text saved: {raw_text_filename}")
            print(f"SUCCESS: {filename} -> {raw_text_filename} ({len(raw_text)} chars)")

            return {
                'source_file': filename,
                'source_type': file_ext,
                'raw_text_file': raw_text_filename,
                'raw_text_path': raw_text_path,
                'method_used': method_used,
                'text_length': len(raw_text),
                'status': 'success'
            }

        except Exception as e:
            self.extraction_logger.error(f"Error saving raw text for {filename}: {e}")
            return None

    def process_all_pdfs(self):
        """Process all PDF files"""
        if not os.path.exists(self.pdf_dir):
            print(f"PDF directory not found: {self.pdf_dir}")
            return []

        pdf_files = [f for f in os.listdir(self.pdf_dir) if f.endswith('.pdf')]

        if not pdf_files:
            print(f"No PDF files found in {self.pdf_dir}")
            return []

        print(f"Found {len(pdf_files)} PDF files")

        results = []
        for i, pdf_file in enumerate(pdf_files, 1):
            pdf_path = os.path.join(self.pdf_dir, pdf_file)
            print(f"[{i}/{len(pdf_files)}] Processing: {pdf_file}")

            result = self.process_single_file(pdf_path)
            if result:
                results.append(result)

        return results

    def process_all_files(self):
        """Process all PDF files"""
        print("TEXT EXTRACTION STARTED")
        print("=" * 50)
        print("Tujuan: Konversi PDF -> Raw plain text")
        print("Output: Raw text files (BELUM dibersihkan)")
        print("=" * 50)

        # Process PDFs
        pdf_results = self.process_all_pdfs()

        if not pdf_results:
            print("No files processed successfully")
            return None

        # Create extraction report
        df_results = pd.DataFrame(pdf_results)
        report_path = os.path.join(self.logs_dir, 'extraction_report.csv')
        df_results.to_csv(report_path, index=False)

        # Summary
        print("\n" + "=" * 60)
        print("EXTRACTION SUMMARY")
        print("=" * 60)
        print(f"PDF files processed: {len(pdf_results)}")
        print(f"Total successful extractions: {len(pdf_results)}")
        print(f"Raw text files saved to: {self.raw_text_dir}")
        print(f"Extraction report: {report_path}")
        print(f"Extraction log: {os.path.join(self.logs_dir, 'extraction.log')}")

        return df_results

# Utility functions
def extract_single_pdf(pdf_path, output_dir="/tmp"):
    """Quick function to extract single PDF"""
    extractor = TextExtractor()
    extractor.raw_text_dir = output_dir
    return extractor.process_single_file(pdf_path)

# Main execution
def main():
    """Main function for text extraction"""
    print("ii. KONVERSI & EKSTRAKSI TEKS")
    print("=" * 50)

    # Check dependencies
    print("Checking dependencies...")
    if PDFMINER_AVAILABLE:
        print("SUCCESS: pdfminer.six available")
    else:
        print("ERROR: pdfminer.six not available - install with: pip install pdfminer.six")

    # Initialize extractor
    extractor = TextExtractor("/content/drive/MyDrive/terorisme")

    # Run extraction
    results = extractor.process_all_files()

    if results is not None:
        print(f"\nEXTRACTION COMPLETE!")
        print(f"Check raw text files in: {extractor.raw_text_dir}")
        print(f"Next step: Run text cleaning on raw files")
    else:
        print(f"\nNo files extracted. Check your PDF directory.")

if __name__ == "__main__":
    main()


INFO:text_extraction:TEXT EXTRACTION SESSION STARTED
INFO:text_extraction:Extracting from PDF: 2023_TK1_Putusan_PN_JAKARTA_TIMUR_Nomor_635_Pid_Sus_2023_PN_JKT_TIM_Tanggal_18_Desember_2023__Penuntut_Umum_ARY_PRATAMA__SHTerdakwa_INDRA_SYAHPUTRA_Alias_INDRA_ONO_Alias_ONO_KAY_Alias_ICAN_AN__Alm.pdf


ii. KONVERSI & EKSTRAKSI TEKS
Checking dependencies...
SUCCESS: pdfminer.six available
PDF input: /content/drive/MyDrive/terorisme/PDF
Raw text output: /content/drive/MyDrive/terorisme/RAW_TEXT
TEXT EXTRACTION STARTED
Tujuan: Konversi PDF -> Raw plain text
Output: Raw text files (BELUM dibersihkan)
Found 46 PDF files
[1/46] Processing: 2023_TK1_Putusan_PN_JAKARTA_TIMUR_Nomor_635_Pid_Sus_2023_PN_JKT_TIM_Tanggal_18_Desember_2023__Penuntut_Umum_ARY_PRATAMA__SHTerdakwa_INDRA_SYAHPUTRA_Alias_INDRA_ONO_Alias_ONO_KAY_Alias_ICAN_AN__Alm.pdf


INFO:text_extraction:Success with pdfminer_basic: 242685 characters
INFO:text_extraction:Raw text saved: raw_2023_TK1_Putusan_PN_JAKARTA_TIMUR_Nomor_635_Pid_Sus_2023_PN_JKT_TIM_Tanggal_18_Desember_2023__Penuntut_Umum_ARY_PRATAMA__SHTerdakwa_INDRA_SYAHPUTRA_Alias_INDRA_ONO_Alias_ONO_KAY_Alias_ICAN_AN__Alm.txt
INFO:text_extraction:Extracting from PDF: 2023_TK1_Putusan_PN_JAKARTA_TIMUR_Nomor_631_Pid_Sus_2023_PN_JKT_TIM_Tanggal_14_Desember_2023__Penuntut_Umum_ANDI_JEFRI_ARDIN__S_H_Terdakwa_DIAN_YUDI_SAPUTRA_alias_ABU_HANIF_Bin_WAHYU_ILAHI__Alm.pdf


SUCCESS: 2023_TK1_Putusan_PN_JAKARTA_TIMUR_Nomor_635_Pid_Sus_2023_PN_JKT_TIM_Tanggal_18_Desember_2023__Penuntut_Umum_ARY_PRATAMA__SHTerdakwa_INDRA_SYAHPUTRA_Alias_INDRA_ONO_Alias_ONO_KAY_Alias_ICAN_AN__Alm.pdf -> raw_2023_TK1_Putusan_PN_JAKARTA_TIMUR_Nomor_635_Pid_Sus_2023_PN_JKT_TIM_Tanggal_18_Desember_2023__Penuntut_Umum_ARY_PRATAMA__SHTerdakwa_INDRA_SYAHPUTRA_Alias_INDRA_ONO_Alias_ONO_KAY_Alias_ICAN_AN__Alm.txt (242685 chars)
[2/46] Processing: 2023_TK1_Putusan_PN_JAKARTA_TIMUR_Nomor_631_Pid_Sus_2023_PN_JKT_TIM_Tanggal_14_Desember_2023__Penuntut_Umum_ANDI_JEFRI_ARDIN__S_H_Terdakwa_DIAN_YUDI_SAPUTRA_alias_ABU_HANIF_Bin_WAHYU_ILAHI__Alm.pdf


INFO:text_extraction:Success with pdfminer_basic: 22592 characters
INFO:text_extraction:Raw text saved: raw_2023_TK1_Putusan_PN_JAKARTA_TIMUR_Nomor_631_Pid_Sus_2023_PN_JKT_TIM_Tanggal_14_Desember_2023__Penuntut_Umum_ANDI_JEFRI_ARDIN__S_H_Terdakwa_DIAN_YUDI_SAPUTRA_alias_ABU_HANIF_Bin_WAHYU_ILAHI__Alm.txt
INFO:text_extraction:Extracting from PDF: 2023_TK1_Putusan_PN_JAKARTA_TIMUR_Nomor_629_Pid_Sus_2023_PN_JKT_TIM_Tanggal_14_Desember_2023__Penuntut_Umum_HERRY_WIYANTO__SH__M_HumTerdakwa_TAJUDIN_Als_PAK_HAJI_TAJUDIN_Als_PAK_TEJE_Als_PAKWA_URA.pdf


SUCCESS: 2023_TK1_Putusan_PN_JAKARTA_TIMUR_Nomor_631_Pid_Sus_2023_PN_JKT_TIM_Tanggal_14_Desember_2023__Penuntut_Umum_ANDI_JEFRI_ARDIN__S_H_Terdakwa_DIAN_YUDI_SAPUTRA_alias_ABU_HANIF_Bin_WAHYU_ILAHI__Alm.pdf -> raw_2023_TK1_Putusan_PN_JAKARTA_TIMUR_Nomor_631_Pid_Sus_2023_PN_JKT_TIM_Tanggal_14_Desember_2023__Penuntut_Umum_ANDI_JEFRI_ARDIN__S_H_Terdakwa_DIAN_YUDI_SAPUTRA_alias_ABU_HANIF_Bin_WAHYU_ILAHI__Alm.txt (22592 chars)
[3/46] Processing: 2023_TK1_Putusan_PN_JAKARTA_TIMUR_Nomor_629_Pid_Sus_2023_PN_JKT_TIM_Tanggal_14_Desember_2023__Penuntut_Umum_HERRY_WIYANTO__SH__M_HumTerdakwa_TAJUDIN_Als_PAK_HAJI_TAJUDIN_Als_PAK_TEJE_Als_PAKWA_URA.pdf


INFO:text_extraction:Success with pdfminer_basic: 286786 characters
INFO:text_extraction:Raw text saved: raw_2023_TK1_Putusan_PN_JAKARTA_TIMUR_Nomor_629_Pid_Sus_2023_PN_JKT_TIM_Tanggal_14_Desember_2023__Penuntut_Umum_HERRY_WIYANTO__SH__M_HumTerdakwa_TAJUDIN_Als_PAK_HAJI_TAJUDIN_Als_PAK_TEJE_Als_PAKWA_URA.txt
INFO:text_extraction:Extracting from PDF: 2023_TK1_Putusan_PN_JAKARTA_TIMUR_Nomor_555_Pid_Sus_2023_PN_JKT_TIM_Tanggal_13_Desember_2023__Penuntut_Umum_ERWIN_INDRAPUTRA__SH___MHTerdakwa_ARIS_BUDIANTO_alias_RIKO_alias_BAHAR_alias_SARAHARSONO.pdf


SUCCESS: 2023_TK1_Putusan_PN_JAKARTA_TIMUR_Nomor_629_Pid_Sus_2023_PN_JKT_TIM_Tanggal_14_Desember_2023__Penuntut_Umum_HERRY_WIYANTO__SH__M_HumTerdakwa_TAJUDIN_Als_PAK_HAJI_TAJUDIN_Als_PAK_TEJE_Als_PAKWA_URA.pdf -> raw_2023_TK1_Putusan_PN_JAKARTA_TIMUR_Nomor_629_Pid_Sus_2023_PN_JKT_TIM_Tanggal_14_Desember_2023__Penuntut_Umum_HERRY_WIYANTO__SH__M_HumTerdakwa_TAJUDIN_Als_PAK_HAJI_TAJUDIN_Als_PAK_TEJE_Als_PAKWA_URA.txt (286786 chars)
[4/46] Processing: 2023_TK1_Putusan_PN_JAKARTA_TIMUR_Nomor_555_Pid_Sus_2023_PN_JKT_TIM_Tanggal_13_Desember_2023__Penuntut_Umum_ERWIN_INDRAPUTRA__SH___MHTerdakwa_ARIS_BUDIANTO_alias_RIKO_alias_BAHAR_alias_SARAHARSONO.pdf


INFO:text_extraction:Success with pdfminer_basic: 190078 characters
INFO:text_extraction:Raw text saved: raw_2023_TK1_Putusan_PN_JAKARTA_TIMUR_Nomor_555_Pid_Sus_2023_PN_JKT_TIM_Tanggal_13_Desember_2023__Penuntut_Umum_ERWIN_INDRAPUTRA__SH___MHTerdakwa_ARIS_BUDIANTO_alias_RIKO_alias_BAHAR_alias_SARAHARSONO.txt
INFO:text_extraction:Extracting from PDF: 2023_TK1_Putusan_PN_JAKARTA_TIMUR_Nomor_637_Pid_Sus_2023_PN_JKT_TIM_Tanggal_15_Nopember_2023__Penuntut_Umum_AMRI_BAYAKTA__SHTerdakwa_WAHYUDI__alias_JONI_alias_GUNTUR_alias_FAJAR_Bin_ABDUL_PANUT.pdf


SUCCESS: 2023_TK1_Putusan_PN_JAKARTA_TIMUR_Nomor_555_Pid_Sus_2023_PN_JKT_TIM_Tanggal_13_Desember_2023__Penuntut_Umum_ERWIN_INDRAPUTRA__SH___MHTerdakwa_ARIS_BUDIANTO_alias_RIKO_alias_BAHAR_alias_SARAHARSONO.pdf -> raw_2023_TK1_Putusan_PN_JAKARTA_TIMUR_Nomor_555_Pid_Sus_2023_PN_JKT_TIM_Tanggal_13_Desember_2023__Penuntut_Umum_ERWIN_INDRAPUTRA__SH___MHTerdakwa_ARIS_BUDIANTO_alias_RIKO_alias_BAHAR_alias_SARAHARSONO.txt (190078 chars)
[5/46] Processing: 2023_TK1_Putusan_PN_JAKARTA_TIMUR_Nomor_637_Pid_Sus_2023_PN_JKT_TIM_Tanggal_15_Nopember_2023__Penuntut_Umum_AMRI_BAYAKTA__SHTerdakwa_WAHYUDI__alias_JONI_alias_GUNTUR_alias_FAJAR_Bin_ABDUL_PANUT.pdf


INFO:text_extraction:Success with pdfminer_basic: 85692 characters
INFO:text_extraction:Raw text saved: raw_2023_TK1_Putusan_PN_JAKARTA_TIMUR_Nomor_637_Pid_Sus_2023_PN_JKT_TIM_Tanggal_15_Nopember_2023__Penuntut_Umum_AMRI_BAYAKTA__SHTerdakwa_WAHYUDI__alias_JONI_alias_GUNTUR_alias_FAJAR_Bin_ABDUL_PANUT.txt
INFO:text_extraction:Extracting from PDF: 2023_TK1_Putusan_PN_JAKARTA_TIMUR_Nomor_544_Pid_Sus_2023_PN_JKT_TIM_Tanggal_13_Nopember_2023__Penuntut_Umum_MALINI_SIANTURI__SHTerdakwa_MUHAMMAD_BUDI_SATRIA_Alias_BUDI__alias_KARI__alias_JUNH__Alm.pdf


SUCCESS: 2023_TK1_Putusan_PN_JAKARTA_TIMUR_Nomor_637_Pid_Sus_2023_PN_JKT_TIM_Tanggal_15_Nopember_2023__Penuntut_Umum_AMRI_BAYAKTA__SHTerdakwa_WAHYUDI__alias_JONI_alias_GUNTUR_alias_FAJAR_Bin_ABDUL_PANUT.pdf -> raw_2023_TK1_Putusan_PN_JAKARTA_TIMUR_Nomor_637_Pid_Sus_2023_PN_JKT_TIM_Tanggal_15_Nopember_2023__Penuntut_Umum_AMRI_BAYAKTA__SHTerdakwa_WAHYUDI__alias_JONI_alias_GUNTUR_alias_FAJAR_Bin_ABDUL_PANUT.txt (85692 chars)
[6/46] Processing: 2023_TK1_Putusan_PN_JAKARTA_TIMUR_Nomor_544_Pid_Sus_2023_PN_JKT_TIM_Tanggal_13_Nopember_2023__Penuntut_Umum_MALINI_SIANTURI__SHTerdakwa_MUHAMMAD_BUDI_SATRIA_Alias_BUDI__alias_KARI__alias_JUNH__Alm.pdf


INFO:text_extraction:Success with pdfminer_basic: 275059 characters
INFO:text_extraction:Raw text saved: raw_2023_TK1_Putusan_PN_JAKARTA_TIMUR_Nomor_544_Pid_Sus_2023_PN_JKT_TIM_Tanggal_13_Nopember_2023__Penuntut_Umum_MALINI_SIANTURI__SHTerdakwa_MUHAMMAD_BUDI_SATRIA_Alias_BUDI__alias_KARI__alias_JUNH__Alm.txt
INFO:text_extraction:Extracting from PDF: 2023_TK1_Putusan_PN_JAKARTA_TIMUR_Nomor_533_Pid_Sus_2023_PN_JKT_TIM_Tanggal_8_Nopember_2023__Penuntut_Umum_DAVID_ROGER_J_PAKPAHAN__SHTerdakwa_SUGENG_Alias_SALMAN_ALGHOZALI_Alias_USTAD_SALMANADENAN.pdf


SUCCESS: 2023_TK1_Putusan_PN_JAKARTA_TIMUR_Nomor_544_Pid_Sus_2023_PN_JKT_TIM_Tanggal_13_Nopember_2023__Penuntut_Umum_MALINI_SIANTURI__SHTerdakwa_MUHAMMAD_BUDI_SATRIA_Alias_BUDI__alias_KARI__alias_JUNH__Alm.pdf -> raw_2023_TK1_Putusan_PN_JAKARTA_TIMUR_Nomor_544_Pid_Sus_2023_PN_JKT_TIM_Tanggal_13_Nopember_2023__Penuntut_Umum_MALINI_SIANTURI__SHTerdakwa_MUHAMMAD_BUDI_SATRIA_Alias_BUDI__alias_KARI__alias_JUNH__Alm.txt (275059 chars)
[7/46] Processing: 2023_TK1_Putusan_PN_JAKARTA_TIMUR_Nomor_533_Pid_Sus_2023_PN_JKT_TIM_Tanggal_8_Nopember_2023__Penuntut_Umum_DAVID_ROGER_J_PAKPAHAN__SHTerdakwa_SUGENG_Alias_SALMAN_ALGHOZALI_Alias_USTAD_SALMANADENAN.pdf


INFO:text_extraction:Success with pdfminer_basic: 146539 characters
INFO:text_extraction:Raw text saved: raw_2023_TK1_Putusan_PN_JAKARTA_TIMUR_Nomor_533_Pid_Sus_2023_PN_JKT_TIM_Tanggal_8_Nopember_2023__Penuntut_Umum_DAVID_ROGER_J_PAKPAHAN__SHTerdakwa_SUGENG_Alias_SALMAN_ALGHOZALI_Alias_USTAD_SALMANADENAN.txt
INFO:text_extraction:Extracting from PDF: 2023_TK1_Putusan_PN_JAKARTA_TIMUR_Nomor_532_Pid_Sus_2023_PN_JKT_TIM_Tanggal_8_Nopember_2023__Penuntut_Umum_DAVID_ROGER_J_PAKPAHAN__SHTerdakwa_IDRIS_ABDILLAH_MAHMUD_ALIAS_SETETES_EMBUN_PAGI_BMAHMUD.pdf


SUCCESS: 2023_TK1_Putusan_PN_JAKARTA_TIMUR_Nomor_533_Pid_Sus_2023_PN_JKT_TIM_Tanggal_8_Nopember_2023__Penuntut_Umum_DAVID_ROGER_J_PAKPAHAN__SHTerdakwa_SUGENG_Alias_SALMAN_ALGHOZALI_Alias_USTAD_SALMANADENAN.pdf -> raw_2023_TK1_Putusan_PN_JAKARTA_TIMUR_Nomor_533_Pid_Sus_2023_PN_JKT_TIM_Tanggal_8_Nopember_2023__Penuntut_Umum_DAVID_ROGER_J_PAKPAHAN__SHTerdakwa_SUGENG_Alias_SALMAN_ALGHOZALI_Alias_USTAD_SALMANADENAN.txt (146539 chars)
[8/46] Processing: 2023_TK1_Putusan_PN_JAKARTA_TIMUR_Nomor_532_Pid_Sus_2023_PN_JKT_TIM_Tanggal_8_Nopember_2023__Penuntut_Umum_DAVID_ROGER_J_PAKPAHAN__SHTerdakwa_IDRIS_ABDILLAH_MAHMUD_ALIAS_SETETES_EMBUN_PAGI_BMAHMUD.pdf


INFO:text_extraction:Success with pdfminer_basic: 161366 characters
INFO:text_extraction:Raw text saved: raw_2023_TK1_Putusan_PN_JAKARTA_TIMUR_Nomor_532_Pid_Sus_2023_PN_JKT_TIM_Tanggal_8_Nopember_2023__Penuntut_Umum_DAVID_ROGER_J_PAKPAHAN__SHTerdakwa_IDRIS_ABDILLAH_MAHMUD_ALIAS_SETETES_EMBUN_PAGI_BMAHMUD.txt
INFO:text_extraction:Extracting from PDF: 2023_TK1_Putusan_PN_JAKARTA_TIMUR_Nomor_594_Pid_Sus_2023_PN_JKT_TIM_Tanggal_1_Nopember_2023__Penuntut_Umum_ERWIN_INDRAPUTRA__SH___MHTerdakwa_DJOKO_UTOMO_alias_JACK_alias_BENI_alias_ARI_aliasURIPDI.pdf


SUCCESS: 2023_TK1_Putusan_PN_JAKARTA_TIMUR_Nomor_532_Pid_Sus_2023_PN_JKT_TIM_Tanggal_8_Nopember_2023__Penuntut_Umum_DAVID_ROGER_J_PAKPAHAN__SHTerdakwa_IDRIS_ABDILLAH_MAHMUD_ALIAS_SETETES_EMBUN_PAGI_BMAHMUD.pdf -> raw_2023_TK1_Putusan_PN_JAKARTA_TIMUR_Nomor_532_Pid_Sus_2023_PN_JKT_TIM_Tanggal_8_Nopember_2023__Penuntut_Umum_DAVID_ROGER_J_PAKPAHAN__SHTerdakwa_IDRIS_ABDILLAH_MAHMUD_ALIAS_SETETES_EMBUN_PAGI_BMAHMUD.txt (161366 chars)
[9/46] Processing: 2023_TK1_Putusan_PN_JAKARTA_TIMUR_Nomor_594_Pid_Sus_2023_PN_JKT_TIM_Tanggal_1_Nopember_2023__Penuntut_Umum_ERWIN_INDRAPUTRA__SH___MHTerdakwa_DJOKO_UTOMO_alias_JACK_alias_BENI_alias_ARI_aliasURIPDI.pdf


INFO:text_extraction:Success with pdfminer_basic: 324827 characters
INFO:text_extraction:Raw text saved: raw_2023_TK1_Putusan_PN_JAKARTA_TIMUR_Nomor_594_Pid_Sus_2023_PN_JKT_TIM_Tanggal_1_Nopember_2023__Penuntut_Umum_ERWIN_INDRAPUTRA__SH___MHTerdakwa_DJOKO_UTOMO_alias_JACK_alias_BENI_alias_ARI_aliasURIPDI.txt
INFO:text_extraction:Extracting from PDF: 2023_TK1_Putusan_PN_JAKARTA_TIMUR_Nomor_527_Pid_Sus_2023_PN_JKT_TIM_Tanggal_19_Oktober_2023__Penuntut_Umum_BERLIAN_D_NAINGGOLAN__SHTerdakwa_SYAIFULLAH_RIFAI_Alias_SYAIFULLAH_RIFAI_Alias_SAYFRAFII_.pdf


SUCCESS: 2023_TK1_Putusan_PN_JAKARTA_TIMUR_Nomor_594_Pid_Sus_2023_PN_JKT_TIM_Tanggal_1_Nopember_2023__Penuntut_Umum_ERWIN_INDRAPUTRA__SH___MHTerdakwa_DJOKO_UTOMO_alias_JACK_alias_BENI_alias_ARI_aliasURIPDI.pdf -> raw_2023_TK1_Putusan_PN_JAKARTA_TIMUR_Nomor_594_Pid_Sus_2023_PN_JKT_TIM_Tanggal_1_Nopember_2023__Penuntut_Umum_ERWIN_INDRAPUTRA__SH___MHTerdakwa_DJOKO_UTOMO_alias_JACK_alias_BENI_alias_ARI_aliasURIPDI.txt (324827 chars)
[10/46] Processing: 2023_TK1_Putusan_PN_JAKARTA_TIMUR_Nomor_527_Pid_Sus_2023_PN_JKT_TIM_Tanggal_19_Oktober_2023__Penuntut_Umum_BERLIAN_D_NAINGGOLAN__SHTerdakwa_SYAIFULLAH_RIFAI_Alias_SYAIFULLAH_RIFAI_Alias_SAYFRAFII_.pdf


INFO:text_extraction:Success with pdfminer_basic: 246148 characters
INFO:text_extraction:Raw text saved: raw_2023_TK1_Putusan_PN_JAKARTA_TIMUR_Nomor_527_Pid_Sus_2023_PN_JKT_TIM_Tanggal_19_Oktober_2023__Penuntut_Umum_BERLIAN_D_NAINGGOLAN__SHTerdakwa_SYAIFULLAH_RIFAI_Alias_SYAIFULLAH_RIFAI_Alias_SAYFRAFII_.txt
INFO:text_extraction:Extracting from PDF: 2023_TK1_Putusan_PN_JAKARTA_TIMUR_Nomor_528_Pid_Sus_2023_PN_JKT_TIM_Tanggal_19_Oktober_2023__Penuntut_Umum_DWI_AGUS_SETYONINGRUM__SH__MHTerdakwa_RAMANDA_PRATAMA_alias_ABU_KENZI_alias_KURAMA_SUARDI.pdf


SUCCESS: 2023_TK1_Putusan_PN_JAKARTA_TIMUR_Nomor_527_Pid_Sus_2023_PN_JKT_TIM_Tanggal_19_Oktober_2023__Penuntut_Umum_BERLIAN_D_NAINGGOLAN__SHTerdakwa_SYAIFULLAH_RIFAI_Alias_SYAIFULLAH_RIFAI_Alias_SAYFRAFII_.pdf -> raw_2023_TK1_Putusan_PN_JAKARTA_TIMUR_Nomor_527_Pid_Sus_2023_PN_JKT_TIM_Tanggal_19_Oktober_2023__Penuntut_Umum_BERLIAN_D_NAINGGOLAN__SHTerdakwa_SYAIFULLAH_RIFAI_Alias_SYAIFULLAH_RIFAI_Alias_SAYFRAFII_.txt (246148 chars)
[11/46] Processing: 2023_TK1_Putusan_PN_JAKARTA_TIMUR_Nomor_528_Pid_Sus_2023_PN_JKT_TIM_Tanggal_19_Oktober_2023__Penuntut_Umum_DWI_AGUS_SETYONINGRUM__SH__MHTerdakwa_RAMANDA_PRATAMA_alias_ABU_KENZI_alias_KURAMA_SUARDI.pdf


INFO:text_extraction:Success with pdfminer_basic: 253053 characters
INFO:text_extraction:Raw text saved: raw_2023_TK1_Putusan_PN_JAKARTA_TIMUR_Nomor_528_Pid_Sus_2023_PN_JKT_TIM_Tanggal_19_Oktober_2023__Penuntut_Umum_DWI_AGUS_SETYONINGRUM__SH__MHTerdakwa_RAMANDA_PRATAMA_alias_ABU_KENZI_alias_KURAMA_SUARDI.txt
INFO:text_extraction:Extracting from PDF: 2023_TK1_Putusan_PN_JAKARTA_TIMUR_Nomor_545_Pid_Sus_2023_PN_JKT_TIM_Tanggal_11_Oktober_2023__Penuntut_Umum_MALINI_SIANTURI__SHTerdakwa_DIKA_GARNAKA_alias_HAMZAH_alias_ABU_USAMAH_alias_PEMBURA__Alm.pdf


SUCCESS: 2023_TK1_Putusan_PN_JAKARTA_TIMUR_Nomor_528_Pid_Sus_2023_PN_JKT_TIM_Tanggal_19_Oktober_2023__Penuntut_Umum_DWI_AGUS_SETYONINGRUM__SH__MHTerdakwa_RAMANDA_PRATAMA_alias_ABU_KENZI_alias_KURAMA_SUARDI.pdf -> raw_2023_TK1_Putusan_PN_JAKARTA_TIMUR_Nomor_528_Pid_Sus_2023_PN_JKT_TIM_Tanggal_19_Oktober_2023__Penuntut_Umum_DWI_AGUS_SETYONINGRUM__SH__MHTerdakwa_RAMANDA_PRATAMA_alias_ABU_KENZI_alias_KURAMA_SUARDI.txt (253053 chars)
[12/46] Processing: 2023_TK1_Putusan_PN_JAKARTA_TIMUR_Nomor_545_Pid_Sus_2023_PN_JKT_TIM_Tanggal_11_Oktober_2023__Penuntut_Umum_MALINI_SIANTURI__SHTerdakwa_DIKA_GARNAKA_alias_HAMZAH_alias_ABU_USAMAH_alias_PEMBURA__Alm.pdf


INFO:text_extraction:Success with pdfminer_basic: 256336 characters
INFO:text_extraction:Raw text saved: raw_2023_TK1_Putusan_PN_JAKARTA_TIMUR_Nomor_545_Pid_Sus_2023_PN_JKT_TIM_Tanggal_11_Oktober_2023__Penuntut_Umum_MALINI_SIANTURI__SHTerdakwa_DIKA_GARNAKA_alias_HAMZAH_alias_ABU_USAMAH_alias_PEMBURA__Alm.txt
INFO:text_extraction:Extracting from PDF: 2023_TK1_Putusan_PN_JAKARTA_TIMUR_Nomor_231_Pid_Sus_2023_PN_Jkt_Tim_Tanggal_2_Agustus_2023__Penuntut_Umum_ERWIN_INDRAPUTRA__SH___MHTerdakwa_TEUKU_MAULIZANSYAH_RAMLI_alias_MAULIDAN_alias_PON_I_TAEB.pdf


SUCCESS: 2023_TK1_Putusan_PN_JAKARTA_TIMUR_Nomor_545_Pid_Sus_2023_PN_JKT_TIM_Tanggal_11_Oktober_2023__Penuntut_Umum_MALINI_SIANTURI__SHTerdakwa_DIKA_GARNAKA_alias_HAMZAH_alias_ABU_USAMAH_alias_PEMBURA__Alm.pdf -> raw_2023_TK1_Putusan_PN_JAKARTA_TIMUR_Nomor_545_Pid_Sus_2023_PN_JKT_TIM_Tanggal_11_Oktober_2023__Penuntut_Umum_MALINI_SIANTURI__SHTerdakwa_DIKA_GARNAKA_alias_HAMZAH_alias_ABU_USAMAH_alias_PEMBURA__Alm.txt (256336 chars)
[13/46] Processing: 2023_TK1_Putusan_PN_JAKARTA_TIMUR_Nomor_231_Pid_Sus_2023_PN_Jkt_Tim_Tanggal_2_Agustus_2023__Penuntut_Umum_ERWIN_INDRAPUTRA__SH___MHTerdakwa_TEUKU_MAULIZANSYAH_RAMLI_alias_MAULIDAN_alias_PON_I_TAEB.pdf


INFO:text_extraction:Success with pdfminer_basic: 191721 characters
INFO:text_extraction:Raw text saved: raw_2023_TK1_Putusan_PN_JAKARTA_TIMUR_Nomor_231_Pid_Sus_2023_PN_Jkt_Tim_Tanggal_2_Agustus_2023__Penuntut_Umum_ERWIN_INDRAPUTRA__SH___MHTerdakwa_TEUKU_MAULIZANSYAH_RAMLI_alias_MAULIDAN_alias_PON_I_TAEB.txt
INFO:text_extraction:Extracting from PDF: 2023_TK1_Putusan_PN_JAKARTA_TIMUR_Nomor_230_Pid_Sus_2023_PN_Jkt_Tim_Tanggal_2_Agustus_2023__Penuntut_Umum_TEDDY_IRAWAN___SH___MH_Terdakwa_MUHAMAD_RIDWAN_als_DEK_WAN_als_NYAK_WAN_als_AHMAD_RI_ABBAS.pdf


SUCCESS: 2023_TK1_Putusan_PN_JAKARTA_TIMUR_Nomor_231_Pid_Sus_2023_PN_Jkt_Tim_Tanggal_2_Agustus_2023__Penuntut_Umum_ERWIN_INDRAPUTRA__SH___MHTerdakwa_TEUKU_MAULIZANSYAH_RAMLI_alias_MAULIDAN_alias_PON_I_TAEB.pdf -> raw_2023_TK1_Putusan_PN_JAKARTA_TIMUR_Nomor_231_Pid_Sus_2023_PN_Jkt_Tim_Tanggal_2_Agustus_2023__Penuntut_Umum_ERWIN_INDRAPUTRA__SH___MHTerdakwa_TEUKU_MAULIZANSYAH_RAMLI_alias_MAULIDAN_alias_PON_I_TAEB.txt (191721 chars)
[14/46] Processing: 2023_TK1_Putusan_PN_JAKARTA_TIMUR_Nomor_230_Pid_Sus_2023_PN_Jkt_Tim_Tanggal_2_Agustus_2023__Penuntut_Umum_TEDDY_IRAWAN___SH___MH_Terdakwa_MUHAMAD_RIDWAN_als_DEK_WAN_als_NYAK_WAN_als_AHMAD_RI_ABBAS.pdf


INFO:text_extraction:Success with pdfminer_basic: 161433 characters
INFO:text_extraction:Raw text saved: raw_2023_TK1_Putusan_PN_JAKARTA_TIMUR_Nomor_230_Pid_Sus_2023_PN_Jkt_Tim_Tanggal_2_Agustus_2023__Penuntut_Umum_TEDDY_IRAWAN___SH___MH_Terdakwa_MUHAMAD_RIDWAN_als_DEK_WAN_als_NYAK_WAN_als_AHMAD_RI_ABBAS.txt
INFO:text_extraction:Extracting from PDF: 2023_TK1_Putusan_PN_JAKARTA_TIMUR_Nomor_211_Pid_Sus_2023_PN_Jkt_Tim_Tanggal_12_Juli_2023__Penuntut_Umum_ANNISA_RK__SHTerdakwa_SYAHRUL_ALS_TOPAN_ALS_BENZ_ALS_ANGGA_BIN_UMARDI_.pdf


SUCCESS: 2023_TK1_Putusan_PN_JAKARTA_TIMUR_Nomor_230_Pid_Sus_2023_PN_Jkt_Tim_Tanggal_2_Agustus_2023__Penuntut_Umum_TEDDY_IRAWAN___SH___MH_Terdakwa_MUHAMAD_RIDWAN_als_DEK_WAN_als_NYAK_WAN_als_AHMAD_RI_ABBAS.pdf -> raw_2023_TK1_Putusan_PN_JAKARTA_TIMUR_Nomor_230_Pid_Sus_2023_PN_Jkt_Tim_Tanggal_2_Agustus_2023__Penuntut_Umum_TEDDY_IRAWAN___SH___MH_Terdakwa_MUHAMAD_RIDWAN_als_DEK_WAN_als_NYAK_WAN_als_AHMAD_RI_ABBAS.txt (161433 chars)
[15/46] Processing: 2023_TK1_Putusan_PN_JAKARTA_TIMUR_Nomor_211_Pid_Sus_2023_PN_Jkt_Tim_Tanggal_12_Juli_2023__Penuntut_Umum_ANNISA_RK__SHTerdakwa_SYAHRUL_ALS_TOPAN_ALS_BENZ_ALS_ANGGA_BIN_UMARDI_.pdf


INFO:text_extraction:Success with pdfminer_basic: 122688 characters
INFO:text_extraction:Raw text saved: raw_2023_TK1_Putusan_PN_JAKARTA_TIMUR_Nomor_211_Pid_Sus_2023_PN_Jkt_Tim_Tanggal_12_Juli_2023__Penuntut_Umum_ANNISA_RK__SHTerdakwa_SYAHRUL_ALS_TOPAN_ALS_BENZ_ALS_ANGGA_BIN_UMARDI_.txt
INFO:text_extraction:Extracting from PDF: 2023_TK1_Putusan_PN_JAKARTA_TIMUR_Nomor_209_Pid_Sus_2023_PN_Jkt_Tim_Tanggal_12_Juli_2023__Penuntut_Umum_HOTMAIDA__SHTerdakwa_ARIADI_Alias_KHAIRUL_alias_FERI_alias_JIWO_Bin_ASNAN.pdf


SUCCESS: 2023_TK1_Putusan_PN_JAKARTA_TIMUR_Nomor_211_Pid_Sus_2023_PN_Jkt_Tim_Tanggal_12_Juli_2023__Penuntut_Umum_ANNISA_RK__SHTerdakwa_SYAHRUL_ALS_TOPAN_ALS_BENZ_ALS_ANGGA_BIN_UMARDI_.pdf -> raw_2023_TK1_Putusan_PN_JAKARTA_TIMUR_Nomor_211_Pid_Sus_2023_PN_Jkt_Tim_Tanggal_12_Juli_2023__Penuntut_Umum_ANNISA_RK__SHTerdakwa_SYAHRUL_ALS_TOPAN_ALS_BENZ_ALS_ANGGA_BIN_UMARDI_.txt (122688 chars)
[16/46] Processing: 2023_TK1_Putusan_PN_JAKARTA_TIMUR_Nomor_209_Pid_Sus_2023_PN_Jkt_Tim_Tanggal_12_Juli_2023__Penuntut_Umum_HOTMAIDA__SHTerdakwa_ARIADI_Alias_KHAIRUL_alias_FERI_alias_JIWO_Bin_ASNAN.pdf


INFO:text_extraction:Success with pdfminer_basic: 179295 characters
INFO:text_extraction:Raw text saved: raw_2023_TK1_Putusan_PN_JAKARTA_TIMUR_Nomor_209_Pid_Sus_2023_PN_Jkt_Tim_Tanggal_12_Juli_2023__Penuntut_Umum_HOTMAIDA__SHTerdakwa_ARIADI_Alias_KHAIRUL_alias_FERI_alias_JIWO_Bin_ASNAN.txt
INFO:text_extraction:Extracting from PDF: 2023_TK1_Putusan_PN_JAKARTA_TIMUR_Nomor_210_Pid_Sus_2023_PN_Jkt_Tim_Tanggal_12_Juli_2023__Penuntut_Umum_MUHAMMAD_FAHRUL__SHTerdakwa_SUTANTO_Alias_AWAN_Alias_EKO_Alias_AHMAD_SYAKIR_Bin_SANIMAN.pdf


SUCCESS: 2023_TK1_Putusan_PN_JAKARTA_TIMUR_Nomor_209_Pid_Sus_2023_PN_Jkt_Tim_Tanggal_12_Juli_2023__Penuntut_Umum_HOTMAIDA__SHTerdakwa_ARIADI_Alias_KHAIRUL_alias_FERI_alias_JIWO_Bin_ASNAN.pdf -> raw_2023_TK1_Putusan_PN_JAKARTA_TIMUR_Nomor_209_Pid_Sus_2023_PN_Jkt_Tim_Tanggal_12_Juli_2023__Penuntut_Umum_HOTMAIDA__SHTerdakwa_ARIADI_Alias_KHAIRUL_alias_FERI_alias_JIWO_Bin_ASNAN.txt (179295 chars)
[17/46] Processing: 2023_TK1_Putusan_PN_JAKARTA_TIMUR_Nomor_210_Pid_Sus_2023_PN_Jkt_Tim_Tanggal_12_Juli_2023__Penuntut_Umum_MUHAMMAD_FAHRUL__SHTerdakwa_SUTANTO_Alias_AWAN_Alias_EKO_Alias_AHMAD_SYAKIR_Bin_SANIMAN.pdf


INFO:text_extraction:Success with pdfminer_basic: 74325 characters
INFO:text_extraction:Raw text saved: raw_2023_TK1_Putusan_PN_JAKARTA_TIMUR_Nomor_210_Pid_Sus_2023_PN_Jkt_Tim_Tanggal_12_Juli_2023__Penuntut_Umum_MUHAMMAD_FAHRUL__SHTerdakwa_SUTANTO_Alias_AWAN_Alias_EKO_Alias_AHMAD_SYAKIR_Bin_SANIMAN.txt
INFO:text_extraction:Extracting from PDF: 2023_TK1_Putusan_PN_JAKARTA_TIMUR_Nomor_109_Pid_Sus_2023_PN_Jkt_Tim_Tanggal_21_Juni_2023__Penuntut_Umum_ARIF_SUSANTO__SH__MHTerdakwa_MOH__SYAIFULLAH_A__SAHABA_Alias_MAMAT_Alias_MAT_HERDIANSYSAHABA.pdf


SUCCESS: 2023_TK1_Putusan_PN_JAKARTA_TIMUR_Nomor_210_Pid_Sus_2023_PN_Jkt_Tim_Tanggal_12_Juli_2023__Penuntut_Umum_MUHAMMAD_FAHRUL__SHTerdakwa_SUTANTO_Alias_AWAN_Alias_EKO_Alias_AHMAD_SYAKIR_Bin_SANIMAN.pdf -> raw_2023_TK1_Putusan_PN_JAKARTA_TIMUR_Nomor_210_Pid_Sus_2023_PN_Jkt_Tim_Tanggal_12_Juli_2023__Penuntut_Umum_MUHAMMAD_FAHRUL__SHTerdakwa_SUTANTO_Alias_AWAN_Alias_EKO_Alias_AHMAD_SYAKIR_Bin_SANIMAN.txt (74325 chars)
[18/46] Processing: 2023_TK1_Putusan_PN_JAKARTA_TIMUR_Nomor_109_Pid_Sus_2023_PN_Jkt_Tim_Tanggal_21_Juni_2023__Penuntut_Umum_ARIF_SUSANTO__SH__MHTerdakwa_MOH__SYAIFULLAH_A__SAHABA_Alias_MAMAT_Alias_MAT_HERDIANSYSAHABA.pdf


INFO:text_extraction:Success with pdfminer_basic: 178380 characters
INFO:text_extraction:Raw text saved: raw_2023_TK1_Putusan_PN_JAKARTA_TIMUR_Nomor_109_Pid_Sus_2023_PN_Jkt_Tim_Tanggal_21_Juni_2023__Penuntut_Umum_ARIF_SUSANTO__SH__MHTerdakwa_MOH__SYAIFULLAH_A__SAHABA_Alias_MAMAT_Alias_MAT_HERDIANSYSAHABA.txt
INFO:text_extraction:Extracting from PDF: 2023_TK1_Putusan_PN_JAKARTA_TIMUR_Nomor_110_Pid_Sus_2023_PN_Jkt_Tim_Tanggal_21_Juni_2023__Penuntut_Umum_HASBUDDIN_B_PASENG__SH_Terdakwa_MUH__RIZAL_S_Pd_I_Alias_RIZAL_Alias_MAMAT_Alias_MAT_TONGGALA.pdf


SUCCESS: 2023_TK1_Putusan_PN_JAKARTA_TIMUR_Nomor_109_Pid_Sus_2023_PN_Jkt_Tim_Tanggal_21_Juni_2023__Penuntut_Umum_ARIF_SUSANTO__SH__MHTerdakwa_MOH__SYAIFULLAH_A__SAHABA_Alias_MAMAT_Alias_MAT_HERDIANSYSAHABA.pdf -> raw_2023_TK1_Putusan_PN_JAKARTA_TIMUR_Nomor_109_Pid_Sus_2023_PN_Jkt_Tim_Tanggal_21_Juni_2023__Penuntut_Umum_ARIF_SUSANTO__SH__MHTerdakwa_MOH__SYAIFULLAH_A__SAHABA_Alias_MAMAT_Alias_MAT_HERDIANSYSAHABA.txt (178380 chars)
[19/46] Processing: 2023_TK1_Putusan_PN_JAKARTA_TIMUR_Nomor_110_Pid_Sus_2023_PN_Jkt_Tim_Tanggal_21_Juni_2023__Penuntut_Umum_HASBUDDIN_B_PASENG__SH_Terdakwa_MUH__RIZAL_S_Pd_I_Alias_RIZAL_Alias_MAMAT_Alias_MAT_TONGGALA.pdf


INFO:text_extraction:Success with pdfminer_basic: 161712 characters
INFO:text_extraction:Raw text saved: raw_2023_TK1_Putusan_PN_JAKARTA_TIMUR_Nomor_110_Pid_Sus_2023_PN_Jkt_Tim_Tanggal_21_Juni_2023__Penuntut_Umum_HASBUDDIN_B_PASENG__SH_Terdakwa_MUH__RIZAL_S_Pd_I_Alias_RIZAL_Alias_MAMAT_Alias_MAT_TONGGALA.txt
INFO:text_extraction:Extracting from PDF: 2023_TK1_Putusan_PN_JAKARTA_TIMUR_Nomor_114_Pid_Sus_2023_PN_Jkt_Tim_Tanggal_24_Mei_2023__Penuntut_Umum_FAISAL_NUR__SH__MHTerdakwa_MUHAMMAD_INDRA_GARUSU_bin_SAINUL_GARUSU.pdf


SUCCESS: 2023_TK1_Putusan_PN_JAKARTA_TIMUR_Nomor_110_Pid_Sus_2023_PN_Jkt_Tim_Tanggal_21_Juni_2023__Penuntut_Umum_HASBUDDIN_B_PASENG__SH_Terdakwa_MUH__RIZAL_S_Pd_I_Alias_RIZAL_Alias_MAMAT_Alias_MAT_TONGGALA.pdf -> raw_2023_TK1_Putusan_PN_JAKARTA_TIMUR_Nomor_110_Pid_Sus_2023_PN_Jkt_Tim_Tanggal_21_Juni_2023__Penuntut_Umum_HASBUDDIN_B_PASENG__SH_Terdakwa_MUH__RIZAL_S_Pd_I_Alias_RIZAL_Alias_MAMAT_Alias_MAT_TONGGALA.txt (161712 chars)
[20/46] Processing: 2023_TK1_Putusan_PN_JAKARTA_TIMUR_Nomor_114_Pid_Sus_2023_PN_Jkt_Tim_Tanggal_24_Mei_2023__Penuntut_Umum_FAISAL_NUR__SH__MHTerdakwa_MUHAMMAD_INDRA_GARUSU_bin_SAINUL_GARUSU.pdf


INFO:text_extraction:Success with pdfminer_basic: 178592 characters
INFO:text_extraction:Raw text saved: raw_2023_TK1_Putusan_PN_JAKARTA_TIMUR_Nomor_114_Pid_Sus_2023_PN_Jkt_Tim_Tanggal_24_Mei_2023__Penuntut_Umum_FAISAL_NUR__SH__MHTerdakwa_MUHAMMAD_INDRA_GARUSU_bin_SAINUL_GARUSU.txt
INFO:text_extraction:Extracting from PDF: 2023_TK1_Putusan_PN_JAKARTA_TIMUR_Nomor_71_Pid_Sus_2023_PN_Jkt_Tim_Tanggal_17_Mei_2023__Penuntut_Umum_TEDDY_IRAWAN___SH___MH_Terdakwa_HERLIANSYAH_als_ANDI_BASO_als_HERLY_BIN_SULTANNI.pdf


SUCCESS: 2023_TK1_Putusan_PN_JAKARTA_TIMUR_Nomor_114_Pid_Sus_2023_PN_Jkt_Tim_Tanggal_24_Mei_2023__Penuntut_Umum_FAISAL_NUR__SH__MHTerdakwa_MUHAMMAD_INDRA_GARUSU_bin_SAINUL_GARUSU.pdf -> raw_2023_TK1_Putusan_PN_JAKARTA_TIMUR_Nomor_114_Pid_Sus_2023_PN_Jkt_Tim_Tanggal_24_Mei_2023__Penuntut_Umum_FAISAL_NUR__SH__MHTerdakwa_MUHAMMAD_INDRA_GARUSU_bin_SAINUL_GARUSU.txt (178592 chars)
[21/46] Processing: 2023_TK1_Putusan_PN_JAKARTA_TIMUR_Nomor_71_Pid_Sus_2023_PN_Jkt_Tim_Tanggal_17_Mei_2023__Penuntut_Umum_TEDDY_IRAWAN___SH___MH_Terdakwa_HERLIANSYAH_als_ANDI_BASO_als_HERLY_BIN_SULTANNI.pdf


INFO:text_extraction:Success with pdfminer_basic: 227472 characters
INFO:text_extraction:Raw text saved: raw_2023_TK1_Putusan_PN_JAKARTA_TIMUR_Nomor_71_Pid_Sus_2023_PN_Jkt_Tim_Tanggal_17_Mei_2023__Penuntut_Umum_TEDDY_IRAWAN___SH___MH_Terdakwa_HERLIANSYAH_als_ANDI_BASO_als_HERLY_BIN_SULTANNI.txt
INFO:text_extraction:Extracting from PDF: 2023_TK1_Putusan_PN_JAKARTA_TIMUR_Nomor_73_Pid_Sus_2023_PN_Jkt_Tim_Tanggal_10_Mei_2023__Penuntut_Umum_HARDINIYANTY__SH__MHTerdakwa_LUKMAN_YUNUS_Als_UKO_Als_ABU_SYUKRON_Bin_IDAM_YUNUS.pdf


SUCCESS: 2023_TK1_Putusan_PN_JAKARTA_TIMUR_Nomor_71_Pid_Sus_2023_PN_Jkt_Tim_Tanggal_17_Mei_2023__Penuntut_Umum_TEDDY_IRAWAN___SH___MH_Terdakwa_HERLIANSYAH_als_ANDI_BASO_als_HERLY_BIN_SULTANNI.pdf -> raw_2023_TK1_Putusan_PN_JAKARTA_TIMUR_Nomor_71_Pid_Sus_2023_PN_Jkt_Tim_Tanggal_17_Mei_2023__Penuntut_Umum_TEDDY_IRAWAN___SH___MH_Terdakwa_HERLIANSYAH_als_ANDI_BASO_als_HERLY_BIN_SULTANNI.txt (227472 chars)
[22/46] Processing: 2023_TK1_Putusan_PN_JAKARTA_TIMUR_Nomor_73_Pid_Sus_2023_PN_Jkt_Tim_Tanggal_10_Mei_2023__Penuntut_Umum_HARDINIYANTY__SH__MHTerdakwa_LUKMAN_YUNUS_Als_UKO_Als_ABU_SYUKRON_Bin_IDAM_YUNUS.pdf


INFO:text_extraction:Success with pdfminer_basic: 196345 characters
INFO:text_extraction:Raw text saved: raw_2023_TK1_Putusan_PN_JAKARTA_TIMUR_Nomor_73_Pid_Sus_2023_PN_Jkt_Tim_Tanggal_10_Mei_2023__Penuntut_Umum_HARDINIYANTY__SH__MHTerdakwa_LUKMAN_YUNUS_Als_UKO_Als_ABU_SYUKRON_Bin_IDAM_YUNUS.txt
INFO:text_extraction:Extracting from PDF: 2025_TK1_Putusan_MS_CALANG_Nomor_70_Pdt_P_2025_MS_Cag_Tanggal_12_Juni_2025__Pemohon_melawan_Termohon.pdf


SUCCESS: 2023_TK1_Putusan_PN_JAKARTA_TIMUR_Nomor_73_Pid_Sus_2023_PN_Jkt_Tim_Tanggal_10_Mei_2023__Penuntut_Umum_HARDINIYANTY__SH__MHTerdakwa_LUKMAN_YUNUS_Als_UKO_Als_ABU_SYUKRON_Bin_IDAM_YUNUS.pdf -> raw_2023_TK1_Putusan_PN_JAKARTA_TIMUR_Nomor_73_Pid_Sus_2023_PN_Jkt_Tim_Tanggal_10_Mei_2023__Penuntut_Umum_HARDINIYANTY__SH__MHTerdakwa_LUKMAN_YUNUS_Als_UKO_Als_ABU_SYUKRON_Bin_IDAM_YUNUS.txt (196345 chars)
[23/46] Processing: 2025_TK1_Putusan_MS_CALANG_Nomor_70_Pdt_P_2025_MS_Cag_Tanggal_12_Juni_2025__Pemohon_melawan_Termohon.pdf


INFO:text_extraction:Success with pdfminer_basic: 58818 characters
INFO:text_extraction:Raw text saved: raw_2025_TK1_Putusan_MS_CALANG_Nomor_70_Pdt_P_2025_MS_Cag_Tanggal_12_Juni_2025__Pemohon_melawan_Termohon.txt
INFO:text_extraction:Extracting from PDF: 2025_TK1_Putusan_PA_SAMPANG_Nomor_776_Pdt_G_2025_PA_Spg_Tanggal_12_Juni_2025__Penggugat_melawan_Tergugat.pdf


SUCCESS: 2025_TK1_Putusan_MS_CALANG_Nomor_70_Pdt_P_2025_MS_Cag_Tanggal_12_Juni_2025__Pemohon_melawan_Termohon.pdf -> raw_2025_TK1_Putusan_MS_CALANG_Nomor_70_Pdt_P_2025_MS_Cag_Tanggal_12_Juni_2025__Pemohon_melawan_Termohon.txt (58818 chars)
[24/46] Processing: 2025_TK1_Putusan_PA_SAMPANG_Nomor_776_Pdt_G_2025_PA_Spg_Tanggal_12_Juni_2025__Penggugat_melawan_Tergugat.pdf


INFO:text_extraction:Success with pdfminer_basic: 29501 characters
INFO:text_extraction:Raw text saved: raw_2025_TK1_Putusan_PA_SAMPANG_Nomor_776_Pdt_G_2025_PA_Spg_Tanggal_12_Juni_2025__Penggugat_melawan_Tergugat.txt
INFO:text_extraction:Extracting from PDF: 2025_TK1_Putusan_PA_SAMPANG_Nomor_830_Pdt_G_2025_PA_Spg_Tanggal_12_Juni_2025__Penggugat_melawan_Tergugat.pdf


SUCCESS: 2025_TK1_Putusan_PA_SAMPANG_Nomor_776_Pdt_G_2025_PA_Spg_Tanggal_12_Juni_2025__Penggugat_melawan_Tergugat.pdf -> raw_2025_TK1_Putusan_PA_SAMPANG_Nomor_776_Pdt_G_2025_PA_Spg_Tanggal_12_Juni_2025__Penggugat_melawan_Tergugat.txt (29501 chars)
[25/46] Processing: 2025_TK1_Putusan_PA_SAMPANG_Nomor_830_Pdt_G_2025_PA_Spg_Tanggal_12_Juni_2025__Penggugat_melawan_Tergugat.pdf


INFO:text_extraction:Success with pdfminer_basic: 35367 characters
INFO:text_extraction:Raw text saved: raw_2025_TK1_Putusan_PA_SAMPANG_Nomor_830_Pdt_G_2025_PA_Spg_Tanggal_12_Juni_2025__Penggugat_melawan_Tergugat.txt
INFO:text_extraction:Extracting from PDF: 2023_TK1_Putusan_PN_JAKARTA_TIMUR_Nomor_74_Pid_Sus_2023_PN_Jkt_Tim_Tanggal_10_Mei_2023__Penuntut_Umum_JAYA_SIAHAAN___SH_Terdakwa_KHOIRRUDDIN_Alias_JIHAN_Bin_M__SAIFUL_ANWARY.pdf


SUCCESS: 2025_TK1_Putusan_PA_SAMPANG_Nomor_830_Pdt_G_2025_PA_Spg_Tanggal_12_Juni_2025__Penggugat_melawan_Tergugat.pdf -> raw_2025_TK1_Putusan_PA_SAMPANG_Nomor_830_Pdt_G_2025_PA_Spg_Tanggal_12_Juni_2025__Penggugat_melawan_Tergugat.txt (35367 chars)
[26/46] Processing: 2023_TK1_Putusan_PN_JAKARTA_TIMUR_Nomor_74_Pid_Sus_2023_PN_Jkt_Tim_Tanggal_10_Mei_2023__Penuntut_Umum_JAYA_SIAHAAN___SH_Terdakwa_KHOIRRUDDIN_Alias_JIHAN_Bin_M__SAIFUL_ANWARY.pdf


INFO:text_extraction:Success with pdfminer_basic: 197688 characters
INFO:text_extraction:Raw text saved: raw_2023_TK1_Putusan_PN_JAKARTA_TIMUR_Nomor_74_Pid_Sus_2023_PN_Jkt_Tim_Tanggal_10_Mei_2023__Penuntut_Umum_JAYA_SIAHAAN___SH_Terdakwa_KHOIRRUDDIN_Alias_JIHAN_Bin_M__SAIFUL_ANWARY.txt
INFO:text_extraction:Extracting from PDF: 2023_TK1_Putusan_PN_JAKARTA_BARAT_Nomor_8_Pid_Sus_2023_PN_Jkt_Brt_Tanggal_13_April_2023__Penuntut_Umum_1_ANDI_JEFRI_ARDIN__SH_MH2_JAHRUDIN__SH3_DENRI_KASWORO__S_H_4_ZULKIFLI__SH__MH5_KHAREZA_SOLEH.pdf


SUCCESS: 2023_TK1_Putusan_PN_JAKARTA_TIMUR_Nomor_74_Pid_Sus_2023_PN_Jkt_Tim_Tanggal_10_Mei_2023__Penuntut_Umum_JAYA_SIAHAAN___SH_Terdakwa_KHOIRRUDDIN_Alias_JIHAN_Bin_M__SAIFUL_ANWARY.pdf -> raw_2023_TK1_Putusan_PN_JAKARTA_TIMUR_Nomor_74_Pid_Sus_2023_PN_Jkt_Tim_Tanggal_10_Mei_2023__Penuntut_Umum_JAYA_SIAHAAN___SH_Terdakwa_KHOIRRUDDIN_Alias_JIHAN_Bin_M__SAIFUL_ANWARY.txt (197688 chars)
[27/46] Processing: 2023_TK1_Putusan_PN_JAKARTA_BARAT_Nomor_8_Pid_Sus_2023_PN_Jkt_Brt_Tanggal_13_April_2023__Penuntut_Umum_1_ANDI_JEFRI_ARDIN__SH_MH2_JAHRUDIN__SH3_DENRI_KASWORO__S_H_4_ZULKIFLI__SH__MH5_KHAREZA_SOLEH.pdf


INFO:text_extraction:Success with pdfminer_basic: 103760 characters
INFO:text_extraction:Raw text saved: raw_2023_TK1_Putusan_PN_JAKARTA_BARAT_Nomor_8_Pid_Sus_2023_PN_Jkt_Brt_Tanggal_13_April_2023__Penuntut_Umum_1_ANDI_JEFRI_ARDIN__SH_MH2_JAHRUDIN__SH3_DENRI_KASWORO__S_H_4_ZULKIFLI__SH__MH5_KHAREZA_SOLEH.txt
INFO:text_extraction:Extracting from PDF: 2023_TK1_Putusan_PN_JAKARTA_BARAT_Nomor_3_Pid_Sus_2023_PN_Jkt_Brt_Tanggal_13_April_2023__Penuntut_Umum_1_POERWOKO_HADI_SASMITO__SH2_Dra__INDRAYATI__H_S__SH__MH3_ADE_SOLEHUDIN__SH__MH4_MARDIAAYANTO.pdf


SUCCESS: 2023_TK1_Putusan_PN_JAKARTA_BARAT_Nomor_8_Pid_Sus_2023_PN_Jkt_Brt_Tanggal_13_April_2023__Penuntut_Umum_1_ANDI_JEFRI_ARDIN__SH_MH2_JAHRUDIN__SH3_DENRI_KASWORO__S_H_4_ZULKIFLI__SH__MH5_KHAREZA_SOLEH.pdf -> raw_2023_TK1_Putusan_PN_JAKARTA_BARAT_Nomor_8_Pid_Sus_2023_PN_Jkt_Brt_Tanggal_13_April_2023__Penuntut_Umum_1_ANDI_JEFRI_ARDIN__SH_MH2_JAHRUDIN__SH3_DENRI_KASWORO__S_H_4_ZULKIFLI__SH__MH5_KHAREZA_SOLEH.txt (103760 chars)
[28/46] Processing: 2023_TK1_Putusan_PN_JAKARTA_BARAT_Nomor_3_Pid_Sus_2023_PN_Jkt_Brt_Tanggal_13_April_2023__Penuntut_Umum_1_POERWOKO_HADI_SASMITO__SH2_Dra__INDRAYATI__H_S__SH__MH3_ADE_SOLEHUDIN__SH__MH4_MARDIAAYANTO.pdf


INFO:text_extraction:Success with pdfminer_basic: 191798 characters
INFO:text_extraction:Raw text saved: raw_2023_TK1_Putusan_PN_JAKARTA_BARAT_Nomor_3_Pid_Sus_2023_PN_Jkt_Brt_Tanggal_13_April_2023__Penuntut_Umum_1_POERWOKO_HADI_SASMITO__SH2_Dra__INDRAYATI__H_S__SH__MH3_ADE_SOLEHUDIN__SH__MH4_MARDIAAYANTO.txt
INFO:text_extraction:Extracting from PDF: 2023_TK1_Putusan_PN_JAKARTA_BARAT_Nomor_1109_Pid_Sus_2022_PN_Jkt_Brt_Tanggal_6_April_2023__Penuntut_Umum_1_JUWITA_KAYANA__S_H___M_H_2_TEDDY_IRAWAN_SH3_ERWIN_INDRAPUTRA__SH__MH4_MUHAMAD_RAMLI_WASIS.pdf


SUCCESS: 2023_TK1_Putusan_PN_JAKARTA_BARAT_Nomor_3_Pid_Sus_2023_PN_Jkt_Brt_Tanggal_13_April_2023__Penuntut_Umum_1_POERWOKO_HADI_SASMITO__SH2_Dra__INDRAYATI__H_S__SH__MH3_ADE_SOLEHUDIN__SH__MH4_MARDIAAYANTO.pdf -> raw_2023_TK1_Putusan_PN_JAKARTA_BARAT_Nomor_3_Pid_Sus_2023_PN_Jkt_Brt_Tanggal_13_April_2023__Penuntut_Umum_1_POERWOKO_HADI_SASMITO__SH2_Dra__INDRAYATI__H_S__SH__MH3_ADE_SOLEHUDIN__SH__MH4_MARDIAAYANTO.txt (191798 chars)
[29/46] Processing: 2023_TK1_Putusan_PN_JAKARTA_BARAT_Nomor_1109_Pid_Sus_2022_PN_Jkt_Brt_Tanggal_6_April_2023__Penuntut_Umum_1_JUWITA_KAYANA__S_H___M_H_2_TEDDY_IRAWAN_SH3_ERWIN_INDRAPUTRA__SH__MH4_MUHAMAD_RAMLI_WASIS.pdf


INFO:text_extraction:Success with pdfminer_basic: 148631 characters
INFO:text_extraction:Raw text saved: raw_2023_TK1_Putusan_PN_JAKARTA_BARAT_Nomor_1109_Pid_Sus_2022_PN_Jkt_Brt_Tanggal_6_April_2023__Penuntut_Umum_1_JUWITA_KAYANA__S_H___M_H_2_TEDDY_IRAWAN_SH3_ERWIN_INDRAPUTRA__SH__MH4_MUHAMAD_RAMLI_WASIS.txt
INFO:text_extraction:Extracting from PDF: 2023_TK1_Putusan_PN_JAKARTA_BARAT_Nomor_1206_Pid_Sus_2022_PN_Jkt_Brt_Tanggal_6_April_2023__Penuntut_Umum_1_HASBUDDIN_B_PASENG__SH2_HEVBEN__SH3_MUCHAMAD_ADYANSYAH__SH__MH4_PONTI_LUKWINANTI_SHARNAMA.pdf


SUCCESS: 2023_TK1_Putusan_PN_JAKARTA_BARAT_Nomor_1109_Pid_Sus_2022_PN_Jkt_Brt_Tanggal_6_April_2023__Penuntut_Umum_1_JUWITA_KAYANA__S_H___M_H_2_TEDDY_IRAWAN_SH3_ERWIN_INDRAPUTRA__SH__MH4_MUHAMAD_RAMLI_WASIS.pdf -> raw_2023_TK1_Putusan_PN_JAKARTA_BARAT_Nomor_1109_Pid_Sus_2022_PN_Jkt_Brt_Tanggal_6_April_2023__Penuntut_Umum_1_JUWITA_KAYANA__S_H___M_H_2_TEDDY_IRAWAN_SH3_ERWIN_INDRAPUTRA__SH__MH4_MUHAMAD_RAMLI_WASIS.txt (148631 chars)
[30/46] Processing: 2023_TK1_Putusan_PN_JAKARTA_BARAT_Nomor_1206_Pid_Sus_2022_PN_Jkt_Brt_Tanggal_6_April_2023__Penuntut_Umum_1_HASBUDDIN_B_PASENG__SH2_HEVBEN__SH3_MUCHAMAD_ADYANSYAH__SH__MH4_PONTI_LUKWINANTI_SHARNAMA.pdf


INFO:text_extraction:Success with pdfminer_basic: 198202 characters
INFO:text_extraction:Raw text saved: raw_2023_TK1_Putusan_PN_JAKARTA_BARAT_Nomor_1206_Pid_Sus_2022_PN_Jkt_Brt_Tanggal_6_April_2023__Penuntut_Umum_1_HASBUDDIN_B_PASENG__SH2_HEVBEN__SH3_MUCHAMAD_ADYANSYAH__SH__MH4_PONTI_LUKWINANTI_SHARNAMA.txt
INFO:text_extraction:Extracting from PDF: 2023_TK1_Putusan_PN_JAKARTA_BARAT_Nomor_1110_Pid_Sus_2022_PN_Jkt_Brt_Tanggal_6_April_2023__Penuntut_Umum_1_HOTMAIDA_SH2_AMRIL_ABDI__SH3_REZA_OKTAVIAN__S_H___M_H_4_FEBBY_SALAHUDDIN__S__Kom__SSUKAMA.pdf


SUCCESS: 2023_TK1_Putusan_PN_JAKARTA_BARAT_Nomor_1206_Pid_Sus_2022_PN_Jkt_Brt_Tanggal_6_April_2023__Penuntut_Umum_1_HASBUDDIN_B_PASENG__SH2_HEVBEN__SH3_MUCHAMAD_ADYANSYAH__SH__MH4_PONTI_LUKWINANTI_SHARNAMA.pdf -> raw_2023_TK1_Putusan_PN_JAKARTA_BARAT_Nomor_1206_Pid_Sus_2022_PN_Jkt_Brt_Tanggal_6_April_2023__Penuntut_Umum_1_HASBUDDIN_B_PASENG__SH2_HEVBEN__SH3_MUCHAMAD_ADYANSYAH__SH__MH4_PONTI_LUKWINANTI_SHARNAMA.txt (198202 chars)
[31/46] Processing: 2023_TK1_Putusan_PN_JAKARTA_BARAT_Nomor_1110_Pid_Sus_2022_PN_Jkt_Brt_Tanggal_6_April_2023__Penuntut_Umum_1_HOTMAIDA_SH2_AMRIL_ABDI__SH3_REZA_OKTAVIAN__S_H___M_H_4_FEBBY_SALAHUDDIN__S__Kom__SSUKAMA.pdf


INFO:text_extraction:Success with pdfminer_basic: 195771 characters
INFO:text_extraction:Raw text saved: raw_2023_TK1_Putusan_PN_JAKARTA_BARAT_Nomor_1110_Pid_Sus_2022_PN_Jkt_Brt_Tanggal_6_April_2023__Penuntut_Umum_1_HOTMAIDA_SH2_AMRIL_ABDI__SH3_REZA_OKTAVIAN__S_H___M_H_4_FEBBY_SALAHUDDIN__S__Kom__SSUKAMA.txt
INFO:text_extraction:Extracting from PDF: 2023_TK1_Putusan_PN_JAKARTA_BARAT_Nomor_1172_Pid_Sus_2022_PN_Jkt_Brt_Tanggal_30_Maret_2023__Penuntut_Umum_1_ERWIN_INDRAPUTRA__SH__MH2_TEDDY_IRAWAN_SH3_JUWITA_KAYANA__S_H___M_H_4_KHAREZA_MOKH_MIJAR.pdf


SUCCESS: 2023_TK1_Putusan_PN_JAKARTA_BARAT_Nomor_1110_Pid_Sus_2022_PN_Jkt_Brt_Tanggal_6_April_2023__Penuntut_Umum_1_HOTMAIDA_SH2_AMRIL_ABDI__SH3_REZA_OKTAVIAN__S_H___M_H_4_FEBBY_SALAHUDDIN__S__Kom__SSUKAMA.pdf -> raw_2023_TK1_Putusan_PN_JAKARTA_BARAT_Nomor_1110_Pid_Sus_2022_PN_Jkt_Brt_Tanggal_6_April_2023__Penuntut_Umum_1_HOTMAIDA_SH2_AMRIL_ABDI__SH3_REZA_OKTAVIAN__S_H___M_H_4_FEBBY_SALAHUDDIN__S__Kom__SSUKAMA.txt (195771 chars)
[32/46] Processing: 2023_TK1_Putusan_PN_JAKARTA_BARAT_Nomor_1172_Pid_Sus_2022_PN_Jkt_Brt_Tanggal_30_Maret_2023__Penuntut_Umum_1_ERWIN_INDRAPUTRA__SH__MH2_TEDDY_IRAWAN_SH3_JUWITA_KAYANA__S_H___M_H_4_KHAREZA_MOKH_MIJAR.pdf


INFO:text_extraction:Success with pdfminer_basic: 191458 characters
INFO:text_extraction:Raw text saved: raw_2023_TK1_Putusan_PN_JAKARTA_BARAT_Nomor_1172_Pid_Sus_2022_PN_Jkt_Brt_Tanggal_30_Maret_2023__Penuntut_Umum_1_ERWIN_INDRAPUTRA__SH__MH2_TEDDY_IRAWAN_SH3_JUWITA_KAYANA__S_H___M_H_4_KHAREZA_MOKH_MIJAR.txt
INFO:text_extraction:Extracting from PDF: 2025_TK1_Putusan_PA_POLEWALI_Nomor_325_Pdt_G_2025_PA_Pwl_Tanggal_12_Juni_2025__Penggugat_melawan_Tergugat.pdf


SUCCESS: 2023_TK1_Putusan_PN_JAKARTA_BARAT_Nomor_1172_Pid_Sus_2022_PN_Jkt_Brt_Tanggal_30_Maret_2023__Penuntut_Umum_1_ERWIN_INDRAPUTRA__SH__MH2_TEDDY_IRAWAN_SH3_JUWITA_KAYANA__S_H___M_H_4_KHAREZA_MOKH_MIJAR.pdf -> raw_2023_TK1_Putusan_PN_JAKARTA_BARAT_Nomor_1172_Pid_Sus_2022_PN_Jkt_Brt_Tanggal_30_Maret_2023__Penuntut_Umum_1_ERWIN_INDRAPUTRA__SH__MH2_TEDDY_IRAWAN_SH3_JUWITA_KAYANA__S_H___M_H_4_KHAREZA_MOKH_MIJAR.txt (191458 chars)
[33/46] Processing: 2025_TK1_Putusan_PA_POLEWALI_Nomor_325_Pdt_G_2025_PA_Pwl_Tanggal_12_Juni_2025__Penggugat_melawan_Tergugat.pdf


INFO:text_extraction:Success with pdfminer_basic: 36027 characters
INFO:text_extraction:Raw text saved: raw_2025_TK1_Putusan_PA_POLEWALI_Nomor_325_Pdt_G_2025_PA_Pwl_Tanggal_12_Juni_2025__Penggugat_melawan_Tergugat.txt
INFO:text_extraction:Extracting from PDF: 2023_TK1_Putusan_PN_JAKARTA_BARAT_Nomor_1085_Pid_Sus_2022_PN_Jkt_Brt_Tanggal_30_Maret_2023__Penuntut_Umum_1_ARY_PRATAMA__SH2_ARIF_SUSANTO__SH__MH3_AMRI_BAYAKTA__S_H_4_MARDIANA_YOLANDA_I__SILFFENDI.pdf


SUCCESS: 2025_TK1_Putusan_PA_POLEWALI_Nomor_325_Pdt_G_2025_PA_Pwl_Tanggal_12_Juni_2025__Penggugat_melawan_Tergugat.pdf -> raw_2025_TK1_Putusan_PA_POLEWALI_Nomor_325_Pdt_G_2025_PA_Pwl_Tanggal_12_Juni_2025__Penggugat_melawan_Tergugat.txt (36027 chars)
[34/46] Processing: 2023_TK1_Putusan_PN_JAKARTA_BARAT_Nomor_1085_Pid_Sus_2022_PN_Jkt_Brt_Tanggal_30_Maret_2023__Penuntut_Umum_1_ARY_PRATAMA__SH2_ARIF_SUSANTO__SH__MH3_AMRI_BAYAKTA__S_H_4_MARDIANA_YOLANDA_I__SILFFENDI.pdf


INFO:text_extraction:Success with pdfminer_basic: 219887 characters
INFO:text_extraction:Raw text saved: raw_2023_TK1_Putusan_PN_JAKARTA_BARAT_Nomor_1085_Pid_Sus_2022_PN_Jkt_Brt_Tanggal_30_Maret_2023__Penuntut_Umum_1_ARY_PRATAMA__SH2_ARIF_SUSANTO__SH__MH3_AMRI_BAYAKTA__S_H_4_MARDIANA_YOLANDA_I__SILFFENDI.txt
INFO:text_extraction:Extracting from PDF: 2023_TK1_Putusan_PN_JAKARTA_BARAT_Nomor_1165_Pid_Sus_2022_PN_Jkt_Brt_Tanggal_30_Maret_2023__Penuntut_Umum_1_AGUS_TRI_HARTONO__S_H___M_Hum_2_BERLIAN_D_NAINGGOLAN__SH__MH3_DWI_AGUS_SETYONINGRUGADMAN.pdf


SUCCESS: 2023_TK1_Putusan_PN_JAKARTA_BARAT_Nomor_1085_Pid_Sus_2022_PN_Jkt_Brt_Tanggal_30_Maret_2023__Penuntut_Umum_1_ARY_PRATAMA__SH2_ARIF_SUSANTO__SH__MH3_AMRI_BAYAKTA__S_H_4_MARDIANA_YOLANDA_I__SILFFENDI.pdf -> raw_2023_TK1_Putusan_PN_JAKARTA_BARAT_Nomor_1085_Pid_Sus_2022_PN_Jkt_Brt_Tanggal_30_Maret_2023__Penuntut_Umum_1_ARY_PRATAMA__SH2_ARIF_SUSANTO__SH__MH3_AMRI_BAYAKTA__S_H_4_MARDIANA_YOLANDA_I__SILFFENDI.txt (219887 chars)
[35/46] Processing: 2023_TK1_Putusan_PN_JAKARTA_BARAT_Nomor_1165_Pid_Sus_2022_PN_Jkt_Brt_Tanggal_30_Maret_2023__Penuntut_Umum_1_AGUS_TRI_HARTONO__S_H___M_Hum_2_BERLIAN_D_NAINGGOLAN__SH__MH3_DWI_AGUS_SETYONINGRUGADMAN.pdf


INFO:text_extraction:Success with pdfminer_basic: 127382 characters
INFO:text_extraction:Raw text saved: raw_2023_TK1_Putusan_PN_JAKARTA_BARAT_Nomor_1165_Pid_Sus_2022_PN_Jkt_Brt_Tanggal_30_Maret_2023__Penuntut_Umum_1_AGUS_TRI_HARTONO__S_H___M_Hum_2_BERLIAN_D_NAINGGOLAN__SH__MH3_DWI_AGUS_SETYONINGRUGADMAN.txt
INFO:text_extraction:Extracting from PDF: 2023_TK1_Putusan_PN_JAKARTA_BARAT_Nomor_4_Pid_Sus_2023_PN_Jkt_Brt_Tanggal_30_Maret_2023__Penuntut_Umum_1_DAVID_ROGER_JULIUS_PAKPAHAN__SH2_DEASY_MARIANA_MARUF__SH__MH3_FAISAL_NUR__SH__MH4_SORHYUDIN.pdf


SUCCESS: 2023_TK1_Putusan_PN_JAKARTA_BARAT_Nomor_1165_Pid_Sus_2022_PN_Jkt_Brt_Tanggal_30_Maret_2023__Penuntut_Umum_1_AGUS_TRI_HARTONO__S_H___M_Hum_2_BERLIAN_D_NAINGGOLAN__SH__MH3_DWI_AGUS_SETYONINGRUGADMAN.pdf -> raw_2023_TK1_Putusan_PN_JAKARTA_BARAT_Nomor_1165_Pid_Sus_2022_PN_Jkt_Brt_Tanggal_30_Maret_2023__Penuntut_Umum_1_AGUS_TRI_HARTONO__S_H___M_Hum_2_BERLIAN_D_NAINGGOLAN__SH__MH3_DWI_AGUS_SETYONINGRUGADMAN.txt (127382 chars)
[36/46] Processing: 2023_TK1_Putusan_PN_JAKARTA_BARAT_Nomor_4_Pid_Sus_2023_PN_Jkt_Brt_Tanggal_30_Maret_2023__Penuntut_Umum_1_DAVID_ROGER_JULIUS_PAKPAHAN__SH2_DEASY_MARIANA_MARUF__SH__MH3_FAISAL_NUR__SH__MH4_SORHYUDIN.pdf


INFO:text_extraction:Success with pdfminer_basic: 166195 characters
INFO:text_extraction:Raw text saved: raw_2023_TK1_Putusan_PN_JAKARTA_BARAT_Nomor_4_Pid_Sus_2023_PN_Jkt_Brt_Tanggal_30_Maret_2023__Penuntut_Umum_1_DAVID_ROGER_JULIUS_PAKPAHAN__SH2_DEASY_MARIANA_MARUF__SH__MH3_FAISAL_NUR__SH__MH4_SORHYUDIN.txt
INFO:text_extraction:Extracting from PDF: 2023_TK1_Putusan_PN_JAKARTA_BARAT_Nomor_1171_Pid_Sus_2022_PN_Jkt_Brt_Tanggal_30_Maret_2023__Penuntut_Umum_1_HEVBEN__SH2_HASBUDDIN_B_PASENG__SH3_MUCHAMAD_ADYANSYAH__SH__MH4_DWI_INDAH_KARTIKA_EGIMIN.pdf


SUCCESS: 2023_TK1_Putusan_PN_JAKARTA_BARAT_Nomor_4_Pid_Sus_2023_PN_Jkt_Brt_Tanggal_30_Maret_2023__Penuntut_Umum_1_DAVID_ROGER_JULIUS_PAKPAHAN__SH2_DEASY_MARIANA_MARUF__SH__MH3_FAISAL_NUR__SH__MH4_SORHYUDIN.pdf -> raw_2023_TK1_Putusan_PN_JAKARTA_BARAT_Nomor_4_Pid_Sus_2023_PN_Jkt_Brt_Tanggal_30_Maret_2023__Penuntut_Umum_1_DAVID_ROGER_JULIUS_PAKPAHAN__SH2_DEASY_MARIANA_MARUF__SH__MH3_FAISAL_NUR__SH__MH4_SORHYUDIN.txt (166195 chars)
[37/46] Processing: 2023_TK1_Putusan_PN_JAKARTA_BARAT_Nomor_1171_Pid_Sus_2022_PN_Jkt_Brt_Tanggal_30_Maret_2023__Penuntut_Umum_1_HEVBEN__SH2_HASBUDDIN_B_PASENG__SH3_MUCHAMAD_ADYANSYAH__SH__MH4_DWI_INDAH_KARTIKA_EGIMIN.pdf


INFO:text_extraction:Success with pdfminer_basic: 370242 characters
INFO:text_extraction:Raw text saved: raw_2023_TK1_Putusan_PN_JAKARTA_BARAT_Nomor_1171_Pid_Sus_2022_PN_Jkt_Brt_Tanggal_30_Maret_2023__Penuntut_Umum_1_HEVBEN__SH2_HASBUDDIN_B_PASENG__SH3_MUCHAMAD_ADYANSYAH__SH__MH4_DWI_INDAH_KARTIKA_EGIMIN.txt
INFO:text_extraction:Extracting from PDF: 2023_TK1_Putusan_PN_JAKARTA_BARAT_Nomor_1160_Pid_Sus_2022_PN_Jkt_Brt_Tanggal_30_Maret_2023__Penuntut_Umum_1_SUHARTATI__SH__MH2_MALINI_SIANTURI_SH3_Dr__HERRY_WIYANTO__SH_M_Hum4_NURHAYATI_ULFIAN_ALM.pdf


SUCCESS: 2023_TK1_Putusan_PN_JAKARTA_BARAT_Nomor_1171_Pid_Sus_2022_PN_Jkt_Brt_Tanggal_30_Maret_2023__Penuntut_Umum_1_HEVBEN__SH2_HASBUDDIN_B_PASENG__SH3_MUCHAMAD_ADYANSYAH__SH__MH4_DWI_INDAH_KARTIKA_EGIMIN.pdf -> raw_2023_TK1_Putusan_PN_JAKARTA_BARAT_Nomor_1171_Pid_Sus_2022_PN_Jkt_Brt_Tanggal_30_Maret_2023__Penuntut_Umum_1_HEVBEN__SH2_HASBUDDIN_B_PASENG__SH3_MUCHAMAD_ADYANSYAH__SH__MH4_DWI_INDAH_KARTIKA_EGIMIN.txt (370242 chars)
[38/46] Processing: 2023_TK1_Putusan_PN_JAKARTA_BARAT_Nomor_1160_Pid_Sus_2022_PN_Jkt_Brt_Tanggal_30_Maret_2023__Penuntut_Umum_1_SUHARTATI__SH__MH2_MALINI_SIANTURI_SH3_Dr__HERRY_WIYANTO__SH_M_Hum4_NURHAYATI_ULFIAN_ALM.pdf


INFO:text_extraction:Success with pdfminer_basic: 231201 characters
INFO:text_extraction:Raw text saved: raw_2023_TK1_Putusan_PN_JAKARTA_BARAT_Nomor_1160_Pid_Sus_2022_PN_Jkt_Brt_Tanggal_30_Maret_2023__Penuntut_Umum_1_SUHARTATI__SH__MH2_MALINI_SIANTURI_SH3_Dr__HERRY_WIYANTO__SH_M_Hum4_NURHAYATI_ULFIAN_ALM.txt
INFO:text_extraction:Extracting from PDF: 2023_TK1_Putusan_PN_JAKARTA_BARAT_Nomor_1105_Pid_Sus_2022_PN_Jkt_Brt_Tanggal_16_Maret_2023__Penuntut_Umum_1_AMRIL_ABDI__SH2_REZA_OKTAVIAN__S_H___M_H_3_HOTMAIDA_SH4_NANDA_KARMILA__SH5_OCTAVIASAKWID.pdf


SUCCESS: 2023_TK1_Putusan_PN_JAKARTA_BARAT_Nomor_1160_Pid_Sus_2022_PN_Jkt_Brt_Tanggal_30_Maret_2023__Penuntut_Umum_1_SUHARTATI__SH__MH2_MALINI_SIANTURI_SH3_Dr__HERRY_WIYANTO__SH_M_Hum4_NURHAYATI_ULFIAN_ALM.pdf -> raw_2023_TK1_Putusan_PN_JAKARTA_BARAT_Nomor_1160_Pid_Sus_2022_PN_Jkt_Brt_Tanggal_30_Maret_2023__Penuntut_Umum_1_SUHARTATI__SH__MH2_MALINI_SIANTURI_SH3_Dr__HERRY_WIYANTO__SH_M_Hum4_NURHAYATI_ULFIAN_ALM.txt (231201 chars)
[39/46] Processing: 2023_TK1_Putusan_PN_JAKARTA_BARAT_Nomor_1105_Pid_Sus_2022_PN_Jkt_Brt_Tanggal_16_Maret_2023__Penuntut_Umum_1_AMRIL_ABDI__SH2_REZA_OKTAVIAN__S_H___M_H_3_HOTMAIDA_SH4_NANDA_KARMILA__SH5_OCTAVIASAKWID.pdf


INFO:text_extraction:Success with pdfminer_basic: 303961 characters
INFO:text_extraction:Raw text saved: raw_2023_TK1_Putusan_PN_JAKARTA_BARAT_Nomor_1105_Pid_Sus_2022_PN_Jkt_Brt_Tanggal_16_Maret_2023__Penuntut_Umum_1_AMRIL_ABDI__SH2_REZA_OKTAVIAN__S_H___M_H_3_HOTMAIDA_SH4_NANDA_KARMILA__SH5_OCTAVIASAKWID.txt
INFO:text_extraction:Extracting from PDF: 2023_TK1_Putusan_PN_JAKARTA_BARAT_Nomor_991_Pid_Sus_2022_PN_Jkt_Brt_Tanggal_16_Maret_2023__Penuntut_Umum_1_UMI_HANINDYA_KUSUMA_SH2_IKA_SYAFITRY_SALIM__SH___MH_3_AGUS_JULIANTO_PURNOMO__SH4_AZCHTIAR.pdf


SUCCESS: 2023_TK1_Putusan_PN_JAKARTA_BARAT_Nomor_1105_Pid_Sus_2022_PN_Jkt_Brt_Tanggal_16_Maret_2023__Penuntut_Umum_1_AMRIL_ABDI__SH2_REZA_OKTAVIAN__S_H___M_H_3_HOTMAIDA_SH4_NANDA_KARMILA__SH5_OCTAVIASAKWID.pdf -> raw_2023_TK1_Putusan_PN_JAKARTA_BARAT_Nomor_1105_Pid_Sus_2022_PN_Jkt_Brt_Tanggal_16_Maret_2023__Penuntut_Umum_1_AMRIL_ABDI__SH2_REZA_OKTAVIAN__S_H___M_H_3_HOTMAIDA_SH4_NANDA_KARMILA__SH5_OCTAVIASAKWID.txt (303961 chars)
[40/46] Processing: 2023_TK1_Putusan_PN_JAKARTA_BARAT_Nomor_991_Pid_Sus_2022_PN_Jkt_Brt_Tanggal_16_Maret_2023__Penuntut_Umum_1_UMI_HANINDYA_KUSUMA_SH2_IKA_SYAFITRY_SALIM__SH___MH_3_AGUS_JULIANTO_PURNOMO__SH4_AZCHTIAR.pdf


INFO:text_extraction:Success with pdfminer_basic: 291067 characters
INFO:text_extraction:Raw text saved: raw_2023_TK1_Putusan_PN_JAKARTA_BARAT_Nomor_991_Pid_Sus_2022_PN_Jkt_Brt_Tanggal_16_Maret_2023__Penuntut_Umum_1_UMI_HANINDYA_KUSUMA_SH2_IKA_SYAFITRY_SALIM__SH___MH_3_AGUS_JULIANTO_PURNOMO__SH4_AZCHTIAR.txt
INFO:text_extraction:Extracting from PDF: 2023_TK1_Putusan_PN_JAKARTA_BARAT_Nomor_1107_Pid_Sus_2022_PN_Jkt_Brt_Tanggal_16_Maret_2023__Penuntut_Umum_1_ARIF_SUSANTO__SH__MH2_ARY_PRATAMA__SH3_AMRI_BAYAKTA__S_H_4_MUHAMAD_RAMLI__SH5_WULA_WARNO.pdf


SUCCESS: 2023_TK1_Putusan_PN_JAKARTA_BARAT_Nomor_991_Pid_Sus_2022_PN_Jkt_Brt_Tanggal_16_Maret_2023__Penuntut_Umum_1_UMI_HANINDYA_KUSUMA_SH2_IKA_SYAFITRY_SALIM__SH___MH_3_AGUS_JULIANTO_PURNOMO__SH4_AZCHTIAR.pdf -> raw_2023_TK1_Putusan_PN_JAKARTA_BARAT_Nomor_991_Pid_Sus_2022_PN_Jkt_Brt_Tanggal_16_Maret_2023__Penuntut_Umum_1_UMI_HANINDYA_KUSUMA_SH2_IKA_SYAFITRY_SALIM__SH___MH_3_AGUS_JULIANTO_PURNOMO__SH4_AZCHTIAR.txt (291067 chars)
[41/46] Processing: 2023_TK1_Putusan_PN_JAKARTA_BARAT_Nomor_1107_Pid_Sus_2022_PN_Jkt_Brt_Tanggal_16_Maret_2023__Penuntut_Umum_1_ARIF_SUSANTO__SH__MH2_ARY_PRATAMA__SH3_AMRI_BAYAKTA__S_H_4_MUHAMAD_RAMLI__SH5_WULA_WARNO.pdf


INFO:text_extraction:Success with pdfminer_basic: 142130 characters
INFO:text_extraction:Raw text saved: raw_2023_TK1_Putusan_PN_JAKARTA_BARAT_Nomor_1107_Pid_Sus_2022_PN_Jkt_Brt_Tanggal_16_Maret_2023__Penuntut_Umum_1_ARIF_SUSANTO__SH__MH2_ARY_PRATAMA__SH3_AMRI_BAYAKTA__S_H_4_MUHAMAD_RAMLI__SH5_WULA_WARNO.txt
INFO:text_extraction:Extracting from PDF: 2023_TK1_Putusan_PN_JAKARTA_BARAT_Nomor_992_Pid_Sus_2022_PN_Jkt_Brt_Tanggal_16_Februari_2023__Penuntut_Umum_1_IKA_SYAFITRY_SALIM__SH___MH_2_UMI_HANINDYA_KUSUMA_SH3_AGUS_JULIANTO_PURNOMO__SH4n_MUSA.pdf


SUCCESS: 2023_TK1_Putusan_PN_JAKARTA_BARAT_Nomor_1107_Pid_Sus_2022_PN_Jkt_Brt_Tanggal_16_Maret_2023__Penuntut_Umum_1_ARIF_SUSANTO__SH__MH2_ARY_PRATAMA__SH3_AMRI_BAYAKTA__S_H_4_MUHAMAD_RAMLI__SH5_WULA_WARNO.pdf -> raw_2023_TK1_Putusan_PN_JAKARTA_BARAT_Nomor_1107_Pid_Sus_2022_PN_Jkt_Brt_Tanggal_16_Maret_2023__Penuntut_Umum_1_ARIF_SUSANTO__SH__MH2_ARY_PRATAMA__SH3_AMRI_BAYAKTA__S_H_4_MUHAMAD_RAMLI__SH5_WULA_WARNO.txt (142130 chars)
[42/46] Processing: 2023_TK1_Putusan_PN_JAKARTA_BARAT_Nomor_992_Pid_Sus_2022_PN_Jkt_Brt_Tanggal_16_Februari_2023__Penuntut_Umum_1_IKA_SYAFITRY_SALIM__SH___MH_2_UMI_HANINDYA_KUSUMA_SH3_AGUS_JULIANTO_PURNOMO__SH4n_MUSA.pdf


INFO:text_extraction:Success with pdfminer_basic: 242001 characters
INFO:text_extraction:Raw text saved: raw_2023_TK1_Putusan_PN_JAKARTA_BARAT_Nomor_992_Pid_Sus_2022_PN_Jkt_Brt_Tanggal_16_Februari_2023__Penuntut_Umum_1_IKA_SYAFITRY_SALIM__SH___MH_2_UMI_HANINDYA_KUSUMA_SH3_AGUS_JULIANTO_PURNOMO__SH4n_MUSA.txt
INFO:text_extraction:Extracting from PDF: 2023_TK1_Putusan_PN_JAKARTA_TIMUR_Nomor_843_Pid_Sus_2022_PN_Jkt_Tim_Tanggal_8_Februari_2023__Penuntut_Umum_RIFQI_ARIALFA_SH_MHTerdakwa_ADI_SUPRIYADI_Als__DAFA_Als__ANAS_Als__ADI_USAMA_Als__BONIMIN.pdf


SUCCESS: 2023_TK1_Putusan_PN_JAKARTA_BARAT_Nomor_992_Pid_Sus_2022_PN_Jkt_Brt_Tanggal_16_Februari_2023__Penuntut_Umum_1_IKA_SYAFITRY_SALIM__SH___MH_2_UMI_HANINDYA_KUSUMA_SH3_AGUS_JULIANTO_PURNOMO__SH4n_MUSA.pdf -> raw_2023_TK1_Putusan_PN_JAKARTA_BARAT_Nomor_992_Pid_Sus_2022_PN_Jkt_Brt_Tanggal_16_Februari_2023__Penuntut_Umum_1_IKA_SYAFITRY_SALIM__SH___MH_2_UMI_HANINDYA_KUSUMA_SH3_AGUS_JULIANTO_PURNOMO__SH4n_MUSA.txt (242001 chars)
[43/46] Processing: 2023_TK1_Putusan_PN_JAKARTA_TIMUR_Nomor_843_Pid_Sus_2022_PN_Jkt_Tim_Tanggal_8_Februari_2023__Penuntut_Umum_RIFQI_ARIALFA_SH_MHTerdakwa_ADI_SUPRIYADI_Als__DAFA_Als__ANAS_Als__ADI_USAMA_Als__BONIMIN.pdf


INFO:text_extraction:Success with pdfminer_basic: 121794 characters
INFO:text_extraction:Raw text saved: raw_2023_TK1_Putusan_PN_JAKARTA_TIMUR_Nomor_843_Pid_Sus_2022_PN_Jkt_Tim_Tanggal_8_Februari_2023__Penuntut_Umum_RIFQI_ARIALFA_SH_MHTerdakwa_ADI_SUPRIYADI_Als__DAFA_Als__ANAS_Als__ADI_USAMA_Als__BONIMIN.txt
INFO:text_extraction:Extracting from PDF: 2025_TK1_Putusan_PA_TILAMUTA_Nomor_81_Pdt_P_2025_PA_Tlm_Tanggal_12_Juni_2025__Pemohon_melawan_Termohon.pdf


SUCCESS: 2023_TK1_Putusan_PN_JAKARTA_TIMUR_Nomor_843_Pid_Sus_2022_PN_Jkt_Tim_Tanggal_8_Februari_2023__Penuntut_Umum_RIFQI_ARIALFA_SH_MHTerdakwa_ADI_SUPRIYADI_Als__DAFA_Als__ANAS_Als__ADI_USAMA_Als__BONIMIN.pdf -> raw_2023_TK1_Putusan_PN_JAKARTA_TIMUR_Nomor_843_Pid_Sus_2022_PN_Jkt_Tim_Tanggal_8_Februari_2023__Penuntut_Umum_RIFQI_ARIALFA_SH_MHTerdakwa_ADI_SUPRIYADI_Als__DAFA_Als__ANAS_Als__ADI_USAMA_Als__BONIMIN.txt (121794 chars)
[44/46] Processing: 2025_TK1_Putusan_PA_TILAMUTA_Nomor_81_Pdt_P_2025_PA_Tlm_Tanggal_12_Juni_2025__Pemohon_melawan_Termohon.pdf


INFO:text_extraction:Success with pdfminer_basic: 27098 characters
INFO:text_extraction:Raw text saved: raw_2025_TK1_Putusan_PA_TILAMUTA_Nomor_81_Pdt_P_2025_PA_Tlm_Tanggal_12_Juni_2025__Pemohon_melawan_Termohon.txt
INFO:text_extraction:Extracting from PDF: 2025_TK1_Putusan_PA_TILAMUTA_Nomor_73_Pdt_P_2025_PA_Tlm_Tanggal_12_Juni_2025__Pemohon_melawan_Termohon.pdf


SUCCESS: 2025_TK1_Putusan_PA_TILAMUTA_Nomor_81_Pdt_P_2025_PA_Tlm_Tanggal_12_Juni_2025__Pemohon_melawan_Termohon.pdf -> raw_2025_TK1_Putusan_PA_TILAMUTA_Nomor_81_Pdt_P_2025_PA_Tlm_Tanggal_12_Juni_2025__Pemohon_melawan_Termohon.txt (27098 chars)
[45/46] Processing: 2025_TK1_Putusan_PA_TILAMUTA_Nomor_73_Pdt_P_2025_PA_Tlm_Tanggal_12_Juni_2025__Pemohon_melawan_Termohon.pdf


INFO:text_extraction:Success with pdfminer_basic: 29854 characters
INFO:text_extraction:Raw text saved: raw_2025_TK1_Putusan_PA_TILAMUTA_Nomor_73_Pdt_P_2025_PA_Tlm_Tanggal_12_Juni_2025__Pemohon_melawan_Termohon.txt
INFO:text_extraction:Extracting from PDF: 2025_TK1_Putusan_PA_TILAMUTA_Nomor_82_Pdt_P_2025_PA_Tlm_Tanggal_12_Juni_2025__Pemohon_melawan_Termohon.pdf


SUCCESS: 2025_TK1_Putusan_PA_TILAMUTA_Nomor_73_Pdt_P_2025_PA_Tlm_Tanggal_12_Juni_2025__Pemohon_melawan_Termohon.pdf -> raw_2025_TK1_Putusan_PA_TILAMUTA_Nomor_73_Pdt_P_2025_PA_Tlm_Tanggal_12_Juni_2025__Pemohon_melawan_Termohon.txt (29854 chars)
[46/46] Processing: 2025_TK1_Putusan_PA_TILAMUTA_Nomor_82_Pdt_P_2025_PA_Tlm_Tanggal_12_Juni_2025__Pemohon_melawan_Termohon.pdf


INFO:text_extraction:Success with pdfminer_basic: 24798 characters
INFO:text_extraction:Raw text saved: raw_2025_TK1_Putusan_PA_TILAMUTA_Nomor_82_Pdt_P_2025_PA_Tlm_Tanggal_12_Juni_2025__Pemohon_melawan_Termohon.txt


SUCCESS: 2025_TK1_Putusan_PA_TILAMUTA_Nomor_82_Pdt_P_2025_PA_Tlm_Tanggal_12_Juni_2025__Pemohon_melawan_Termohon.pdf -> raw_2025_TK1_Putusan_PA_TILAMUTA_Nomor_82_Pdt_P_2025_PA_Tlm_Tanggal_12_Juni_2025__Pemohon_melawan_Termohon.txt (24798 chars)

EXTRACTION SUMMARY
PDF files processed: 46
Total successful extractions: 46
Raw text files saved to: /content/drive/MyDrive/terorisme/RAW_TEXT
Extraction report: /logs/extraction_report.csv
Extraction log: /logs/extraction.log

EXTRACTION COMPLETE!
Check raw text files in: /content/drive/MyDrive/terorisme/RAW_TEXT
Next step: Run text cleaning on raw files


##Pembersihan

In [60]:
import os
import pandas as pd
import re
import logging
from datetime import date

In [61]:
import os
import re
import logging
from datetime import datetime

# Setup logging
logging.basicConfig(level=logging.INFO)
logger = logging.getLogger(__name__)

class TextCleaner:
    """Clean raw extracted text files"""

    def __init__(self, base_dir="/content/drive/MyDrive/terorisme"):
        self.base_dir = base_dir
        self.input_dir = os.path.join(base_dir, "RAW_TEXT")  # Input: raw text dari tahap ii
        self.output_dir = "/data/raw"  # Output: cleaned text files
        self.gdrive_output_dir = os.path.join(base_dir, "CLEANED")  # Google Drive backup
        self.gdrive_data_raw_dir = os.path.join(base_dir, "data", "raw")  # Mirror of /data/raw in gdrive
        self.logs_dir = "/logs"  # Local logs
        self.gdrive_logs_dir = os.path.join(base_dir, "logs")  # Google Drive logs mirror

        # Create directories
        os.makedirs(self.input_dir, exist_ok=True)
        os.makedirs(self.output_dir, exist_ok=True)
        os.makedirs(self.gdrive_output_dir, exist_ok=True)
        os.makedirs(self.gdrive_data_raw_dir, exist_ok=True)
        os.makedirs(self.logs_dir, exist_ok=True)
        os.makedirs(self.gdrive_logs_dir, exist_ok=True)

        print(f"Input (raw text): {self.input_dir}")
        print(f"Output 1 (data/raw): {self.output_dir}")
        print(f"Output 2 (gdrive): {self.gdrive_output_dir}")
        print(f"Output 3 (gdrive/data/raw): {self.gdrive_data_raw_dir}")
        print(f"Logs 1 (local): {self.logs_dir}")
        print(f"Logs 2 (gdrive): {self.gdrive_logs_dir}")

        # Setup cleaning logger
        self.setup_cleaning_logger()

    def setup_cleaning_logger(self):
        """Setup dedicated cleaning logger with dual output"""
        self.cleaning_logger = logging.getLogger('text_cleaning')
        self.cleaning_logger.setLevel(logging.INFO)

        # Remove existing handlers
        for handler in self.cleaning_logger.handlers[:]:
            self.cleaning_logger.removeHandler(handler)

        # Create file handlers for both locations
        log_file_local = os.path.join(self.logs_dir, 'cleaning.log')
        log_file_gdrive = os.path.join(self.gdrive_logs_dir, 'cleaning.log')

        # Local log handler
        file_handler_local = logging.FileHandler(log_file_local, mode='a', encoding='utf-8')
        file_handler_local.setLevel(logging.INFO)

        # Google Drive log handler
        file_handler_gdrive = logging.FileHandler(log_file_gdrive, mode='a', encoding='utf-8')
        file_handler_gdrive.setLevel(logging.INFO)

        # Create formatter
        formatter = logging.Formatter(
            '%(asctime)s - %(levelname)s - %(message)s',
            datefmt='%Y-%m-%d %H:%M:%S'
        )
        file_handler_local.setFormatter(formatter)
        file_handler_gdrive.setFormatter(formatter)

        # Add both handlers
        self.cleaning_logger.addHandler(file_handler_local)
        self.cleaning_logger.addHandler(file_handler_gdrive)

        self.cleaning_logger.info("="*60)
        self.cleaning_logger.info("TEXT CLEANING SESSION STARTED")
        self.cleaning_logger.info("="*60)

    # =================== 1. HAPUS HEADER/FOOTER/WATERMARK ===================

    def remove_headers_footers_watermarks(self, text):
        """Step 1: Remove headers, footers, page numbers, and watermarks"""
        if not isinstance(text, str) or not text.strip():
            return ""

        original_length = len(text)

        # MA specific headers and footers (exact patterns)
        ma_patterns = [
            # MA header with various spacing
            r'M\s*[Aa]\s*[Hh]\s*[Kk]\s*[Aa]\s*[Mm]\s*[Aa]\s*[Hh]\s*\s*[Aa]\s*[Gg]\s*[Uu]\s*[Nn]\s*[Gg]\s*\s*[Rr]\s*[Ee]\s*[Pp]\s*[Uu]\s*[Bb]\s*[Ll]\s*[Ii]\s*[Kk]\s*\s*[Ii]\s*[Nn]\s*[Dd]\s*[Oo]\s*[Nn]\s*[Ee]\s*[Ss]\s*[Ii]\s*[Aa]',

            # Disclaimer section
            r'Disclaimer\s*',
            r'Kepaniteraan Mahkamah Agung Republik Indonesia berusaha untuk selalu mencantumkan informasi paling kini dan akurat sebagai bentuk komitmen Mahkamah Agung untuk pelayanan publik, transparansi dan akuntabilitas',
            r'pelaksanaan fungsi peradilan\.\s*Namun dalam hal-hal tertentu masih dimungkinkan terjadi permasalahan teknis terkait dengan akurasi dan keterkinian informasi yang kami sajikan, hal mana akan terus kami perbaiki dari waktu kewaktu\.',
            r'Dalam hal Anda menemukan inakurasi informasi yang termuat pada situs ini atau informasi yang seharusnya ada, namun belum tersedia, maka harap segera hubungi Kepaniteraan Mahkamah Agung RI melalui\s*:',
            r'Email\s*:\s*kepaniteraan@mahkamahagung\.go\.id\s*Telp\s*:\s*021-384\s*3348\s*\(ext\.318\)',
        ]

        # Page numbers and navigation
        page_patterns = [
            r'Halaman\s+\d+\s+dari\s+\d+',
            r'Page\s+\d+\s+of\s+\d+',
            r'^\s*\d+\s*$',  # Standalone numbers
            r'^\s*-\s*\d+\s*-\s*$',  # -1-, -2-, etc
            r'^\s*\d+\s*/\s*\d+\s*$',  # 1/10, 2/10, etc
        ]

        # Watermarks and document stamps
        watermark_patterns = [
            r'SALINAN PUTUSAN',
            r'COPY\s+OF\s+VERDICT',
            r'DOKUMEN\s+ELEKTRONIK',
            r'ELECTRONIC\s+DOCUMENT',
            r'^\s*CONFIDENTIAL\s*$',
            r'^\s*RAHASIA\s*$',
            r'^\s*DRAFT\s*$',
            r'FOR\s+INTERNAL\s+USE\s+ONLY',
        ]

        # Apply all header/footer/watermark removals
        all_patterns = ma_patterns + page_patterns + watermark_patterns

        for pattern in all_patterns:
            text = re.sub(pattern, '', text, flags=re.IGNORECASE | re.MULTILINE)

        removed_chars = original_length - len(text)
        self.cleaning_logger.info(f"Headers/footers/watermarks removed: {removed_chars} characters")

        return text

    # =================== 2. NORMALISASI SPASI DAN KARAKTER ===================

    def normalize_spacing_and_characters(self, text, lowercase=True, remove_punctuation=False):
        """Step 2: Normalize spacing and characters"""
        if not isinstance(text, str) or not text.strip():
            return ""

        original_length = len(text)

        # Convert to lowercase if requested
        if lowercase:
            text = text.lower()
            self.cleaning_logger.info("Text converted to lowercase")

        # Normalize line breaks and spacing
        text = re.sub(r'\r\n', '\n', text)  # Windows line endings
        text = re.sub(r'\r', '\n', text)    # Mac line endings
        text = re.sub(r'\n\s*\n\s*\n+', '\n\n', text)  # Multiple line breaks to double
        text = re.sub(r'[ \t]+', ' ', text)  # Multiple spaces/tabs to single space
        text = re.sub(r'\n[ \t]+', '\n', text)  # Remove spaces at beginning of lines
        text = re.sub(r'[ \t]+\n', '\n', text)  # Remove spaces at end of lines

        # Fix common OCR errors (conservative approach)
        ocr_fixes = {
            r'\brn\b': 'm',     # "rn" → "m" only if standalone
            r'\bvv\b': 'w',     # "vv" → "w" only if standalone
            r'\bl1\b': 'll',    # "l1" → "ll" only if standalone
            r'\b0\b': 'o',      # "0" → "o" only if standalone in text context
        }

        for pattern, replacement in ocr_fixes.items():
            text = re.sub(pattern, replacement, text)

        # Handle punctuation
        if remove_punctuation:
            # Remove punctuation but keep sentence structure
            text = re.sub(r'[^\w\s\n]', '', text)
            self.cleaning_logger.info("Punctuation removed")
        else:
            # Just normalize punctuation spacing
            text = re.sub(r'\s+([.,;:!?])', r'\1', text)  # Remove spaces before punctuation
            text = re.sub(r'([.,;:!?])\s*([.,;:!?])', r'\1\2', text)  # Fix double punctuation

        # Final spacing cleanup
        text = re.sub(r'\s+', ' ', text)  # Multiple spaces to single
        text = re.sub(r'\n\s+', '\n', text)  # Spaces after newlines
        text = re.sub(r'\s+\n', '\n', text)  # Spaces before newlines
        text = re.sub(r'\n{3,}', '\n\n', text)  # Max 2 consecutive newlines

        normalized_chars = original_length - len(text)
        self.cleaning_logger.info(f"Spacing/character normalization: {normalized_chars} characters removed")

        return text.strip()

    # =================== COMPLETE CLEANING PIPELINE ===================

    def clean_single_text(self, text, remove_punctuation=False):
        """Complete cleaning pipeline for single text"""
        if not isinstance(text, str) or not text.strip():
            return ""

        original_length = len(text)
        self.cleaning_logger.info(f"Starting cleaning. Original length: {original_length} characters")

        # Step 1: Remove headers, footers, watermarks
        text = self.remove_headers_footers_watermarks(text)
        step1_length = len(text)

        # Step 2: Normalize spacing and characters
        text = self.normalize_spacing_and_characters(text, lowercase=True, remove_punctuation=remove_punctuation)
        final_length = len(text)

        # Calculate reduction
        total_reduction = ((original_length - final_length) / original_length * 100) if original_length > 0 else 0

        self.cleaning_logger.info(f"Cleaning complete. Final length: {final_length} characters ({total_reduction:.1f}% reduction)")

        # Quality check
        if final_length < original_length * 0.1:  # Less than 10% remaining
            self.cleaning_logger.warning("Over 90% of text was removed - check if cleaning is too aggressive")

        return text

    def process_single_file(self, input_filename, remove_punctuation=False):
        """Process single raw text file"""
        input_path = os.path.join(self.input_dir, input_filename)

        if not os.path.exists(input_path):
            self.cleaning_logger.error(f"File not found: {input_path}")
            return False

        try:
            # Read raw text
            with open(input_path, 'r', encoding='utf-8') as f:
                raw_text = f.read()

            self.cleaning_logger.info(f"Processing file: {input_filename}")

            if not raw_text.strip():
                self.cleaning_logger.warning(f"Empty file: {input_filename}")
                return False

            # Clean text
            cleaned_text = self.clean_single_text(raw_text, remove_punctuation)

            if not cleaned_text.strip():
                self.cleaning_logger.error(f"Cleaning resulted in empty text: {input_filename}")
                return False

            # Generate output filename (case_XXX.txt format as specified)
            base_name = input_filename.replace('raw_', '').replace('.txt', '')
            output_filename = f"case_{base_name}.txt"
            output_path_data = os.path.join(self.output_dir, output_filename)
            output_path_gdrive = os.path.join(self.gdrive_output_dir, output_filename)
            output_path_gdrive_data = os.path.join(self.gdrive_data_raw_dir, output_filename)

            # Step 3: Save cleaned text to ALL THREE locations
            # Save to /data/raw/
            with open(output_path_data, 'w', encoding='utf-8') as f:
                f.write(cleaned_text)

            # Save to Google Drive CLEANED
            with open(output_path_gdrive, 'w', encoding='utf-8') as f:
                f.write(cleaned_text)

            # Save to Google Drive data/raw
            with open(output_path_gdrive_data, 'w', encoding='utf-8') as f:
                f.write(cleaned_text)

            self.cleaning_logger.info(f"Cleaned file saved to all three locations: {output_filename}")
            print(f"SUCCESS: {input_filename} -> {output_filename} (saved to /data/raw, gdrive/CLEANED, and gdrive/data/raw)")

            return True

        except Exception as e:
            self.cleaning_logger.error(f"Error processing {input_filename}: {str(e)}")
            print(f"ERROR: {input_filename}: {str(e)}")
            return False

    def process_all_files(self, remove_punctuation=False):
        """Process all raw text files"""
        text_files = [f for f in os.listdir(self.input_dir)
                     if f.endswith(('.txt', '.TXT')) and os.path.isfile(os.path.join(self.input_dir, f))]

        if not text_files:
            print(f"No text files found in {self.input_dir}")
            return

        print(f"Found {len(text_files)} files to process")
        print(f"Remove punctuation: {'YES' if remove_punctuation else 'NO'}")
        print("="*60)

        success_count = 0
        error_count = 0

        for i, filename in enumerate(text_files, 1):
            print(f"[{i}/{len(text_files)}] {filename}")
            if self.process_single_file(filename, remove_punctuation):
                success_count += 1
            else:
                error_count += 1

        print("\n" + "="*60)
        print("CLEANING SUMMARY:")
        print(f"Success: {success_count}")
        print(f"Errors: {error_count}")
        print(f"Output 1: {self.output_dir}")
        print(f"Output 2: {self.gdrive_output_dir}")
        print(f"Output 3: {self.gdrive_data_raw_dir}")

        # Create cleaning summary
        self.create_cleaning_summary(success_count, error_count, text_files)

    def create_cleaning_summary(self, success_count, error_count, processed_files):
        """Create cleaning summary report"""
        summary_content = f"""TEXT CLEANING SUMMARY
===================
Date: {datetime.now().strftime('%Y-%m-%d %H:%M:%S')}
Total files found: {len(processed_files)}
Successfully cleaned: {success_count}
Errors: {error_count}
Success rate: {(success_count/len(processed_files)*100):.1f}%

Output directory 1: {self.output_dir}
Output directory 2: {self.gdrive_output_dir}
Output directory 3: {self.gdrive_data_raw_dir}
Log file 1: {os.path.join(self.logs_dir, 'cleaning.log')}
Log file 2: {os.path.join(self.gdrive_logs_dir, 'cleaning.log')}

Files processed:
"""

        for i, filename in enumerate(processed_files, 1):
            summary_content += f"{i:3d}. {filename}\n"

        # Save summary to both locations
        summary_path_local = os.path.join(self.logs_dir, 'cleaning_summary.txt')
        summary_path_gdrive = os.path.join(self.gdrive_logs_dir, 'cleaning_summary.txt')

        with open(summary_path_local, 'w', encoding='utf-8') as f:
            f.write(summary_content)

        with open(summary_path_gdrive, 'w', encoding='utf-8') as f:
            f.write(summary_content)

        print(f"Summary saved to: {summary_path_local}")
        print(f"Summary saved to: {summary_path_gdrive}")
        self.cleaning_logger.info(f"Summary reports created in both locations")

# Utility functions
def clean_single_text_quick(text, remove_punctuation=False):
    """Quick function to clean single text without logging"""
    cleaner = TextCleaner()
    return cleaner.clean_single_text(text, remove_punctuation=remove_punctuation)

def clean_text_from_file(input_path, output_path, remove_punctuation=False):
    """Clean text from specific file and save to specific location"""
    try:
        with open(input_path, 'r', encoding='utf-8') as f:
            raw_text = f.read()

        cleaned_text = clean_single_text_quick(raw_text, remove_punctuation)

        with open(output_path, 'w', encoding='utf-8') as f:
            f.write(cleaned_text)

        print(f"Cleaned text saved: {output_path}")
        return True

    except Exception as e:
        print(f"Error: {e}")
        return False

def main():
    """Main execution function"""
    print("iii. PEMBERSIHAN TEKS")
    print("=" * 50)
    print("Input: Raw text files hasil ekstraksi PDF")
    print("Output 1: Clean text files di /data/raw/")
    print("Output 2: Clean text files di Google Drive/CLEANED")
    print("Output 3: Clean text files di Google Drive/data/raw")
    print("Logs: /logs/ dan Google Drive/logs/")
    print("=" * 50)

    # Initialize cleaner
    cleaner = TextCleaner("/content/drive/MyDrive/terorisme")

    # Process all raw text files (keep punctuation by default)
    cleaner.process_all_files(remove_punctuation=False)

    print("\nCLEANING PROCESS COMPLETE!")
    print(f"Check output files in: /data/raw/")
    print(f"Check output files in: {cleaner.gdrive_output_dir}")
    print(f"Check output files in: {cleaner.gdrive_data_raw_dir}")
    print(f"Check logs in: /logs/cleaning.log")
    print(f"Check logs in: {cleaner.gdrive_logs_dir}/cleaning.log")

# Execute
if __name__ == "__main__":
    main()

INFO:text_cleaning:TEXT CLEANING SESSION STARTED
INFO:text_cleaning:Processing file: raw_2023_TK1_Putusan_PN_JAKARTA_TIMUR_Nomor_635_Pid_Sus_2023_PN_JKT_TIM_Tanggal_18_Desember_2023__Penuntut_Umum_ARY_PRATAMA__SHTerdakwa_INDRA_SYAHPUTRA_Alias_INDRA_ONO_Alias_ONO_KAY_Alias_ICAN_AN__Alm.txt
INFO:text_cleaning:Starting cleaning. Original length: 242685 characters
INFO:text_cleaning:Headers/footers/watermarks removed: 63644 characters
INFO:text_cleaning:Text converted to lowercase
INFO:text_cleaning:Spacing/character normalization: 1454 characters removed
INFO:text_cleaning:Cleaning complete. Final length: 177586 characters (26.8% reduction)
INFO:text_cleaning:Cleaned file saved to all three locations: case_2023_TK1_Putusan_PN_JAKARTA_TIMUR_Nomor_635_Pid_Sus_2023_PN_JKT_TIM_Tanggal_18_Desember_2023__Penuntut_Umum_ARY_PRATAMA__SHTerdakwa_INDRA_SYAHPUTRA_Alias_INDRA_ONO_Alias_ONO_KAY_Alias_ICAN_AN__Alm.txt
INFO:text_cleaning:Processing file: raw_2023_TK1_Putusan_PN_JAKARTA_TIMUR_Nomor_631_Pi

iii. PEMBERSIHAN TEKS
Input: Raw text files hasil ekstraksi PDF
Output 1: Clean text files di /data/raw/
Output 2: Clean text files di Google Drive/CLEANED
Output 3: Clean text files di Google Drive/data/raw
Logs: /logs/ dan Google Drive/logs/
Input (raw text): /content/drive/MyDrive/terorisme/RAW_TEXT
Output 1 (data/raw): /data/raw
Output 2 (gdrive): /content/drive/MyDrive/terorisme/CLEANED
Output 3 (gdrive/data/raw): /content/drive/MyDrive/terorisme/data/raw
Logs 1 (local): /logs
Logs 2 (gdrive): /content/drive/MyDrive/terorisme/logs
Found 46 files to process
Remove punctuation: NO
[1/46] raw_2023_TK1_Putusan_PN_JAKARTA_TIMUR_Nomor_635_Pid_Sus_2023_PN_JKT_TIM_Tanggal_18_Desember_2023__Penuntut_Umum_ARY_PRATAMA__SHTerdakwa_INDRA_SYAHPUTRA_Alias_INDRA_ONO_Alias_ONO_KAY_Alias_ICAN_AN__Alm.txt
SUCCESS: raw_2023_TK1_Putusan_PN_JAKARTA_TIMUR_Nomor_635_Pid_Sus_2023_PN_JKT_TIM_Tanggal_18_Desember_2023__Penuntut_Umum_ARY_PRATAMA__SHTerdakwa_INDRA_SYAHPUTRA_Alias_INDRA_ONO_Alias_ONO_KAY_Alias_

INFO:text_cleaning:Cleaned file saved to all three locations: case_2023_TK1_Putusan_PN_JAKARTA_TIMUR_Nomor_631_Pid_Sus_2023_PN_JKT_TIM_Tanggal_14_Desember_2023__Penuntut_Umum_ANDI_JEFRI_ARDIN__S_H_Terdakwa_DIAN_YUDI_SAPUTRA_alias_ABU_HANIF_Bin_WAHYU_ILAHI__Alm.txt
INFO:text_cleaning:Processing file: raw_2023_TK1_Putusan_PN_JAKARTA_TIMUR_Nomor_629_Pid_Sus_2023_PN_JKT_TIM_Tanggal_14_Desember_2023__Penuntut_Umum_HERRY_WIYANTO__SH__M_HumTerdakwa_TAJUDIN_Als_PAK_HAJI_TAJUDIN_Als_PAK_TEJE_Als_PAKWA_URA.txt
INFO:text_cleaning:Starting cleaning. Original length: 286786 characters
INFO:text_cleaning:Headers/footers/watermarks removed: 69635 characters
INFO:text_cleaning:Text converted to lowercase
INFO:text_cleaning:Spacing/character normalization: 1702 characters removed
INFO:text_cleaning:Cleaning complete. Final length: 215448 characters (24.9% reduction)
INFO:text_cleaning:Cleaned file saved to all three locations: case_2023_TK1_Putusan_PN_JAKARTA_TIMUR_Nomor_629_Pid_Sus_2023_PN_JKT_TIM_Tan

SUCCESS: raw_2023_TK1_Putusan_PN_JAKARTA_TIMUR_Nomor_631_Pid_Sus_2023_PN_JKT_TIM_Tanggal_14_Desember_2023__Penuntut_Umum_ANDI_JEFRI_ARDIN__S_H_Terdakwa_DIAN_YUDI_SAPUTRA_alias_ABU_HANIF_Bin_WAHYU_ILAHI__Alm.txt -> case_2023_TK1_Putusan_PN_JAKARTA_TIMUR_Nomor_631_Pid_Sus_2023_PN_JKT_TIM_Tanggal_14_Desember_2023__Penuntut_Umum_ANDI_JEFRI_ARDIN__S_H_Terdakwa_DIAN_YUDI_SAPUTRA_alias_ABU_HANIF_Bin_WAHYU_ILAHI__Alm.txt (saved to /data/raw, gdrive/CLEANED, and gdrive/data/raw)
[3/46] raw_2023_TK1_Putusan_PN_JAKARTA_TIMUR_Nomor_629_Pid_Sus_2023_PN_JKT_TIM_Tanggal_14_Desember_2023__Penuntut_Umum_HERRY_WIYANTO__SH__M_HumTerdakwa_TAJUDIN_Als_PAK_HAJI_TAJUDIN_Als_PAK_TEJE_Als_PAKWA_URA.txt
SUCCESS: raw_2023_TK1_Putusan_PN_JAKARTA_TIMUR_Nomor_629_Pid_Sus_2023_PN_JKT_TIM_Tanggal_14_Desember_2023__Penuntut_Umum_HERRY_WIYANTO__SH__M_HumTerdakwa_TAJUDIN_Als_PAK_HAJI_TAJUDIN_Als_PAK_TEJE_Als_PAKWA_URA.txt -> case_2023_TK1_Putusan_PN_JAKARTA_TIMUR_Nomor_629_Pid_Sus_2023_PN_JKT_TIM_Tanggal_14_Desember_202

INFO:text_cleaning:Headers/footers/watermarks removed: 42448 characters
INFO:text_cleaning:Text converted to lowercase
INFO:text_cleaning:Spacing/character normalization: 1077 characters removed
INFO:text_cleaning:Cleaning complete. Final length: 146552 characters (22.9% reduction)
INFO:text_cleaning:Cleaned file saved to all three locations: case_2023_TK1_Putusan_PN_JAKARTA_TIMUR_Nomor_555_Pid_Sus_2023_PN_JKT_TIM_Tanggal_13_Desember_2023__Penuntut_Umum_ERWIN_INDRAPUTRA__SH___MHTerdakwa_ARIS_BUDIANTO_alias_RIKO_alias_BAHAR_alias_SARAHARSONO.txt
INFO:text_cleaning:Processing file: raw_2023_TK1_Putusan_PN_JAKARTA_TIMUR_Nomor_637_Pid_Sus_2023_PN_JKT_TIM_Tanggal_15_Nopember_2023__Penuntut_Umum_AMRI_BAYAKTA__SHTerdakwa_WAHYUDI__alias_JONI_alias_GUNTUR_alias_FAJAR_Bin_ABDUL_PANUT.txt
INFO:text_cleaning:Starting cleaning. Original length: 85692 characters
INFO:text_cleaning:Headers/footers/watermarks removed: 19673 characters
INFO:text_cleaning:Text converted to lowercase
INFO:text_cleaning:S

SUCCESS: raw_2023_TK1_Putusan_PN_JAKARTA_TIMUR_Nomor_555_Pid_Sus_2023_PN_JKT_TIM_Tanggal_13_Desember_2023__Penuntut_Umum_ERWIN_INDRAPUTRA__SH___MHTerdakwa_ARIS_BUDIANTO_alias_RIKO_alias_BAHAR_alias_SARAHARSONO.txt -> case_2023_TK1_Putusan_PN_JAKARTA_TIMUR_Nomor_555_Pid_Sus_2023_PN_JKT_TIM_Tanggal_13_Desember_2023__Penuntut_Umum_ERWIN_INDRAPUTRA__SH___MHTerdakwa_ARIS_BUDIANTO_alias_RIKO_alias_BAHAR_alias_SARAHARSONO.txt (saved to /data/raw, gdrive/CLEANED, and gdrive/data/raw)
[5/46] raw_2023_TK1_Putusan_PN_JAKARTA_TIMUR_Nomor_637_Pid_Sus_2023_PN_JKT_TIM_Tanggal_15_Nopember_2023__Penuntut_Umum_AMRI_BAYAKTA__SHTerdakwa_WAHYUDI__alias_JONI_alias_GUNTUR_alias_FAJAR_Bin_ABDUL_PANUT.txt
SUCCESS: raw_2023_TK1_Putusan_PN_JAKARTA_TIMUR_Nomor_637_Pid_Sus_2023_PN_JKT_TIM_Tanggal_15_Nopember_2023__Penuntut_Umum_AMRI_BAYAKTA__SHTerdakwa_WAHYUDI__alias_JONI_alias_GUNTUR_alias_FAJAR_Bin_ABDUL_PANUT.txt -> case_2023_TK1_Putusan_PN_JAKARTA_TIMUR_Nomor_637_Pid_Sus_2023_PN_JKT_TIM_Tanggal_15_Nopember_202

INFO:text_cleaning:Spacing/character normalization: 1701 characters removed
INFO:text_cleaning:Cleaning complete. Final length: 208264 characters (24.3% reduction)
INFO:text_cleaning:Cleaned file saved to all three locations: case_2023_TK1_Putusan_PN_JAKARTA_TIMUR_Nomor_544_Pid_Sus_2023_PN_JKT_TIM_Tanggal_13_Nopember_2023__Penuntut_Umum_MALINI_SIANTURI__SHTerdakwa_MUHAMMAD_BUDI_SATRIA_Alias_BUDI__alias_KARI__alias_JUNH__Alm.txt
INFO:text_cleaning:Processing file: raw_2023_TK1_Putusan_PN_JAKARTA_TIMUR_Nomor_533_Pid_Sus_2023_PN_JKT_TIM_Tanggal_8_Nopember_2023__Penuntut_Umum_DAVID_ROGER_J_PAKPAHAN__SHTerdakwa_SUGENG_Alias_SALMAN_ALGHOZALI_Alias_USTAD_SALMANADENAN.txt
INFO:text_cleaning:Starting cleaning. Original length: 146539 characters
INFO:text_cleaning:Headers/footers/watermarks removed: 34878 characters
INFO:text_cleaning:Text converted to lowercase
INFO:text_cleaning:Spacing/character normalization: 841 characters removed
INFO:text_cleaning:Cleaning complete. Final length: 110819 c

SUCCESS: raw_2023_TK1_Putusan_PN_JAKARTA_TIMUR_Nomor_544_Pid_Sus_2023_PN_JKT_TIM_Tanggal_13_Nopember_2023__Penuntut_Umum_MALINI_SIANTURI__SHTerdakwa_MUHAMMAD_BUDI_SATRIA_Alias_BUDI__alias_KARI__alias_JUNH__Alm.txt -> case_2023_TK1_Putusan_PN_JAKARTA_TIMUR_Nomor_544_Pid_Sus_2023_PN_JKT_TIM_Tanggal_13_Nopember_2023__Penuntut_Umum_MALINI_SIANTURI__SHTerdakwa_MUHAMMAD_BUDI_SATRIA_Alias_BUDI__alias_KARI__alias_JUNH__Alm.txt (saved to /data/raw, gdrive/CLEANED, and gdrive/data/raw)
[7/46] raw_2023_TK1_Putusan_PN_JAKARTA_TIMUR_Nomor_533_Pid_Sus_2023_PN_JKT_TIM_Tanggal_8_Nopember_2023__Penuntut_Umum_DAVID_ROGER_J_PAKPAHAN__SHTerdakwa_SUGENG_Alias_SALMAN_ALGHOZALI_Alias_USTAD_SALMANADENAN.txt
SUCCESS: raw_2023_TK1_Putusan_PN_JAKARTA_TIMUR_Nomor_533_Pid_Sus_2023_PN_JKT_TIM_Tanggal_8_Nopember_2023__Penuntut_Umum_DAVID_ROGER_J_PAKPAHAN__SHTerdakwa_SUGENG_Alias_SALMAN_ALGHOZALI_Alias_USTAD_SALMANADENAN.txt -> case_2023_TK1_Putusan_PN_JAKARTA_TIMUR_Nomor_533_Pid_Sus_2023_PN_JKT_TIM_Tanggal_8_Nopembe

INFO:text_cleaning:Processing file: raw_2023_TK1_Putusan_PN_JAKARTA_TIMUR_Nomor_594_Pid_Sus_2023_PN_JKT_TIM_Tanggal_1_Nopember_2023__Penuntut_Umum_ERWIN_INDRAPUTRA__SH___MHTerdakwa_DJOKO_UTOMO_alias_JACK_alias_BENI_alias_ARI_aliasURIPDI.txt
INFO:text_cleaning:Starting cleaning. Original length: 324827 characters
INFO:text_cleaning:Headers/footers/watermarks removed: 73966 characters
INFO:text_cleaning:Text converted to lowercase
INFO:text_cleaning:Spacing/character normalization: 1658 characters removed
INFO:text_cleaning:Cleaning complete. Final length: 249202 characters (23.3% reduction)
INFO:text_cleaning:Cleaned file saved to all three locations: case_2023_TK1_Putusan_PN_JAKARTA_TIMUR_Nomor_594_Pid_Sus_2023_PN_JKT_TIM_Tanggal_1_Nopember_2023__Penuntut_Umum_ERWIN_INDRAPUTRA__SH___MHTerdakwa_DJOKO_UTOMO_alias_JACK_alias_BENI_alias_ARI_aliasURIPDI.txt
INFO:text_cleaning:Processing file: raw_2023_TK1_Putusan_PN_JAKARTA_TIMUR_Nomor_527_Pid_Sus_2023_PN_JKT_TIM_Tanggal_19_Oktober_2023__Pe

SUCCESS: raw_2023_TK1_Putusan_PN_JAKARTA_TIMUR_Nomor_594_Pid_Sus_2023_PN_JKT_TIM_Tanggal_1_Nopember_2023__Penuntut_Umum_ERWIN_INDRAPUTRA__SH___MHTerdakwa_DJOKO_UTOMO_alias_JACK_alias_BENI_alias_ARI_aliasURIPDI.txt -> case_2023_TK1_Putusan_PN_JAKARTA_TIMUR_Nomor_594_Pid_Sus_2023_PN_JKT_TIM_Tanggal_1_Nopember_2023__Penuntut_Umum_ERWIN_INDRAPUTRA__SH___MHTerdakwa_DJOKO_UTOMO_alias_JACK_alias_BENI_alias_ARI_aliasURIPDI.txt (saved to /data/raw, gdrive/CLEANED, and gdrive/data/raw)
[10/46] raw_2023_TK1_Putusan_PN_JAKARTA_TIMUR_Nomor_527_Pid_Sus_2023_PN_JKT_TIM_Tanggal_19_Oktober_2023__Penuntut_Umum_BERLIAN_D_NAINGGOLAN__SHTerdakwa_SYAIFULLAH_RIFAI_Alias_SYAIFULLAH_RIFAI_Alias_SAYFRAFII_.txt
SUCCESS: raw_2023_TK1_Putusan_PN_JAKARTA_TIMUR_Nomor_527_Pid_Sus_2023_PN_JKT_TIM_Tanggal_19_Oktober_2023__Penuntut_Umum_BERLIAN_D_NAINGGOLAN__SHTerdakwa_SYAIFULLAH_RIFAI_Alias_SYAIFULLAH_RIFAI_Alias_SAYFRAFII_.txt -> case_2023_TK1_Putusan_PN_JAKARTA_TIMUR_Nomor_527_Pid_Sus_2023_PN_JKT_TIM_Tanggal_19_Oktob

INFO:text_cleaning:Headers/footers/watermarks removed: 62065 characters
INFO:text_cleaning:Text converted to lowercase
INFO:text_cleaning:Spacing/character normalization: 1678 characters removed
INFO:text_cleaning:Cleaning complete. Final length: 189309 characters (25.2% reduction)
INFO:text_cleaning:Cleaned file saved to all three locations: case_2023_TK1_Putusan_PN_JAKARTA_TIMUR_Nomor_528_Pid_Sus_2023_PN_JKT_TIM_Tanggal_19_Oktober_2023__Penuntut_Umum_DWI_AGUS_SETYONINGRUM__SH__MHTerdakwa_RAMANDA_PRATAMA_alias_ABU_KENZI_alias_KURAMA_SUARDI.txt
INFO:text_cleaning:Processing file: raw_2023_TK1_Putusan_PN_JAKARTA_TIMUR_Nomor_545_Pid_Sus_2023_PN_JKT_TIM_Tanggal_11_Oktober_2023__Penuntut_Umum_MALINI_SIANTURI__SHTerdakwa_DIKA_GARNAKA_alias_HAMZAH_alias_ABU_USAMAH_alias_PEMBURA__Alm.txt
INFO:text_cleaning:Starting cleaning. Original length: 256336 characters
INFO:text_cleaning:Headers/footers/watermarks removed: 62065 characters
INFO:text_cleaning:Text converted to lowercase
INFO:text_cleani

SUCCESS: raw_2023_TK1_Putusan_PN_JAKARTA_TIMUR_Nomor_528_Pid_Sus_2023_PN_JKT_TIM_Tanggal_19_Oktober_2023__Penuntut_Umum_DWI_AGUS_SETYONINGRUM__SH__MHTerdakwa_RAMANDA_PRATAMA_alias_ABU_KENZI_alias_KURAMA_SUARDI.txt -> case_2023_TK1_Putusan_PN_JAKARTA_TIMUR_Nomor_528_Pid_Sus_2023_PN_JKT_TIM_Tanggal_19_Oktober_2023__Penuntut_Umum_DWI_AGUS_SETYONINGRUM__SH__MHTerdakwa_RAMANDA_PRATAMA_alias_ABU_KENZI_alias_KURAMA_SUARDI.txt (saved to /data/raw, gdrive/CLEANED, and gdrive/data/raw)
[12/46] raw_2023_TK1_Putusan_PN_JAKARTA_TIMUR_Nomor_545_Pid_Sus_2023_PN_JKT_TIM_Tanggal_11_Oktober_2023__Penuntut_Umum_MALINI_SIANTURI__SHTerdakwa_DIKA_GARNAKA_alias_HAMZAH_alias_ABU_USAMAH_alias_PEMBURA__Alm.txt
SUCCESS: raw_2023_TK1_Putusan_PN_JAKARTA_TIMUR_Nomor_545_Pid_Sus_2023_PN_JKT_TIM_Tanggal_11_Oktober_2023__Penuntut_Umum_MALINI_SIANTURI__SHTerdakwa_DIKA_GARNAKA_alias_HAMZAH_alias_ABU_USAMAH_alias_PEMBURA__Alm.txt -> case_2023_TK1_Putusan_PN_JAKARTA_TIMUR_Nomor_545_Pid_Sus_2023_PN_JKT_TIM_Tanggal_11_Oktob

INFO:text_cleaning:Spacing/character normalization: 4031 characters removed
INFO:text_cleaning:Cleaning complete. Final length: 143250 characters (25.3% reduction)
INFO:text_cleaning:Cleaned file saved to all three locations: case_2023_TK1_Putusan_PN_JAKARTA_TIMUR_Nomor_231_Pid_Sus_2023_PN_Jkt_Tim_Tanggal_2_Agustus_2023__Penuntut_Umum_ERWIN_INDRAPUTRA__SH___MHTerdakwa_TEUKU_MAULIZANSYAH_RAMLI_alias_MAULIDAN_alias_PON_I_TAEB.txt
INFO:text_cleaning:Processing file: raw_2023_TK1_Putusan_PN_JAKARTA_TIMUR_Nomor_230_Pid_Sus_2023_PN_Jkt_Tim_Tanggal_2_Agustus_2023__Penuntut_Umum_TEDDY_IRAWAN___SH___MH_Terdakwa_MUHAMAD_RIDWAN_als_DEK_WAN_als_NYAK_WAN_als_AHMAD_RI_ABBAS.txt
INFO:text_cleaning:Starting cleaning. Original length: 161433 characters
INFO:text_cleaning:Headers/footers/watermarks removed: 37689 characters
INFO:text_cleaning:Text converted to lowercase
INFO:text_cleaning:Spacing/character normalization: 3982 characters removed
INFO:text_cleaning:Cleaning complete. Final length: 119761 

SUCCESS: raw_2023_TK1_Putusan_PN_JAKARTA_TIMUR_Nomor_231_Pid_Sus_2023_PN_Jkt_Tim_Tanggal_2_Agustus_2023__Penuntut_Umum_ERWIN_INDRAPUTRA__SH___MHTerdakwa_TEUKU_MAULIZANSYAH_RAMLI_alias_MAULIDAN_alias_PON_I_TAEB.txt -> case_2023_TK1_Putusan_PN_JAKARTA_TIMUR_Nomor_231_Pid_Sus_2023_PN_Jkt_Tim_Tanggal_2_Agustus_2023__Penuntut_Umum_ERWIN_INDRAPUTRA__SH___MHTerdakwa_TEUKU_MAULIZANSYAH_RAMLI_alias_MAULIDAN_alias_PON_I_TAEB.txt (saved to /data/raw, gdrive/CLEANED, and gdrive/data/raw)
[14/46] raw_2023_TK1_Putusan_PN_JAKARTA_TIMUR_Nomor_230_Pid_Sus_2023_PN_Jkt_Tim_Tanggal_2_Agustus_2023__Penuntut_Umum_TEDDY_IRAWAN___SH___MH_Terdakwa_MUHAMAD_RIDWAN_als_DEK_WAN_als_NYAK_WAN_als_AHMAD_RI_ABBAS.txt
SUCCESS: raw_2023_TK1_Putusan_PN_JAKARTA_TIMUR_Nomor_230_Pid_Sus_2023_PN_Jkt_Tim_Tanggal_2_Agustus_2023__Penuntut_Umum_TEDDY_IRAWAN___SH___MH_Terdakwa_MUHAMAD_RIDWAN_als_DEK_WAN_als_NYAK_WAN_als_AHMAD_RI_ABBAS.txt -> case_2023_TK1_Putusan_PN_JAKARTA_TIMUR_Nomor_230_Pid_Sus_2023_PN_Jkt_Tim_Tanggal_2_Agustu

INFO:text_cleaning:Spacing/character normalization: 753 characters removed
INFO:text_cleaning:Cleaning complete. Final length: 92420 characters (24.7% reduction)
INFO:text_cleaning:Cleaned file saved to all three locations: case_2023_TK1_Putusan_PN_JAKARTA_TIMUR_Nomor_211_Pid_Sus_2023_PN_Jkt_Tim_Tanggal_12_Juli_2023__Penuntut_Umum_ANNISA_RK__SHTerdakwa_SYAHRUL_ALS_TOPAN_ALS_BENZ_ALS_ANGGA_BIN_UMARDI_.txt
INFO:text_cleaning:Processing file: raw_2023_TK1_Putusan_PN_JAKARTA_TIMUR_Nomor_209_Pid_Sus_2023_PN_Jkt_Tim_Tanggal_12_Juli_2023__Penuntut_Umum_HOTMAIDA__SHTerdakwa_ARIADI_Alias_KHAIRUL_alias_FERI_alias_JIWO_Bin_ASNAN.txt
INFO:text_cleaning:Starting cleaning. Original length: 179295 characters
INFO:text_cleaning:Headers/footers/watermarks removed: 40933 characters
INFO:text_cleaning:Text converted to lowercase
INFO:text_cleaning:Spacing/character normalization: 1344 characters removed
INFO:text_cleaning:Cleaning complete. Final length: 137017 characters (23.6% reduction)
INFO:text_clea

SUCCESS: raw_2023_TK1_Putusan_PN_JAKARTA_TIMUR_Nomor_211_Pid_Sus_2023_PN_Jkt_Tim_Tanggal_12_Juli_2023__Penuntut_Umum_ANNISA_RK__SHTerdakwa_SYAHRUL_ALS_TOPAN_ALS_BENZ_ALS_ANGGA_BIN_UMARDI_.txt -> case_2023_TK1_Putusan_PN_JAKARTA_TIMUR_Nomor_211_Pid_Sus_2023_PN_Jkt_Tim_Tanggal_12_Juli_2023__Penuntut_Umum_ANNISA_RK__SHTerdakwa_SYAHRUL_ALS_TOPAN_ALS_BENZ_ALS_ANGGA_BIN_UMARDI_.txt (saved to /data/raw, gdrive/CLEANED, and gdrive/data/raw)
[16/46] raw_2023_TK1_Putusan_PN_JAKARTA_TIMUR_Nomor_209_Pid_Sus_2023_PN_Jkt_Tim_Tanggal_12_Juli_2023__Penuntut_Umum_HOTMAIDA__SHTerdakwa_ARIADI_Alias_KHAIRUL_alias_FERI_alias_JIWO_Bin_ASNAN.txt
SUCCESS: raw_2023_TK1_Putusan_PN_JAKARTA_TIMUR_Nomor_209_Pid_Sus_2023_PN_Jkt_Tim_Tanggal_12_Juli_2023__Penuntut_Umum_HOTMAIDA__SHTerdakwa_ARIADI_Alias_KHAIRUL_alias_FERI_alias_JIWO_Bin_ASNAN.txt -> case_2023_TK1_Putusan_PN_JAKARTA_TIMUR_Nomor_209_Pid_Sus_2023_PN_Jkt_Tim_Tanggal_12_Juli_2023__Penuntut_Umum_HOTMAIDA__SHTerdakwa_ARIADI_Alias_KHAIRUL_alias_FERI_alias_JIW

INFO:text_cleaning:Headers/footers/watermarks removed: 18159 characters
INFO:text_cleaning:Text converted to lowercase
INFO:text_cleaning:Spacing/character normalization: 527 characters removed
INFO:text_cleaning:Cleaning complete. Final length: 55638 characters (25.1% reduction)
INFO:text_cleaning:Cleaned file saved to all three locations: case_2023_TK1_Putusan_PN_JAKARTA_TIMUR_Nomor_210_Pid_Sus_2023_PN_Jkt_Tim_Tanggal_12_Juli_2023__Penuntut_Umum_MUHAMMAD_FAHRUL__SHTerdakwa_SUTANTO_Alias_AWAN_Alias_EKO_Alias_AHMAD_SYAKIR_Bin_SANIMAN.txt
INFO:text_cleaning:Processing file: raw_2023_TK1_Putusan_PN_JAKARTA_TIMUR_Nomor_109_Pid_Sus_2023_PN_Jkt_Tim_Tanggal_21_Juni_2023__Penuntut_Umum_ARIF_SUSANTO__SH__MHTerdakwa_MOH__SYAIFULLAH_A__SAHABA_Alias_MAMAT_Alias_MAT_HERDIANSYSAHABA.txt
INFO:text_cleaning:Starting cleaning. Original length: 178380 characters
INFO:text_cleaning:Headers/footers/watermarks removed: 42448 characters
INFO:text_cleaning:Text converted to lowercase
INFO:text_cleaning:Spac

SUCCESS: raw_2023_TK1_Putusan_PN_JAKARTA_TIMUR_Nomor_210_Pid_Sus_2023_PN_Jkt_Tim_Tanggal_12_Juli_2023__Penuntut_Umum_MUHAMMAD_FAHRUL__SHTerdakwa_SUTANTO_Alias_AWAN_Alias_EKO_Alias_AHMAD_SYAKIR_Bin_SANIMAN.txt -> case_2023_TK1_Putusan_PN_JAKARTA_TIMUR_Nomor_210_Pid_Sus_2023_PN_Jkt_Tim_Tanggal_12_Juli_2023__Penuntut_Umum_MUHAMMAD_FAHRUL__SHTerdakwa_SUTANTO_Alias_AWAN_Alias_EKO_Alias_AHMAD_SYAKIR_Bin_SANIMAN.txt (saved to /data/raw, gdrive/CLEANED, and gdrive/data/raw)
[18/46] raw_2023_TK1_Putusan_PN_JAKARTA_TIMUR_Nomor_109_Pid_Sus_2023_PN_Jkt_Tim_Tanggal_21_Juni_2023__Penuntut_Umum_ARIF_SUSANTO__SH__MHTerdakwa_MOH__SYAIFULLAH_A__SAHABA_Alias_MAMAT_Alias_MAT_HERDIANSYSAHABA.txt


INFO:text_cleaning:Cleaned file saved to all three locations: case_2023_TK1_Putusan_PN_JAKARTA_TIMUR_Nomor_109_Pid_Sus_2023_PN_Jkt_Tim_Tanggal_21_Juni_2023__Penuntut_Umum_ARIF_SUSANTO__SH__MHTerdakwa_MOH__SYAIFULLAH_A__SAHABA_Alias_MAMAT_Alias_MAT_HERDIANSYSAHABA.txt
INFO:text_cleaning:Processing file: raw_2023_TK1_Putusan_PN_JAKARTA_TIMUR_Nomor_110_Pid_Sus_2023_PN_Jkt_Tim_Tanggal_21_Juni_2023__Penuntut_Umum_HASBUDDIN_B_PASENG__SH_Terdakwa_MUH__RIZAL_S_Pd_I_Alias_RIZAL_Alias_MAMAT_Alias_MAT_TONGGALA.txt
INFO:text_cleaning:Starting cleaning. Original length: 161712 characters
INFO:text_cleaning:Headers/footers/watermarks removed: 39355 characters
INFO:text_cleaning:Text converted to lowercase
INFO:text_cleaning:Spacing/character normalization: 826 characters removed
INFO:text_cleaning:Cleaning complete. Final length: 121530 characters (24.8% reduction)
INFO:text_cleaning:Cleaned file saved to all three locations: case_2023_TK1_Putusan_PN_JAKARTA_TIMUR_Nomor_110_Pid_Sus_2023_PN_Jkt_Tim_T

SUCCESS: raw_2023_TK1_Putusan_PN_JAKARTA_TIMUR_Nomor_109_Pid_Sus_2023_PN_Jkt_Tim_Tanggal_21_Juni_2023__Penuntut_Umum_ARIF_SUSANTO__SH__MHTerdakwa_MOH__SYAIFULLAH_A__SAHABA_Alias_MAMAT_Alias_MAT_HERDIANSYSAHABA.txt -> case_2023_TK1_Putusan_PN_JAKARTA_TIMUR_Nomor_109_Pid_Sus_2023_PN_Jkt_Tim_Tanggal_21_Juni_2023__Penuntut_Umum_ARIF_SUSANTO__SH__MHTerdakwa_MOH__SYAIFULLAH_A__SAHABA_Alias_MAMAT_Alias_MAT_HERDIANSYSAHABA.txt (saved to /data/raw, gdrive/CLEANED, and gdrive/data/raw)
[19/46] raw_2023_TK1_Putusan_PN_JAKARTA_TIMUR_Nomor_110_Pid_Sus_2023_PN_Jkt_Tim_Tanggal_21_Juni_2023__Penuntut_Umum_HASBUDDIN_B_PASENG__SH_Terdakwa_MUH__RIZAL_S_Pd_I_Alias_RIZAL_Alias_MAMAT_Alias_MAT_TONGGALA.txt
SUCCESS: raw_2023_TK1_Putusan_PN_JAKARTA_TIMUR_Nomor_110_Pid_Sus_2023_PN_Jkt_Tim_Tanggal_21_Juni_2023__Penuntut_Umum_HASBUDDIN_B_PASENG__SH_Terdakwa_MUH__RIZAL_S_Pd_I_Alias_RIZAL_Alias_MAMAT_Alias_MAT_TONGGALA.txt -> case_2023_TK1_Putusan_PN_JAKARTA_TIMUR_Nomor_110_Pid_Sus_2023_PN_Jkt_Tim_Tanggal_21_Juni_

INFO:text_cleaning:Headers/footers/watermarks removed: 42383 characters
INFO:text_cleaning:Text converted to lowercase
INFO:text_cleaning:Spacing/character normalization: 958 characters removed
INFO:text_cleaning:Cleaning complete. Final length: 135250 characters (24.3% reduction)
INFO:text_cleaning:Cleaned file saved to all three locations: case_2023_TK1_Putusan_PN_JAKARTA_TIMUR_Nomor_114_Pid_Sus_2023_PN_Jkt_Tim_Tanggal_24_Mei_2023__Penuntut_Umum_FAISAL_NUR__SH__MHTerdakwa_MUHAMMAD_INDRA_GARUSU_bin_SAINUL_GARUSU.txt
INFO:text_cleaning:Processing file: raw_2023_TK1_Putusan_PN_JAKARTA_TIMUR_Nomor_71_Pid_Sus_2023_PN_Jkt_Tim_Tanggal_17_Mei_2023__Penuntut_Umum_TEDDY_IRAWAN___SH___MH_Terdakwa_HERLIANSYAH_als_ANDI_BASO_als_HERLY_BIN_SULTANNI.txt
INFO:text_cleaning:Starting cleaning. Original length: 227472 characters
INFO:text_cleaning:Headers/footers/watermarks removed: 56865 characters
INFO:text_cleaning:Text converted to lowercase
INFO:text_cleaning:Spacing/character normalization: 1605 c

SUCCESS: raw_2023_TK1_Putusan_PN_JAKARTA_TIMUR_Nomor_114_Pid_Sus_2023_PN_Jkt_Tim_Tanggal_24_Mei_2023__Penuntut_Umum_FAISAL_NUR__SH__MHTerdakwa_MUHAMMAD_INDRA_GARUSU_bin_SAINUL_GARUSU.txt -> case_2023_TK1_Putusan_PN_JAKARTA_TIMUR_Nomor_114_Pid_Sus_2023_PN_Jkt_Tim_Tanggal_24_Mei_2023__Penuntut_Umum_FAISAL_NUR__SH__MHTerdakwa_MUHAMMAD_INDRA_GARUSU_bin_SAINUL_GARUSU.txt (saved to /data/raw, gdrive/CLEANED, and gdrive/data/raw)
[21/46] raw_2023_TK1_Putusan_PN_JAKARTA_TIMUR_Nomor_71_Pid_Sus_2023_PN_Jkt_Tim_Tanggal_17_Mei_2023__Penuntut_Umum_TEDDY_IRAWAN___SH___MH_Terdakwa_HERLIANSYAH_als_ANDI_BASO_als_HERLY_BIN_SULTANNI.txt


INFO:text_cleaning:Cleaned file saved to all three locations: case_2023_TK1_Putusan_PN_JAKARTA_TIMUR_Nomor_71_Pid_Sus_2023_PN_Jkt_Tim_Tanggal_17_Mei_2023__Penuntut_Umum_TEDDY_IRAWAN___SH___MH_Terdakwa_HERLIANSYAH_als_ANDI_BASO_als_HERLY_BIN_SULTANNI.txt
INFO:text_cleaning:Processing file: raw_2023_TK1_Putusan_PN_JAKARTA_TIMUR_Nomor_73_Pid_Sus_2023_PN_Jkt_Tim_Tanggal_10_Mei_2023__Penuntut_Umum_HARDINIYANTY__SH__MHTerdakwa_LUKMAN_YUNUS_Als_UKO_Als_ABU_SYUKRON_Bin_IDAM_YUNUS.txt
INFO:text_cleaning:Starting cleaning. Original length: 196345 characters
INFO:text_cleaning:Headers/footers/watermarks removed: 45818 characters
INFO:text_cleaning:Text converted to lowercase


SUCCESS: raw_2023_TK1_Putusan_PN_JAKARTA_TIMUR_Nomor_71_Pid_Sus_2023_PN_Jkt_Tim_Tanggal_17_Mei_2023__Penuntut_Umum_TEDDY_IRAWAN___SH___MH_Terdakwa_HERLIANSYAH_als_ANDI_BASO_als_HERLY_BIN_SULTANNI.txt -> case_2023_TK1_Putusan_PN_JAKARTA_TIMUR_Nomor_71_Pid_Sus_2023_PN_Jkt_Tim_Tanggal_17_Mei_2023__Penuntut_Umum_TEDDY_IRAWAN___SH___MH_Terdakwa_HERLIANSYAH_als_ANDI_BASO_als_HERLY_BIN_SULTANNI.txt (saved to /data/raw, gdrive/CLEANED, and gdrive/data/raw)
[22/46] raw_2023_TK1_Putusan_PN_JAKARTA_TIMUR_Nomor_73_Pid_Sus_2023_PN_Jkt_Tim_Tanggal_10_Mei_2023__Penuntut_Umum_HARDINIYANTY__SH__MHTerdakwa_LUKMAN_YUNUS_Als_UKO_Als_ABU_SYUKRON_Bin_IDAM_YUNUS.txt


INFO:text_cleaning:Spacing/character normalization: 4405 characters removed
INFO:text_cleaning:Cleaning complete. Final length: 146121 characters (25.6% reduction)
INFO:text_cleaning:Cleaned file saved to all three locations: case_2023_TK1_Putusan_PN_JAKARTA_TIMUR_Nomor_73_Pid_Sus_2023_PN_Jkt_Tim_Tanggal_10_Mei_2023__Penuntut_Umum_HARDINIYANTY__SH__MHTerdakwa_LUKMAN_YUNUS_Als_UKO_Als_ABU_SYUKRON_Bin_IDAM_YUNUS.txt
INFO:text_cleaning:Processing file: raw_2025_TK1_Putusan_MS_CALANG_Nomor_70_Pdt_P_2025_MS_Cag_Tanggal_12_Juni_2025__Pemohon_melawan_Termohon.txt
INFO:text_cleaning:Starting cleaning. Original length: 58818 characters
INFO:text_cleaning:Headers/footers/watermarks removed: 17402 characters
INFO:text_cleaning:Text converted to lowercase
INFO:text_cleaning:Spacing/character normalization: 359 characters removed
INFO:text_cleaning:Cleaning complete. Final length: 41056 characters (30.2% reduction)
INFO:text_cleaning:Cleaned file saved to all three locations: case_2025_TK1_Putusan_

SUCCESS: raw_2023_TK1_Putusan_PN_JAKARTA_TIMUR_Nomor_73_Pid_Sus_2023_PN_Jkt_Tim_Tanggal_10_Mei_2023__Penuntut_Umum_HARDINIYANTY__SH__MHTerdakwa_LUKMAN_YUNUS_Als_UKO_Als_ABU_SYUKRON_Bin_IDAM_YUNUS.txt -> case_2023_TK1_Putusan_PN_JAKARTA_TIMUR_Nomor_73_Pid_Sus_2023_PN_Jkt_Tim_Tanggal_10_Mei_2023__Penuntut_Umum_HARDINIYANTY__SH__MHTerdakwa_LUKMAN_YUNUS_Als_UKO_Als_ABU_SYUKRON_Bin_IDAM_YUNUS.txt (saved to /data/raw, gdrive/CLEANED, and gdrive/data/raw)
[23/46] raw_2025_TK1_Putusan_MS_CALANG_Nomor_70_Pdt_P_2025_MS_Cag_Tanggal_12_Juni_2025__Pemohon_melawan_Termohon.txt
SUCCESS: raw_2025_TK1_Putusan_MS_CALANG_Nomor_70_Pdt_P_2025_MS_Cag_Tanggal_12_Juni_2025__Pemohon_melawan_Termohon.txt -> case_2025_TK1_Putusan_MS_CALANG_Nomor_70_Pdt_P_2025_MS_Cag_Tanggal_12_Juni_2025__Pemohon_melawan_Termohon.txt (saved to /data/raw, gdrive/CLEANED, and gdrive/data/raw)
[24/46] raw_2025_TK1_Putusan_PA_SAMPANG_Nomor_776_Pdt_G_2025_PA_Spg_Tanggal_12_Juni_2025__Penggugat_melawan_Tergugat.txt
SUCCESS: raw_2025_TK

INFO:text_cleaning:Spacing/character normalization: 250 characters removed
INFO:text_cleaning:Cleaning complete. Final length: 25410 characters (28.2% reduction)
INFO:text_cleaning:Cleaned file saved to all three locations: case_2025_TK1_Putusan_PA_SAMPANG_Nomor_830_Pdt_G_2025_PA_Spg_Tanggal_12_Juni_2025__Penggugat_melawan_Tergugat.txt
INFO:text_cleaning:Processing file: raw_2023_TK1_Putusan_PN_JAKARTA_TIMUR_Nomor_74_Pid_Sus_2023_PN_Jkt_Tim_Tanggal_10_Mei_2023__Penuntut_Umum_JAYA_SIAHAAN___SH_Terdakwa_KHOIRRUDDIN_Alias_JIHAN_Bin_M__SAIFUL_ANWARY.txt
INFO:text_cleaning:Starting cleaning. Original length: 197688 characters
INFO:text_cleaning:Headers/footers/watermarks removed: 45883 characters
INFO:text_cleaning:Text converted to lowercase
INFO:text_cleaning:Spacing/character normalization: 4520 characters removed
INFO:text_cleaning:Cleaning complete. Final length: 147284 characters (25.5% reduction)
INFO:text_cleaning:Cleaned file saved to all three locations: case_2023_TK1_Putusan_PN_J

SUCCESS: raw_2025_TK1_Putusan_PA_SAMPANG_Nomor_830_Pdt_G_2025_PA_Spg_Tanggal_12_Juni_2025__Penggugat_melawan_Tergugat.txt -> case_2025_TK1_Putusan_PA_SAMPANG_Nomor_830_Pdt_G_2025_PA_Spg_Tanggal_12_Juni_2025__Penggugat_melawan_Tergugat.txt (saved to /data/raw, gdrive/CLEANED, and gdrive/data/raw)
[26/46] raw_2023_TK1_Putusan_PN_JAKARTA_TIMUR_Nomor_74_Pid_Sus_2023_PN_Jkt_Tim_Tanggal_10_Mei_2023__Penuntut_Umum_JAYA_SIAHAAN___SH_Terdakwa_KHOIRRUDDIN_Alias_JIHAN_Bin_M__SAIFUL_ANWARY.txt
SUCCESS: raw_2023_TK1_Putusan_PN_JAKARTA_TIMUR_Nomor_74_Pid_Sus_2023_PN_Jkt_Tim_Tanggal_10_Mei_2023__Penuntut_Umum_JAYA_SIAHAAN___SH_Terdakwa_KHOIRRUDDIN_Alias_JIHAN_Bin_M__SAIFUL_ANWARY.txt -> case_2023_TK1_Putusan_PN_JAKARTA_TIMUR_Nomor_74_Pid_Sus_2023_PN_Jkt_Tim_Tanggal_10_Mei_2023__Penuntut_Umum_JAYA_SIAHAAN___SH_Terdakwa_KHOIRRUDDIN_Alias_JIHAN_Bin_M__SAIFUL_ANWARY.txt (saved to /data/raw, gdrive/CLEANED, and gdrive/data/raw)
[27/46] raw_2023_TK1_Putusan_PN_JAKARTA_BARAT_Nomor_8_Pid_Sus_2023_PN_Jkt_Brt_

INFO:text_cleaning:Starting cleaning. Original length: 103760 characters
INFO:text_cleaning:Headers/footers/watermarks removed: 24451 characters
INFO:text_cleaning:Text converted to lowercase
INFO:text_cleaning:Spacing/character normalization: 738 characters removed
INFO:text_cleaning:Cleaning complete. Final length: 78570 characters (24.3% reduction)
INFO:text_cleaning:Cleaned file saved to all three locations: case_2023_TK1_Putusan_PN_JAKARTA_BARAT_Nomor_8_Pid_Sus_2023_PN_Jkt_Brt_Tanggal_13_April_2023__Penuntut_Umum_1_ANDI_JEFRI_ARDIN__SH_MH2_JAHRUDIN__SH3_DENRI_KASWORO__S_H_4_ZULKIFLI__SH__MH5_KHAREZA_SOLEH.txt
INFO:text_cleaning:Processing file: raw_2023_TK1_Putusan_PN_JAKARTA_BARAT_Nomor_3_Pid_Sus_2023_PN_Jkt_Brt_Tanggal_13_April_2023__Penuntut_Umum_1_POERWOKO_HADI_SASMITO__SH2_Dra__INDRAYATI__H_S__SH__MH3_ADE_SOLEHUDIN__SH__MH4_MARDIAAYANTO.txt
INFO:text_cleaning:Starting cleaning. Original length: 191798 characters
INFO:text_cleaning:Headers/footers/watermarks removed: 44340 cha

SUCCESS: raw_2023_TK1_Putusan_PN_JAKARTA_BARAT_Nomor_8_Pid_Sus_2023_PN_Jkt_Brt_Tanggal_13_April_2023__Penuntut_Umum_1_ANDI_JEFRI_ARDIN__SH_MH2_JAHRUDIN__SH3_DENRI_KASWORO__S_H_4_ZULKIFLI__SH__MH5_KHAREZA_SOLEH.txt -> case_2023_TK1_Putusan_PN_JAKARTA_BARAT_Nomor_8_Pid_Sus_2023_PN_Jkt_Brt_Tanggal_13_April_2023__Penuntut_Umum_1_ANDI_JEFRI_ARDIN__SH_MH2_JAHRUDIN__SH3_DENRI_KASWORO__S_H_4_ZULKIFLI__SH__MH5_KHAREZA_SOLEH.txt (saved to /data/raw, gdrive/CLEANED, and gdrive/data/raw)
[28/46] raw_2023_TK1_Putusan_PN_JAKARTA_BARAT_Nomor_3_Pid_Sus_2023_PN_Jkt_Brt_Tanggal_13_April_2023__Penuntut_Umum_1_POERWOKO_HADI_SASMITO__SH2_Dra__INDRAYATI__H_S__SH__MH3_ADE_SOLEHUDIN__SH__MH4_MARDIAAYANTO.txt


INFO:text_cleaning:Cleaned file saved to all three locations: case_2023_TK1_Putusan_PN_JAKARTA_BARAT_Nomor_3_Pid_Sus_2023_PN_Jkt_Brt_Tanggal_13_April_2023__Penuntut_Umum_1_POERWOKO_HADI_SASMITO__SH2_Dra__INDRAYATI__H_S__SH__MH3_ADE_SOLEHUDIN__SH__MH4_MARDIAAYANTO.txt
INFO:text_cleaning:Processing file: raw_2023_TK1_Putusan_PN_JAKARTA_BARAT_Nomor_1109_Pid_Sus_2022_PN_Jkt_Brt_Tanggal_6_April_2023__Penuntut_Umum_1_JUWITA_KAYANA__S_H___M_H_2_TEDDY_IRAWAN_SH3_ERWIN_INDRAPUTRA__SH__MH4_MUHAMAD_RAMLI_WASIS.txt
INFO:text_cleaning:Starting cleaning. Original length: 148631 characters
INFO:text_cleaning:Headers/footers/watermarks removed: 34797 characters
INFO:text_cleaning:Text converted to lowercase
INFO:text_cleaning:Spacing/character normalization: 1359 characters removed
INFO:text_cleaning:Cleaning complete. Final length: 112474 characters (24.3% reduction)
INFO:text_cleaning:Cleaned file saved to all three locations: case_2023_TK1_Putusan_PN_JAKARTA_BARAT_Nomor_1109_Pid_Sus_2022_PN_Jkt_Brt

SUCCESS: raw_2023_TK1_Putusan_PN_JAKARTA_BARAT_Nomor_3_Pid_Sus_2023_PN_Jkt_Brt_Tanggal_13_April_2023__Penuntut_Umum_1_POERWOKO_HADI_SASMITO__SH2_Dra__INDRAYATI__H_S__SH__MH3_ADE_SOLEHUDIN__SH__MH4_MARDIAAYANTO.txt -> case_2023_TK1_Putusan_PN_JAKARTA_BARAT_Nomor_3_Pid_Sus_2023_PN_Jkt_Brt_Tanggal_13_April_2023__Penuntut_Umum_1_POERWOKO_HADI_SASMITO__SH2_Dra__INDRAYATI__H_S__SH__MH3_ADE_SOLEHUDIN__SH__MH4_MARDIAAYANTO.txt (saved to /data/raw, gdrive/CLEANED, and gdrive/data/raw)
[29/46] raw_2023_TK1_Putusan_PN_JAKARTA_BARAT_Nomor_1109_Pid_Sus_2022_PN_Jkt_Brt_Tanggal_6_April_2023__Penuntut_Umum_1_JUWITA_KAYANA__S_H___M_H_2_TEDDY_IRAWAN_SH3_ERWIN_INDRAPUTRA__SH__MH4_MUHAMAD_RAMLI_WASIS.txt
SUCCESS: raw_2023_TK1_Putusan_PN_JAKARTA_BARAT_Nomor_1109_Pid_Sus_2022_PN_Jkt_Brt_Tanggal_6_April_2023__Penuntut_Umum_1_JUWITA_KAYANA__S_H___M_H_2_TEDDY_IRAWAN_SH3_ERWIN_INDRAPUTRA__SH__MH4_MUHAMAD_RAMLI_WASIS.txt -> case_2023_TK1_Putusan_PN_JAKARTA_BARAT_Nomor_1109_Pid_Sus_2022_PN_Jkt_Brt_Tanggal_6_April

INFO:text_cleaning:Headers/footers/watermarks removed: 45818 characters
INFO:text_cleaning:Text converted to lowercase
INFO:text_cleaning:Spacing/character normalization: 1220 characters removed
INFO:text_cleaning:Cleaning complete. Final length: 151163 characters (23.7% reduction)
INFO:text_cleaning:Cleaned file saved to all three locations: case_2023_TK1_Putusan_PN_JAKARTA_BARAT_Nomor_1206_Pid_Sus_2022_PN_Jkt_Brt_Tanggal_6_April_2023__Penuntut_Umum_1_HASBUDDIN_B_PASENG__SH2_HEVBEN__SH3_MUCHAMAD_ADYANSYAH__SH__MH4_PONTI_LUKWINANTI_SHARNAMA.txt
INFO:text_cleaning:Processing file: raw_2023_TK1_Putusan_PN_JAKARTA_BARAT_Nomor_1110_Pid_Sus_2022_PN_Jkt_Brt_Tanggal_6_April_2023__Penuntut_Umum_1_HOTMAIDA_SH2_AMRIL_ABDI__SH3_REZA_OKTAVIAN__S_H___M_H_4_FEBBY_SALAHUDDIN__S__Kom__SSUKAMA.txt
INFO:text_cleaning:Starting cleaning. Original length: 195771 characters
INFO:text_cleaning:Headers/footers/watermarks removed: 45079 characters
INFO:text_cleaning:Text converted to lowercase


SUCCESS: raw_2023_TK1_Putusan_PN_JAKARTA_BARAT_Nomor_1206_Pid_Sus_2022_PN_Jkt_Brt_Tanggal_6_April_2023__Penuntut_Umum_1_HASBUDDIN_B_PASENG__SH2_HEVBEN__SH3_MUCHAMAD_ADYANSYAH__SH__MH4_PONTI_LUKWINANTI_SHARNAMA.txt -> case_2023_TK1_Putusan_PN_JAKARTA_BARAT_Nomor_1206_Pid_Sus_2022_PN_Jkt_Brt_Tanggal_6_April_2023__Penuntut_Umum_1_HASBUDDIN_B_PASENG__SH2_HEVBEN__SH3_MUCHAMAD_ADYANSYAH__SH__MH4_PONTI_LUKWINANTI_SHARNAMA.txt (saved to /data/raw, gdrive/CLEANED, and gdrive/data/raw)
[31/46] raw_2023_TK1_Putusan_PN_JAKARTA_BARAT_Nomor_1110_Pid_Sus_2022_PN_Jkt_Brt_Tanggal_6_April_2023__Penuntut_Umum_1_HOTMAIDA_SH2_AMRIL_ABDI__SH3_REZA_OKTAVIAN__S_H___M_H_4_FEBBY_SALAHUDDIN__S__Kom__SSUKAMA.txt


INFO:text_cleaning:Spacing/character normalization: 1210 characters removed
INFO:text_cleaning:Cleaning complete. Final length: 149481 characters (23.6% reduction)
INFO:text_cleaning:Cleaned file saved to all three locations: case_2023_TK1_Putusan_PN_JAKARTA_BARAT_Nomor_1110_Pid_Sus_2022_PN_Jkt_Brt_Tanggal_6_April_2023__Penuntut_Umum_1_HOTMAIDA_SH2_AMRIL_ABDI__SH3_REZA_OKTAVIAN__S_H___M_H_4_FEBBY_SALAHUDDIN__S__Kom__SSUKAMA.txt
INFO:text_cleaning:Processing file: raw_2023_TK1_Putusan_PN_JAKARTA_BARAT_Nomor_1172_Pid_Sus_2022_PN_Jkt_Brt_Tanggal_30_Maret_2023__Penuntut_Umum_1_ERWIN_INDRAPUTRA__SH__MH2_TEDDY_IRAWAN_SH3_JUWITA_KAYANA__S_H___M_H_4_KHAREZA_MOKH_MIJAR.txt
INFO:text_cleaning:Starting cleaning. Original length: 191458 characters
INFO:text_cleaning:Headers/footers/watermarks removed: 45079 characters
INFO:text_cleaning:Text converted to lowercase
INFO:text_cleaning:Spacing/character normalization: 1815 characters removed
INFO:text_cleaning:Cleaning complete. Final length: 144563 

SUCCESS: raw_2023_TK1_Putusan_PN_JAKARTA_BARAT_Nomor_1110_Pid_Sus_2022_PN_Jkt_Brt_Tanggal_6_April_2023__Penuntut_Umum_1_HOTMAIDA_SH2_AMRIL_ABDI__SH3_REZA_OKTAVIAN__S_H___M_H_4_FEBBY_SALAHUDDIN__S__Kom__SSUKAMA.txt -> case_2023_TK1_Putusan_PN_JAKARTA_BARAT_Nomor_1110_Pid_Sus_2022_PN_Jkt_Brt_Tanggal_6_April_2023__Penuntut_Umum_1_HOTMAIDA_SH2_AMRIL_ABDI__SH3_REZA_OKTAVIAN__S_H___M_H_4_FEBBY_SALAHUDDIN__S__Kom__SSUKAMA.txt (saved to /data/raw, gdrive/CLEANED, and gdrive/data/raw)
[32/46] raw_2023_TK1_Putusan_PN_JAKARTA_BARAT_Nomor_1172_Pid_Sus_2022_PN_Jkt_Brt_Tanggal_30_Maret_2023__Penuntut_Umum_1_ERWIN_INDRAPUTRA__SH__MH2_TEDDY_IRAWAN_SH3_JUWITA_KAYANA__S_H___M_H_4_KHAREZA_MOKH_MIJAR.txt
SUCCESS: raw_2023_TK1_Putusan_PN_JAKARTA_BARAT_Nomor_1172_Pid_Sus_2022_PN_Jkt_Brt_Tanggal_30_Maret_2023__Penuntut_Umum_1_ERWIN_INDRAPUTRA__SH__MH2_TEDDY_IRAWAN_SH3_JUWITA_KAYANA__S_H___M_H_4_KHAREZA_MOKH_MIJAR.txt -> case_2023_TK1_Putusan_PN_JAKARTA_BARAT_Nomor_1172_Pid_Sus_2022_PN_Jkt_Brt_Tanggal_30_Mare

INFO:text_cleaning:Spacing/character normalization: 272 characters removed
INFO:text_cleaning:Cleaning complete. Final length: 26147 characters (27.4% reduction)
INFO:text_cleaning:Cleaned file saved to all three locations: case_2025_TK1_Putusan_PA_POLEWALI_Nomor_325_Pdt_G_2025_PA_Pwl_Tanggal_12_Juni_2025__Penggugat_melawan_Tergugat.txt
INFO:text_cleaning:Processing file: raw_2023_TK1_Putusan_PN_JAKARTA_BARAT_Nomor_1085_Pid_Sus_2022_PN_Jkt_Brt_Tanggal_30_Maret_2023__Penuntut_Umum_1_ARY_PRATAMA__SH2_ARIF_SUSANTO__SH__MH3_AMRI_BAYAKTA__S_H_4_MARDIANA_YOLANDA_I__SILFFENDI.txt
INFO:text_cleaning:Starting cleaning. Original length: 219887 characters
INFO:text_cleaning:Headers/footers/watermarks removed: 50776 characters
INFO:text_cleaning:Text converted to lowercase
INFO:text_cleaning:Spacing/character normalization: 1192 characters removed
INFO:text_cleaning:Cleaning complete. Final length: 167918 characters (23.6% reduction)


SUCCESS: raw_2025_TK1_Putusan_PA_POLEWALI_Nomor_325_Pdt_G_2025_PA_Pwl_Tanggal_12_Juni_2025__Penggugat_melawan_Tergugat.txt -> case_2025_TK1_Putusan_PA_POLEWALI_Nomor_325_Pdt_G_2025_PA_Pwl_Tanggal_12_Juni_2025__Penggugat_melawan_Tergugat.txt (saved to /data/raw, gdrive/CLEANED, and gdrive/data/raw)
[34/46] raw_2023_TK1_Putusan_PN_JAKARTA_BARAT_Nomor_1085_Pid_Sus_2022_PN_Jkt_Brt_Tanggal_30_Maret_2023__Penuntut_Umum_1_ARY_PRATAMA__SH2_ARIF_SUSANTO__SH__MH3_AMRI_BAYAKTA__S_H_4_MARDIANA_YOLANDA_I__SILFFENDI.txt


INFO:text_cleaning:Cleaned file saved to all three locations: case_2023_TK1_Putusan_PN_JAKARTA_BARAT_Nomor_1085_Pid_Sus_2022_PN_Jkt_Brt_Tanggal_30_Maret_2023__Penuntut_Umum_1_ARY_PRATAMA__SH2_ARIF_SUSANTO__SH__MH3_AMRI_BAYAKTA__S_H_4_MARDIANA_YOLANDA_I__SILFFENDI.txt
INFO:text_cleaning:Processing file: raw_2023_TK1_Putusan_PN_JAKARTA_BARAT_Nomor_1165_Pid_Sus_2022_PN_Jkt_Brt_Tanggal_30_Maret_2023__Penuntut_Umum_1_AGUS_TRI_HARTONO__S_H___M_Hum_2_BERLIAN_D_NAINGGOLAN__SH__MH3_DWI_AGUS_SETYONINGRUGADMAN.txt
INFO:text_cleaning:Starting cleaning. Original length: 127382 characters
INFO:text_cleaning:Headers/footers/watermarks removed: 31038 characters
INFO:text_cleaning:Text converted to lowercase
INFO:text_cleaning:Spacing/character normalization: 819 characters removed
INFO:text_cleaning:Cleaning complete. Final length: 95524 characters (25.0% reduction)
INFO:text_cleaning:Cleaned file saved to all three locations: case_2023_TK1_Putusan_PN_JAKARTA_BARAT_Nomor_1165_Pid_Sus_2022_PN_Jkt_Brt_T

SUCCESS: raw_2023_TK1_Putusan_PN_JAKARTA_BARAT_Nomor_1085_Pid_Sus_2022_PN_Jkt_Brt_Tanggal_30_Maret_2023__Penuntut_Umum_1_ARY_PRATAMA__SH2_ARIF_SUSANTO__SH__MH3_AMRI_BAYAKTA__S_H_4_MARDIANA_YOLANDA_I__SILFFENDI.txt -> case_2023_TK1_Putusan_PN_JAKARTA_BARAT_Nomor_1085_Pid_Sus_2022_PN_Jkt_Brt_Tanggal_30_Maret_2023__Penuntut_Umum_1_ARY_PRATAMA__SH2_ARIF_SUSANTO__SH__MH3_AMRI_BAYAKTA__S_H_4_MARDIANA_YOLANDA_I__SILFFENDI.txt (saved to /data/raw, gdrive/CLEANED, and gdrive/data/raw)
[35/46] raw_2023_TK1_Putusan_PN_JAKARTA_BARAT_Nomor_1165_Pid_Sus_2022_PN_Jkt_Brt_Tanggal_30_Maret_2023__Penuntut_Umum_1_AGUS_TRI_HARTONO__S_H___M_Hum_2_BERLIAN_D_NAINGGOLAN__SH__MH3_DWI_AGUS_SETYONINGRUGADMAN.txt
SUCCESS: raw_2023_TK1_Putusan_PN_JAKARTA_BARAT_Nomor_1165_Pid_Sus_2022_PN_Jkt_Brt_Tanggal_30_Maret_2023__Penuntut_Umum_1_AGUS_TRI_HARTONO__S_H___M_Hum_2_BERLIAN_D_NAINGGOLAN__SH__MH3_DWI_AGUS_SETYONINGRUGADMAN.txt -> case_2023_TK1_Putusan_PN_JAKARTA_BARAT_Nomor_1165_Pid_Sus_2022_PN_Jkt_Brt_Tanggal_30_Mare

INFO:text_cleaning:Headers/footers/watermarks removed: 37016 characters
INFO:text_cleaning:Text converted to lowercase
INFO:text_cleaning:Spacing/character normalization: 1130 characters removed
INFO:text_cleaning:Cleaning complete. Final length: 128048 characters (23.0% reduction)
INFO:text_cleaning:Cleaned file saved to all three locations: case_2023_TK1_Putusan_PN_JAKARTA_BARAT_Nomor_4_Pid_Sus_2023_PN_Jkt_Brt_Tanggal_30_Maret_2023__Penuntut_Umum_1_DAVID_ROGER_JULIUS_PAKPAHAN__SH2_DEASY_MARIANA_MARUF__SH__MH3_FAISAL_NUR__SH__MH4_SORHYUDIN.txt
INFO:text_cleaning:Processing file: raw_2023_TK1_Putusan_PN_JAKARTA_BARAT_Nomor_1171_Pid_Sus_2022_PN_Jkt_Brt_Tanggal_30_Maret_2023__Penuntut_Umum_1_HEVBEN__SH2_HASBUDDIN_B_PASENG__SH3_MUCHAMAD_ADYANSYAH__SH__MH4_DWI_INDAH_KARTIKA_EGIMIN.txt
INFO:text_cleaning:Starting cleaning. Original length: 370242 characters
INFO:text_cleaning:Headers/footers/watermarks removed: 87982 characters
INFO:text_cleaning:Text converted to lowercase


SUCCESS: raw_2023_TK1_Putusan_PN_JAKARTA_BARAT_Nomor_4_Pid_Sus_2023_PN_Jkt_Brt_Tanggal_30_Maret_2023__Penuntut_Umum_1_DAVID_ROGER_JULIUS_PAKPAHAN__SH2_DEASY_MARIANA_MARUF__SH__MH3_FAISAL_NUR__SH__MH4_SORHYUDIN.txt -> case_2023_TK1_Putusan_PN_JAKARTA_BARAT_Nomor_4_Pid_Sus_2023_PN_Jkt_Brt_Tanggal_30_Maret_2023__Penuntut_Umum_1_DAVID_ROGER_JULIUS_PAKPAHAN__SH2_DEASY_MARIANA_MARUF__SH__MH3_FAISAL_NUR__SH__MH4_SORHYUDIN.txt (saved to /data/raw, gdrive/CLEANED, and gdrive/data/raw)
[37/46] raw_2023_TK1_Putusan_PN_JAKARTA_BARAT_Nomor_1171_Pid_Sus_2022_PN_Jkt_Brt_Tanggal_30_Maret_2023__Penuntut_Umum_1_HEVBEN__SH2_HASBUDDIN_B_PASENG__SH3_MUCHAMAD_ADYANSYAH__SH__MH4_DWI_INDAH_KARTIKA_EGIMIN.txt


INFO:text_cleaning:Spacing/character normalization: 2273 characters removed
INFO:text_cleaning:Cleaning complete. Final length: 279986 characters (24.4% reduction)
INFO:text_cleaning:Cleaned file saved to all three locations: case_2023_TK1_Putusan_PN_JAKARTA_BARAT_Nomor_1171_Pid_Sus_2022_PN_Jkt_Brt_Tanggal_30_Maret_2023__Penuntut_Umum_1_HEVBEN__SH2_HASBUDDIN_B_PASENG__SH3_MUCHAMAD_ADYANSYAH__SH__MH4_DWI_INDAH_KARTIKA_EGIMIN.txt
INFO:text_cleaning:Processing file: raw_2023_TK1_Putusan_PN_JAKARTA_BARAT_Nomor_1160_Pid_Sus_2022_PN_Jkt_Brt_Tanggal_30_Maret_2023__Penuntut_Umum_1_SUHARTATI__SH__MH2_MALINI_SIANTURI_SH3_Dr__HERRY_WIYANTO__SH_M_Hum4_NURHAYATI_ULFIAN_ALM.txt
INFO:text_cleaning:Starting cleaning. Original length: 231201 characters
INFO:text_cleaning:Headers/footers/watermarks removed: 54686 characters
INFO:text_cleaning:Text converted to lowercase


SUCCESS: raw_2023_TK1_Putusan_PN_JAKARTA_BARAT_Nomor_1171_Pid_Sus_2022_PN_Jkt_Brt_Tanggal_30_Maret_2023__Penuntut_Umum_1_HEVBEN__SH2_HASBUDDIN_B_PASENG__SH3_MUCHAMAD_ADYANSYAH__SH__MH4_DWI_INDAH_KARTIKA_EGIMIN.txt -> case_2023_TK1_Putusan_PN_JAKARTA_BARAT_Nomor_1171_Pid_Sus_2022_PN_Jkt_Brt_Tanggal_30_Maret_2023__Penuntut_Umum_1_HEVBEN__SH2_HASBUDDIN_B_PASENG__SH3_MUCHAMAD_ADYANSYAH__SH__MH4_DWI_INDAH_KARTIKA_EGIMIN.txt (saved to /data/raw, gdrive/CLEANED, and gdrive/data/raw)
[38/46] raw_2023_TK1_Putusan_PN_JAKARTA_BARAT_Nomor_1160_Pid_Sus_2022_PN_Jkt_Brt_Tanggal_30_Maret_2023__Penuntut_Umum_1_SUHARTATI__SH__MH2_MALINI_SIANTURI_SH3_Dr__HERRY_WIYANTO__SH_M_Hum4_NURHAYATI_ULFIAN_ALM.txt


INFO:text_cleaning:Spacing/character normalization: 1391 characters removed
INFO:text_cleaning:Cleaning complete. Final length: 175123 characters (24.3% reduction)
INFO:text_cleaning:Cleaned file saved to all three locations: case_2023_TK1_Putusan_PN_JAKARTA_BARAT_Nomor_1160_Pid_Sus_2022_PN_Jkt_Brt_Tanggal_30_Maret_2023__Penuntut_Umum_1_SUHARTATI__SH__MH2_MALINI_SIANTURI_SH3_Dr__HERRY_WIYANTO__SH_M_Hum4_NURHAYATI_ULFIAN_ALM.txt
INFO:text_cleaning:Processing file: raw_2023_TK1_Putusan_PN_JAKARTA_BARAT_Nomor_1105_Pid_Sus_2022_PN_Jkt_Brt_Tanggal_16_Maret_2023__Penuntut_Umum_1_AMRIL_ABDI__SH2_REZA_OKTAVIAN__S_H___M_H_3_HOTMAIDA_SH4_NANDA_KARMILA__SH5_OCTAVIASAKWID.txt
INFO:text_cleaning:Starting cleaning. Original length: 303961 characters
INFO:text_cleaning:Headers/footers/watermarks removed: 72486 characters
INFO:text_cleaning:Text converted to lowercase


SUCCESS: raw_2023_TK1_Putusan_PN_JAKARTA_BARAT_Nomor_1160_Pid_Sus_2022_PN_Jkt_Brt_Tanggal_30_Maret_2023__Penuntut_Umum_1_SUHARTATI__SH__MH2_MALINI_SIANTURI_SH3_Dr__HERRY_WIYANTO__SH_M_Hum4_NURHAYATI_ULFIAN_ALM.txt -> case_2023_TK1_Putusan_PN_JAKARTA_BARAT_Nomor_1160_Pid_Sus_2022_PN_Jkt_Brt_Tanggal_30_Maret_2023__Penuntut_Umum_1_SUHARTATI__SH__MH2_MALINI_SIANTURI_SH3_Dr__HERRY_WIYANTO__SH_M_Hum4_NURHAYATI_ULFIAN_ALM.txt (saved to /data/raw, gdrive/CLEANED, and gdrive/data/raw)
[39/46] raw_2023_TK1_Putusan_PN_JAKARTA_BARAT_Nomor_1105_Pid_Sus_2022_PN_Jkt_Brt_Tanggal_16_Maret_2023__Penuntut_Umum_1_AMRIL_ABDI__SH2_REZA_OKTAVIAN__S_H___M_H_3_HOTMAIDA_SH4_NANDA_KARMILA__SH5_OCTAVIASAKWID.txt


INFO:text_cleaning:Spacing/character normalization: 1929 characters removed
INFO:text_cleaning:Cleaning complete. Final length: 229545 characters (24.5% reduction)
INFO:text_cleaning:Cleaned file saved to all three locations: case_2023_TK1_Putusan_PN_JAKARTA_BARAT_Nomor_1105_Pid_Sus_2022_PN_Jkt_Brt_Tanggal_16_Maret_2023__Penuntut_Umum_1_AMRIL_ABDI__SH2_REZA_OKTAVIAN__S_H___M_H_3_HOTMAIDA_SH4_NANDA_KARMILA__SH5_OCTAVIASAKWID.txt
INFO:text_cleaning:Processing file: raw_2023_TK1_Putusan_PN_JAKARTA_BARAT_Nomor_991_Pid_Sus_2022_PN_Jkt_Brt_Tanggal_16_Maret_2023__Penuntut_Umum_1_UMI_HANINDYA_KUSUMA_SH2_IKA_SYAFITRY_SALIM__SH___MH_3_AGUS_JULIANTO_PURNOMO__SH4_AZCHTIAR.txt
INFO:text_cleaning:Starting cleaning. Original length: 291067 characters
INFO:text_cleaning:Headers/footers/watermarks removed: 70238 characters
INFO:text_cleaning:Text converted to lowercase
INFO:text_cleaning:Spacing/character normalization: 1890 characters removed
INFO:text_cleaning:Cleaning complete. Final length: 218938 

SUCCESS: raw_2023_TK1_Putusan_PN_JAKARTA_BARAT_Nomor_1105_Pid_Sus_2022_PN_Jkt_Brt_Tanggal_16_Maret_2023__Penuntut_Umum_1_AMRIL_ABDI__SH2_REZA_OKTAVIAN__S_H___M_H_3_HOTMAIDA_SH4_NANDA_KARMILA__SH5_OCTAVIASAKWID.txt -> case_2023_TK1_Putusan_PN_JAKARTA_BARAT_Nomor_1105_Pid_Sus_2022_PN_Jkt_Brt_Tanggal_16_Maret_2023__Penuntut_Umum_1_AMRIL_ABDI__SH2_REZA_OKTAVIAN__S_H___M_H_3_HOTMAIDA_SH4_NANDA_KARMILA__SH5_OCTAVIASAKWID.txt (saved to /data/raw, gdrive/CLEANED, and gdrive/data/raw)
[40/46] raw_2023_TK1_Putusan_PN_JAKARTA_BARAT_Nomor_991_Pid_Sus_2022_PN_Jkt_Brt_Tanggal_16_Maret_2023__Penuntut_Umum_1_UMI_HANINDYA_KUSUMA_SH2_IKA_SYAFITRY_SALIM__SH___MH_3_AGUS_JULIANTO_PURNOMO__SH4_AZCHTIAR.txt
SUCCESS: raw_2023_TK1_Putusan_PN_JAKARTA_BARAT_Nomor_991_Pid_Sus_2022_PN_Jkt_Brt_Tanggal_16_Maret_2023__Penuntut_Umum_1_UMI_HANINDYA_KUSUMA_SH2_IKA_SYAFITRY_SALIM__SH___MH_3_AGUS_JULIANTO_PURNOMO__SH4_AZCHTIAR.txt -> case_2023_TK1_Putusan_PN_JAKARTA_BARAT_Nomor_991_Pid_Sus_2022_PN_Jkt_Brt_Tanggal_16_Maret

INFO:text_cleaning:Headers/footers/watermarks removed: 32516 characters
INFO:text_cleaning:Text converted to lowercase
INFO:text_cleaning:Spacing/character normalization: 853 characters removed
INFO:text_cleaning:Cleaning complete. Final length: 108760 characters (23.5% reduction)
INFO:text_cleaning:Cleaned file saved to all three locations: case_2023_TK1_Putusan_PN_JAKARTA_BARAT_Nomor_1107_Pid_Sus_2022_PN_Jkt_Brt_Tanggal_16_Maret_2023__Penuntut_Umum_1_ARIF_SUSANTO__SH__MH2_ARY_PRATAMA__SH3_AMRI_BAYAKTA__S_H_4_MUHAMAD_RAMLI__SH5_WULA_WARNO.txt
INFO:text_cleaning:Processing file: raw_2023_TK1_Putusan_PN_JAKARTA_BARAT_Nomor_992_Pid_Sus_2022_PN_Jkt_Brt_Tanggal_16_Februari_2023__Penuntut_Umum_1_IKA_SYAFITRY_SALIM__SH___MH_2_UMI_HANINDYA_KUSUMA_SH3_AGUS_JULIANTO_PURNOMO__SH4n_MUSA.txt
INFO:text_cleaning:Starting cleaning. Original length: 242001 characters
INFO:text_cleaning:Headers/footers/watermarks removed: 56903 characters
INFO:text_cleaning:Text converted to lowercase
INFO:text_cleanin

SUCCESS: raw_2023_TK1_Putusan_PN_JAKARTA_BARAT_Nomor_1107_Pid_Sus_2022_PN_Jkt_Brt_Tanggal_16_Maret_2023__Penuntut_Umum_1_ARIF_SUSANTO__SH__MH2_ARY_PRATAMA__SH3_AMRI_BAYAKTA__S_H_4_MUHAMAD_RAMLI__SH5_WULA_WARNO.txt -> case_2023_TK1_Putusan_PN_JAKARTA_BARAT_Nomor_1107_Pid_Sus_2022_PN_Jkt_Brt_Tanggal_16_Maret_2023__Penuntut_Umum_1_ARIF_SUSANTO__SH__MH2_ARY_PRATAMA__SH3_AMRI_BAYAKTA__S_H_4_MUHAMAD_RAMLI__SH5_WULA_WARNO.txt (saved to /data/raw, gdrive/CLEANED, and gdrive/data/raw)
[42/46] raw_2023_TK1_Putusan_PN_JAKARTA_BARAT_Nomor_992_Pid_Sus_2022_PN_Jkt_Brt_Tanggal_16_Februari_2023__Penuntut_Umum_1_IKA_SYAFITRY_SALIM__SH___MH_2_UMI_HANINDYA_KUSUMA_SH3_AGUS_JULIANTO_PURNOMO__SH4n_MUSA.txt
SUCCESS: raw_2023_TK1_Putusan_PN_JAKARTA_BARAT_Nomor_992_Pid_Sus_2022_PN_Jkt_Brt_Tanggal_16_Februari_2023__Penuntut_Umum_1_IKA_SYAFITRY_SALIM__SH___MH_2_UMI_HANINDYA_KUSUMA_SH3_AGUS_JULIANTO_PURNOMO__SH4n_MUSA.txt -> case_2023_TK1_Putusan_PN_JAKARTA_BARAT_Nomor_992_Pid_Sus_2022_PN_Jkt_Brt_Tanggal_16_Febru

INFO:text_cleaning:Headers/footers/watermarks removed: 28000 characters
INFO:text_cleaning:Text converted to lowercase
INFO:text_cleaning:Spacing/character normalization: 1193 characters removed
INFO:text_cleaning:Cleaning complete. Final length: 92600 characters (24.0% reduction)
INFO:text_cleaning:Cleaned file saved to all three locations: case_2023_TK1_Putusan_PN_JAKARTA_TIMUR_Nomor_843_Pid_Sus_2022_PN_Jkt_Tim_Tanggal_8_Februari_2023__Penuntut_Umum_RIFQI_ARIALFA_SH_MHTerdakwa_ADI_SUPRIYADI_Als__DAFA_Als__ANAS_Als__ADI_USAMA_Als__BONIMIN.txt
INFO:text_cleaning:Processing file: raw_2025_TK1_Putusan_PA_TILAMUTA_Nomor_81_Pdt_P_2025_PA_Tlm_Tanggal_12_Juni_2025__Pemohon_melawan_Termohon.txt
INFO:text_cleaning:Starting cleaning. Original length: 27098 characters
INFO:text_cleaning:Headers/footers/watermarks removed: 8129 characters
INFO:text_cleaning:Text converted to lowercase
INFO:text_cleaning:Spacing/character normalization: 235 characters removed
INFO:text_cleaning:Cleaning complete. 

SUCCESS: raw_2023_TK1_Putusan_PN_JAKARTA_TIMUR_Nomor_843_Pid_Sus_2022_PN_Jkt_Tim_Tanggal_8_Februari_2023__Penuntut_Umum_RIFQI_ARIALFA_SH_MHTerdakwa_ADI_SUPRIYADI_Als__DAFA_Als__ANAS_Als__ADI_USAMA_Als__BONIMIN.txt -> case_2023_TK1_Putusan_PN_JAKARTA_TIMUR_Nomor_843_Pid_Sus_2022_PN_Jkt_Tim_Tanggal_8_Februari_2023__Penuntut_Umum_RIFQI_ARIALFA_SH_MHTerdakwa_ADI_SUPRIYADI_Als__DAFA_Als__ANAS_Als__ADI_USAMA_Als__BONIMIN.txt (saved to /data/raw, gdrive/CLEANED, and gdrive/data/raw)
[44/46] raw_2025_TK1_Putusan_PA_TILAMUTA_Nomor_81_Pdt_P_2025_PA_Tlm_Tanggal_12_Juni_2025__Pemohon_melawan_Termohon.txt
SUCCESS: raw_2025_TK1_Putusan_PA_TILAMUTA_Nomor_81_Pdt_P_2025_PA_Tlm_Tanggal_12_Juni_2025__Pemohon_melawan_Termohon.txt -> case_2025_TK1_Putusan_PA_TILAMUTA_Nomor_81_Pdt_P_2025_PA_Tlm_Tanggal_12_Juni_2025__Pemohon_melawan_Termohon.txt (saved to /data/raw, gdrive/CLEANED, and gdrive/data/raw)
[45/46] raw_2025_TK1_Putusan_PA_TILAMUTA_Nomor_73_Pdt_P_2025_PA_Tlm_Tanggal_12_Juni_2025__Pemohon_melawan_T

# **Validasi**

In [62]:
import os
import re
import logging
import pandas as pd
from datetime import datetime

In [63]:
import os
import re
import logging
import pandas as pd
from datetime import datetime

# Setup logging
logging.basicConfig(level=logging.INFO)
logger = logging.getLogger(__name__)

class TextValidator:
    """Validate text integrity and completeness of legal documents"""

    def __init__(self, base_dir="/content/drive/MyDrive/terorisme"):
        self.base_dir = base_dir
        self.cleaned_dir = "/data/raw"  # Input: cleaned text files
        self.gdrive_cleaned_dir = os.path.join(base_dir, "CLEANED")  # Alternative input
        self.logs_dir = "/logs"  # Local logs
        self.gdrive_logs_dir = os.path.join(base_dir, "logs")  # Google Drive logs mirror
        self.validation_dir = os.path.join(base_dir, "VALIDATION")

        # Create directories
        os.makedirs(self.validation_dir, exist_ok=True)
        os.makedirs(self.logs_dir, exist_ok=True)
        os.makedirs(self.gdrive_logs_dir, exist_ok=True)

        print(f"Input 1 (cleaned): {self.cleaned_dir}")
        print(f"Input 2 (gdrive): {self.gdrive_cleaned_dir}")
        print(f"Validation output: {self.validation_dir}")
        print(f"Logs 1 (local): {self.logs_dir}")
        print(f"Logs 2 (gdrive): {self.gdrive_logs_dir}")

        # Setup validation logger
        self.setup_validation_logger()

        # Define legal document structure requirements
        self.setup_legal_requirements()

    def setup_validation_logger(self):
        """Setup dedicated validation logger with dual output"""
        self.validation_logger = logging.getLogger('text_validation')
        self.validation_logger.setLevel(logging.INFO)

        # Remove existing handlers
        for handler in self.validation_logger.handlers[:]:
            self.validation_logger.removeHandler(handler)

        # Ensure log directories exist
        os.makedirs(self.logs_dir, exist_ok=True)
        os.makedirs(self.gdrive_logs_dir, exist_ok=True)

        # Create file handlers for both locations
        log_file_local = os.path.join(self.logs_dir, 'validation.log')
        log_file_gdrive = os.path.join(self.gdrive_logs_dir, 'validation.log')

        # Create formatter
        formatter = logging.Formatter(
            '%(asctime)s - %(levelname)s - %(message)s',
            datefmt='%Y-%m-%d %H:%M:%S'
        )

        # Local log handler
        try:
            file_handler_local = logging.FileHandler(log_file_local, mode='a', encoding='utf-8')
            file_handler_local.setLevel(logging.INFO)
            file_handler_local.setFormatter(formatter)
            self.validation_logger.addHandler(file_handler_local)
        except Exception as e:
            print(f"Warning: Could not create local validation log: {e}")

        # Google Drive log handler
        try:
            file_handler_gdrive = logging.FileHandler(log_file_gdrive, mode='a', encoding='utf-8')
            file_handler_gdrive.setLevel(logging.INFO)
            file_handler_gdrive.setFormatter(formatter)
            self.validation_logger.addHandler(file_handler_gdrive)
        except Exception as e:
            print(f"Warning: Could not create gdrive validation log: {e}")

        self.validation_logger.info("="*60)
        self.validation_logger.info("TEXT VALIDATION SESSION STARTED")
        self.validation_logger.info("="*60)

    def setup_legal_requirements(self):
        """Define requirements for legal document completeness"""

        # Essential legal document sections (case insensitive)
        self.essential_sections = [
            r'menimbang',     # "MENIMBANG" section
            r'mengingat',     # "MENGINGAT" section
            r'mengadili',     # "MENGADILI" section
        ]

        # Important legal elements
        self.important_elements = [
            r'putusan',       # "PUTUSAN"
            r'terdakwa',      # "TERDAKWA"
            r'jaksa',         # "JAKSA" or prosecutor
            r'hakim',         # "HAKIM" or judge
            r'pasal',         # "PASAL" or article reference
        ]

        # Document structure indicators
        self.structure_indicators = [
            r'nomor.*\d+',    # Case number
            r'tahun.*\d{4}',  # Year reference
            r'pengadilan',    # Court reference
        ]

        # Minimum content thresholds
        self.min_word_count = 100       # Minimum words for valid document
        self.min_char_count = 500       # Minimum characters
        self.min_sentence_count = 10    # Minimum sentences

    # =================== TEXT ANALYSIS METHODS ===================

    def analyze_text_structure(self, text):
        """Analyze text structure and content"""
        if not isinstance(text, str) or not text.strip():
            return {
                'word_count': 0,
                'char_count': 0,
                'sentence_count': 0,
                'paragraph_count': 0,
                'essential_sections_found': [],
                'important_elements_found': [],
                'structure_indicators_found': []
            }

        # Basic statistics
        words = text.split()
        sentences = re.split(r'[.!?]+', text)
        paragraphs = [p.strip() for p in text.split('\n\n') if p.strip()]

        # Find essential sections
        essential_found = []
        for section in self.essential_sections:
            if re.search(section, text, re.IGNORECASE):
                essential_found.append(section)

        # Find important elements
        elements_found = []
        for element in self.important_elements:
            if re.search(element, text, re.IGNORECASE):
                elements_found.append(element)

        # Find structure indicators
        structure_found = []
        for indicator in self.structure_indicators:
            if re.search(indicator, text, re.IGNORECASE):
                structure_found.append(indicator)

        return {
            'word_count': len(words),
            'char_count': len(text),
            'sentence_count': len([s for s in sentences if s.strip()]),
            'paragraph_count': len(paragraphs),
            'essential_sections_found': essential_found,
            'important_elements_found': elements_found,
            'structure_indicators_found': structure_found
        }

    def calculate_completeness_score(self, analysis):
        """Calculate text completeness score (0-100%)"""
        score = 0
        max_score = 100

        # 1. Essential sections (40% of score)
        essential_score = (len(analysis['essential_sections_found']) / len(self.essential_sections)) * 40
        score += min(essential_score, 40)

        # 2. Important elements (30% of score)
        elements_score = (len(analysis['important_elements_found']) / len(self.important_elements)) * 30
        score += min(elements_score, 30)

        # 3. Text length adequacy (20% of score)
        length_score = 0
        if analysis['word_count'] >= self.min_word_count:
            length_score += 10
        if analysis['char_count'] >= self.min_char_count:
            length_score += 10
        score += length_score

        # 4. Structure indicators (10% of score)
        structure_score = (len(analysis['structure_indicators_found']) / len(self.structure_indicators)) * 10
        score += min(structure_score, 10)

        return min(score, 100)  # Cap at 100%

    def validate_single_file(self, filename, source_dir):
        """Validate single text file"""
        file_path = os.path.join(source_dir, filename)

        if not os.path.exists(file_path):
            self.validation_logger.error(f"File not found: {file_path}")
            return None

        try:
            # Read file
            with open(file_path, 'r', encoding='utf-8') as f:
                text = f.read()

            self.validation_logger.info(f"Validating file: {filename}")

            # Analyze text
            analysis = self.analyze_text_structure(text)

            # Calculate completeness score
            completeness_score = self.calculate_completeness_score(analysis)

            # Determine validation status
            is_valid = completeness_score >= 80.0  # 80% threshold

            # Create validation result
            result = {
                'filename': filename,
                'source_directory': source_dir,
                'file_size_bytes': len(text.encode('utf-8')),
                'word_count': analysis['word_count'],
                'char_count': analysis['char_count'],
                'sentence_count': analysis['sentence_count'],
                'paragraph_count': analysis['paragraph_count'],
                'essential_sections_found': len(analysis['essential_sections_found']),
                'essential_sections_list': ', '.join(analysis['essential_sections_found']),
                'important_elements_found': len(analysis['important_elements_found']),
                'important_elements_list': ', '.join(analysis['important_elements_found']),
                'structure_indicators_found': len(analysis['structure_indicators_found']),
                'structure_indicators_list': ', '.join(analysis['structure_indicators_found']),
                'completeness_score': round(completeness_score, 2),
                'is_valid': is_valid,
                'validation_status': 'PASS' if is_valid else 'FAIL',
                'validation_timestamp': datetime.now().strftime('%Y-%m-%d %H:%M:%S')
            }

            # Log result
            status = "PASS" if is_valid else "FAIL"
            self.validation_logger.info(f"{filename}: {status} (Score: {completeness_score:.1f}%)")

            if not is_valid:
                self.validation_logger.warning(f"{filename}: Below 80% threshold - may be incomplete")

            print(f"{status}: {filename} (Score: {completeness_score:.1f}%)")

            return result

        except Exception as e:
            self.validation_logger.error(f"Error validating {filename}: {str(e)}")
            print(f"ERROR: {filename}: {str(e)}")
            return None

    def get_text_files(self, directory):
        """Get list of text files from directory"""
        if not os.path.exists(directory):
            return []

        return [f for f in os.listdir(directory)
                if f.endswith('.txt') and os.path.isfile(os.path.join(directory, f))]

    def validate_all_files(self):
        """Validate all cleaned text files"""
        print("iv. VALIDASI TEKS")
        print("=" * 50)
        print("Tujuan: Periksa keutuhan teks (minimal 80% isi putusan)")
        print("Threshold: 80% completeness score")
        print("=" * 50)

        # Get files from both locations
        data_raw_files = self.get_text_files(self.cleaned_dir)
        gdrive_files = self.get_text_files(self.gdrive_cleaned_dir)

        if not data_raw_files and not gdrive_files:
            print("No text files found for validation")
            return None

        # Use data/raw files if available, otherwise use gdrive
        if data_raw_files:
            files_to_validate = data_raw_files
            source_directory = self.cleaned_dir
            print(f"Using files from: {self.cleaned_dir}")
        else:
            files_to_validate = gdrive_files
            source_directory = self.gdrive_cleaned_dir
            print(f"Using files from: {self.gdrive_cleaned_dir}")

        print(f"Found {len(files_to_validate)} files to validate")
        print("=" * 60)

        # Validate each file
        results = []
        pass_count = 0
        fail_count = 0

        for i, filename in enumerate(files_to_validate, 1):
            print(f"[{i}/{len(files_to_validate)}] Validating: {filename}")
            result = self.validate_single_file(filename, source_directory)

            if result:
                results.append(result)
                if result['is_valid']:
                    pass_count += 1
                else:
                    fail_count += 1

        if not results:
            print("No files successfully validated")
            return None

        # Create validation report
        self.create_validation_report(results, pass_count, fail_count)

        return results

    def create_validation_report(self, results, pass_count, fail_count):
        """Create comprehensive validation report"""

        # Create DataFrame
        df_results = pd.DataFrame(results)

        # Save detailed CSV report
        csv_path = os.path.join(self.validation_dir, 'validation_report.csv')
        df_results.to_csv(csv_path, index=False)

        # Create summary statistics
        total_files = len(results)
        pass_rate = (pass_count / total_files * 100) if total_files > 0 else 0
        avg_score = df_results['completeness_score'].mean()
        min_score = df_results['completeness_score'].min()
        max_score = df_results['completeness_score'].max()

        # Create text summary report
        summary_content = f"""TEXT VALIDATION REPORT
=====================
Date: {datetime.now().strftime('%Y-%m-%d %H:%M:%S')}
Validation Threshold: 80% completeness

SUMMARY STATISTICS:
------------------
Total files validated: {total_files}
Files passed (≥80%): {pass_count}
Files failed (<80%): {fail_count}
Pass rate: {pass_rate:.1f}%

SCORE STATISTICS:
----------------
Average score: {avg_score:.1f}%
Minimum score: {min_score:.1f}%
Maximum score: {max_score:.1f}%

FAILED FILES (if any):
---------------------
"""

        # Add failed files details
        failed_files = df_results[df_results['is_valid'] == False]
        if not failed_files.empty:
            for _, file_info in failed_files.iterrows():
                summary_content += f"- {file_info['filename']}: {file_info['completeness_score']:.1f}% "
                summary_content += f"(Missing: "

                missing_elements = []
                if file_info['essential_sections_found'] < len(self.essential_sections):
                    missing_elements.append("essential sections")
                if file_info['important_elements_found'] < len(self.important_elements):
                    missing_elements.append("important elements")
                if file_info['word_count'] < self.min_word_count:
                    missing_elements.append("sufficient content")

                summary_content += ", ".join(missing_elements) + ")\n"
        else:
            summary_content += "No files failed validation!\n"

        summary_content += f"\nDETAILED REPORT: {csv_path}\n"
        summary_content += f"VALIDATION LOG 1: {os.path.join(self.logs_dir, 'validation.log')}\n"
        summary_content += f"VALIDATION LOG 2: {os.path.join(self.gdrive_logs_dir, 'validation.log')}\n"

        # Save summary report to both locations
        summary_path_local = os.path.join(self.validation_dir, 'validation_summary.txt')
        summary_path_gdrive = os.path.join(self.gdrive_logs_dir, 'validation_summary.txt')

        # Ensure directories exist
        os.makedirs(self.validation_dir, exist_ok=True)
        os.makedirs(self.gdrive_logs_dir, exist_ok=True)

        try:
            with open(summary_path_local, 'w', encoding='utf-8') as f:
                f.write(summary_content)
        except Exception as e:
            print(f"Warning: Could not save validation summary locally: {e}")

        try:
            with open(summary_path_gdrive, 'w', encoding='utf-8') as f:
                f.write(summary_content)
        except Exception as e:
            print(f"Warning: Could not save validation summary to gdrive: {e}")

        # Display summary
        print("\n" + "=" * 60)
        print("VALIDATION SUMMARY:")
        print("=" * 60)
        print(f"Total files: {total_files}")
        print(f"Passed (≥80%): {pass_count}")
        print(f"Failed (<80%): {fail_count}")
        print(f"Pass rate: {pass_rate:.1f}%")
        print(f"Average score: {avg_score:.1f}%")
        print(f"Score range: {min_score:.1f}% - {max_score:.1f}%")

        # Show failed files in console
        if fail_count > 0:
            print(f"\nFAILED FILES ({fail_count}):")
            print("-" * 40)
            failed_files = df_results[df_results['is_valid'] == False]
            for _, file_info in failed_files.iterrows():
                print(f"  {file_info['filename']}: {file_info['completeness_score']:.1f}%")
                issues = []
                if file_info['essential_sections_found'] < len(self.essential_sections):
                    missing_count = len(self.essential_sections) - file_info['essential_sections_found']
                    issues.append(f"{missing_count} missing essential sections")
                if file_info['word_count'] < self.min_word_count:
                    issues.append(f"too short ({file_info['word_count']} words)")
                if issues:
                    print(f"    Issues: {', '.join(issues)}")
        else:
            print("\nAll files passed validation!")

        print(f"\nREPORTS GENERATED:")
        print(f"  Detailed CSV: {csv_path}")
        print(f"  Summary TXT (validation): {summary_path_local}")
        print(f"  Summary TXT (gdrive): {summary_path_gdrive}")
        print(f"  Validation log (local): {os.path.join(self.logs_dir, 'validation.log')}")
        print(f"  Validation log (gdrive): {os.path.join(self.gdrive_logs_dir, 'validation.log')}")

        # Log summary
        self.validation_logger.info(f"Validation complete: {pass_count}/{total_files} files passed ({pass_rate:.1f}%)")
        self.validation_logger.info(f"Average completeness score: {avg_score:.1f}%")

        if fail_count > 0:
            self.validation_logger.warning(f"{fail_count} files failed validation (below 80% threshold)")

# Utility functions
def validate_single_text(text):
    """Quick function to validate single text"""
    validator = TextValidator()
    analysis = validator.analyze_text_structure(text)
    score = validator.calculate_completeness_score(analysis)
    return score, score >= 80.0

def validate_specific_file(file_path):
    """Validate specific file by path"""
    validator = TextValidator()
    filename = os.path.basename(file_path)
    directory = os.path.dirname(file_path)
    return validator.validate_single_file(filename, directory)

def main():
    """Main execution function"""
    print("iv. VALIDASI")
    print("=" * 50)
    print("1. Periksa keutuhan teks (minimal 80% isi putusan tersedia)")
    print("2. Catat log file: /logs/ dan Google Drive/logs/")
    print("=" * 50)

    # Initialize validator
    validator = TextValidator("/content/drive/MyDrive/terorisme")

    # Run validation
    results = validator.validate_all_files()

    if results:
        print("\nVALIDATION PROCESS COMPLETE!")
        print(f"Check validation reports in: {validator.validation_dir}")
        print(f"Check validation logs: /logs/validation.log")
        print(f"Check validation logs: {validator.gdrive_logs_dir}/validation.log")
    else:
        print("\nNo files found for validation.")

# Execute
if __name__ == "__main__":
    main()


INFO:text_validation:TEXT VALIDATION SESSION STARTED
INFO:text_validation:Validating file: case_2023_TK1_Putusan_PN_JAKARTA_TIMUR_Nomor_533_Pid_Sus_2023_PN_JKT_TIM_Tanggal_8_Nopember_2023__Penuntut_Umum_DAVID_ROGER_J_PAKPAHAN__SHTerdakwa_SUGENG_Alias_SALMAN_ALGHOZALI_Alias_USTAD_SALMANADENAN.txt
INFO:text_validation:case_2023_TK1_Putusan_PN_JAKARTA_TIMUR_Nomor_533_Pid_Sus_2023_PN_JKT_TIM_Tanggal_8_Nopember_2023__Penuntut_Umum_DAVID_ROGER_J_PAKPAHAN__SHTerdakwa_SUGENG_Alias_SALMAN_ALGHOZALI_Alias_USTAD_SALMANADENAN.txt: PASS (Score: 100.0%)
INFO:text_validation:Validating file: case_2023_TK1_Putusan_PN_JAKARTA_TIMUR_Nomor_544_Pid_Sus_2023_PN_JKT_TIM_Tanggal_13_Nopember_2023__Penuntut_Umum_MALINI_SIANTURI__SHTerdakwa_MUHAMMAD_BUDI_SATRIA_Alias_BUDI__alias_KARI__alias_JUNH__Alm.txt
INFO:text_validation:case_2023_TK1_Putusan_PN_JAKARTA_TIMUR_Nomor_544_Pid_Sus_2023_PN_JKT_TIM_Tanggal_13_Nopember_2023__Penuntut_Umum_MALINI_SIANTURI__SHTerdakwa_MUHAMMAD_BUDI_SATRIA_Alias_BUDI__alias_KARI__ali

iv. VALIDASI
1. Periksa keutuhan teks (minimal 80% isi putusan tersedia)
2. Catat log file: /logs/ dan Google Drive/logs/
Input 1 (cleaned): /data/raw
Input 2 (gdrive): /content/drive/MyDrive/terorisme/CLEANED
Validation output: /content/drive/MyDrive/terorisme/VALIDATION
Logs 1 (local): /logs
Logs 2 (gdrive): /content/drive/MyDrive/terorisme/logs
iv. VALIDASI TEKS
Tujuan: Periksa keutuhan teks (minimal 80% isi putusan)
Threshold: 80% completeness score
Using files from: /data/raw
Found 46 files to validate
[1/46] Validating: case_2023_TK1_Putusan_PN_JAKARTA_TIMUR_Nomor_533_Pid_Sus_2023_PN_JKT_TIM_Tanggal_8_Nopember_2023__Penuntut_Umum_DAVID_ROGER_J_PAKPAHAN__SHTerdakwa_SUGENG_Alias_SALMAN_ALGHOZALI_Alias_USTAD_SALMANADENAN.txt
PASS: case_2023_TK1_Putusan_PN_JAKARTA_TIMUR_Nomor_533_Pid_Sus_2023_PN_JKT_TIM_Tanggal_8_Nopember_2023__Penuntut_Umum_DAVID_ROGER_J_PAKPAHAN__SHTerdakwa_SUGENG_Alias_SALMAN_ALGHOZALI_Alias_USTAD_SALMANADENAN.txt (Score: 100.0%)
[2/46] Validating: case_2023_TK1_P

INFO:text_validation:case_2023_TK1_Putusan_PN_JAKARTA_TIMUR_Nomor_211_Pid_Sus_2023_PN_Jkt_Tim_Tanggal_12_Juli_2023__Penuntut_Umum_ANNISA_RK__SHTerdakwa_SYAHRUL_ALS_TOPAN_ALS_BENZ_ALS_ANGGA_BIN_UMARDI_.txt: PASS (Score: 86.7%)
INFO:text_validation:Validating file: case_2023_TK1_Putusan_PN_JAKARTA_BARAT_Nomor_991_Pid_Sus_2022_PN_Jkt_Brt_Tanggal_16_Maret_2023__Penuntut_Umum_1_UMI_HANINDYA_KUSUMA_SH2_IKA_SYAFITRY_SALIM__SH___MH_3_AGUS_JULIANTO_PURNOMO__SH4_AZCHTIAR.txt
INFO:text_validation:case_2023_TK1_Putusan_PN_JAKARTA_BARAT_Nomor_991_Pid_Sus_2022_PN_Jkt_Brt_Tanggal_16_Maret_2023__Penuntut_Umum_1_UMI_HANINDYA_KUSUMA_SH2_IKA_SYAFITRY_SALIM__SH___MH_3_AGUS_JULIANTO_PURNOMO__SH4_AZCHTIAR.txt: PASS (Score: 86.7%)
INFO:text_validation:Validating file: case_2023_TK1_Putusan_PN_JAKARTA_TIMUR_Nomor_528_Pid_Sus_2023_PN_JKT_TIM_Tanggal_19_Oktober_2023__Penuntut_Umum_DWI_AGUS_SETYONINGRUM__SH__MHTerdakwa_RAMANDA_PRATAMA_alias_ABU_KENZI_alias_KURAMA_SUARDI.txt
INFO:text_validation:case_2023_TK1_Put

PASS: case_2023_TK1_Putusan_PN_JAKARTA_TIMUR_Nomor_211_Pid_Sus_2023_PN_Jkt_Tim_Tanggal_12_Juli_2023__Penuntut_Umum_ANNISA_RK__SHTerdakwa_SYAHRUL_ALS_TOPAN_ALS_BENZ_ALS_ANGGA_BIN_UMARDI_.txt (Score: 86.7%)
[16/46] Validating: case_2023_TK1_Putusan_PN_JAKARTA_BARAT_Nomor_991_Pid_Sus_2022_PN_Jkt_Brt_Tanggal_16_Maret_2023__Penuntut_Umum_1_UMI_HANINDYA_KUSUMA_SH2_IKA_SYAFITRY_SALIM__SH___MH_3_AGUS_JULIANTO_PURNOMO__SH4_AZCHTIAR.txt
PASS: case_2023_TK1_Putusan_PN_JAKARTA_BARAT_Nomor_991_Pid_Sus_2022_PN_Jkt_Brt_Tanggal_16_Maret_2023__Penuntut_Umum_1_UMI_HANINDYA_KUSUMA_SH2_IKA_SYAFITRY_SALIM__SH___MH_3_AGUS_JULIANTO_PURNOMO__SH4_AZCHTIAR.txt (Score: 86.7%)
[17/46] Validating: case_2023_TK1_Putusan_PN_JAKARTA_TIMUR_Nomor_528_Pid_Sus_2023_PN_JKT_TIM_Tanggal_19_Oktober_2023__Penuntut_Umum_DWI_AGUS_SETYONINGRUM__SH__MHTerdakwa_RAMANDA_PRATAMA_alias_ABU_KENZI_alias_KURAMA_SUARDI.txt
PASS: case_2023_TK1_Putusan_PN_JAKARTA_TIMUR_Nomor_528_Pid_Sus_2023_PN_JKT_TIM_Tanggal_19_Oktober_2023__Penuntut_Umu

INFO:text_validation:case_2023_TK1_Putusan_PN_JAKARTA_TIMUR_Nomor_209_Pid_Sus_2023_PN_Jkt_Tim_Tanggal_12_Juli_2023__Penuntut_Umum_HOTMAIDA__SHTerdakwa_ARIADI_Alias_KHAIRUL_alias_FERI_alias_JIWO_Bin_ASNAN.txt: PASS (Score: 100.0%)
INFO:text_validation:Validating file: case_2023_TK1_Putusan_PN_JAKARTA_TIMUR_Nomor_637_Pid_Sus_2023_PN_JKT_TIM_Tanggal_15_Nopember_2023__Penuntut_Umum_AMRI_BAYAKTA__SHTerdakwa_WAHYUDI__alias_JONI_alias_GUNTUR_alias_FAJAR_Bin_ABDUL_PANUT.txt
INFO:text_validation:case_2023_TK1_Putusan_PN_JAKARTA_TIMUR_Nomor_637_Pid_Sus_2023_PN_JKT_TIM_Tanggal_15_Nopember_2023__Penuntut_Umum_AMRI_BAYAKTA__SHTerdakwa_WAHYUDI__alias_JONI_alias_GUNTUR_alias_FAJAR_Bin_ABDUL_PANUT.txt: PASS (Score: 100.0%)
INFO:text_validation:Validating file: case_2023_TK1_Putusan_PN_JAKARTA_TIMUR_Nomor_555_Pid_Sus_2023_PN_JKT_TIM_Tanggal_13_Desember_2023__Penuntut_Umum_ERWIN_INDRAPUTRA__SH___MHTerdakwa_ARIS_BUDIANTO_alias_RIKO_alias_BAHAR_alias_SARAHARSONO.txt
INFO:text_validation:case_2023_TK1_Putu

PASS: case_2023_TK1_Putusan_PN_JAKARTA_TIMUR_Nomor_209_Pid_Sus_2023_PN_Jkt_Tim_Tanggal_12_Juli_2023__Penuntut_Umum_HOTMAIDA__SHTerdakwa_ARIADI_Alias_KHAIRUL_alias_FERI_alias_JIWO_Bin_ASNAN.txt (Score: 100.0%)
[31/46] Validating: case_2023_TK1_Putusan_PN_JAKARTA_TIMUR_Nomor_637_Pid_Sus_2023_PN_JKT_TIM_Tanggal_15_Nopember_2023__Penuntut_Umum_AMRI_BAYAKTA__SHTerdakwa_WAHYUDI__alias_JONI_alias_GUNTUR_alias_FAJAR_Bin_ABDUL_PANUT.txt
PASS: case_2023_TK1_Putusan_PN_JAKARTA_TIMUR_Nomor_637_Pid_Sus_2023_PN_JKT_TIM_Tanggal_15_Nopember_2023__Penuntut_Umum_AMRI_BAYAKTA__SHTerdakwa_WAHYUDI__alias_JONI_alias_GUNTUR_alias_FAJAR_Bin_ABDUL_PANUT.txt (Score: 100.0%)
[32/46] Validating: case_2023_TK1_Putusan_PN_JAKARTA_TIMUR_Nomor_555_Pid_Sus_2023_PN_JKT_TIM_Tanggal_13_Desember_2023__Penuntut_Umum_ERWIN_INDRAPUTRA__SH___MHTerdakwa_ARIS_BUDIANTO_alias_RIKO_alias_BAHAR_alias_SARAHARSONO.txt
PASS: case_2023_TK1_Putusan_PN_JAKARTA_TIMUR_Nomor_555_Pid_Sus_2023_PN_JKT_TIM_Tanggal_13_Desember_2023__Penuntut_Umu

INFO:text_validation:Validating file: case_2023_TK1_Putusan_PN_JAKARTA_BARAT_Nomor_992_Pid_Sus_2022_PN_Jkt_Brt_Tanggal_16_Februari_2023__Penuntut_Umum_1_IKA_SYAFITRY_SALIM__SH___MH_2_UMI_HANINDYA_KUSUMA_SH3_AGUS_JULIANTO_PURNOMO__SH4n_MUSA.txt
INFO:text_validation:case_2023_TK1_Putusan_PN_JAKARTA_BARAT_Nomor_992_Pid_Sus_2022_PN_Jkt_Brt_Tanggal_16_Februari_2023__Penuntut_Umum_1_IKA_SYAFITRY_SALIM__SH___MH_2_UMI_HANINDYA_KUSUMA_SH3_AGUS_JULIANTO_PURNOMO__SH4n_MUSA.txt: PASS (Score: 100.0%)
INFO:text_validation:Validation complete: 43/46 files passed (93.5%)
INFO:text_validation:Average completeness score: 93.2%


PASS: case_2023_TK1_Putusan_PN_JAKARTA_BARAT_Nomor_8_Pid_Sus_2023_PN_Jkt_Brt_Tanggal_13_April_2023__Penuntut_Umum_1_ANDI_JEFRI_ARDIN__SH_MH2_JAHRUDIN__SH3_DENRI_KASWORO__S_H_4_ZULKIFLI__SH__MH5_KHAREZA_SOLEH.txt (Score: 100.0%)
[46/46] Validating: case_2023_TK1_Putusan_PN_JAKARTA_BARAT_Nomor_992_Pid_Sus_2022_PN_Jkt_Brt_Tanggal_16_Februari_2023__Penuntut_Umum_1_IKA_SYAFITRY_SALIM__SH___MH_2_UMI_HANINDYA_KUSUMA_SH3_AGUS_JULIANTO_PURNOMO__SH4n_MUSA.txt
PASS: case_2023_TK1_Putusan_PN_JAKARTA_BARAT_Nomor_992_Pid_Sus_2022_PN_Jkt_Brt_Tanggal_16_Februari_2023__Penuntut_Umum_1_IKA_SYAFITRY_SALIM__SH___MH_2_UMI_HANINDYA_KUSUMA_SH3_AGUS_JULIANTO_PURNOMO__SH4n_MUSA.txt (Score: 100.0%)

VALIDATION SUMMARY:
Total files: 46
Passed (≥80%): 43
Failed (<80%): 3
Pass rate: 93.5%
Average score: 93.2%
Score range: 74.7% - 100.0%

FAILED FILES (3):
----------------------------------------
  case_2025_TK1_Putusan_PA_TILAMUTA_Nomor_73_Pdt_P_2025_PA_Tlm_Tanggal_12_Juni_2025__Pemohon_melawan_Termohon.txt: 74.7%