<a href="https://colab.research.google.com/github/Luthfiashofaa/UAS_PenalaranKomputer/blob/main/(UAS)_Case_Representation.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

#Tahap 2 – Case Representation

In [None]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


## Ekstraksi Metadata

In [None]:
import os
import re
import json
import logging
import pandas as pd
from datetime import datetime
from typing import Dict, List, Optional

In [None]:
# Pengaturan logging
logging.basicConfig(level=logging.INFO)
logger = logging.getLogger(__name__)

class MetadataExtractor:
    """Ekstrak metadata terstruktur dari dokumen putusan pengadilan"""

    def __init__(self, base_dir="/content/drive/MyDrive/korupsi"):
        self.base_dir = base_dir
        self.cleaned_dir = "/data/raw"  # Input: file teks yang sudah dibersihkan
        self.gdrive_cleaned_dir = os.path.join(base_dir, "CLEANED")  # Input alternatif
        self.output_dir = "/data/processed"  # Output directory lokal
        self.gdrive_output_dir = os.path.join(base_dir, "data", "processed")  # Output Google Drive
        self.metadata_dir = os.path.join(base_dir, "METADATA")  # Backup output
        self.logs_dir = "/logs"

        # Buat direktori
        os.makedirs(self.output_dir, exist_ok=True)
        os.makedirs(self.gdrive_output_dir, exist_ok=True)  # Buat di Google Drive
        os.makedirs(self.metadata_dir, exist_ok=True)
        os.makedirs(self.logs_dir, exist_ok=True)

        print(f"🔍 EKSTRAKSI METADATA")
        print(f"Input: {self.cleaned_dir} atau {self.gdrive_cleaned_dir}")
        print(f"Output Lokal: {self.output_dir}")
        print(f"Output GDrive: {self.gdrive_output_dir}")

        # Setup ekstraksi patterns
        self.setup_extraction_patterns()

        # Mapping bulan Indonesia
        self.month_mapping = {
            'januari': '01', 'februari': '02', 'maret': '03', 'april': '04',
            'mei': '05', 'juni': '06', 'juli': '07', 'agustus': '08',
            'september': '09', 'oktober': '10', 'november': '11', 'desember': '12'
        }

    def setup_extraction_patterns(self):
        """Definisikan pola regex yang lebih spesifik untuk ekstraksi metadata"""

        # 1. POLA NOMOR PERKARA (lebih spesifik)
        self.case_number_patterns = [
            r'(?:nomor|register)\s*(?:perkara)?\s*:\s*(\d+/pid\.?(?:sus|b)?/?(?:\d+)?/?pn\.?\w+)',  # Pidana
            r'(?:nomor|register)\s*(?:perkara)?\s*:\s*(\d+/pdt\.?g?/?(?:\d+)?/?pn\.?\w+)',  # Perdata
            r'(?:nomor|register)\s*(?:perkara)?\s*:\s*(\d+/\w+/\d{4}/pn\.?\w+)',  # Format lengkap
            r'perkara\s+nomor\s*:\s*(\d+/[\w\./]+)',
            r'dalam\s+perkara\s+nomor\s*:\s*([^\n\r]+?)(?:\s|$)',
        ]

        # 2. POLA TANGGAL (lebih kontekstual)
        self.date_patterns = [
            r'diputuskan?\s+(?:pada\s+(?:hari\s+\w+\s+)?)?tanggal\s*:?\s*(\d{1,2})\s+(\w+)\s+(\d{4})',
            r'dibacakan\s+(?:pada\s+(?:hari\s+\w+\s+)?)?tanggal\s*:?\s*(\d{1,2})\s+(\w+)\s+(\d{4})',
            r'pada\s+hari\s+\w+\s+tanggal\s+(\d{1,2})\s+(\w+)\s+(\d{4})',
            r'(?:jakarta|surabaya|bandung|medan|semarang|yogyakarta|makassar|palembang|denpasar|malang),?\s*(\d{1,2})\s+(\w+)\s+(\d{4})',
        ]

        # 3. POLA JENIS PERKARA (lebih spesifik dengan konteks)
        self.case_type_patterns = [
            r'(?:dalam\s+)?perkara\s+tindak\s+pidana\s+(korupsi|pencucian\s+uang|narkotika|terorisme)',
            r'(?:dalam\s+)?perkara\s+pidana\s+(khusus|umum)',
            r'(?:dalam\s+)?perkara\s+(pidana|perdata)(?:\s+(?:khusus|umum))?',
            r'tindak\s+pidana\s+(korupsi|pencucian\s+uang|narkotika)',
        ]

        # 4. POLA PASAL HUKUM (dengan konteks UU)
        self.legal_article_patterns = [
            r'pasal\s+(\d+(?:\s+(?:ayat|huruf)\s*\([^)]+\))?(?:\s+(?:jo\.?|juncto|dan)\s+pasal\s+\d+(?:\s+(?:ayat|huruf)\s*\([^)]+\))?)*)\s+(?:undang[- ]undang|uu)',
            r'melanggar\s+pasal\s+(\d+(?:\s+(?:ayat|huruf)\s*\([^)]+\))?)',
            r'berdasarkan\s+pasal\s+(\d+(?:\s+(?:ayat|huruf)\s*\([^)]+\))?)',
            r'undang[- ]undang\s+(?:republik\s+indonesia\s+)?nomor\s+(\d+)\s+tahun\s+(\d{4})',
            r'uu\s+(?:no\.?\s*|nomor\s+)?(\d+)/?(\d{4})',
        ]

        # 5. POLA PIHAK-PIHAK (dengan delimiter yang jelas)
        self.parties_patterns = [
            # Terdakwa (format yang umum)
            r'terdakwa\s*:\s*([A-Z][^,;\n\r]+?)(?:\s*(?:,\s*alias|,\s*yang\s+selanjutnya|;|\n))',
            r'nama\s+lengkap\s*:\s*([A-Z][^,;\n\r]+?)(?:\s*(?:,|\n))',

            # Penuntut Umum
            r'(?:jaksa\s+)?penuntut\s+umum\s*:\s*([A-Z][^,;\n\r]+?)(?:\s*(?:,\s*S\.H|,|\n))',

            # Perkara Perdata
            r'penggugat\s*(?:I|1)?\s*:\s*([A-Z][^,;\n\r]+?)(?:\s*(?:,|\n))',
            r'tergugat\s*(?:I|1)?\s*:\s*([A-Z][^,;\n\r]+?)(?:\s*(?:,|\n))',
            r'pemohon\s*:\s*([A-Z][^,;\n\r]+?)(?:\s*(?:,|\n))',
            r'termohon\s*:\s*([A-Z][^,;\n\r]+?)(?:\s*(?:,|\n))',

            # Hakim
            r'hakim\s+ketua\s*:\s*([A-Z][^,;\n\r]+?)(?:\s*(?:,\s*S\.H|,|\n))',
            r'hakim\s+anggota\s*:\s*([A-Z][^,;\n\r]+?)(?:\s*(?:,\s*S\.H|,|\n))',
        ]

        # 6. POLA PENGADILAN (dengan nama lokasi yang jelas)
        self.court_patterns = [
            r'pengadilan\s+negeri\s+([A-Z][a-z]+(?:\s+[A-Z][a-z]+)*)',
            r'pengadilan\s+tinggi\s+([A-Z][a-z]+(?:\s+[A-Z][a-z]+)*)',
            r'mahkamah\s+agung\s+republik\s+indonesia',
            r'pengadilan\s+(?:tata\s+usaha\s+negara|agama|militer)\s+([A-Z][a-z]+)',
        ]

    def extract_case_number(self, text: str) -> Optional[str]:
        """Ekstrak nomor perkara dengan validasi format"""
        for pattern in self.case_number_patterns:
            match = re.search(pattern, text, re.IGNORECASE)
            if match:
                case_number = match.group(1).strip()
                # Validasi format nomor perkara
                if re.match(r'\d+/', case_number) and len(case_number) > 5:
                    return case_number
        return None

    def extract_dates(self, text: str) -> Dict[str, Optional[str]]:
        """Ekstrak tanggal-tanggal penting"""
        dates = {'decision_date': None, 'hearing_date': None}

        for pattern in self.date_patterns:
            matches = re.finditer(pattern, text, re.IGNORECASE)
            for match in matches:
                try:
                    day = match.group(1).zfill(2)
                    month_name = match.group(2).lower()
                    year = match.group(3)

                    if month_name in self.month_mapping:
                        month = self.month_mapping[month_name]
                        formatted_date = f"{year}-{month}-{day}"

                        context = text[max(0, match.start()-50):match.end()+50].lower()
                        if 'diputuskan' in context or 'dibacakan' in context:
                            dates['decision_date'] = formatted_date
                        else:
                            dates['hearing_date'] = formatted_date
                except (IndexError, KeyError):
                    continue

        return dates

    def extract_case_type(self, text: str) -> Optional[str]:
        """Ekstrak jenis perkara"""
        for pattern in self.case_type_patterns:
            match = re.search(pattern, text, re.IGNORECASE)
            if match:
                return match.group(0).strip()
        return None

    def extract_legal_articles(self, text: str) -> List[str]:
        """Ekstrak pasal dan undang-undang dengan validasi"""
        articles = []
        for pattern in self.legal_article_patterns:
            matches = re.finditer(pattern, text, re.IGNORECASE)
            for match in matches:
                article = match.group(0).strip()

                # Validasi artikel hukum (harus mengandung angka)
                if re.search(r'\d+', article) and len(article) > 5:
                    # Bersihkan artikel dari kata yang tidak perlu
                    article = re.sub(r'\s+', ' ', article)
                    if article not in articles:
                        articles.append(article)

        return articles[:10]  # Batasi maksimal 10 pasal untuk menghindari noise

    def clean_party_name(self, name: str) -> str:
        """Membersihkan nama dari gelar dan informasi tambahan"""
        if not name:
            return ""

        # Daftar pattern untuk dibersihkan
        clean_patterns = [
            r',\s*S\.H\.?.*$',
            r',\s*S\.E\.?.*$',
            r',\s*M\.H\.?.*$',
            r',\s*alias.*$',
            r',\s*bin\s+.*$',
            r',\s*binti\s+.*$',
            r'\s+yang\s+selanjutnya.*$'
        ]

        cleaned_name = name.strip()
        for pattern in clean_patterns:
            cleaned_name = re.sub(pattern, '', cleaned_name, flags=re.IGNORECASE)

        return cleaned_name.strip()

    def extract_parties(self, text: str) -> Dict[str, List[str]]:
        """Ekstrak pihak-pihak yang terlibat dengan validasi nama"""
        parties = {
            'defendants': [],      # Terdakwa
            'prosecutors': [],     # JPU
            'plaintiffs': [],      # Penggugat/Pemohon
            'respondents': [],     # Tergugat/Termohon
            'judges': []           # Hakim
        }

        for pattern in self.parties_patterns:
            matches = re.finditer(pattern, text, re.IGNORECASE)
            for match in matches:
                party_name = match.group(1).strip()
                party_context = match.group(0).lower()

                # Validasi nama (harus dimulai dengan huruf kapital dan minimal 3 karakter)
                if not party_name or len(party_name) < 3 or not party_name[0].isupper():
                    continue

                # Bersihkan nama dari gelar dan informasi tambahan
                party_name = self.clean_party_name(party_name)

                if len(party_name) < 3:
                    continue

                # Kategorisasi berdasarkan konteks
                if 'terdakwa' in party_context or 'nama lengkap' in party_context:
                    parties['defendants'].append(party_name)
                elif any(word in party_context for word in ['jaksa', 'penuntut']):
                    parties['prosecutors'].append(party_name)
                elif 'penggugat' in party_context or 'pemohon' in party_context:
                    parties['plaintiffs'].append(party_name)
                elif 'tergugat' in party_context or 'termohon' in party_context:
                    parties['respondents'].append(party_name)
                elif 'hakim' in party_context:
                    parties['judges'].append(party_name)

        # Remove duplicates
        for key in parties:
            parties[key] = list(dict.fromkeys(parties[key]))

        return parties

    def extract_court_info(self, text: str) -> Dict[str, Optional[str]]:
        """Ekstrak informasi pengadilan"""
        court_info = {'court_name': None, 'court_type': None, 'court_location': None}

        for pattern in self.court_patterns:
            match = re.search(pattern, text, re.IGNORECASE)
            if match:
                court_text = match.group(0).strip()
                court_info['court_name'] = court_text

                if 'negeri' in court_text.lower():
                    court_info['court_type'] = 'Pengadilan Negeri'
                    if match.groups():
                        court_info['court_location'] = match.group(1).strip()
                elif 'tinggi' in court_text.lower():
                    court_info['court_type'] = 'Pengadilan Tinggi'
                    if match.groups():
                        court_info['court_location'] = match.group(1).strip()
                elif 'mahkamah agung' in court_text.lower():
                    court_info['court_type'] = 'Mahkamah Agung'

                break

        return court_info

    def extract_metadata_from_text(self, text: str, filename: str) -> Dict:
        """Ekstrak semua metadata dari teks"""
        if not isinstance(text, str) or not text.strip():
            return {}

        metadata = {
            'filename': filename,
            'extraction_timestamp': datetime.now().strftime('%Y-%m-%d %H:%M:%S'),
        }

        # Ekstrak komponen metadata
        metadata['case_number'] = self.extract_case_number(text)
        dates = self.extract_dates(text)
        metadata.update(dates)
        metadata['case_type'] = self.extract_case_type(text)
        metadata['legal_articles'] = self.extract_legal_articles(text)
        metadata['parties'] = self.extract_parties(text)
        metadata['court_info'] = self.extract_court_info(text)

        return metadata

    def process_single_file(self, filename: str, source_dir: str) -> Optional[Dict]:
        """Proses file tunggal dengan validasi hasil"""
        file_path = os.path.join(source_dir, filename)

        if not os.path.exists(file_path):
            logger.error(f"File tidak ditemukan: {file_path}")
            return None

        try:
            with open(file_path, 'r', encoding='utf-8') as f:
                text = f.read()

            if not text.strip():
                logger.warning(f"File kosong: {filename}")
                return None

            metadata = self.extract_metadata_from_text(text, filename)

            if metadata:
                # Validasi minimal data yang diperlukan
                has_case_number = bool(metadata.get('case_number'))
                has_parties = any(metadata.get('parties', {}).values())
                has_dates = bool(metadata.get('decision_date') or metadata.get('hearing_date'))

                status = "✅" if (has_case_number or has_parties or has_dates) else "⚠️"
                print(f"{status} {filename}")
                return metadata
            else:
                print(f"❌ {filename}")
                return None

        except Exception as e:
            logger.error(f"Error memproses {filename}: {str(e)}")
            print(f"❌ {filename}: {str(e)}")
            return None

    def get_text_files(self, directory: str) -> List[str]:
        """Dapatkan daftar file teks"""
        if not os.path.exists(directory):
            return []
        return [f for f in os.listdir(directory)
                if f.endswith('.txt') and os.path.isfile(os.path.join(directory, f))]

    def save_metadata_to_csv(self, metadata_list: List[Dict]):
        """Simpan metadata ke CSV terstruktur di kedua lokasi"""
        flattened_data = []

        for metadata in metadata_list:
            parties = metadata.get('parties', {})
            court_info = metadata.get('court_info', {})

            flat_record = {
                # IDENTITAS DOKUMEN
                'nama_file': metadata.get('filename'),
                'tanggal_ekstraksi': metadata.get('extraction_timestamp'),

                # IDENTITAS PERKARA
                'nomor_perkara': metadata.get('case_number'),
                'tanggal_putusan': metadata.get('decision_date'),
                'tanggal_sidang': metadata.get('hearing_date'),
                'jenis_perkara': metadata.get('case_type'),

                # INFORMASI PENGADILAN
                'nama_pengadilan': court_info.get('court_name'),
                'jenis_pengadilan': court_info.get('court_type'),
                'lokasi_pengadilan': court_info.get('court_location'),

                # PIHAK-PIHAK TERKAIT
                'pihak_penggugat': '; '.join(parties.get('plaintiffs', [])),
                'pihak_tergugat': '; '.join(parties.get('respondents', [])),
                'terdakwa': '; '.join(parties.get('defendants', [])),
                'jaksa_penuntut_umum': '; '.join(parties.get('prosecutors', [])),
                'hakim': '; '.join(parties.get('judges', [])),

                # ASPEK HUKUM
                'pasal_yang_dilanggar': '; '.join(metadata.get('legal_articles', [])),
                'jumlah_pasal': len(metadata.get('legal_articles', [])),

                # HITUNGAN PIHAK
                'jumlah_penggugat': len(parties.get('plaintiffs', [])),
                'jumlah_tergugat': len(parties.get('respondents', [])),
                'jumlah_terdakwa': len(parties.get('defendants', [])),
                'jumlah_hakim': len(parties.get('judges', []))
            }
            flattened_data.append(flat_record)

        # Buat DataFrame
        df = pd.DataFrame(flattened_data)
        timestamp = datetime.now().strftime('%Y%m%d_%H%M%S')
        csv_filename = f"cases.csv"

        # Simpan ke direktori lokal (/data/processed)
        csv_path_local = os.path.join(self.output_dir, csv_filename)
        df.to_csv(csv_path_local, index=False, encoding='utf-8')
        print(f"📄 CSV Lokal: {csv_path_local}")

        # Simpan ke Google Drive (/content/drive/MyDrive/korupsi/data/processed)
        csv_path_gdrive = os.path.join(self.gdrive_output_dir, csv_filename)
        df.to_csv(csv_path_gdrive, index=False, encoding='utf-8')
        print(f"💾 CSV GDrive: {csv_path_gdrive}")

        return csv_path_local, csv_path_gdrive

    def save_metadata_to_json(self, metadata_list: List[Dict]):
        """Simpan metadata ke JSON lengkap di kedua lokasi"""
        timestamp = datetime.now().strftime('%Y%m%d_%H%M%S')
        json_filename = f"cases.json"

        # Simpan ke direktori lokal
        json_path_local = os.path.join(self.output_dir, json_filename)
        with open(json_path_local, 'w', encoding='utf-8') as f:
            json.dump(metadata_list, f, ensure_ascii=False, indent=2)
        print(f"📄 JSON Lokal: {json_path_local}")

        # Simpan ke Google Drive
        json_path_gdrive = os.path.join(self.gdrive_output_dir, json_filename)
        with open(json_path_gdrive, 'w', encoding='utf-8') as f:
            json.dump(metadata_list, f, ensure_ascii=False, indent=2)
        print(f"💾 JSON GDrive: {json_path_gdrive}")

        return json_path_local, json_path_gdrive

    def process_all_files(self) -> List[Dict]:
        """Proses semua file untuk ekstraksi metadata"""
        print("🔍 i. EKSTRAKSI METADATA")
        print("=" * 60)
        print("Mengambil metadata: Nomor Perkara, Tanggal, Jenis Perkara, Pasal, Pihak, dll.")
        print("=" * 60)

        # Cari file dari kedua lokasi
        data_raw_files = self.get_text_files(self.cleaned_dir)
        gdrive_files = self.get_text_files(self.gdrive_cleaned_dir)

        if data_raw_files:
            files_to_process = data_raw_files
            source_directory = self.cleaned_dir
            print(f"📂 Menggunakan file dari: {self.cleaned_dir}")
        elif gdrive_files:
            files_to_process = gdrive_files
            source_directory = self.gdrive_cleaned_dir
            print(f"📂 Menggunakan file dari: {self.gdrive_cleaned_dir}")
        else:
            print("❌ Tidak ada file teks yang ditemukan!")
            return []

        print(f"📁 Ditemukan {len(files_to_process)} file untuk diproses")
        print("-" * 60)

        # Proses setiap file
        all_metadata = []
        success_count = 0

        for i, filename in enumerate(files_to_process, 1):
            print(f"[{i:3d}/{len(files_to_process)}] ", end="")
            metadata = self.process_single_file(filename, source_directory)

            if metadata:
                all_metadata.append(metadata)
                success_count += 1

        print("-" * 60)
        print(f"✅ BERHASIL: {success_count} file")
        print(f"❌ GAGAL: {len(files_to_process) - success_count} file")

        if all_metadata:
            # Simpan ke CSV dan JSON di kedua lokasi
            csv_paths = self.save_metadata_to_csv(all_metadata)
            json_paths = self.save_metadata_to_json(all_metadata)

            print(f"📊 Total metadata berhasil diekstrak: {len(all_metadata)} record")
            print(f"💾 File tersimpan di 2 lokasi: lokal & Google Drive")

        return all_metadata

def main():
    """Fungsi utama untuk menjalankan ekstraksi metadata"""
    print("🚀 MULAI EKSTRAKSI METADATA PUTUSAN PENGADILAN")
    print("=" * 70)

    try:
        extractor = MetadataExtractor()
        metadata_results = extractor.process_all_files()

        if metadata_results:
            print("\n🎉 EKSTRAKSI METADATA SELESAI!")
            print(f"Total metadata: {len(metadata_results)} record")
            print("File output tersimpan di:")
            print("  - Lokal: /data/processed/")
            print("  - GDrive: /content/drive/MyDrive/korupsi/data/processed/")
        else:
            print("\n❌ Tidak ada metadata yang berhasil diekstrak.")

    except Exception as e:
        print(f"\n💥 ERROR: {str(e)}")
        import traceback
        traceback.print_exc()

if __name__ == "__main__":
    main()

🚀 MULAI EKSTRAKSI METADATA PUTUSAN PENGADILAN
🔍 EKSTRAKSI METADATA
Input: /data/raw atau /content/drive/MyDrive/korupsi/CLEANED
Output Lokal: /data/processed
Output GDrive: /content/drive/MyDrive/korupsi/data/processed
🔍 i. EKSTRAKSI METADATA
Mengambil metadata: Nomor Perkara, Tanggal, Jenis Perkara, Pasal, Pihak, dll.
📂 Menggunakan file dari: /content/drive/MyDrive/korupsi/CLEANED
📁 Ditemukan 114 file untuk diproses
------------------------------------------------------------
[  1/114] ✅ case_2024_TK1_Putusan_PN_SURABAYA_Nomor_85_Pid_Sus-TPK_2024_PN_Sby_Tanggal_31_Desember_2024__Penuntut_Umum_Martina_Peristyanti__S_H___MBATerdakwa_MEGA_YUNAN_RAKHMANA.txt
[  2/114] ✅ case_2024_TK1_Putusan_PN_SURABAYA_Nomor_86_Pid_Sus-TPK_2024_PN_Sby_Tanggal_31_Desember_2024__Penuntut_Umum_Martina_Peristyanti__S_H___MBATerdakwa_SUJARWO_Bin_JIMIN.txt
[  3/114] ⚠️ case_2024_TK1_Putusan_PN_SURABAYA_Nomor_105_Pid_Sus-TPK_2024_PN_Sby_Tanggal_23_Desember_2024__Penuntut_Umum_DIAN_PRANATA_DEPARI__S_H__M_HTerdak

## Ekstraksi Konten Kunci

In [None]:
import os
import re
import json
import pandas as pd
from datetime import datetime
from typing import Dict, List, Optional, Tuple
import logging

In [None]:
logging.basicConfig(level=logging.INFO)
logger = logging.getLogger(__name__)

class KontenKunciExtractor:
    """Ekstrak konten kunci dari dokumen putusan pengadilan"""

    def __init__(self, base_dir="/content/drive/MyDrive/korupsi"):
        self.base_dir = base_dir
        self.cleaned_dir = "/data/raw"
        self.gdrive_cleaned_dir = os.path.join(base_dir, "CLEANED")
        self.output_dir = "/data/processed"  # Output lokal
        self.gdrive_output_dir = os.path.join(base_dir, "data", "processed")  # Output Google Drive
        self.logs_dir = "/logs"

        # Buat direktori
        os.makedirs(self.output_dir, exist_ok=True)
        os.makedirs(self.gdrive_output_dir, exist_ok=True)  # Buat di Google Drive
        os.makedirs(self.logs_dir, exist_ok=True)

        print(f"📝 EKSTRAKSI KONTEN KUNCI")
        print(f"Input: {self.cleaned_dir} atau {self.gdrive_cleaned_dir}")
        print(f"Output Lokal: {self.output_dir}")
        print(f"Output GDrive: {self.gdrive_output_dir}")

        # Setup pola ekstraksi konten
        self.setup_content_patterns()

    def setup_content_patterns(self):
        """Setup pola regex untuk ekstraksi konten kunci"""

        # 1. POLA RINGKASAN FAKTA
        self.fact_summary_patterns = {
            'dakwaan': [
                r'dakwaan\s*:?\s*(.*?)(?=\n\n|\nterdakwa|\npenuntut|\nhakim|\Z)',
                r'terdakwa\s+didakwa\s+(.*?)(?=\n\n|\Z)',
                r'melakukan\s+perbuatan\s+(.*?)(?=\n\n|\Z)',
            ],
            'barang_bukti': [
                r'barang\s+bukti\s*:?\s*(.*?)(?=\n\n|\nsaksi|\nhakim|\Z)',
                r'bukti.*?yang\s+diajukan\s+(.*?)(?=\n\n|\Z)',
                r'alat\s+bukti\s*:?\s*(.*?)(?=\n\n|\Z)',
            ],
            'fakta_persidangan': [
                r'fakta.*?persidangan\s*:?\s*(.*?)(?=\n\n|\npertimbangan|\Z)',
                r'berdasarkan\s+fakta.*?persidangan\s+(.*?)(?=\n\n|\Z)',
                r'dari\s+fakta.*?terungkap\s+(.*?)(?=\n\n|\Z)',
            ],
            'kronologi': [
                r'kronologi\s*:?\s*(.*?)(?=\n\n|\npertimbangan|\Z)',
                r'peristiwa\s+terjadi\s+(.*?)(?=\n\n|\Z)',
                r'pada\s+tanggal.*?terdakwa\s+(.*?)(?=\n\n|\Z)',
            ],
            'kerugian': [
                r'kerugian.*?negara\s*:?\s*(.*?)(?=\n\n|\Z)',
                r'merugikan.*?keuangan.*?negara\s+(.*?)(?=\n\n|\Z)',
                r'nilai\s+kerugian\s*:?\s*(.*?)(?=\n\n|\Z)',
            ]
        }

        # 2. POLA ARGUMEN HUKUM UTAMA
        self.legal_argument_patterns = {
            'pertimbangan_hakim': [
                r'pertimbangan\s*:?\s*(.*?)(?=\nmengadili|\namar|\Z)',
                r'menimbang\s*:?\s*(.*?)(?=\nmengadili|\namar|\Z)',
                r'bahwa.*?hakim\s+berpendapat\s+(.*?)(?=\n\n|\Z)',
            ],
            'pasal_yang_terbukti': [
                r'terbukti.*?melanggar\s+pasal\s+(.*?)(?=\n\n|\Z)',
                r'perbuatan.*?memenuhi.*?pasal\s+(.*?)(?=\n\n|\Z)',
                r'dakwaan.*?terbukti.*?pasal\s+(.*?)(?=\n\n|\Z)',
            ],
            'alasan_putusan': [
                r'oleh\s+karena\s+itu\s+(.*?)(?=\nmengadili|\namar|\Z)',
                r'dengan\s+demikian\s+(.*?)(?=\nmengadili|\namar|\Z)',
                r'berdasarkan.*?pertimbangan\s+(.*?)(?=\nmengadili|\namar|\Z)',
            ],
            'amar_putusan': [
                r'mengadili\s*:?\s*(.*?)(?=\Z)',
                r'amar\s*:?\s*(.*?)(?=\Z)',
                r'memutuskan\s*:?\s*(.*?)(?=\Z)',
            ],
            'putusan_hukuman': [
                r'menjatuhkan\s+pidana\s+(.*?)(?=\n[0-9]|\Z)',
                r'menghukum\s+terdakwa\s+(.*?)(?=\n[0-9]|\Z)',
                r'pidana\s+penjara\s+selama\s+(.*?)(?=\n|\Z)',
                r'pidana\s+denda\s+sebesar\s+(.*?)(?=\n|\Z)',
            ]
        }

        # 3. POLA ELEMEN PENTING LAINNYA
        self.other_patterns = {
            'saksi': [
                r'saksi\s*:?\s*(.*?)(?=\nterdakwa|\npenuntut|\Z)',
                r'keterangan\s+saksi\s+(.*?)(?=\n\n|\Z)',
            ],
            'ahli': [
                r'keterangan\s+ahli\s*:?\s*(.*?)(?=\n\n|\Z)',
                r'ahli\s+menerangkan\s+(.*?)(?=\n\n|\Z)',
            ],
            'pengakuan_terdakwa': [
                r'terdakwa\s+mengaku\s+(.*?)(?=\n\n|\Z)',
                r'keterangan\s+terdakwa\s+(.*?)(?=\n\n|\Z)',
            ]
        }

    def clean_extracted_text(self, text: str) -> str:
        """Bersihkan teks yang diekstrak"""
        if not text:
            return ""

        # Remove excessive whitespace
        text = re.sub(r'\s+', ' ', text)
        # Remove common legal formatting
        text = re.sub(r'\b(?:pasal|ayat|huruf|angka)\s+\([^)]+\)', '', text)
        # Remove line numbers
        text = re.sub(r'^\d+\.?\s*', '', text)

        return text.strip()

    def extract_fact_summary(self, text: str) -> Dict[str, str]:
        """Ekstrak ringkasan fakta dari teks"""
        facts = {}

        for category, patterns in self.fact_summary_patterns.items():
            extracted_text = ""

            for pattern in patterns:
                matches = re.finditer(pattern, text, re.IGNORECASE | re.DOTALL)
                for match in matches:
                    if match.group(1):
                        content = self.clean_extracted_text(match.group(1))
                        if len(content) > len(extracted_text):
                            extracted_text = content

            facts[category] = extracted_text[:1000] if extracted_text else ""  # Limit to 1000 chars

        return facts

    def extract_legal_arguments(self, text: str) -> Dict[str, str]:
        """Ekstrak argumen hukum utama"""
        arguments = {}

        for category, patterns in self.legal_argument_patterns.items():
            extracted_text = ""

            for pattern in patterns:
                matches = re.finditer(pattern, text, re.IGNORECASE | re.DOTALL)
                for match in matches:
                    if match.group(1):
                        content = self.clean_extracted_text(match.group(1))
                        if len(content) > len(extracted_text):
                            extracted_text = content

            arguments[category] = extracted_text[:1000] if extracted_text else ""

        return arguments

    def extract_other_elements(self, text: str) -> Dict[str, str]:
        """Ekstrak elemen penting lainnya"""
        others = {}

        for category, patterns in self.other_patterns.items():
            extracted_text = ""

            for pattern in patterns:
                matches = re.finditer(pattern, text, re.IGNORECASE | re.DOTALL)
                for match in matches:
                    if match.group(1):
                        content = self.clean_extracted_text(match.group(1))
                        if len(content) > len(extracted_text):
                            extracted_text = content

            others[category] = extracted_text[:500] if extracted_text else ""

        return others

    def extract_key_content_from_text(self, text: str, filename: str) -> Dict:
        """Ekstrak semua konten kunci dari teks"""
        if not isinstance(text, str) or not text.strip():
            return {}

        content = {
            'filename': filename,
            'extraction_timestamp': datetime.now().strftime('%Y-%m-%d %H:%M:%S'),
            'text_length': len(text),
            'word_count': len(text.split())
        }

        # 1. Ekstrak ringkasan fakta
        fact_summary = self.extract_fact_summary(text)
        content['fact_summary'] = fact_summary

        # 2. Ekstrak argumen hukum
        legal_arguments = self.extract_legal_arguments(text)
        content['legal_arguments'] = legal_arguments

        # 3. Ekstrak elemen lainnya
        other_elements = self.extract_other_elements(text)
        content['other_elements'] = other_elements

        # 4. Hitung kelengkapan konten
        content['content_completeness'] = self.calculate_content_completeness(content)

        return content

    def calculate_content_completeness(self, content: Dict) -> float:
        """Hitung persentase kelengkapan konten kunci"""
        essential_elements = [
            'dakwaan', 'barang_bukti', 'pertimbangan_hakim',
            'amar_putusan', 'putusan_hukuman'
        ]

        score = 0
        fact_summary = content.get('fact_summary', {})
        legal_arguments = content.get('legal_arguments', {})

        for element in essential_elements:
            if element in fact_summary and fact_summary[element]:
                score += 1
            elif element in legal_arguments and legal_arguments[element]:
                score += 1

        return (score / len(essential_elements)) * 100

    def process_single_file(self, filename: str, source_dir: str) -> Optional[Dict]:
        """Proses file tunggal untuk ekstraksi konten kunci"""
        file_path = os.path.join(source_dir, filename)

        if not os.path.exists(file_path):
            logger.error(f"File tidak ditemukan: {file_path}")
            return None

        try:
            with open(file_path, 'r', encoding='utf-8') as f:
                text = f.read()

            if not text.strip():
                logger.warning(f"File kosong: {filename}")
                return None

            content = self.extract_key_content_from_text(text, filename)

            if content:
                completeness = content.get('content_completeness', 0)
                print(f"✅ {filename} (Kelengkapan: {completeness:.1f}%)")
                return content
            else:
                print(f"❌ {filename}")
                return None

        except Exception as e:
            logger.error(f"Error memproses {filename}: {str(e)}")
            print(f"❌ {filename}: {str(e)}")
            return None

    def get_text_files(self, directory: str) -> List[str]:
        """Dapatkan daftar file teks"""
        if not os.path.exists(directory):
            return []
        return [f for f in os.listdir(directory)
                if f.endswith('.txt') and os.path.isfile(os.path.join(directory, f))]

    def save_content_to_csv(self, content_list: List[Dict]):
        """Simpan konten kunci ke CSV di kedua lokasi"""
        flattened_data = []

        for content in content_list:
            fact_summary = content.get('fact_summary', {})
            legal_arguments = content.get('legal_arguments', {})
            other_elements = content.get('other_elements', {})

            flat_record = {
                # IDENTITAS DOKUMEN
                'nama_file': content.get('filename'),
                'tanggal_ekstraksi': content.get('extraction_timestamp'),
                'panjang_teks': content.get('text_length'),
                'jumlah_kata': content.get('word_count'),
                'kelengkapan_konten_persen': content.get('content_completeness'),

                # RINGKASAN FAKTA
                'dakwaan': fact_summary.get('dakwaan', ''),
                'barang_bukti': fact_summary.get('barang_bukti', ''),
                'fakta_persidangan': fact_summary.get('fakta_persidangan', ''),
                'kronologi': fact_summary.get('kronologi', ''),
                'kerugian': fact_summary.get('kerugian', ''),

                # ARGUMEN HUKUM UTAMA
                'pertimbangan_hakim': legal_arguments.get('pertimbangan_hakim', ''),
                'pasal_yang_terbukti': legal_arguments.get('pasal_yang_terbukti', ''),
                'alasan_putusan': legal_arguments.get('alasan_putusan', ''),
                'amar_putusan': legal_arguments.get('amar_putusan', ''),
                'putusan_hukuman': legal_arguments.get('putusan_hukuman', ''),

                # ELEMEN LAINNYA
                'keterangan_saksi': other_elements.get('saksi', ''),
                'keterangan_ahli': other_elements.get('ahli', ''),
                'pengakuan_terdakwa': other_elements.get('pengakuan_terdakwa', '')
            }
            flattened_data.append(flat_record)

        # Buat DataFrame
        df = pd.DataFrame(flattened_data)
        timestamp = datetime.now().strftime('%Y%m%d_%H%M%S')
        csv_filename = f"konten_kunci_{timestamp}.csv"

        # Simpan ke direktori lokal
        csv_path_local = os.path.join(self.output_dir, csv_filename)
        df.to_csv(csv_path_local, index=False, encoding='utf-8')
        print(f"📄 CSV Lokal: {csv_path_local}")

        # Simpan ke Google Drive
        csv_path_gdrive = os.path.join(self.gdrive_output_dir, csv_filename)
        df.to_csv(csv_path_gdrive, index=False, encoding='utf-8')
        print(f"💾 CSV GDrive: {csv_path_gdrive}")

        return csv_path_local, csv_path_gdrive

    def save_content_to_json(self, content_list: List[Dict]):
        """Simpan konten kunci ke JSON lengkap di kedua lokasi"""
        timestamp = datetime.now().strftime('%Y%m%d_%H%M%S')
        json_filename = f"konten_kunci_{timestamp}.json"

        # Simpan ke direktori lokal
        json_path_local = os.path.join(self.output_dir, json_filename)
        with open(json_path_local, 'w', encoding='utf-8') as f:
            json.dump(content_list, f, ensure_ascii=False, indent=2)
        print(f"📄 JSON Lokal: {json_path_local}")

        # Simpan ke Google Drive
        json_path_gdrive = os.path.join(self.gdrive_output_dir, json_filename)
        with open(json_path_gdrive, 'w', encoding='utf-8') as f:
            json.dump(content_list, f, ensure_ascii=False, indent=2)
        print(f"💾 JSON GDrive: {json_path_gdrive}")

        return json_path_local, json_path_gdrive

    def create_summary_report(self, content_list: List[Dict]):
        """Buat laporan ringkasan ekstraksi konten"""
        if not content_list:
            return

        # Statistik dasar
        total_files = len(content_list)
        avg_completeness = sum(c.get('content_completeness', 0) for c in content_list) / total_files
        avg_length = sum(c.get('text_length', 0) for c in content_list) / total_files

        # Hitung coverage untuk setiap elemen
        coverage = {}
        elements = ['dakwaan', 'barang_bukti', 'pertimbangan_hakim', 'amar_putusan', 'putusan_hukuman']

        for element in elements:
            count = 0
            for content in content_list:
                fact_summary = content.get('fact_summary', {})
                legal_arguments = content.get('legal_arguments', {})

                if (fact_summary.get(element) or legal_arguments.get(element)):
                    count += 1

            coverage[element] = (count / total_files) * 100

        # Buat laporan
        report = f"""
📊 LAPORAN EKSTRAKSI KONTEN KUNCI
====================================
Tanggal: {datetime.now().strftime('%Y-%m-%d %H:%M:%S')}

STATISTIK UMUM:
- Total file diproses: {total_files}
- Rata-rata kelengkapan: {avg_completeness:.1f}%
- Rata-rata panjang teks: {avg_length:,.0f} karakter

COVERAGE ELEMEN KUNCI:
- Dakwaan: {coverage.get('dakwaan', 0):.1f}%
- Barang Bukti: {coverage.get('barang_bukti', 0):.1f}%
- Pertimbangan Hakim: {coverage.get('pertimbangan_hakim', 0):.1f}%
- Amar Putusan: {coverage.get('amar_putusan', 0):.1f}%
- Putusan Hukuman: {coverage.get('putusan_hukuman', 0):.1f}%

FILE DENGAN KELENGKAPAN TINGGI (>80%):
"""

        high_quality = [c for c in content_list if c.get('content_completeness', 0) > 80]
        for content in high_quality[:10]:
            filename = content.get('filename', 'Unknown')
            completeness = content.get('content_completeness', 0)
            report += f"- {filename}: {completeness:.1f}%\n"

        if len(high_quality) > 10:
            report += f"... dan {len(high_quality) - 10} file lainnya\n"

        # Simpan laporan
        timestamp = datetime.now().strftime('%Y%m%d_%H%M%S')
        report_filename = f"laporan_konten_{timestamp}.txt"
        report_path = os.path.join(self.output_dir, report_filename)

        with open(report_path, 'w', encoding='utf-8') as f:
            f.write(report)

        print(f"📋 Laporan disimpan: {report_path}")

    def process_all_files(self) -> List[Dict]:
        """Proses semua file untuk ekstraksi konten kunci"""
        print("📝 ii. EKSTRAKSI KONTEN KUNCI")
        print("=" * 60)
        print("1. Ringkasan fakta (barang bukti, dakwaan)")
        print("2. Argumen hukum utama (putusan, pasal)")
        print("=" * 60)

        # Cari file dari kedua lokasi
        data_raw_files = self.get_text_files(self.cleaned_dir)
        gdrive_files = self.get_text_files(self.gdrive_cleaned_dir)

        if data_raw_files:
            files_to_process = data_raw_files
            source_directory = self.cleaned_dir
            print(f"📂 Menggunakan file dari: {self.cleaned_dir}")
        elif gdrive_files:
            files_to_process = gdrive_files
            source_directory = self.gdrive_cleaned_dir
            print(f"📂 Menggunakan file dari: {self.gdrive_cleaned_dir}")
        else:
            print("❌ Tidak ada file teks yang ditemukan!")
            return []

        print(f"📁 Ditemukan {len(files_to_process)} file untuk diproses")
        print("-" * 60)

        # Proses setiap file
        all_content = []
        success_count = 0

        for i, filename in enumerate(files_to_process, 1):
            print(f"[{i:3d}/{len(files_to_process)}] ", end="")
            content = self.process_single_file(filename, source_directory)

            if content:
                all_content.append(content)
                success_count += 1

        print("-" * 60)
        print(f"✅ BERHASIL: {success_count} file")
        print(f"❌ GAGAL: {len(files_to_process) - success_count} file")

        if all_content:
            # Simpan ke CSV dan JSON di kedua lokasi
            csv_paths = self.save_content_to_csv(all_content)
            json_paths = self.save_content_to_json(all_content)

            # Buat laporan ringkasan
            self.create_summary_report(all_content)

            print(f"📊 Total konten kunci berhasil diekstrak: {len(all_content)} record")
            print(f"💾 File tersimpan di 2 lokasi: lokal & Google Drive")

        return all_content

def main():
    """Fungsi utama untuk menjalankan ekstraksi konten kunci"""
    print("🚀 MULAI EKSTRAKSI KONTEN KUNCI")
    print("=" * 50)

    try:
        extractor = KontenKunciExtractor()
        content_results = extractor.process_all_files()

        if content_results:
            print("\n🎉 EKSTRAKSI KONTEN KUNCI SELESAI!")
            print(f"Total konten: {len(content_results)} record")
            print("File output tersimpan di: /data/processed/")
        else:
            print("\n❌ Tidak ada konten kunci yang berhasil diekstrak.")

    except Exception as e:
        print(f"\n💥 ERROR: {str(e)}")
        import traceback
        traceback.print_exc()

if __name__ == "__main__":
    main()

🚀 MULAI EKSTRAKSI KONTEN KUNCI
📝 EKSTRAKSI KONTEN KUNCI
Input: /data/raw atau /content/drive/MyDrive/korupsi/CLEANED
Output Lokal: /data/processed
Output GDrive: /content/drive/MyDrive/korupsi/data/processed
📝 ii. EKSTRAKSI KONTEN KUNCI
1. Ringkasan fakta (barang bukti, dakwaan)
2. Argumen hukum utama (putusan, pasal)
📂 Menggunakan file dari: /content/drive/MyDrive/korupsi/CLEANED
📁 Ditemukan 114 file untuk diproses
------------------------------------------------------------
[  1/114] ✅ case_2024_TK1_Putusan_PN_SURABAYA_Nomor_85_Pid_Sus-TPK_2024_PN_Sby_Tanggal_31_Desember_2024__Penuntut_Umum_Martina_Peristyanti__S_H___MBATerdakwa_MEGA_YUNAN_RAKHMANA.txt (Kelengkapan: 100.0%)
[  2/114] ✅ case_2024_TK1_Putusan_PN_SURABAYA_Nomor_86_Pid_Sus-TPK_2024_PN_Sby_Tanggal_31_Desember_2024__Penuntut_Umum_Martina_Peristyanti__S_H___MBATerdakwa_SUJARWO_Bin_JIMIN.txt (Kelengkapan: 100.0%)
[  3/114] ✅ case_2024_TK1_Putusan_PN_SURABAYA_Nomor_105_Pid_Sus-TPK_2024_PN_Sby_Tanggal_23_Desember_2024__Penuntu

## Feature Engineering

In [None]:
import os
import re
import json
import pandas as pd
import numpy as np
from datetime import datetime
from typing import Dict, List, Optional, Tuple
from collections import Counter
import logging
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer
from sklearn.preprocessing import LabelEncoder, StandardScaler
from sklearn.decomposition import PCA
import warnings
warnings.filterwarnings('ignore')

In [None]:
logging.basicConfig(level=logging.INFO)
logger = logging.getLogger(__name__)

class FeatureEngineer:
    """Generate features untuk machine learning dari dokumen putusan"""

    def __init__(self, base_dir="/content/drive/MyDrive/korupsi"):
        self.base_dir = base_dir
        self.cleaned_dir = "/data/raw"
        self.gdrive_cleaned_dir = os.path.join(base_dir, "CLEANED")
        self.input_dir = "/data/processed"  # Input dari langkah sebelumnya
        self.output_dir = "/data/processed"
        self.gdrive_output_dir = os.path.join(base_dir, "data", "processed")  # Output Google Drive
        self.logs_dir = "/logs"

        # Buat direktori
        os.makedirs(self.output_dir, exist_ok=True)
        os.makedirs(self.gdrive_output_dir, exist_ok=True)
        os.makedirs(self.logs_dir, exist_ok=True)

        print(f"🔧 FEATURE ENGINEERING")
        print(f"Input teks: {self.cleaned_dir} atau {self.gdrive_cleaned_dir}")
        print(f"Input metadata: {self.input_dir}")
        print(f"Output Lokal: {self.output_dir}")
        print(f"Output GDrive: {self.gdrive_output_dir}")

        # Initialize vectorizers
        self.tfidf_vectorizer = TfidfVectorizer(
            max_features=1000,
            stop_words=self.get_indonesian_stopwords(),
            ngram_range=(1, 2),
            min_df=2,
            max_df=0.95
        )

        self.count_vectorizer = CountVectorizer(
            max_features=500,
            stop_words=self.get_indonesian_stopwords(),
            ngram_range=(1, 1),
            min_df=2
        )

        # Label encoders dan scalers
        self.label_encoders = {}
        self.scaler = StandardScaler()

    def get_indonesian_stopwords(self) -> List[str]:
        """Daftar stopwords bahasa Indonesia untuk legal documents"""
        return [
            'yang', 'dan', 'di', 'ke', 'dari', 'pada', 'dengan', 'untuk', 'dalam', 'oleh',
            'adalah', 'akan', 'telah', 'sudah', 'dapat', 'harus', 'tidak', 'belum', 'juga',
            'bahwa', 'sebagai', 'atau', 'jika', 'karena', 'sehingga', 'maka', 'agar', 'itu',
            'ini', 'tersebut', 'hal', 'ada', 'sebuah', 'suatu', 'semua', 'setiap', 'beberapa',
            'pengadilan', 'hakim', 'terdakwa', 'penggugat', 'tergugat', 'putusan', 'perkara',
            'pasal', 'undang', 'hukum', 'pidana', 'perdata', 'nomor', 'tanggal', 'tahun',
            'republik', 'indonesia', 'negeri', 'jaksa', 'penuntut', 'umum', 'saksi', 'bukti'
        ]

    def load_text_files(self) -> Dict[str, str]:
        """Load semua file teks yang sudah dibersihkan"""
        texts = {}

        # Cek direktori mana yang tersedia
        if os.path.exists(self.cleaned_dir):
            source_dir = self.cleaned_dir
        elif os.path.exists(self.gdrive_cleaned_dir):
            source_dir = self.gdrive_cleaned_dir
        else:
            logger.error("Tidak ada direktori teks yang ditemukan")
            return {}

        print(f"📂 Loading teks dari: {source_dir}")

        for filename in os.listdir(source_dir):
            if filename.endswith('.txt'):
                filepath = os.path.join(source_dir, filename)
                try:
                    with open(filepath, 'r', encoding='utf-8') as f:
                        text = f.read()
                    texts[filename] = text
                except Exception as e:
                    logger.error(f"Error loading {filename}: {e}")

        print(f"📁 Loaded {len(texts)} file teks")
        return texts

    def load_metadata(self) -> Optional[pd.DataFrame]:
        """Load metadata dari langkah i"""
        metadata_files = [f for f in os.listdir(self.input_dir) if f.startswith('metadata_') and f.endswith('.csv')]

        if not metadata_files:
            logger.warning("Tidak ada file metadata yang ditemukan")
            return None

        # Ambil file metadata terbaru
        latest_metadata = max(metadata_files)
        metadata_path = os.path.join(self.input_dir, latest_metadata)

        try:
            df = pd.read_csv(metadata_path, encoding='utf-8')
            print(f"📊 Loaded metadata: {len(df)} records dari {latest_metadata}")
            return df
        except Exception as e:
            logger.error(f"Error loading metadata: {e}")
            return None

    def load_content(self) -> Optional[pd.DataFrame]:
        """Load konten kunci dari langkah ii"""
        content_files = [f for f in os.listdir(self.input_dir) if f.startswith('konten_kunci_') and f.endswith('.csv')]

        if not content_files:
            logger.warning("Tidak ada file konten kunci yang ditemukan")
            return None

        # Ambil file konten terbaru
        latest_content = max(content_files)
        content_path = os.path.join(self.input_dir, latest_content)

        try:
            df = pd.read_csv(content_path, encoding='utf-8')
            print(f"📝 Loaded konten kunci: {len(df)} records dari {latest_content}")
            return df
        except Exception as e:
            logger.error(f"Error loading konten: {e}")
            return None

    def count_syllables(self, word: str) -> int:
        """Estimasi jumlah suku kata dalam kata (untuk bahasa Indonesia)"""
        vowels = 'aeiouAEIOU'
        syllable_count = 0
        prev_char_was_vowel = False

        for char in word:
            if char in vowels:
                if not prev_char_was_vowel:
                    syllable_count += 1
                prev_char_was_vowel = True
            else:
                prev_char_was_vowel = False

        return max(1, syllable_count)  # Minimal 1 suku kata

    def calculate_flesch_score(self, words: int, sentences: int, syllables: int) -> float:
        """Hitung Flesch Reading Ease Score (adaptasi untuk Indonesia)"""
        if sentences == 0 or words == 0:
            return 0

        avg_sentence_length = words / sentences
        avg_syllables_per_word = syllables / words

        # Formula Flesch (disesuaikan untuk bahasa Indonesia)
        score = 206.835 - (1.015 * avg_sentence_length) - (84.6 * avg_syllables_per_word)
        return max(0, min(100, score))

    def calculate_text_features(self, texts: Dict[str, str]) -> pd.DataFrame:
        """Hitung fitur-fitur teks dasar yang komprehensif"""
        features_data = []

        for filename, text in texts.items():
            if not text:
                continue

            # Preprocessing
            text_lower = text.lower()
            words = text.split()
            sentences = re.split(r'[.!?]+', text)
            paragraphs = [p for p in text.split('\n\n') if p.strip()]

            # 1. BASIC LENGTH FEATURES
            char_count = len(text)
            word_count = len(words)
            sentence_count = len(sentences)
            paragraph_count = len(paragraphs)

            # 2. ADVANCED LEXICAL FEATURES
            unique_words = len(set([w.lower() for w in words]))
            lexical_diversity = unique_words / word_count if word_count > 0 else 0
            avg_word_length = np.mean([len(word) for word in words]) if word_count > 0 else 0
            avg_sentence_length = word_count / sentence_count if sentence_count > 0 else 0

            # 3. READABILITY METRICS
            syllable_count = sum([self.count_syllables(word) for word in words])
            flesch_score = self.calculate_flesch_score(word_count, sentence_count, syllable_count)

            # 4. LEGAL DOCUMENT SPECIFIC FEATURES
            legal_terms_count = len(re.findall(r'\b(?:pasal|undang|hukum|pidana|perdata|terdakwa|penggugat|hakim|jaksa|dakwaan|vonis|putusan)\b', text_lower))
            law_references = len(re.findall(r'\b(?:uu|undang.*?undang)\s+(?:no\.?|nomor)\s+\d+', text_lower))
            article_references = len(re.findall(r'\bpasal\s+\d+', text_lower))
            court_mentions = len(re.findall(r'\bpengadilan\s+(?:negeri|tinggi|agama|militer)', text_lower))

            # 5. NUMERIC AND DATE FEATURES
            number_mentions = len(re.findall(r'\b\d+\b', text))
            date_mentions = len(re.findall(r'\b\d{1,2}[\s/\-]\w+[\s/\-]\d{4}\b', text))
            money_mentions = len(re.findall(r'\b(?:rp\.?|rupiah)\s*\d', text_lower))
            percentage_mentions = len(re.findall(r'\d+\s*%', text))

            # 6. STRUCTURAL DOCUMENT FEATURES
            has_header = 1 if re.search(r'\bputusan\b.*\bpengadilan\b', text_lower) else 0
            has_case_number = 1 if re.search(r'\bnomor\s*:\s*\d+/', text_lower) else 0
            has_parties = 1 if re.search(r'\b(?:terdakwa|penggugat|tergugat)\s*:', text_lower) else 0
            has_consideration = 1 if re.search(r'\bmenimbang\b', text_lower) else 0
            has_decision = 1 if re.search(r'\bmengadili\b', text_lower) else 0
            has_conclusion = 1 if re.search(r'\b(?:demikian|kesimpulan)\b', text_lower) else 0

            # 7. CONTENT TYPE FEATURES
            has_dakwaan = 1 if re.search(r'\bdakwaan\b', text_lower) else 0
            has_tuntutan = 1 if re.search(r'\btuntutan\b', text_lower) else 0
            has_pembelaan = 1 if re.search(r'\bpembelaan\b', text_lower) else 0
            has_saksi = 1 if re.search(r'\bsaksi\b', text_lower) else 0
            has_bukti = 1 if re.search(r'\bbarang\s+bukti\b', text_lower) else 0

            # 8. LINGUISTIC COMPLEXITY FEATURES
            complex_sentences = len([s for s in sentences if len(s.split()) > 20])
            question_count = len(re.findall(r'\?', text))
            exclamation_count = len(re.findall(r'!', text))
            quoted_text_count = len(re.findall(r'"[^"]*"', text))

            # 9. PARTY INVOLVEMENT FEATURES
            defendant_mentions = len(re.findall(r'\bterdakwa\b', text_lower))
            plaintiff_mentions = len(re.findall(r'\bpenggugat\b', text_lower))
            judge_mentions = len(re.findall(r'\bhakim\b', text_lower))
            prosecutor_mentions = len(re.findall(r'\bjaksa\b', text_lower))

            # 10. LEGAL REASONING FEATURES
            because_count = len(re.findall(r'\b(?:karena|sebab|disebabkan)\b', text_lower))
            therefore_count = len(re.findall(r'\b(?:oleh karena|dengan demikian|maka)\b', text_lower))
            evidence_count = len(re.findall(r'\b(?:bukti|terbukti|membuktikan)\b', text_lower))
            violation_count = len(re.findall(r'\b(?:melanggar|pelanggaran|melakukan)\b', text_lower))

            features_data.append({
                'nama_file': filename,

                # Basic Features
                'char_count': char_count,
                'word_count': word_count,
                'sentence_count': sentence_count,
                'paragraph_count': paragraph_count,

                # Lexical Features
                'unique_words': unique_words,
                'lexical_diversity': lexical_diversity,
                'avg_word_length': avg_word_length,
                'avg_sentence_length': avg_sentence_length,

                # Readability
                'syllable_count': syllable_count,
                'flesch_score': flesch_score,

                # Legal Specific
                'legal_terms_count': legal_terms_count,
                'law_references': law_references,
                'article_references': article_references,
                'court_mentions': court_mentions,

                # Numeric Features
                'number_mentions': number_mentions,
                'date_mentions': date_mentions,
                'money_mentions': money_mentions,
                'percentage_mentions': percentage_mentions,

                # Document Structure
                'has_header': has_header,
                'has_case_number': has_case_number,
                'has_parties': has_parties,
                'has_consideration': has_consideration,
                'has_decision': has_decision,
                'has_conclusion': has_conclusion,

                # Content Type
                'has_dakwaan': has_dakwaan,
                'has_tuntutan': has_tuntutan,
                'has_pembelaan': has_pembelaan,
                'has_saksi': has_saksi,
                'has_bukti': has_bukti,

                # Linguistic Complexity
                'complex_sentences': complex_sentences,
                'question_count': question_count,
                'exclamation_count': exclamation_count,
                'quoted_text_count': quoted_text_count,

                # Party Mentions
                'defendant_mentions': defendant_mentions,
                'plaintiff_mentions': plaintiff_mentions,
                'judge_mentions': judge_mentions,
                'prosecutor_mentions': prosecutor_mentions,

                # Legal Reasoning
                'because_count': because_count,
                'therefore_count': therefore_count,
                'evidence_count': evidence_count,
                'violation_count': violation_count
            })

        return pd.DataFrame(features_data)

    def create_bag_of_words_features(self, texts: Dict[str, str]) -> Tuple[pd.DataFrame, pd.DataFrame]:
        """Buat bag-of-words dan TF-IDF features dengan preprocessing yang lebih baik"""
        filenames = list(texts.keys())
        text_content = [texts[fname] for fname in filenames]

        # Filter empty texts
        valid_indices = [i for i, text in enumerate(text_content) if text.strip()]
        valid_filenames = [filenames[i] for i in valid_indices]
        valid_texts = [text_content[i] for i in valid_indices]

        if not valid_texts:
            return pd.DataFrame(), pd.DataFrame()

        # Preprocessing teks
        processed_texts = []
        for text in valid_texts:
            # Bersihkan teks
            text = re.sub(r'\d+', 'NUMBER', text)  # Replace numbers
            text = re.sub(r'[^\w\s]', ' ', text)   # Remove punctuation
            text = re.sub(r'\s+', ' ', text)       # Normalize whitespace
            processed_texts.append(text.lower())

        # 1. COUNT VECTORIZER (Bag of Words)
        try:
            count_matrix = self.count_vectorizer.fit_transform(processed_texts)
            count_feature_names = self.count_vectorizer.get_feature_names_out()

            bow_df = pd.DataFrame(
                count_matrix.toarray(),
                columns=[f'bow_{name}' for name in count_feature_names],
                index=valid_filenames
            )
            bow_df.reset_index(inplace=True)
            bow_df.rename(columns={'index': 'nama_file'}, inplace=True)

            print(f"🎯 Bag-of-Words: {bow_df.shape[1]-1} features")
        except Exception as e:
            logger.error(f"Error creating BoW features: {e}")
            bow_df = pd.DataFrame()

        # 2. TF-IDF VECTORIZER
        try:
            tfidf_matrix = self.tfidf_vectorizer.fit_transform(processed_texts)
            tfidf_feature_names = self.tfidf_vectorizer.get_feature_names_out()

            tfidf_df = pd.DataFrame(
                tfidf_matrix.toarray(),
                columns=[f'tfidf_{name}' for name in tfidf_feature_names],
                index=valid_filenames
            )
            tfidf_df.reset_index(inplace=True)
            tfidf_df.rename(columns={'index': 'nama_file'}, inplace=True)

            print(f"📈 TF-IDF: {tfidf_df.shape[1]-1} features")
        except Exception as e:
            logger.error(f"Error creating TF-IDF features: {e}")
            tfidf_df = pd.DataFrame()

        return bow_df, tfidf_df

    def create_qa_pairs(self, texts: Dict[str, str], metadata_df: pd.DataFrame = None) -> List[Dict]:
        """Buat QA pairs sederhana untuk training dengan lebih banyak template"""
        qa_pairs = []

        for filename, text in texts.items():
            if not text:
                continue

            # Template QA yang lebih komprehensif
            qa_templates = [
                # Basic Information Extraction
                {
                    'question': 'Siapa terdakwa dalam perkara ini?',
                    'pattern': r'terdakwa\s*:\s*([A-Z][^\n\r,;]+?)(?:\s*(?:,|;|\n))',
                    'type': 'entity_extraction'
                },
                {
                    'question': 'Siapa hakim dalam perkara ini?',
                    'pattern': r'hakim\s+(?:ketua|anggota)\s*:\s*([A-Z][^\n\r,;]+?)(?:\s*(?:,|;|\n))',
                    'type': 'entity_extraction'
                },
                {
                    'question': 'Siapa jaksa penuntut umum?',
                    'pattern': r'(?:jaksa\s+)?penuntut\s+umum\s*:\s*([A-Z][^\n\r,;]+?)(?:\s*(?:,|;|\n))',
                    'type': 'entity_extraction'
                },

                # Case Classification
                {
                    'question': 'Apa jenis perkara ini?',
                    'pattern': r'perkara\s+(pidana\s+(?:khusus|umum)|perdata|tindak\s+pidana\s+\w+)',
                    'type': 'classification'
                },
                {
                    'question': 'Apakah ini perkara korupsi?',
                    'pattern': r'(?:perkara\s+)?tindak\s+pidana\s+(korupsi)',
                    'type': 'classification'
                },

                # Legal References
                {
                    'question': 'Pasal apa yang dilanggar?',
                    'pattern': r'(?:melanggar\s+)?pasal\s+(\d+(?:\s+(?:ayat|huruf)\s*\([^)]+\))?)',
                    'type': 'legal_reference'
                },
                {
                    'question': 'Undang-undang apa yang dirujuk?',
                    'pattern': r'(?:undang[- ]undang|uu)\s+(?:republik\s+indonesia\s+)?(?:no\.?\s*|nomor\s+)?(\d+\s+tahun\s+\d{4})',
                    'type': 'legal_reference'
                },

                # Temporal Information
                {
                    'question': 'Kapan putusan ini dibacakan?',
                    'pattern': r'(?:dibacakan|diputuskan)\s+(?:pada\s+)?tanggal\s+(\d{1,2}\s+\w+\s+\d{4})',
                    'type': 'date_extraction'
                },
                {
                    'question': 'Kapan tindak pidana terjadi?',
                    'pattern': r'(?:pada|tanggal)\s+(\d{1,2}\s+\w+\s+\d{4})[^,]*(?:terdakwa|melakukan)',
                    'type': 'date_extraction'
                },

                # Decision and Punishment
                {
                    'question': 'Apa putusan hakim?',
                    'pattern': r'mengadili\s*:\s*([^.]+\.)',
                    'type': 'decision_extraction'
                },
                {
                    'question': 'Berapa lama hukuman yang dijatuhkan?',
                    'pattern': r'(?:dengan\s+)?hukuman\s+(?:penjara|kurungan)\s+(?:selama\s+)?(\d+\s+(?:tahun|bulan|hari))',
                    'type': 'punishment_extraction'
                },
                {
                    'question': 'Berapa denda yang harus dibayar?',
                    'pattern': r'(?:denda|membayar)\s+(?:sebesar\s+)?(?:rp\.?\s*)?([0-9.,]+(?:\s*(?:juta|ribu|miliar))?)',
                    'type': 'fine_extraction'
                },

                # Evidence and Facts
                {
                    'question': 'Apa barang bukti dalam perkara ini?',
                    'pattern': r'barang\s+bukti\s*(?:berupa|adalah)?\s*:?\s*([^.]+)',
                    'type': 'evidence_extraction'
                },
                {
                    'question': 'Berapa kerugian yang ditimbulkan?',
                    'pattern': r'kerugian\s+(?:negara|keuangan)\s+(?:sebesar\s+)?(?:rp\.?\s*)?([0-9.,]+(?:\s*(?:juta|ribu|miliar))?)',
                    'type': 'damage_extraction'
                },

                # Legal Reasoning
                {
                    'question': 'Mengapa terdakwa dianggap bersalah?',
                    'pattern': r'(?:terbukti|bersalah)\s+(?:secara\s+sah\s+dan\s+meyakinkan\s+)?([^.]+)',
                    'type': 'reasoning_extraction'
                },
                {
                    'question': 'Apa pertimbangan hakim?',
                    'pattern': r'menimbang\s*[,:]\s*([^;]+)',
                    'type': 'consideration_extraction'
                }
            ]

            for template in qa_templates:
                matches = re.finditer(template['pattern'], text, re.IGNORECASE | re.DOTALL)
                for match in matches:
                    answer = match.group(1).strip()
                    if answer and len(answer) > 3 and len(answer) < 300:  # Filter jawaban
                        # Bersihkan jawaban
                        answer = re.sub(r'\s+', ' ', answer)
                        answer = answer.strip('.,;:')

                        qa_pairs.append({
                            'filename': filename,
                            'question': template['question'],
                            'answer': answer,
                            'question_type': template['type'],
                            'context': text[max(0, match.start()-100):match.end()+100],
                            'confidence': min(1.0, len(answer) / 100)  # Simple confidence score
                        })

        print(f"❓ Generated {len(qa_pairs)} QA pairs")
        return qa_pairs

    def encode_categorical_features(self, df: pd.DataFrame) -> pd.DataFrame:
        """Encode categorical features menjadi numerical dengan handling yang lebih baik"""
        categorical_columns = df.select_dtypes(include=['object']).columns
        categorical_columns = [col for col in categorical_columns if col != 'nama_file']

        df_encoded = df.copy()

        for col in categorical_columns:
            if col not in self.label_encoders:
                self.label_encoders[col] = LabelEncoder()

            # Handle missing values
            df_encoded[col] = df_encoded[col].fillna('unknown')

            try:
                df_encoded[f'{col}_encoded'] = self.label_encoders[col].fit_transform(df_encoded[col])
                print(f"🔢 Encoded {col}: {len(self.label_encoders[col].classes_)} categories")
            except Exception as e:
                logger.error(f"Error encoding {col}: {e}")

        return df_encoded

    def combine_all_features(self, text_features: pd.DataFrame, bow_features: pd.DataFrame,
                           tfidf_features: pd.DataFrame, metadata_df: pd.DataFrame = None,
                           content_df: pd.DataFrame = None) -> pd.DataFrame:
        """Gabungkan semua features menjadi satu dataset dengan handling yang robust"""

        # Start dengan text features
        combined_df = text_features.copy()

        # Join dengan metadata jika tersedia
        if metadata_df is not None and not metadata_df.empty:
            combined_df = pd.merge(combined_df, metadata_df, on='nama_file', how='left')
            print(f"📊 Joined dengan metadata: {combined_df.shape}")

        # Join dengan content jika tersedia
        if content_df is not None and not content_df.empty:
            # Pilih kolom penting dari content
            content_cols = ['nama_file']
            if 'kelengkapan_konten_persen' in content_df.columns:
                content_cols.append('kelengkapan_konten_persen')
            if 'panjang_teks' in content_df.columns:
                content_cols.append('panjang_teks')
            if 'jumlah_kata' in content_df.columns:
                content_cols.append('jumlah_kata')

            available_cols = [col for col in content_cols if col in content_df.columns]
            if len(available_cols) > 1:  # More than just nama_file
                content_subset = content_df[available_cols].copy()
                combined_df = pd.merge(combined_df, content_subset, on='nama_file', how='left')
                print(f"📝 Joined dengan konten: {combined_df.shape}")

        # Join dengan BoW features jika tersedia
        if not bow_features.empty:
            combined_df = pd.merge(combined_df, bow_features, on='nama_file', how='left')
            print(f"🎯 Joined dengan BoW: {combined_df.shape}")

        # Join dengan TF-IDF features jika tersedia
        if not tfidf_features.empty:
            combined_df = pd.merge(combined_df, tfidf_features, on='nama_file', how='left')
            print(f"📈 Joined dengan TF-IDF: {combined_df.shape}")

        return combined_df

    def create_target_variables(self, combined_df: pd.DataFrame) -> pd.DataFrame:
        """Buat target variables yang komprehensif untuk supervised learning"""
        df_with_targets = combined_df.copy()

        # 1. CLASSIFICATION TARGETS
        # Jenis perkara classification
        if 'jenis_perkara' in df_with_targets.columns:
            df_with_targets['is_pidana'] = df_with_targets['jenis_perkara'].apply(
                lambda x: 1 if x and 'pidana' in str(x).lower() else 0
            )
            df_with_targets['is_korupsi'] = df_with_targets['jenis_perkara'].apply(
                lambda x: 1 if x and 'korupsi' in str(x).lower() else 0
            )
            df_with_targets['is_perdata'] = df_with_targets['jenis_perkara'].apply(
                lambda x: 1 if x and 'perdata' in str(x).lower() else 0
            )

        # 2. REGRESSION TARGETS
        # Kompleksitas perkara berdasarkan jumlah kata dan fitur lainnya
        if 'word_count' in df_with_targets.columns:
            # Document complexity score (1-4)
            df_with_targets['complexity_score'] = pd.cut(
                df_with_targets['word_count'],
                bins=[0, 1000, 5000, 10000, float('inf')],
                labels=[1, 2, 3, 4]
            ).astype(int)

            # Document length category
            df_with_targets['length_category'] = pd.cut(
                df_with_targets['word_count'],
                bins=[0, 2000, 7500, 15000, float('inf')],
                labels=['short', 'medium', 'long', 'very_long']
            )

        # Legal complexity based on multiple factors
        legal_complexity = 0
        if 'article_references' in df_with_targets.columns:
            legal_complexity += df_with_targets['article_references'].fillna(0) * 0.3
        if 'law_references' in df_with_targets.columns:
            legal_complexity += df_with_targets['law_references'].fillna(0) * 0.4
        if 'legal_terms_count' in df_with_targets.columns:
            legal_complexity += (df_with_targets['legal_terms_count'].fillna(0) / 10) * 0.3

        df_with_targets['legal_complexity'] = legal_complexity

        # 3. BINARY TARGETS
        # Kelengkapan dokumen
        if 'has_decision' in df_with_targets.columns and 'has_consideration' in df_with_targets.columns:
            df_with_targets['is_complete_doc'] = (
                (df_with_targets['has_decision'] == 1) &
                (df_with_targets['has_consideration'] == 1) &
                (df_with_targets['has_parties'] == 1)
            ).astype(int)

        # Document quality indicators
        if 'has_case_number' in df_with_targets.columns:
            df_with_targets['has_proper_format'] = (
                (df_with_targets['has_case_number'] == 1) &
                (df_with_targets['has_header'] == 1)
            ).astype(int)

        # High-quality legal document
        quality_score = 0
        quality_features = ['has_consideration', 'has_decision', 'has_parties', 'has_case_number']
        for feature in quality_features:
            if feature in df_with_targets.columns:
                quality_score += df_with_targets[feature].fillna(0)

        df_with_targets['quality_score'] = quality_score
        df_with_targets['is_high_quality'] = (quality_score >= 3).astype(int)

        # 4. MULTI-CLASS TARGETS
        # Document type classification
        doc_type = []
        for idx, row in df_with_targets.iterrows():
            if row.get('has_dakwaan', 0) == 1:
                doc_type.append('criminal_case')
            elif row.get('is_perdata', 0) == 1:
                doc_type.append('civil_case')
            elif row.get('has_decision', 0) == 1:
                doc_type.append('judgment')
            else:
                doc_type.append('other')

        df_with_targets['document_type'] = doc_type

        # Case severity (based on legal terms and complexity)
        severity_bins = [0, 5, 15, 30, float('inf')]
        severity_labels = ['low', 'medium', 'high', 'critical']

        if 'legal_terms_count' in df_with_targets.columns:
            df_with_targets['case_severity'] = pd.cut(
                df_with_targets['legal_terms_count'].fillna(0),
                bins=severity_bins,
                labels=severity_labels
            )

        print(f"🎯 Created comprehensive target variables")
        return df_with_targets

    def create_derived_features(self, df: pd.DataFrame) -> pd.DataFrame:
        """Buat derived features dari kombinasi fitur existing"""
        df_derived = df.copy()

        # 1. RATIO FEATURES
        if 'unique_words' in df_derived.columns and 'word_count' in df_derived.columns:
            df_derived['vocabulary_richness'] = df_derived['unique_words'] / (df_derived['word_count'] + 1)

        if 'legal_terms_count' in df_derived.columns and 'word_count' in df_derived.columns:
            df_derived['legal_term_density'] = df_derived['legal_terms_count'] / (df_derived['word_count'] + 1)

        if 'sentence_count' in df_derived.columns and 'paragraph_count' in df_derived.columns:
            df_derived['sentences_per_paragraph'] = df_derived['sentence_count'] / (df_derived['paragraph_count'] + 1)

        # 2. INTERACTION FEATURES
        if 'article_references' in df_derived.columns and 'law_references' in df_derived.columns:
            df_derived['total_legal_refs'] = df_derived['article_references'].fillna(0) + df_derived['law_references'].fillna(0)

        if 'defendant_mentions' in df_derived.columns and 'prosecutor_mentions' in df_derived.columns:
            df_derived['prosecution_intensity'] = df_derived['prosecutor_mentions'].fillna(0) / (df_derived['defendant_mentions'].fillna(0) + 1)

        # 3. COMPOSITE SCORES
        # Document formality score
        formality_features = ['has_header', 'has_case_number', 'has_parties', 'has_consideration', 'has_decision']
        formality_score = 0
        for feature in formality_features:
            if feature in df_derived.columns:
                formality_score += df_derived[feature].fillna(0)
        df_derived['formality_score'] = formality_score / len(formality_features)

        # Content richness score
        content_features = ['has_dakwaan', 'has_bukti', 'has_saksi', 'has_tuntutan']
        content_score = 0
        for feature in content_features:
            if feature in df_derived.columns:
                content_score += df_derived[feature].fillna(0)
        df_derived['content_richness'] = content_score / len(content_features)

        print(f"🔄 Created derived features")
        return df_derived

    def perform_feature_selection(self, df: pd.DataFrame, n_components: int = 50) -> Tuple[pd.DataFrame, pd.DataFrame]:
        """Lakukan feature selection dan dimensionality reduction"""
        # Separate numerical and categorical columns
        numeric_cols = df.select_dtypes(include=[np.number]).columns.tolist()
        numeric_cols = [col for col in numeric_cols if col != 'nama_file']

        if len(numeric_cols) < 2:
            print("⚠️ Tidak cukup fitur numerik untuk PCA")
            return df, pd.DataFrame()

        # Prepare data for PCA
        numeric_data = df[numeric_cols].fillna(0)

        # Standardize features
        try:
            scaled_data = self.scaler.fit_transform(numeric_data)

            # Apply PCA
            n_components = min(n_components, len(numeric_cols), len(df) - 1)
            pca = PCA(n_components=n_components)
            pca_features = pca.fit_transform(scaled_data)

            # Create PCA DataFrame
            pca_columns = [f'pca_{i+1}' for i in range(n_components)]
            pca_df = pd.DataFrame(pca_features, columns=pca_columns, index=df.index)
            pca_df['nama_file'] = df['nama_file'].values

            # Calculate explained variance
            explained_variance = pca.explained_variance_ratio_
            cumulative_variance = np.cumsum(explained_variance)

            print(f"📊 PCA completed: {n_components} components explain {cumulative_variance[-1]:.2%} variance")

            return df, pca_df

        except Exception as e:
            logger.error(f"Error in PCA: {e}")
            return df, pd.DataFrame()

    def generate_feature_summary(self, combined_df: pd.DataFrame, qa_pairs: List[Dict], pca_df: pd.DataFrame = None) -> str:
        """Generate comprehensive feature summary"""
        summary = []
        summary.append("=" * 70)
        summary.append("FEATURE ENGINEERING SUMMARY REPORT")
        summary.append("=" * 70)
        summary.append(f"Generated on: {datetime.now().strftime('%Y-%m-%d %H:%M:%S')}")
        summary.append("")

        # Dataset Overview
        summary.append("📊 DATASET OVERVIEW")
        summary.append("-" * 30)
        summary.append(f"Total Documents: {combined_df.shape[0]}")
        summary.append(f"Total Features: {combined_df.shape[1]}")
        summary.append(f"QA Pairs Generated: {len(qa_pairs)}")
        if pca_df is not None and not pca_df.empty:
            summary.append(f"PCA Components: {pca_df.shape[1] - 1}")  # -1 for nama_file
        summary.append("")

        # Feature Categories
        summary.append("🔧 FEATURE CATEGORIES")
        summary.append("-" * 30)

        # Count features by category
        bow_features = len([col for col in combined_df.columns if col.startswith('bow_')])
        tfidf_features = len([col for col in combined_df.columns if col.startswith('tfidf_')])
        text_features = len([col for col in combined_df.columns if col in [
            'char_count', 'word_count', 'sentence_count', 'paragraph_count',
            'unique_words', 'lexical_diversity', 'avg_word_length'
        ]])

        summary.append(f"Text Features: {text_features}")
        summary.append(f"Bag-of-Words Features: {bow_features}")
        summary.append(f"TF-IDF Features: {tfidf_features}")
        summary.append(f"Legal Features: {len([col for col in combined_df.columns if 'legal' in col])}")
        summary.append(f"Target Variables: {len([col for col in combined_df.columns if col.startswith('is_') or col.endswith('_score')])}")
        summary.append("")

        # Data Quality
        summary.append("📈 DATA QUALITY METRICS")
        summary.append("-" * 30)
        summary.append(f"Missing Value Ratio: {combined_df.isnull().sum().sum() / (combined_df.shape[0] * combined_df.shape[1]):.2%}")
        summary.append(f"Complete Cases: {combined_df.dropna().shape[0]} ({combined_df.dropna().shape[0]/combined_df.shape[0]:.1%})")
        summary.append("")

        # Feature Types
        summary.append("📋 FEATURE TYPES")
        summary.append("-" * 30)
        for dtype in combined_df.dtypes.value_counts().items():
            summary.append(f"{dtype[0]}: {dtype[1]} columns")
        summary.append("")

        # QA Statistics
        if qa_pairs:
            summary.append("❓ QA PAIRS STATISTICS")
            summary.append("-" * 30)
            qa_types = Counter([qa['question_type'] for qa in qa_pairs])
            for qa_type, count in qa_types.most_common():
                summary.append(f"{qa_type}: {count} pairs")
            summary.append("")

        # Column List
        summary.append("📝 ALL COLUMNS")
        summary.append("-" * 30)
        for i, col in enumerate(combined_df.columns, 1):
            summary.append(f"{i:3d}. {col}")

        summary.append("")
        summary.append("=" * 70)
        summary.append("END OF REPORT")
        summary.append("=" * 70)

        return "\n".join(summary)

    def save_features_to_files(self, combined_df: pd.DataFrame, qa_pairs: List[Dict], pca_df: pd.DataFrame = None):
        """Simpan semua features ke file dengan backup di Google Drive"""
        timestamp = datetime.now().strftime('%Y%m%d_%H%M%S')

        # 1. SAVE MAIN FEATURES DATASET
        features_filename = f"features_{timestamp}.csv"
        features_path_local = os.path.join(self.output_dir, features_filename)
        features_path_gdrive = os.path.join(self.gdrive_output_dir, features_filename)

        combined_df.to_csv(features_path_local, index=False, encoding='utf-8')
        combined_df.to_csv(features_path_gdrive, index=False, encoding='utf-8')
        print(f"📄 Features CSV Lokal: {features_path_local}")
        print(f"💾 Features CSV GDrive: {features_path_gdrive}")

        # 2. SAVE PCA FEATURES if available
        if pca_df is not None and not pca_df.empty:
            pca_filename = f"features_pca_{timestamp}.csv"
            pca_path_local = os.path.join(self.output_dir, pca_filename)
            pca_path_gdrive = os.path.join(self.gdrive_output_dir, pca_filename)

            pca_df.to_csv(pca_path_local, index=False, encoding='utf-8')
            pca_df.to_csv(pca_path_gdrive, index=False, encoding='utf-8')
            print(f"📊 PCA CSV Lokal: {pca_path_local}")
            print(f"💾 PCA CSV GDrive: {pca_path_gdrive}")

        # 3. SAVE QA PAIRS
        if qa_pairs:
            qa_filename = f"qa_pairs_{timestamp}.json"
            qa_path_local = os.path.join(self.output_dir, qa_filename)
            qa_path_gdrive = os.path.join(self.gdrive_output_dir, qa_filename)

            with open(qa_path_local, 'w', encoding='utf-8') as f:
                json.dump(qa_pairs, f, ensure_ascii=False, indent=2)
            with open(qa_path_gdrive, 'w', encoding='utf-8') as f:
                json.dump(qa_pairs, f, ensure_ascii=False, indent=2)
            print(f"❓ QA JSON Lokal: {qa_path_local}")
            print(f"💾 QA JSON GDrive: {qa_path_gdrive}")

        # 4. SAVE FEATURE SUMMARY
        summary_filename = f"feature_summary_{timestamp}.txt"
        summary_path_local = os.path.join(self.output_dir, summary_filename)
        summary_path_gdrive = os.path.join(self.gdrive_output_dir, summary_filename)

        summary_content = self.generate_feature_summary(combined_df, qa_pairs, pca_df)

        with open(summary_path_local, 'w', encoding='utf-8') as f:
            f.write(summary_content)
        with open(summary_path_gdrive, 'w', encoding='utf-8') as f:
            f.write(summary_content)

        print(f"📋 Summary Lokal: {summary_path_local}")
        print(f"💾 Summary GDrive: {summary_path_gdrive}")

        return {
            'features_local': features_path_local,
            'features_gdrive': features_path_gdrive,
            'qa_local': qa_path_local if qa_pairs else None,
            'qa_gdrive': qa_path_gdrive if qa_pairs else None,
            'summary_local': summary_path_local,
            'summary_gdrive': summary_path_gdrive,
            'pca_local': pca_path_local if pca_df is not None and not pca_df.empty else None,
            'pca_gdrive': pca_path_gdrive if pca_df is not None and not pca_df.empty else None
        }

    def process_all_features(self) -> Tuple[pd.DataFrame, List[Dict]]:
        """Proses semua tahap feature engineering dengan error handling yang robust"""
        print("🔧 iii. FEATURE ENGINEERING - COMPREHENSIVE")
        print("=" * 60)
        print("Generating features: Text stats, BoW, TF-IDF, QA pairs, PCA")
        print("=" * 60)

        try:
            # 1. LOAD DATA
            print("\n📂 Loading data...")
            texts = self.load_text_files()
            if not texts:
                print("❌ Tidak ada file teks yang ditemukan!")
                return pd.DataFrame(), []

            metadata_df = self.load_metadata()
            content_df = self.load_content()

            # 2. CALCULATE TEXT FEATURES
            print("\n📊 Calculating comprehensive text features...")
            text_features = self.calculate_text_features(texts)
            print(f"✅ Text features: {text_features.shape}")

            # 3. CREATE BAG-OF-WORDS AND TF-IDF
            print("\n🎯 Creating BoW and TF-IDF features...")
            bow_features, tfidf_features = self.create_bag_of_words_features(texts)

            # 4. CREATE QA PAIRS
            print("\n❓ Creating comprehensive QA pairs...")
            qa_pairs = self.create_qa_pairs(texts, metadata_df)

            # 5. COMBINE ALL FEATURES
            print("\n🔗 Combining all features...")
            combined_df = self.combine_all_features(
                text_features, bow_features, tfidf_features, metadata_df, content_df
            )

            # 6. CREATE DERIVED FEATURES
            print("\n🔄 Creating derived features...")
            combined_df = self.create_derived_features(combined_df)

            # 7. ENCODE CATEGORICAL FEATURES
            print("\n🔢 Encoding categorical features...")
            combined_df = self.encode_categorical_features(combined_df)

            # 8. CREATE TARGET VARIABLES
            print("\n🎯 Creating target variables...")
            combined_df = self.create_target_variables(combined_df)

            # 9. FEATURE SELECTION AND PCA
            print("\n📊 Performing feature selection and PCA...")
            combined_df, pca_df = self.perform_feature_selection(combined_df)

            # 10. SAVE ALL FEATURES
            print("\n💾 Saving features to files...")
            file_paths = self.save_features_to_files(combined_df, qa_pairs, pca_df)

            print("\n" + "=" * 60)
            print(f"✅ FEATURE ENGINEERING COMPLETED SUCCESSFULLY!")
            print(f"📊 Final dataset: {combined_df.shape}")
            print(f"❓ QA pairs: {len(qa_pairs)}")
            print(f"💾 Files saved to 2 locations: lokal & Google Drive")
            print("=" * 60)

            return combined_df, qa_pairs

        except Exception as e:
            logger.error(f"Error in feature engineering: {e}")
            print(f"💥 ERROR: {str(e)}")
            import traceback
            traceback.print_exc()
            return pd.DataFrame(), []

def main():
    """Fungsi utama untuk menjalankan feature engineering"""
    print("🚀 MULAI COMPREHENSIVE FEATURE ENGINEERING")
    print("=" * 70)

    try:
        engineer = FeatureEngineer()
        features_df, qa_pairs = engineer.process_all_features()

        if not features_df.empty:
            print(f"\n🎉 FEATURE ENGINEERING BERHASIL!")
            print(f"Dataset shape: {features_df.shape}")
            print(f"QA pairs: {len(qa_pairs)}")
            print("File output tersimpan di:")
            print("  - Lokal: /data/processed/")
            print("  - GDrive: /content/drive/MyDrive/korupsi/data/processed/")
        else:
            print("\n❌ Tidak ada features yang berhasil dibuat.")

    except Exception as e:
        print(f"\n💥 ERROR: {str(e)}")
        import traceback
        traceback.print_exc()

if __name__ == "__main__":
    main()

🚀 MULAI COMPREHENSIVE FEATURE ENGINEERING
🔧 FEATURE ENGINEERING
Input teks: /data/raw atau /content/drive/MyDrive/korupsi/CLEANED
Input metadata: /data/processed
Output Lokal: /data/processed
Output GDrive: /content/drive/MyDrive/korupsi/data/processed
🔧 iii. FEATURE ENGINEERING - COMPREHENSIVE
Generating features: Text stats, BoW, TF-IDF, QA pairs, PCA

📂 Loading data...
📂 Loading teks dari: /content/drive/MyDrive/korupsi/CLEANED
📁 Loaded 114 file teks
📊 Loaded metadata: 114 records dari metadata_20250613_172125.csv
📝 Loaded konten kunci: 114 records dari konten_kunci_20250613_172934.csv

📊 Calculating comprehensive text features...
✅ Text features: (114, 42)

🎯 Creating BoW and TF-IDF features...
🎯 Bag-of-Words: 500 features
📈 TF-IDF: 1000 features

❓ Creating comprehensive QA pairs...
❓ Generated 29011 QA pairs

🔗 Combining all features...
📊 Joined dengan metadata: (114, 61)
📝 Joined dengan konten: (114, 64)
🎯 Joined dengan BoW: (114, 564)
📈 Joined dengan TF-IDF: (114, 1564)

🔄 Crea