In [1]:
from google.colab import drive
drive.mount('/content/drive')


Mounted at /content/drive


In [2]:
from google.colab import drive
drive.mount('/content/drive')

import os
import json
import pickle
import re
import numpy as np
import pandas as pd
from datetime import datetime
from typing import Dict, List, Tuple, Optional, Counter
from collections import Counter, defaultdict
import logging

# Machine Learning Libraries
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.feature_extraction.text import TfidfVectorizer

# BERT and Transformers
try:
    from transformers import AutoTokenizer, AutoModel
    import torch
    TRANSFORMERS_AVAILABLE = True
except ImportError:
    print("⚠️ Transformers not available. Install with: pip install transformers torch")
    TRANSFORMERS_AVAILABLE = False

import warnings
warnings.filterwarnings('ignore')

logging.basicConfig(level=logging.INFO)
logger = logging.getLogger(__name__)

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


# **Ekstrak Solusi**

In [3]:
# i. EKSTRAK SOLUSI
# 1. Dari kasus top-k, ambil amar putusan atau ringkasan dakwaan
# 2. Simpan di struktur: {case_id: solusi_text}
# ============================================================================

class RetrievalSystem:
    """
    Sistem retrieval untuk mendukung solution reuse
    """

    def __init__(self, base_dir="/content/drive/MyDrive/terorisme"):
        self.base_dir = base_dir
        self.vectors_dir = os.path.join(base_dir, "data", "vectors")

        # Components
        self.tfidf_vectorizer = None
        self.case_vectors_tfidf = None
        self.case_ids = []

        self.load_components()

    def load_components(self) -> bool:
        """Load retrieval components"""
        print("🔍 Loading retrieval components...")

        # Find best vector file
        vector_files = [f for f in os.listdir(self.vectors_dir) if f.endswith('.pkl')]

        best_file = None
        best_vocab_size = 0

        for vf in vector_files:
            if 'tfidf' in vf.lower():
                try:
                    with open(os.path.join(self.vectors_dir, vf), 'rb') as f:
                        data = pickle.load(f)

                    if 'vectorizer' in data:
                        vocab_size = len(data['vectorizer'].get_feature_names_out())
                        if vocab_size > best_vocab_size:
                            best_vocab_size = vocab_size
                            best_file = vf
                except:
                    continue

        if best_file:
            file_path = os.path.join(self.vectors_dir, best_file)
            with open(file_path, 'rb') as f:
                data = pickle.load(f)

            self.tfidf_vectorizer = data['vectorizer']
            self.case_vectors_tfidf = data['vectors']
            self.case_ids = data['case_ids']

            if hasattr(self.case_vectors_tfidf, 'toarray'):
                self.case_vectors_tfidf = self.case_vectors_tfidf.toarray()

            print(f"✅ Loaded: {len(self.case_ids)} cases, {best_vocab_size:,} vocab")
            return True

        return False

    def retrieve(self, query: str, k: int = 5) -> List[str]:
        """Retrieve top-k similar cases"""
        if not self.tfidf_vectorizer or self.case_vectors_tfidf is None:
            return []

        # Preprocess query
        processed_query = query.lower().strip()
        processed_query = re.sub(r'\s+', ' ', processed_query)

        # Compute query vector
        query_vector = self.tfidf_vectorizer.transform([processed_query])

        if query_vector.nnz == 0:
            return []

        # Compute similarities
        query_dense = query_vector.toarray() if hasattr(query_vector, 'toarray') else query_vector
        similarities = cosine_similarity(query_dense, self.case_vectors_tfidf).flatten()

        # Return top-k case_ids
        top_indices = np.argsort(similarities)[::-1][:k]
        return [self.case_ids[idx] for idx in top_indices]

    def retrieve_with_scores(self, query: str, k: int = 5) -> List[Tuple[str, float]]:
        """Retrieve with similarity scores"""
        if not self.tfidf_vectorizer or self.case_vectors_tfidf is None:
            return []

        processed_query = query.lower().strip()
        query_vector = self.tfidf_vectorizer.transform([processed_query])

        if query_vector.nnz == 0:
            return []

        query_dense = query_vector.toarray() if hasattr(query_vector, 'toarray') else query_vector
        similarities = cosine_similarity(query_dense, self.case_vectors_tfidf).flatten()

        top_indices = np.argsort(similarities)[::-1][:k]

        results = []
        for idx in top_indices:
            case_id = self.case_ids[idx]
            score = similarities[idx]
            results.append((case_id, float(score)))

        return results

class SolutionExtractor:
    """
    i. Ekstrak Solusi dari kasus top-k
    """

    def __init__(self, base_dir="/content/drive/MyDrive/terorisme"):
        self.base_dir = base_dir
        self.raw_dir = os.path.join(base_dir, "CLEANED")
        self.processed_dir = os.path.join(base_dir, "data", "processed")

        # Storage untuk solusi
        self.case_solutions = {}  # {case_id: solusi_text}
        self.case_metadata = {}

        print("📄 i. EKSTRAK SOLUSI")

    def load_case_metadata(self) -> bool:
        """Load metadata kasus dari cases.csv"""
        cases_file = os.path.join(self.processed_dir, "cases.csv")

        if not os.path.exists(cases_file):
            print("❌ cases.csv not found")
            return False

        try:
            df = pd.read_csv(cases_file, encoding='utf-8')

            for _, row in df.iterrows():
                filename = row['nama_file']
                case_id = filename.replace('.txt', '') if filename.endswith('.txt') else filename

                self.case_metadata[case_id] = {
                    'putusan': row.get('putusan', ''),
                    'jenis_perkara': row.get('jenis_perkara', ''),
                    'vonis': row.get('vonis', ''),
                    'hukuman_pidana': row.get('hukuman_pidana', ''),
                    'hukuman_denda': row.get('hukuman_denda', ''),
                    'dakwaan': row.get('dakwaan', ''),
                    'pasal_yang_dilanggar': row.get('pasal_yang_dilanggar', '')
                }

            print(f"✅ Loaded metadata for {len(self.case_metadata)} cases")
            return True

        except Exception as e:
            print(f"❌ Error loading metadata: {e}")
            return False

    def extract_solution_from_text(self, text: str) -> str:
        """Ekstrak amar putusan atau ringkasan dari teks"""
        if not text:
            return ""

        text_lower = text.lower()

        # Pattern untuk mencari amar putusan
        putusan_patterns = [
            r'(amar\s+putusan[:\s].*?)(?:\n\n|\Z)',
            r'(mengadili[:\s].*?)(?:\n\n|\Z)',
            r'(memutuskan[:\s].*?)(?:\n\n|\Z)',
            r'(menjatuhkan\s+pidana[:\s].*?)(?:\n\n|\Z)',
            r'(menghukum\s+terdakwa[:\s].*?)(?:\n\n|\Z)'
        ]

        # Cari pattern putusan
        for pattern in putusan_patterns:
            matches = re.findall(pattern, text_lower, re.DOTALL | re.IGNORECASE)
            if matches:
                solution = matches[0].strip()
                # Bersihkan dan ambil bagian penting
                solution = re.sub(r'\s+', ' ', solution)
                solution = solution[:500]  # Batasi panjang
                return solution

        # Fallback: cari kalimat dengan kata kunci hukuman
        hukuman_patterns = [
            r'([^.]*(?:hukuman|pidana|denda|penjara|kurungan)[^.]*\.)',
            r'([^.]*(?:vonis|putusan|memutuskan)[^.]*\.)',
            r'([^.]*(?:terbukti|tidak terbukti)[^.]*\.)'
        ]

        for pattern in hukuman_patterns:
            matches = re.findall(pattern, text_lower)
            if matches:
                return matches[0].strip()[:300]

        # Fallback terakhir: ambil bagian tengah dokumen
        lines = text.split('\n')
        middle_start = len(lines) // 3
        middle_end = 2 * len(lines) // 3
        middle_text = ' '.join(lines[middle_start:middle_end])

        return middle_text[:200].strip()

    def create_solution_from_metadata(self, case_id: str) -> str:
        """Buat solusi dari metadata yang tersedia"""
        if case_id not in self.case_metadata:
            return "Solusi tidak tersedia"

        meta = self.case_metadata[case_id]
        solution_parts = []

        # Jenis perkara
        if meta['jenis_perkara']:
            solution_parts.append(f"Jenis: {meta['jenis_perkara']}")

        # Putusan
        if meta['putusan']:
            solution_parts.append(f"Putusan: {meta['putusan']}")

        # Vonis
        if meta['vonis']:
            solution_parts.append(f"Vonis: {meta['vonis']}")

        # Hukuman
        if meta['hukuman_pidana']:
            solution_parts.append(f"Hukuman: {meta['hukuman_pidana']}")

        if meta['hukuman_denda']:
            solution_parts.append(f"Denda: {meta['hukuman_denda']}")

        # Pasal
        if meta['pasal_yang_dilanggar']:
            solution_parts.append(f"Pasal: {meta['pasal_yang_dilanggar']}")

        if solution_parts:
            return "; ".join(solution_parts)
        else:
            return "Informasi putusan tidak lengkap"

    def extract_all_solutions(self, case_ids: List[str]) -> Dict[str, str]:
        """
        1. Dari kasus top-k, ambil amar putusan atau ringkasan dakwaan
        2. Simpan di struktur: {case_id: solusi_text}
        """
        print(f"\n📄 Extracting solutions for {len(case_ids)} cases...")

        # Load metadata
        self.load_case_metadata()

        solutions = {}

        for case_id in case_ids:
            try:
                # Strategy 1: Extract from raw text
                raw_file = os.path.join(self.raw_dir, f"{case_id}.txt")

                if os.path.exists(raw_file):
                    with open(raw_file, 'r', encoding='utf-8') as f:
                        text = f.read()

                    solution = self.extract_solution_from_text(text)

                    if len(solution.strip()) > 20:  # Valid solution
                        solutions[case_id] = solution
                        continue

                # Strategy 2: Use metadata
                solution = self.create_solution_from_metadata(case_id)
                solutions[case_id] = solution

            except Exception as e:
                print(f"⚠️ Error extracting solution for {case_id}: {e}")
                solutions[case_id] = "Solusi tidak dapat diekstrak"

        print(f"✅ Extracted {len(solutions)} solutions")

        # Show sample solutions
        sample_cases = list(solutions.keys())[:3]
        for case_id in sample_cases:
            solution = solutions[case_id]
            short_solution = solution[:100] + "..." if len(solution) > 100 else solution
            print(f"   {case_id}: {short_solution}")

        self.case_solutions = solutions
        return solutions


# **Algoritma Prediksi dan Implementasi Fungsi**

In [4]:
# ============================================================================
# ii. ALGORITMA PREDIKSI
# 1. Majority vote: pilih solusi yang paling banyak muncul
# 2. Weighted similarity: bobot = skor similarity
# ============================================================================

class SolutionPredictor:
    """
    ii. Algoritma Prediksi & iii. Implementasi Fungsi
    """

    def __init__(self, base_dir="/content/drive/MyDrive/terorisme"):
        self.base_dir = base_dir

        # Components
        self.retrieval_system = RetrievalSystem(base_dir)
        self.solution_extractor = SolutionExtractor(base_dir)

        # Cache all solutions untuk efisiensi
        self._initialize_solution_cache()

        print("🔮 ii. ALGORITMA PREDIKSI")

    def _initialize_solution_cache(self):
        """Initialize cache dengan semua solusi yang tersedia"""
        print("💾 Initializing solution cache...")

        if self.retrieval_system.case_ids:
            # Extract solutions untuk semua cases
            all_solutions = self.solution_extractor.extract_all_solutions(
                self.retrieval_system.case_ids
            )
            print(f"✅ Cached {len(all_solutions)} solutions")

    def majority_vote(self, solutions: List[str]) -> str:
        """
        1. Majority vote: pilih solusi yang paling banyak muncul
        """
        if not solutions:
            return "Tidak ada solusi tersedia"

        # Normalisasi solusi untuk counting
        normalized_solutions = []
        for sol in solutions:
            # Ambil kata kunci utama
            sol_lower = sol.lower()

            # Extract key decision words
            key_words = []
            if 'terbukti' in sol_lower and 'tidak' not in sol_lower:
                key_words.append('terbukti')
            elif 'tidak terbukti' in sol_lower:
                key_words.append('tidak_terbukti')

            if 'penjara' in sol_lower or 'pidana' in sol_lower:
                key_words.append('penjara')
            if 'denda' in sol_lower:
                key_words.append('denda')
            if 'bebas' in sol_lower:
                key_words.append('bebas')

            normalized = '_'.join(key_words) if key_words else 'unknown'
            normalized_solutions.append(normalized)

        # Count occurrences
        counter = Counter(normalized_solutions)
        most_common = counter.most_common(1)[0][0]

        # Map back to original solution
        for i, norm_sol in enumerate(normalized_solutions):
            if norm_sol == most_common:
                return solutions[i]

        return solutions[0]  # Fallback

    def weighted_similarity(self, solutions: List[str], scores: List[float]) -> str:
        """
        2. Weighted similarity: bobot = skor similarity
        """
        if not solutions or not scores:
            return "Tidak ada solusi tersedia"

        # Normalisasi scores
        total_score = sum(scores)
        if total_score == 0:
            return self.majority_vote(solutions)

        weights = [score / total_score for score in scores]

        # Group solutions by similarity
        solution_weights = defaultdict(float)
        solution_examples = {}

        for sol, weight in zip(solutions, weights):
            # Simplify solution for grouping
            sol_key = self._simplify_solution(sol)
            solution_weights[sol_key] += weight
            if sol_key not in solution_examples:
                solution_examples[sol_key] = sol

        # Pilih solusi dengan weight tertinggi
        best_solution_key = max(solution_weights, key=solution_weights.get)
        return solution_examples[best_solution_key]

    def _simplify_solution(self, solution: str) -> str:
        """Simplify solution untuk grouping"""
        sol_lower = solution.lower()

        if 'tidak terbukti' in sol_lower or 'bebas' in sol_lower:
            return 'tidak_terbukti'
        elif 'terbukti' in sol_lower:
            if 'penjara' in sol_lower and 'denda' in sol_lower:
                return 'terbukti_penjara_denda'
            elif 'penjara' in sol_lower:
                return 'terbukti_penjara'
            elif 'denda' in sol_lower:
                return 'terbukti_denda'
            else:
                return 'terbukti'
        else:
                return 'unknown'

    def predict_outcome(self, query: str, k: int = 5, method: str = 'weighted') -> Dict:
        """
        Implementasi Fungsi predict_outcome sesuai spesifikasi
        """
        # Retrieve top-k similar cases
        if method == 'weighted':
            top_cases_with_scores = self.retrieval_system.retrieve_with_scores(query, k=k)
            top_k = [case for case, score in top_cases_with_scores]
            scores = [score for case, score in top_cases_with_scores]
        else:
            top_k = self.retrieval_system.retrieve(query, k=k)
            scores = [1.0] * len(top_k)  # Equal weights for majority vote

        if not top_k:
            return {
                'predicted_solution': "Tidak dapat menemukan kasus serupa",
                'top_cases': [],
                'method': method,
                'confidence': 0.0
            }

        # Extract solutions from top-k cases
        solutions = []
        valid_cases = []
        valid_scores = []

        for i, case_id in enumerate(top_k):
            if case_id in self.solution_extractor.case_solutions:
                solution = self.solution_extractor.case_solutions[case_id]
                solutions.append(solution)
                valid_cases.append(case_id)
                valid_scores.append(scores[i])

        if not solutions:
            return {
                'predicted_solution': "Solusi tidak tersedia untuk kasus serupa",
                'top_cases': top_k,
                'method': method,
                'confidence': 0.0
            }

        # Apply prediction algorithm
        if method == 'majority':
            predicted_solution = self.majority_vote(solutions)
        else:  # weighted
            predicted_solution = self.weighted_similarity(solutions, valid_scores)

        # Calculate confidence
        confidence = sum(valid_scores) / len(valid_scores) if valid_scores else 0.0

        return {
            'predicted_solution': predicted_solution,
            'top_cases': valid_cases,
            'case_solutions': dict(zip(valid_cases, solutions)),
            'similarity_scores': valid_scores,
            'method': method,
            'confidence': confidence,
            'query': query
        }

# **Demo Manual**

In [5]:
# ============================================================================
# iv. DEMO MANUAL
# 1. Siapkan 5 contoh kasus baru → jalankan predict_outcome() →
#    bandingkan dengan putusan sebenarnya
# ============================================================================

class ManualDemo:
    """
    iv. Demo Manual
    """

    def __init__(self, base_dir="/content/drive/MyDrive/terorisme"):
        self.base_dir = base_dir
        self.results_dir = os.path.join(base_dir, "data", "results")

        os.makedirs(self.results_dir, exist_ok=True)

        self.predictor = SolutionPredictor(base_dir)

        print("🧪 iv. DEMO MANUAL")

    def create_demo_cases(self) -> List[Dict]:
        """
        1. Siapkan 5 contoh kasus baru
        """
        demo_cases = [
             {
        "query_id": "DEMO_001",
        "query": "aksi terorisme dengan peledakan bom di terminal bus kota",
        "expected_outcome": "Terbukti bersalah, pidana penjara seumur hidup",
        "description": "Peledakan bom di fasilitas umum"
    },
    {
        "query_id": "DEMO_002",
        "query": "penangkapan anggota kelompok teroris yang merencanakan serangan terhadap aparat",
        "expected_outcome": "Terbukti bersalah, pidana penjara",
        "description": "Perencanaan serangan terhadap aparat"
    },
    {
        "query_id": "DEMO_003",
        "query": "radikalisasi mahasiswa melalui media sosial oleh jaringan teror",
        "expected_outcome": "Terbukti bersalah, pidana penjara dan rehabilitasi",
        "description": "Radikalisasi generasi muda"
    },
    {
        "query_id": "DEMO_004",
        "query": "pengiriman dana dari luar negeri untuk mendanai aksi teror di indonesia",
        "expected_outcome": "Terbukti bersalah, pidana penjara dan perampasan dana",
        "description": "Pendanaan terorisme lintas negara"
    },
    {
        "query_id": "DEMO_005",
        "query": "penggerebekan tempat persembunyian kelompok teroris oleh densus 88",
        "expected_outcome": "Penangkapan berhasil, proses hukum berjalan",
        "description": "Penggerebekan dan penangkapan"
    }
        ]

        print(f"📝 Created {len(demo_cases)} demo cases")
        for case in demo_cases:
            print(f"   {case['query_id']}: {case['description']}")

        return demo_cases

    def run_demo(self) -> List[Dict]:
        """
        2. Jalankan predict_outcome() untuk setiap kasus demo
        """
        demo_cases = self.create_demo_cases()
        results = []

        print(f"\n🔮 Running prediction demo...")

        for case in demo_cases:
            query_id = case['query_id']
            query = case['query']
            expected = case['expected_outcome']

            print(f"\n--- {query_id} ---")
            print(f"Query: {query}")
            print(f"Expected: {expected}")

            # Test both methods
            for method in ['weighted', 'majority']:
                try:
                    prediction_result = self.predictor.predict_outcome(
                        query=query,
                        k=5,
                        method=method
                    )

                    predicted_solution = prediction_result['predicted_solution']
                    confidence = prediction_result['confidence']
                    top_cases = prediction_result['top_cases']

                    print(f"\n{method.upper()} Method:")
                    print(f"  Predicted: {predicted_solution[:100]}...")
                    print(f"  Confidence: {confidence:.3f}")
                    print(f"  Top cases: {top_cases[:3]}")

                    # Compare with expected
                    comparison = self.compare_prediction(predicted_solution, expected)
                    print(f"  Match score: {comparison['score']:.2f}")

                    result = {
                        'query_id': query_id,
                        'query': query,
                        'method': method,
                        'predicted_solution': predicted_solution,
                        'expected_outcome': expected,
                        'confidence': confidence,
                        'top_cases': top_cases,
                        'match_score': comparison['score'],
                        'match_explanation': comparison['explanation']
                    }

                    results.append(result)

                except Exception as e:
                    print(f"  ❌ Error: {e}")

                    error_result = {
                        'query_id': query_id,
                        'query': query,
                        'method': method,
                        'predicted_solution': f"Error: {str(e)}",
                        'expected_outcome': expected,
                        'confidence': 0.0,
                        'top_cases': [],
                        'match_score': 0.0,
                        'match_explanation': 'Prediction failed'
                    }

                    results.append(error_result)

        return results

    def compare_prediction(self, predicted: str, expected: str) -> Dict:
        """
        3. Bandingkan dengan putusan sebenarnya
        """
        pred_lower = predicted.lower()
        exp_lower = expected.lower()

        score = 0.0
        explanations = []

        # Check for key terms
        key_terms = [
            ('terbukti', 0.3),
            ('tidak terbukti', 0.3),
            ('penjara', 0.2),
            ('pidana', 0.2),
            ('denda', 0.15),
            ('bebas', 0.2)
        ]

        for term, weight in key_terms:
            if term in pred_lower and term in exp_lower:
                score += weight
                explanations.append(f"✅ Found '{term}'")
            elif term in exp_lower and term not in pred_lower:
                explanations.append(f"❌ Missing '{term}'")
            elif term in pred_lower and term not in exp_lower:
                explanations.append(f"⚠️ Extra '{term}'")

        # Bonus for overall direction match
        if ('terbukti' in pred_lower and 'terbukti' in exp_lower) or \
           ('tidak terbukti' in pred_lower and ('tidak terbukti' in exp_lower or 'bebas' in exp_lower)):
            score += 0.2
            explanations.append("✅ Overall direction matches")

        score = min(score, 1.0)  # Cap at 1.0

        return {
            'score': score,
            'explanation': '; '.join(explanations)
        }

In [6]:
# ============================================================================
# v. OUTPUT
# 1. Script 04_predict.py / notebook
# 2. File /data/results/predictions.csv berisi:
#    query_id predicted_solution top_5_case_ids
# ============================================================================

class OutputGenerator:
    """
    v. Output
    """

    def __init__(self, base_dir="/content/drive/MyDrive/terorisme"):
        self.base_dir = base_dir
        self.results_dir = os.path.join(base_dir, "data", "results")

        os.makedirs(self.results_dir, exist_ok=True)

        print("📊 v. OUTPUT")

    def save_predictions_csv(self, results: List[Dict]) -> str:
        """
        2. File /data/results/predictions.csv berisi:
           query_id predicted_solution top_5_case_ids
        """
        timestamp = datetime.now().strftime('%Y%m%d_%H%M%S')
        csv_filename = f"predictions_{timestamp}.csv"
        csv_path = os.path.join(self.results_dir, csv_filename)

        # Prepare data for CSV
        csv_data = []

        for result in results:
            # Convert top_cases list to string
            top_5_case_ids = ';'.join(result['top_cases'][:5])

            csv_row = {
                'query_id': result['query_id'],
                'query': result['query'],
                'method': result['method'],
                'predicted_solution': result['predicted_solution'],
                'expected_outcome': result['expected_outcome'],
                'top_5_case_ids': top_5_case_ids,
                'confidence': result['confidence'],
                'match_score': result['match_score'],
                'match_explanation': result['match_explanation']
            }

            csv_data.append(csv_row)

        # Save to CSV
        df = pd.DataFrame(csv_data)
        df.to_csv(csv_path, index=False, encoding='utf-8')

        print(f"📄 Predictions saved: {csv_filename}")
        print(f"   Records: {len(csv_data)}")
        print(f"   Columns: {list(df.columns)}")

        return csv_path

    def save_detailed_results(self, results: List[Dict]) -> str:
        """Save detailed results as JSON"""
        timestamp = datetime.now().strftime('%Y%m%d_%H%M%S')
        json_filename = f"detailed_predictions_{timestamp}.json"
        json_path = os.path.join(self.results_dir, json_filename)

        detailed_data = {
            'metadata': {
                'generated_at': datetime.now().isoformat(),
                'total_predictions': len(results),
                'methods_used': list(set([r['method'] for r in results])),
                'version': 'solution_reuse_v1'
            },
            'results': results
        }

        with open(json_path, 'w', encoding='utf-8') as f:
            json.dump(detailed_data, f, ensure_ascii=False, indent=2, default=str)

        print(f"📄 Detailed results saved: {json_filename}")

        return json_path

    def generate_summary_report(self, results: List[Dict]) -> str:
        """Generate summary report"""
        report = []
        report.append("=" * 70)
        report.append("🔮 TAHAP 4 - SOLUTION REUSE - SUMMARY REPORT")
        report.append("=" * 70)
        report.append(f"Generated: {datetime.now().strftime('%Y-%m-%d %H:%M:%S')}")
        report.append("")

        # Overall statistics
        total_predictions = len(results)
        successful_predictions = len([r for r in results if 'Error' not in r['predicted_solution']])
        avg_confidence = np.mean([r['confidence'] for r in results if r['confidence'] > 0])
        avg_match_score = np.mean([r['match_score'] for r in results])

        report.append("📊 OVERALL STATISTICS:")
        report.append(f"  Total predictions: {total_predictions}")
        report.append(f"  Successful predictions: {successful_predictions} ({successful_predictions/total_predictions*100:.1f}%)")
        report.append(f"  Average confidence: {avg_confidence:.3f}")
        report.append(f"  Average match score: {avg_match_score:.3f}")
        report.append("")

        # Method comparison
        methods = list(set([r['method'] for r in results]))
        report.append("🔧 METHOD COMPARISON:")

        for method in methods:
            method_results = [r for r in results if r['method'] == method]
            method_confidence = np.mean([r['confidence'] for r in method_results if r['confidence'] > 0])
            method_match = np.mean([r['match_score'] for r in method_results])

            report.append(f"  {method.upper()}:")
            report.append(f"    Avg Confidence: {method_confidence:.3f}")
            report.append(f"    Avg Match Score: {method_match:.3f}")

        report.append("")

        # Best predictions
        report.append("🏆 BEST PREDICTIONS:")
        best_results = sorted([r for r in results if r['match_score'] > 0],
                             key=lambda x: x['match_score'], reverse=True)[:3]

        for i, result in enumerate(best_results, 1):
            report.append(f"  {i}. {result['query_id']} ({result['method']})")
            report.append(f"     Query: {result['query'][:50]}...")
            report.append(f"     Match Score: {result['match_score']:.3f}")
            report.append(f"     Confidence: {result['confidence']:.3f}")

        report.append("")

        # Performance assessment
        if avg_match_score >= 0.7:
            report.append("🎉 EXCELLENT: System performing very well!")
        elif avg_match_score >= 0.5:
            report.append("✅ GOOD: System performing adequately")
        elif avg_match_score >= 0.3:
            report.append("⚠️ FAIR: System needs improvement")
        else:
            report.append("❌ POOR: System requires significant work")

        report.append("=" * 70)

        return "\n".join(report)

class SolutionReuseSystem:
    """
    Main class untuk Tahap 4 - Solution Reuse
    """

    def __init__(self, base_dir="/content/drive/MyDrive/terorisme"):
        self.base_dir = base_dir

        print("🔮 TAHAP 4 - SOLUTION REUSE")
        print("=" * 60)
        print("Tujuan: Gunakan putusan lama sebagai dasar pencarian untuk kasus baru")
        print("=" * 60)

        # Initialize components
        self.solution_extractor = SolutionExtractor(base_dir)
        self.predictor = SolutionPredictor(base_dir)
        self.demo = ManualDemo(base_dir)
        self.output_generator = OutputGenerator(base_dir)

    def run_complete_solution_reuse(self) -> bool:
        """
        Jalankan semua tahap solution reuse
        """
        try:
            print("\n🔮 Running complete solution reuse process...")

            # iv. Demo Manual
            print("\n" + "="*50)
            print("🧪 iv. DEMO MANUAL")
            print("="*50)

            demo_results = self.demo.run_demo()

            if not demo_results:
                print("❌ Demo failed - no results generated")
                return False

            # v. Output
            print("\n" + "="*50)
            print("📊 v. OUTPUT")
            print("="*50)

            # Save CSV
            csv_path = self.output_generator.save_predictions_csv(demo_results)

            # Save detailed JSON
            json_path = self.output_generator.save_detailed_results(demo_results)

            # Generate and show report
            report = self.output_generator.generate_summary_report(demo_results)
            print(f"\n{report}")

            # Final success message
            print("\n" + "=" * 60)
            print("✅ TAHAP 4 - SOLUTION REUSE COMPLETED!")
            print("📁 Output files created:")
            print(f"   - {os.path.basename(csv_path)}")
            print(f"   - {os.path.basename(json_path)}")
            print("🔮 Solution reuse system ready for production!")
            print("=" * 60)

            return True

        except Exception as e:
            print(f"❌ Error in solution reuse process: {e}")
            import traceback
            traceback.print_exc()
            return False

def test_individual_components():
    """Test individual components untuk debugging"""
    print("🧪 TESTING INDIVIDUAL COMPONENTS")
    print("=" * 50)

    base_dir = "/content/drive/MyDrive/terorisme"

    # Test 1: Retrieval System
    print("\n1. Testing Retrieval System...")
    try:
        retrieval = RetrievalSystem(base_dir)
        if retrieval.case_ids:
            test_query = "penangkapan anggota kelompok teroris"
            results = retrieval.retrieve(test_query, k=3)
            print(f"✅ Retrieval working: {len(results)} results for '{test_query}'")
        else:
            print("❌ Retrieval system has no cases")
    except Exception as e:
        print(f"❌ Retrieval test failed: {e}")

    # Test 2: Solution Extractor
    print("\n2. Testing Solution Extractor...")
    try:
        extractor = SolutionExtractor(base_dir)
        if extractor.load_case_metadata():
            sample_cases = list(extractor.case_metadata.keys())[:3]
            solutions = extractor.extract_all_solutions(sample_cases)
            print(f"✅ Extraction working: {len(solutions)} solutions extracted")
        else:
            print("❌ Cannot load case metadata")
    except Exception as e:
        print(f"❌ Extraction test failed: {e}")

    # Test 3: Predictor
    print("\n3. Testing Predictor...")
    try:
        predictor = SolutionPredictor(base_dir)
        test_query = "penyuapan pejabat"
        result = predictor.predict_outcome(test_query, k=3)
        print(f"✅ Prediction working: '{result['predicted_solution'][:50]}...'")
    except Exception as e:
        print(f"❌ Prediction test failed: {e}")

    print("\n" + "=" * 50)

def main():
    """
    Fungsi utama untuk Tahap 4 - Solution Reuse
    """
    print("🚀 MULAI TAHAP 4 - SOLUTION REUSE")
    print("=" * 70)

    try:
        # Optional: Test individual components first
        # test_individual_components()

        # Run complete solution reuse system
        system = SolutionReuseSystem()
        success = system.run_complete_solution_reuse()

        if success:
            print(f"\n🎉 TAHAP 4 BERHASIL!")
            print("✨ Yang telah diselesaikan:")
            print("  ✅ i. Ekstrak Solusi dari kasus top-k")
            print("  ✅ ii. Algoritma Prediksi (majority vote & weighted similarity)")
            print("  ✅ iii. Implementasi Fungsi predict_outcome()")
            print("  ✅ iv. Demo Manual dengan 5 contoh kasus")
            print("  ✅ v. Output CSV dan JSON hasil prediksi")
            print("🔮 Solution reuse system siap digunakan!")
        else:
            print("\n❌ Tahap 4 gagal diselesaikan")
            print("🔧 Jalankan test_individual_components() untuk debugging")

    except Exception as e:
        print(f"\n💥 ERROR: {str(e)}")
        import traceback
        traceback.print_exc()

if __name__ == "__main__":
    main()

# ============================================================================
# ADDITIONAL UTILITY FUNCTIONS
# ============================================================================

def quick_predict(query: str, base_dir="/content/drive/MyDrive/terorisme") -> str:
    """
    Quick prediction function untuk testing cepat

    Usage:
    result = quick_predict("penangkapan anggota kelompok teroris")
    print(result)
    """
    try:
        predictor = SolutionPredictor(base_dir)
        result = predictor.predict_outcome(query, k=5, method='weighted')
        return result['predicted_solution']
    except Exception as e:
        return f"Error: {str(e)}"

def batch_predict(queries: List[str], base_dir="/content/drive/MyDrive/terorisme") -> List[Dict]:
    """
    Batch prediction untuk multiple queries

    Usage:
    queries = [ "aksi terorisme di tempat umum",
    "penangkapan anggota kelompok teroris",
    "pendanaan jaringan terorisme dari luar negeri"]
    results = batch_predict(queries)
    """
    predictor = SolutionPredictor(base_dir)
    results = []

    for i, query in enumerate(queries):
        try:
            result = predictor.predict_outcome(query, k=5, method='weighted')
            result['query_id'] = f"BATCH_{i+1:03d}"
            results.append(result)
        except Exception as e:
            error_result = {
                'query_id': f"BATCH_{i+1:03d}",
                'query': query,
                'predicted_solution': f"Error: {str(e)}",
                'confidence': 0.0,
                'top_cases': []
            }
            results.append(error_result)

    return results

def interactive_demo(base_dir="/content/drive/MyDrive/terorisme"):
    """
    Interactive demo untuk testing manual
    """
    print("🔮 INTERACTIVE SOLUTION REUSE DEMO")
    print("=" * 50)
    print("Masukkan query kasus hukum (atau 'quit' untuk keluar)")

    predictor = SolutionPredictor(base_dir)

    while True:
        query = input("\n🔍 Query: ").strip()

        if query.lower() in ['quit', 'exit', 'q']:
            break

        if not query:
            continue

        try:
            print(f"🔮 Predicting for: '{query}'")

            # Test both methods
            for method in ['weighted', 'majority']:
                result = predictor.predict_outcome(query, k=5, method=method)

                print(f"\n{method.upper()} METHOD:")
                print(f"Prediction: {result['predicted_solution']}")
                print(f"Confidence: {result['confidence']:.3f}")
                print(f"Top cases: {result['top_cases'][:3]}")

        except Exception as e:
            print(f"❌ Error: {e}")

    print("👋 Demo selesai!")

🚀 MULAI TAHAP 4 - SOLUTION REUSE
🔮 TAHAP 4 - SOLUTION REUSE
Tujuan: Gunakan putusan lama sebagai dasar pencarian untuk kasus baru
📄 i. EKSTRAK SOLUSI
🔍 Loading retrieval components...
✅ Loaded: 46 cases, 3,440 vocab
📄 i. EKSTRAK SOLUSI
💾 Initializing solution cache...

📄 Extracting solutions for 46 cases...
✅ Loaded metadata for 46 cases
✅ Extracted 46 solutions
   case_2023_TK1_Putusan_PN_JAKARTA_TIMUR_Nomor_631_Pid_Sus_2023_PN_JKT_TIM_Tanggal_14_Desember_2023__Penuntut_Umum_ANDI_JEFRI_ARDIN__S_H_Terdakwa_DIAN_YUDI_SAPUTRA_alias_ABU_HANIF_Bin_WAHYU_ILAHI__Alm: mengadili perkara pidana denganacara pemeriksaan biasa dalam tingkat pertama menjatuhkan putusan seb...
   case_2023_TK1_Putusan_PN_JAKARTA_TIMUR_Nomor_629_Pid_Sus_2023_PN_JKT_TIM_Tanggal_14_Desember_2023__Penuntut_Umum_HERRY_WIYANTO__SH__M_HumTerdakwa_TAJUDIN_Als_PAK_HAJI_TAJUDIN_Als_PAK_TEJE_Als_PAKWA_URA: mengadili perkara pidana denganacara pemeriksaan biasa dalam tingkat pertama menjatuhkan putusan seb...
   case_2023_TK1_P