In [1]:
import re, json, pandas as pd, numpy as np
from pathlib import Path
from sklearn.metrics.pairwise import cosine_similarity
from collections import Counter
from sklearn.feature_extraction.text import TfidfVectorizer

In [2]:
# Persiapan & Load Data
BASE = Path(r"D:/SEMESTER 6/PROJECT CBR")
DATA = BASE / "data"
PROC = DATA / "processed"
EVAL = DATA / "eval"
RES  = DATA / "results"
RES.mkdir(parents=True, exist_ok=True)

df = pd.read_csv(PROC / "cases.csv")
df["amar"] = df["amar"].fillna("").astype(str)
df["ringkasan_fakta"] = df["ringkasan_fakta"].fillna("").astype(str)

In [4]:
# TF-IDF Vectorizer untuk Reuse

vectorizer = TfidfVectorizer(ngram_range=(1, 2),
                            token_pattern=r"(?u)\b[a-zA-Z]{2,}\b",
                            min_df=1, max_df=1.0)
tfidf_matrix = vectorizer.fit_transform(df["ringkasan_fakta"])

In [5]:
# Dictionary: {case_id: amar_putusan}
case_solutions = dict(zip(df["case_id"], df["amar"]))


In [6]:
# Retrieval Function (TF-IDF cosine)
def retrieve(query: str, k=5):
    vec = vectorizer.transform([query])
    sims = cosine_similarity(vec, tfidf_matrix).flatten()
    top_idx = sims.argsort()[::-1][:k]
    case_ids = df.iloc[top_idx]["case_id"].tolist()
    sim_scores = sims[top_idx]
    return case_ids, sim_scores


In [7]:
# Predict Outcome – Majority Vote
def predict_majority(query: str, k=5):
    case_ids, _ = retrieve(query, k)
    solutions = [case_solutions.get(cid, "") for cid in case_ids if cid in case_solutions]
    count = Counter(solutions)
    pred = count.most_common(1)[0][0] if count else ""
    return pred, case_ids

In [8]:
# Predict Outcome – Weighted Similarity
def predict_weighted(query: str, k=5):
    case_ids, scores = retrieve(query, k)
    weights = {}
    for cid, score in zip(case_ids, scores):
        sol = case_solutions.get(cid, "")
        if sol not in weights:
            weights[sol] = 0
        weights[sol] += score
    pred = max(weights, key=weights.get) if weights else ""
    return pred, case_ids


In [13]:
# Evaluasi Manual – 5 Query Baru
# Jika file queries.json tidak ada, buat file dengan 5 query contoh
queries_path = EVAL / "queries.json"
if not queries_path.exists():
    queries = [
        {"query_id": "Q1", "query": "Apa yang terjadi dalam kasus pencurian motor di Sumenep?"},
        {"query_id": "Q2", "query": "Bagaimana putusan hakim pada kasus pencurian sapi?"},
        {"query_id": "Q3", "query": "Apa hasil sidang kasus perceraian di PA Maros?"},
        {"query_id": "Q4", "query": "Bagaimana amar putusan kasus pencurian laptop?"},
        {"query_id": "Q5", "query": "Apa keputusan hakim dalam kasus penggugat perceraian?"}
    ]
    with open(queries_path, "w", encoding="utf-8") as f:
        json.dump(queries, f, ensure_ascii=False, indent=2)
else:
    queries = json.loads(queries_path.read_text(encoding="utf-8"))

rows = []
for q in queries:
    qid, qtext = q["query_id"], q["query"]

    pred_majority, cases_maj = predict_majority(qtext)
    pred_weighted, cases_weight = predict_weighted(qtext)

    rows.append({
        "query_id": qid,
        "query": qtext,
        "method": "majority",
        "predicted_solution": pred_majority,
        "top_5_case_ids": cases_maj
    })
    rows.append({
        "query_id": qid,
        "query": qtext,
        "method": "weighted",
        "predicted_solution": pred_weighted,
        "top_5_case_ids": cases_weight
    })


In [14]:
# Simpan ke CSV
df_out = pd.DataFrame(rows)
df_out.to_csv(RES / "predictions.csv", index=False)

print("✅ Disimpan ke:", RES / "predictions.csv")
print(df_out.head(10))

✅ Disimpan ke: D:\SEMESTER 6\PROJECT CBR\data\results\predictions.csv
  query_id                                              query    method  \
0       Q1  Apa yang terjadi dalam kasus pencurian motor d...  majority   
1       Q1  Apa yang terjadi dalam kasus pencurian motor d...  weighted   
2       Q2  Bagaimana putusan hakim pada kasus pencurian s...  majority   
3       Q2  Bagaimana putusan hakim pada kasus pencurian s...  weighted   
4       Q3     Apa hasil sidang kasus perceraian di PA Maros?  majority   
5       Q3     Apa hasil sidang kasus perceraian di PA Maros?  weighted   
6       Q4     Bagaimana amar putusan kasus pencurian laptop?  majority   
7       Q4     Bagaimana amar putusan kasus pencurian laptop?  weighted   
8       Q5  Apa keputusan hakim dalam kasus penggugat perc...  majority   
9       Q5  Apa keputusan hakim dalam kasus penggugat perc...  weighted   

  predicted_solution   top_5_case_ids  
0          Lain-lain  [-, -, -, -, -]  
1          Lain-lain  [-