In [4]:
import os
import sys
sys.path.append(os.path.abspath(os.path.join(os.getcwd(), "..")))

import pandas as pd
from ast import literal_eval
from src.tag.src.answer_generator import generate_answer

# === Load dan filter dataset ===
DATASET_PATH = os.path.join("data", "Dataset Testing 2.xlsx")
df = pd.read_excel(DATASET_PATH)
valid_df = df[df["is_valid"]]
sampled_df = valid_df.sample(n=min(20, len(valid_df)), random_state=42)

# === Hasil akhir: list of dict yang siap evaluasi ===
samples = []

for i, row in sampled_df.iterrows():
    question = str(row["user_input"])
    ground_truth = str(row["reference"])

    try:
        contexts = literal_eval(row["reference_contexts_2"])
        if not isinstance(contexts, list): continue
    except:
        continue

    # Siapkan input tabular untuk generator
    columns = ["teks"]
    rows = [[ctx] for ctx in contexts]

    # Generate jawaban dari sistemmu
    answer = generate_answer(columns, rows, question, mode="few-shot", llm_mode="api")

    samples.append({
        "user_input": question,
        "retrieved_contexts": contexts,
        "response": answer,
        "reference": ground_truth
    })

# Simpan untuk dipakai ulang (opsional)
import json
with open("generated_samples.json", "w", encoding="utf-8") as f:
    json.dump(samples, f, indent=2, ensure_ascii=False)

print(f"Total samples siap evaluasi: {len(samples)}")


Total samples siap evaluasi: 20


In [10]:
!pip install rouge-score

Collecting rouge-score
  Downloading rouge_score-0.1.2.tar.gz (17 kB)
  Installing build dependencies: started
  Installing build dependencies: finished with status 'done'
  Getting requirements to build wheel: started
  Getting requirements to build wheel: finished with status 'done'
  Preparing metadata (pyproject.toml): started
  Preparing metadata (pyproject.toml): finished with status 'done'
Collecting absl-py (from rouge-score)
  Downloading absl_py-2.3.0-py3-none-any.whl.metadata (2.4 kB)
Collecting nltk (from rouge-score)
  Downloading nltk-3.9.1-py3-none-any.whl.metadata (2.9 kB)
Downloading absl_py-2.3.0-py3-none-any.whl (135 kB)
Downloading nltk-3.9.1-py3-none-any.whl (1.5 MB)
   ---------------------------------------- 0.0/1.5 MB ? eta -:--:--
   ---------------------------------------- 0.0/1.5 MB ? eta -:--:--
   ------ --------------------------------- 0.3/1.5 MB ? eta -:--:--
   -------------------- ------------------- 0.8/1.5 MB 1.8 MB/s eta 0:00:01
   -----------------

In [None]:
import sys
import os
import json
import time

# Tambahkan path agar modul src bisa ditemukan
sys.path.append(os.path.abspath(os.path.join(os.getcwd(), "..")))

from src.tag.evaluation.eval_metrics import evaluate_text_generation
from ragas.dataset_schema import EvaluationDataset
from src.tag.src.text2sqlchain_zero import init_llm
from langchain.embeddings import HuggingFaceEmbeddings

# === Load hasil generate_answer dari file JSON ===
with open("generated_samples.json", "r", encoding="utf-8") as f:
    raw_samples = json.load(f)

# === Bersihkan dan ubah field agar cocok dengan EvaluationDataset ===
samples = []
for s in raw_samples:
    try:
        question = str(s["user_input"])
        answer = str(s["response"])
        ground_truth = str(s["reference"])
        contexts = s["retrieved_contexts"]
        
        if not isinstance(contexts, list) or not all(isinstance(c, str) for c in contexts):
            continue

        samples.append({
            "user_input": question,
            "response": answer,
            "retrieved_contexts": contexts,
            "reference": ground_truth
        })
    except Exception as e:
        print(f"❌ Error pada sample: {e}")
        continue

print(f"✅ Total sample valid untuk evaluasi: {len(samples)}")

# === Inisialisasi LLM dan Embedding ===
llm_model = init_llm(mode="gemini")  # sesuai mode generate_answer
embedding_model = HuggingFaceEmbeddings(model_name="sentence-transformers/all-MiniLM-L6-v2")

# === Evaluasi per 3 soal ===
batch_size = 3
for i in range(0, len(samples), batch_size):
    batch_samples = samples[i:i+batch_size]
    dataset = EvaluationDataset.from_list(batch_samples)

    print(f"\n🧪 Mengevaluasi batch {i//batch_size + 1} ({len(batch_samples)} soal)...")
    result = evaluate_text_generation(
        evaluation_dataset=dataset,
        llm_model=llm_model,
        embedding_model=embedding_model,
        experiment_name=f"eval_batch_answer_generation_{i//batch_size + 1}"
    )
    print(result)

    if i + batch_size < len(samples):
        print("⏳ Menunggu 30 detik sebelum lanjut ke batch berikutnya...\n")
        time.sleep(30)


✅ Total sample valid untuk evaluasi: 20





🧪 Mengevaluasi batch 1 (3 soal)...


Evaluating: 100%|██████████| 24/24 [00:14<00:00,  1.71it/s]


{'rouge1_precision(mode=precision)': 0.4993, 'rouge1_recall(mode=recall)': 0.6964, 'rouge1_fmeasure(mode=fmeasure)': 0.4970, 'rougeL_precision(mode=precision)': 0.4454, 'rougeL_recall(mode=recall)': 0.6596, 'rougeL_fmeasure(mode=fmeasure)': 0.4560, 'answer_relevancy': 0.3646, 'faithfulness': 1.0000}
⏳ Menunggu 30 detik sebelum lanjut ke batch berikutnya...


🧪 Mengevaluasi batch 2 (3 soal)...


Evaluating: 100%|██████████| 24/24 [00:12<00:00,  1.93it/s]


{'rouge1_precision(mode=precision)': 0.4461, 'rouge1_recall(mode=recall)': 0.7084, 'rouge1_fmeasure(mode=fmeasure)': 0.5056, 'rougeL_precision(mode=precision)': 0.3800, 'rougeL_recall(mode=recall)': 0.5942, 'rougeL_fmeasure(mode=fmeasure)': 0.4301, 'answer_relevancy': 0.6904, 'faithfulness': 1.0000}
⏳ Menunggu 30 detik sebelum lanjut ke batch berikutnya...


🧪 Mengevaluasi batch 3 (3 soal)...


Evaluating: 100%|██████████| 24/24 [00:11<00:00,  2.07it/s]


{'rouge1_precision(mode=precision)': 0.4744, 'rouge1_recall(mode=recall)': 0.3203, 'rouge1_fmeasure(mode=fmeasure)': 0.3692, 'rougeL_precision(mode=precision)': 0.4225, 'rougeL_recall(mode=recall)': 0.2764, 'rougeL_fmeasure(mode=fmeasure)': 0.3238, 'answer_relevancy': 0.1882, 'faithfulness': 1.0000}
⏳ Menunggu 30 detik sebelum lanjut ke batch berikutnya...


🧪 Mengevaluasi batch 4 (3 soal)...


  quota_metric: "generativelanguage.googleapis.com/generate_content_free_tier_requests"
  quota_id: "GenerateRequestsPerMinutePerProjectPerModel-FreeTier"
  quota_dimensions {
    key: "model"
    value: "gemini-2.0-flash"
  }
  quota_dimensions {
    key: "location"
    value: "global"
  }
  quota_value: 15
}
, links {
  description: "Learn more about Gemini API quotas"
  url: "https://ai.google.dev/gemini-api/docs/rate-limits"
}
, retry_delay {
  seconds: 23
}
].
  quota_metric: "generativelanguage.googleapis.com/generate_content_free_tier_requests"
  quota_id: "GenerateRequestsPerMinutePerProjectPerModel-FreeTier"
  quota_dimensions {
    key: "model"
    value: "gemini-2.0-flash"
  }
  quota_dimensions {
    key: "location"
    value: "global"
  }
  quota_value: 15
}
, links {
  description: "Learn more about Gemini API quotas"
  url: "https://ai.google.dev/gemini-api/docs/rate-limits"
}
, retry_delay {
  seconds: 23
}
].
  quota_metric: "generativelanguage.googleapis.com/generate_

{'rouge1_precision(mode=precision)': 0.3574, 'rouge1_recall(mode=recall)': 0.8952, 'rouge1_fmeasure(mode=fmeasure)': 0.4911, 'rougeL_precision(mode=precision)': 0.3381, 'rougeL_recall(mode=recall)': 0.8514, 'rougeL_fmeasure(mode=fmeasure)': 0.4651, 'answer_relevancy': 0.3900, 'faithfulness': 0.9630}
⏳ Menunggu 30 detik sebelum lanjut ke batch berikutnya...


🧪 Mengevaluasi batch 5 (3 soal)...


Evaluating: 100%|██████████| 24/24 [00:22<00:00,  1.06it/s]


{'rouge1_precision(mode=precision)': 0.5501, 'rouge1_recall(mode=recall)': 0.9861, 'rouge1_fmeasure(mode=fmeasure)': 0.6740, 'rougeL_precision(mode=precision)': 0.5464, 'rougeL_recall(mode=recall)': 0.9807, 'rougeL_fmeasure(mode=fmeasure)': 0.6697, 'answer_relevancy': 0.3562, 'faithfulness': 1.0000}
⏳ Menunggu 30 detik sebelum lanjut ke batch berikutnya...


🧪 Mengevaluasi batch 6 (3 soal)...


Evaluating: 100%|██████████| 24/24 [00:19<00:00,  1.25it/s]


{'rouge1_precision(mode=precision)': 0.3928, 'rouge1_recall(mode=recall)': 0.7281, 'rouge1_fmeasure(mode=fmeasure)': 0.4746, 'rougeL_precision(mode=precision)': 0.3895, 'rougeL_recall(mode=recall)': 0.7126, 'rougeL_fmeasure(mode=fmeasure)': 0.4693, 'answer_relevancy': 0.2301, 'faithfulness': 1.0000}
⏳ Menunggu 30 detik sebelum lanjut ke batch berikutnya...


🧪 Mengevaluasi batch 7 (2 soal)...


Evaluating: 100%|██████████| 16/16 [00:20<00:00,  1.29s/it]


{'rouge1_precision(mode=precision)': 0.4095, 'rouge1_recall(mode=recall)': 0.8321, 'rouge1_fmeasure(mode=fmeasure)': 0.5402, 'rougeL_precision(mode=precision)': 0.3688, 'rougeL_recall(mode=recall)': 0.7577, 'rougeL_fmeasure(mode=fmeasure)': 0.4883, 'answer_relevancy': 0.2630, 'faithfulness': 1.0000}


In [14]:
for r in all_results:
    print(f"[DEBUG] result.scores[0] = {r.scores[0]}")
    print(f"[DEBUG] type = {type(r.scores[0])}")
    print(f"[DEBUG] keys = {vars(r.scores[0]).keys()}")
    break  # cukup 1x

[DEBUG] result.scores[0] = {'rouge1_precision(mode=precision)': 0.45, 'rouge1_recall(mode=recall)': 0.6923076923076923, 'rouge1_fmeasure(mode=fmeasure)': 0.5454545454545455, 'rougeL_precision(mode=precision)': 0.39, 'rougeL_recall(mode=recall)': 0.6, 'rougeL_fmeasure(mode=fmeasure)': 0.4727272727272727, 'answer_relevancy': 0.28667318828731536, 'faithfulness': 1.0}
[DEBUG] type = <class 'dict'>


TypeError: vars() argument must have __dict__ attribute

In [15]:
all_results = []
all_results.append(result)
# === Rata-rata skor keseluruhan ===
from collections import defaultdict

# Kumpulkan semua skor berdasarkan metr
# ik
combined_scores = defaultdict(list)

for result in all_results:
    for batch_dict in result.scores:  # result.scores adalah list of dict
        for metric_name, score in batch_dict.items():
            combined_scores[metric_name].append(score)

# Tampilkan rata-rata per metrik
print("\n📊 RATA-RATA EVALUASI KESELURUHAN:")
for metric, scores in combined_scores.items():
    avg_score = sum(scores) / len(scores)
    print(f"{metric}: {avg_score:.4f}")


📊 RATA-RATA EVALUASI KESELURUHAN:
rouge1_precision(mode=precision): 0.4095
rouge1_recall(mode=recall): 0.8321
rouge1_fmeasure(mode=fmeasure): 0.5402
rougeL_precision(mode=precision): 0.3688
rougeL_recall(mode=recall): 0.7577
rougeL_fmeasure(mode=fmeasure): 0.4883
answer_relevancy: 0.2630
faithfulness: 1.0000
