In [1]:
# ================================
# 📦 IMPORT SECTION
# ================================
import os, sys, json, time
import pandas as pd
from ast import literal_eval
from typing import List, Dict

from langchain.embeddings import HuggingFaceEmbeddings
from ragas.dataset_schema import EvaluationDataset, EvaluationResult
from IPython.display import display

# Tambahkan path lokal ke modul `src/`
sys.path.append(os.path.abspath(os.path.join(os.getcwd(), "..")))

from src.tag.src.init_llm import init_llm
from src.tag.src.answer_generator import generate_answer
from src.tag.evaluation.eval_metrics import evaluate_text_generation


  from .autonotebook import tqdm as notebook_tqdm


In [2]:
# ================================
# 🧪 DEFINE TEST CASES
# ================================
answer_test_cases = [
    {
        "experiment_name": "answer_eval_fewshot_api",
        "llm_mode": "claude",
        "embedding_model": "sentence-transformers/all-MiniLM-L6-v2"
    },
    {
        "experiment_name": "answer_eval_fewshot_ollama",
        "llm_mode": "ollama",
        "embedding_model": "sentence-transformers/all-MiniLM-L6-v2"
    }
]


In [3]:
# ================================
# 📥 LOAD DATASET
# ================================
def load_evaluation_dataset(
    path: str = "data/Dataset Testing Archive 2.xlsx",
    n_samples: int = 100,
    random_seed: int = 42
) -> List[Dict]:
    """
    Load dan sampling dataset evaluasi.
    """
    df = pd.read_excel(path)
    valid_df = df[df["is_valid"]]
    sampled_df = valid_df.sample(n=min(n_samples, len(valid_df)), random_state=random_seed)

    dataset = []
    for _, row in sampled_df.iterrows():
        try:
            dataset.append({
                "user_input": str(row["user_input"]),
                "reference_contexts": literal_eval(row["reference_contexts_2"]),
                "reference": str(row["reference"]),
                "response": str(row.get("response", ""))  # kosongkan jika belum ada
            })
        except Exception as e:
            print(f"❌ Error parsing row: {e}")
            continue

    print(f"✅ Jumlah soal yang dimuat: {len(dataset)}")
    return dataset


In [None]:
def run_single_answer_test_case(
    test_case: dict,
    dataset: List[Dict]
) -> Dict:
    """
    Evaluasi Answer Generation: generate jawaban LLM lalu evaluasi dengan metrik RAGAS.
    """
    print(f"\n🚀 Evaluasi Answer Generation: {test_case['experiment_name']}")

    llm_mode = test_case["llm_mode"]
    llm_model = init_llm(mode=llm_mode)
    embedding_model = HuggingFaceEmbeddings(model_name=test_case["embedding_model"])

    samples = []
    for idx, d in enumerate(dataset):
        question = d["user_input"]
        contexts = d["reference_contexts"]
        reference = d["reference"]

        # Siapkan input untuk LLM dalam format tabel
        columns = ["teks"]
        rows = [[ctx] for ctx in contexts]

        # 🔄 Generate jawaban dari LLM
        try:
            answer = generate_answer(columns, rows, question, mode="few-shot", llm_mode=llm_mode)
        except Exception as e:
            print(f"❌ Gagal generate jawaban untuk: {question}\n{e}")
            answer = "jawaban tidak tersedia"

        samples.append({
            "user_input": question,
            "retrieved_contexts": contexts,
            "response": answer,
            "reference": reference
        })
    
            # 💤 Sleep setiap x soal
        if (idx + 1) % 3 == 0 and (idx + 1) < len(dataset):
            print("⏳ Menunggu 15 detik sebelum lanjut ke batch berikutnya...\n")
            time.sleep(15)

    ragas_dataset = EvaluationDataset.from_list(samples)
    
    batch_size = 3
    results = []

    for i in range(0, len(samples), batch_size):
        batch = samples[i:i+batch_size]
        batch_dataset = EvaluationDataset.from_list(batch)

        print(f"🔎 Mengevaluasi batch {i//batch_size + 1} ({len(batch)} soal)")
        result = evaluate_text_generation(
            evaluation_dataset=batch_dataset,
            llm_model=llm_model,
            embedding_model=embedding_model,
            experiment_name=f"answer_eval_batch_{i//batch_size + 1}"
        )
        results.append(result)

        if i + batch_size < len(samples):
            print("🕒 Sleeping 20 detik...\n")
            time.sleep(15)

    # 📊 Tampilkan hasil
    df_result = result.to_pandas()
    print("\n📊 Hasil Evaluasi:")
    display(df_result)

    return {
        "experiment_name": test_case["experiment_name"],
        "evaluation_result": results,
        "samples": samples  # optional: bisa simpan jawaban LLM
    }


In [5]:
from collections import defaultdict

def summarize_scores_from_result_list(results: List[Dict]) -> None:
    """
    Menerima list of result dari run_single_answer_test_case dan
    menampilkan rata-rata skor per metrik dari EvaluationResult.scores.
    """
    combined_scores = defaultdict(list)

    for result_dict in results:
        result = result_dict["evaluation_result"]
        for score_dict in result.scores:  # result.scores = list of dict
            for metric_name, score in score_dict.items():
                combined_scores[metric_name].append(score)

    print("\n📊 RATA-RATA EVALUASI KESELURUHAN:")
    print("-" * 60)
    for metric, scores in combined_scores.items():
        avg_score = sum(scores) / len(scores)
        print(f"{metric:<35}: {avg_score:.4f}")


In [10]:
dataset = load_evaluation_dataset(n_samples=40)

✅ Jumlah soal yang dimuat: 40


**CASE API (GEMINI/CLAUDE)**

In [19]:
result_api = run_single_answer_test_case(answer_test_cases[0], dataset)


🚀 Evaluasi Answer Generation: answer_eval_fewshot_api




⏳ Menunggu 20 detik sebelum lanjut ke batch berikutnya...

⏳ Menunggu 20 detik sebelum lanjut ke batch berikutnya...

⏳ Menunggu 20 detik sebelum lanjut ke batch berikutnya...

⏳ Menunggu 20 detik sebelum lanjut ke batch berikutnya...

🔎 Mengevaluasi batch 1 (3 soal)


Evaluating: 100%|██████████| 24/24 [00:17<00:00,  1.35it/s]


🕒 Sleeping 20 detik...

🔎 Mengevaluasi batch 2 (3 soal)


  quota_metric: "generativelanguage.googleapis.com/generate_content_free_tier_requests"
  quota_id: "GenerateRequestsPerMinutePerProjectPerModel-FreeTier"
  quota_dimensions {
    key: "model"
    value: "gemini-2.0-flash"
  }
  quota_dimensions {
    key: "location"
    value: "global"
  }
  quota_value: 15
}
, links {
  description: "Learn more about Gemini API quotas"
  url: "https://ai.google.dev/gemini-api/docs/rate-limits"
}
, retry_delay {
  seconds: 15
}
].
  quota_metric: "generativelanguage.googleapis.com/generate_content_free_tier_requests"
  quota_id: "GenerateRequestsPerMinutePerProjectPerModel-FreeTier"
  quota_dimensions {
    key: "model"
    value: "gemini-2.0-flash"
  }
  quota_dimensions {
    key: "location"
    value: "global"
  }
  quota_value: 15
}
, links {
  description: "Learn more about Gemini API quotas"
  url: "https://ai.google.dev/gemini-api/docs/rate-limits"
}
, retry_delay {
  seconds: 15
}
].
  quota_metric: "generativelanguage.googleapis.com/generate_

🕒 Sleeping 20 detik...

🔎 Mengevaluasi batch 3 (3 soal)


Evaluating: 100%|██████████| 24/24 [00:08<00:00,  2.74it/s]


🕒 Sleeping 20 detik...

🔎 Mengevaluasi batch 4 (3 soal)


Evaluating: 100%|██████████| 24/24 [00:10<00:00,  2.29it/s]


🕒 Sleeping 20 detik...

🔎 Mengevaluasi batch 5 (3 soal)


  quota_metric: "generativelanguage.googleapis.com/generate_content_free_tier_requests"
  quota_id: "GenerateRequestsPerMinutePerProjectPerModel-FreeTier"
  quota_dimensions {
    key: "model"
    value: "gemini-2.0-flash"
  }
  quota_dimensions {
    key: "location"
    value: "global"
  }
  quota_value: 15
}
, links {
  description: "Learn more about Gemini API quotas"
  url: "https://ai.google.dev/gemini-api/docs/rate-limits"
}
, retry_delay {
  seconds: 18
}
].
  quota_metric: "generativelanguage.googleapis.com/generate_content_free_tier_requests"
  quota_id: "GenerateRequestsPerMinutePerProjectPerModel-FreeTier"
  quota_dimensions {
    key: "model"
    value: "gemini-2.0-flash"
  }
  quota_dimensions {
    key: "location"
    value: "global"
  }
  quota_value: 15
}
, links {
  description: "Learn more about Gemini API quotas"
  url: "https://ai.google.dev/gemini-api/docs/rate-limits"
}
, retry_delay {
  seconds: 17
}
].
  quota_metric: "generativelanguage.googleapis.com/generate_

🕒 Sleeping 20 detik...

🔎 Mengevaluasi batch 6 (3 soal)


Evaluating: 100%|██████████| 24/24 [00:20<00:00,  1.15it/s]


🕒 Sleeping 20 detik...

🔎 Mengevaluasi batch 7 (3 soal)


Evaluating: 100%|██████████| 24/24 [00:19<00:00,  1.25it/s]


🕒 Sleeping 20 detik...

🔎 Mengevaluasi batch 8 (3 soal)


Evaluating: 100%|██████████| 24/24 [00:12<00:00,  1.91it/s]


🕒 Sleeping 20 detik...

🔎 Mengevaluasi batch 9 (3 soal)


  quota_metric: "generativelanguage.googleapis.com/generate_content_free_tier_requests"
  quota_id: "GenerateRequestsPerMinutePerProjectPerModel-FreeTier"
  quota_dimensions {
    key: "model"
    value: "gemini-2.0-flash"
  }
  quota_dimensions {
    key: "location"
    value: "global"
  }
  quota_value: 15
}
, links {
  description: "Learn more about Gemini API quotas"
  url: "https://ai.google.dev/gemini-api/docs/rate-limits"
}
, retry_delay {
  seconds: 26
}
].
  quota_metric: "generativelanguage.googleapis.com/generate_content_free_tier_requests"
  quota_id: "GenerateRequestsPerMinutePerProjectPerModel-FreeTier"
  quota_dimensions {
    key: "model"
    value: "gemini-2.0-flash"
  }
  quota_dimensions {
    key: "location"
    value: "global"
  }
  quota_value: 15
}
, links {
  description: "Learn more about Gemini API quotas"
  url: "https://ai.google.dev/gemini-api/docs/rate-limits"
}
, retry_delay {
  seconds: 22
}
].
  quota_metric: "generativelanguage.googleapis.com/generate_

🕒 Sleeping 20 detik...

🔎 Mengevaluasi batch 10 (3 soal)


Evaluating: 100%|██████████| 24/24 [00:09<00:00,  2.65it/s]


🕒 Sleeping 20 detik...

🔎 Mengevaluasi batch 11 (3 soal)


Evaluating: 100%|██████████| 24/24 [00:05<00:00,  4.24it/s]


🕒 Sleeping 20 detik...

🔎 Mengevaluasi batch 12 (3 soal)


  quota_metric: "generativelanguage.googleapis.com/generate_content_free_tier_requests"
  quota_id: "GenerateRequestsPerMinutePerProjectPerModel-FreeTier"
  quota_dimensions {
    key: "model"
    value: "gemini-2.0-flash"
  }
  quota_dimensions {
    key: "location"
    value: "global"
  }
  quota_value: 15
}
, links {
  description: "Learn more about Gemini API quotas"
  url: "https://ai.google.dev/gemini-api/docs/rate-limits"
}
, retry_delay {
  seconds: 33
}
].
  quota_metric: "generativelanguage.googleapis.com/generate_content_free_tier_requests"
  quota_id: "GenerateRequestsPerMinutePerProjectPerModel-FreeTier"
  quota_dimensions {
    key: "model"
    value: "gemini-2.0-flash"
  }
  quota_dimensions {
    key: "location"
    value: "global"
  }
  quota_value: 15
}
, links {
  description: "Learn more about Gemini API quotas"
  url: "https://ai.google.dev/gemini-api/docs/rate-limits"
}
, retry_delay {
  seconds: 32
}
].
  quota_metric: "generativelanguage.googleapis.com/generate_

🕒 Sleeping 20 detik...

🔎 Mengevaluasi batch 13 (3 soal)


Evaluating: 100%|██████████| 24/24 [00:06<00:00,  3.66it/s]


🕒 Sleeping 20 detik...

🔎 Mengevaluasi batch 14 (1 soal)


Evaluating: 100%|██████████| 8/8 [00:03<00:00,  2.34it/s]



📊 Hasil Evaluasi:


Unnamed: 0,user_input,retrieved_contexts,response,reference,rouge1_precision(mode=precision),rouge1_recall(mode=recall),rouge1_fmeasure(mode=fmeasure),rougeL_precision(mode=precision),rougeL_recall(mode=recall),rougeL_fmeasure(mode=fmeasure),answer_relevancy,faithfulness
0,Apa itu Badan Aksesibilitas Telekomunikasi dan...,[(1) Badan Aksesibilitas Telekomunikasi Dan In...,Badan Aksesibilitas Telekomunikasi dan Informa...,Badan Aksesibilitas Telekomunikasi dan Informa...,0.576923,0.737705,0.647482,0.564103,0.721311,0.633094,0.804848,1.0


In [23]:
all_results = [{"evaluation_result": r} for r in result_api["evaluation_result"]]
                   
summarize_scores_from_result_list(all_results)


📊 RATA-RATA EVALUASI KESELURUHAN:
------------------------------------------------------------
rouge1_precision(mode=precision)   : 0.4360
rouge1_recall(mode=recall)         : 0.6619
rouge1_fmeasure(mode=fmeasure)     : 0.4773
rougeL_precision(mode=precision)   : 0.4021
rougeL_recall(mode=recall)         : 0.6173
rougeL_fmeasure(mode=fmeasure)     : 0.4420
answer_relevancy                   : 0.3371
faithfulness                       : 0.8790


# OLd CoDes

In [8]:
samples = []
for item in tqdm(dataset, desc=f"Running {experiment_name}"):
    q = item["user_input"]
    ref = item["reference_contexts"]
    ground_truth = item["reference"]
    
    # Generate answer using the LLM
    answer = generate_answer(columns, rows, question, llm_mode="api")
    
    samples.append({
            "user_input": q,
            "response": answer,
            "retrieved_contexts": ref,
            "reference": ground_truth
        })

eval_dataset = EvaluationDataset(samples)

# === Evaluasi per 3 soal ===
batch_size = 3
for i in range(0, len(samples), batch_size):
    batch_samples = samples[i:i+batch_size]
    dataset = EvaluationDataset.from_list(batch_samples)

    print(f"\n🧪 Mengevaluasi batch {i//batch_size + 1} ({len(batch_samples)} soal)...")
    result = evaluate_text_generation(
        evaluation_dataset=dataset,
        llm_model=llm_model,
        embedding_model=embedding_model,
        experiment_name=f"eval_batch_answer_generation_{i//batch_size + 1}"
    )
    print(result)

    if i + batch_size < len(samples):
        print("⏳ Menunggu 30 detik sebelum lanjut ke batch berikutnya...\n")
        time.sleep(30)

NameError: name 'experiment_name' is not defined

In [None]:
# === Load dan filter dataset ===
DATASET_PATH = os.path.join("data", "Dataset Testing 2.xlsx")
df = pd.read_excel(DATASET_PATH)
valid_df = df[df["is_valid"]]
sampled_df = valid_df.sample(n=min(20, len(valid_df)), random_state=42)

# === Hasil akhir: list of dict yang siap evaluasi ===
samples = []

for i, row in sampled_df.iterrows():
    question = str(row["user_input"])
    ground_truth = str(row["reference"])

    try:
        contexts = literal_eval(row["reference_contexts_2"])
        if not isinstance(contexts, list): continue
    except:
        continue

    # Siapkan input tabular untuk generator
    columns = ["teks"]
    rows = [[ctx] for ctx in contexts]

    # Generate jawaban dari sistemmu
    answer = generate_answer(columns, rows, question, mode="few-shot", llm_mode="api")

    samples.append({
        "user_input": question,
        "retrieved_contexts": contexts,
        "response": answer,
        "reference": ground_truth
    })

# Simpan untuk dipakai ulang (opsional)
import json
with open("generated_samples.json", "w", encoding="utf-8") as f:
    json.dump(samples, f, indent=2, ensure_ascii=False)

print(f"Total samples siap evaluasi: {len(samples)}")


Total samples siap evaluasi: 20


In [None]:
# === Load hasil generate_answer dari file JSON ===
with open("generated_samples.json", "r", encoding="utf-8") as f:
    raw_samples = json.load(f)

# === Bersihkan dan ubah field agar cocok dengan EvaluationDataset ===
samples = []
for s in raw_samples:
    try:
        question = str(s["user_input"])
        answer = str(s["response"])
        ground_truth = str(s["reference"])
        contexts = s["retrieved_contexts"]
        
        if not isinstance(contexts, list) or not all(isinstance(c, str) for c in contexts):
            continue

        samples.append({
            "user_input": question,
            "response": answer,
            "retrieved_contexts": contexts,
            "reference": ground_truth
        })
    except Exception as e:
        print(f"❌ Error pada sample: {e}")
        continue

print(f"✅ Total sample valid untuk evaluasi: {len(samples)}")

# === Inisialisasi LLM dan Embedding ===
llm_model = init_llm(mode="gemini")  # sesuai mode generate_answer
embedding_model = HuggingFaceEmbeddings(model_name="sentence-transformers/all-MiniLM-L6-v2")

# === Evaluasi per 3 soal ===
batch_size = 3
for i in range(0, len(samples), batch_size):
    batch_samples = samples[i:i+batch_size]
    dataset = EvaluationDataset.from_list(batch_samples)

    print(f"\n🧪 Mengevaluasi batch {i//batch_size + 1} ({len(batch_samples)} soal)...")
    result = evaluate_text_generation(
        evaluation_dataset=dataset,
        llm_model=llm_model,
        embedding_model=embedding_model,
        experiment_name=f"eval_batch_answer_generation_{i//batch_size + 1}"
    )
    print(result)

    if i + batch_size < len(samples):
        print("⏳ Menunggu 30 detik sebelum lanjut ke batch berikutnya...\n")
        time.sleep(30)


✅ Total sample valid untuk evaluasi: 20





🧪 Mengevaluasi batch 1 (3 soal)...


Evaluating: 100%|██████████| 24/24 [00:14<00:00,  1.71it/s]


{'rouge1_precision(mode=precision)': 0.4993, 'rouge1_recall(mode=recall)': 0.6964, 'rouge1_fmeasure(mode=fmeasure)': 0.4970, 'rougeL_precision(mode=precision)': 0.4454, 'rougeL_recall(mode=recall)': 0.6596, 'rougeL_fmeasure(mode=fmeasure)': 0.4560, 'answer_relevancy': 0.3646, 'faithfulness': 1.0000}
⏳ Menunggu 30 detik sebelum lanjut ke batch berikutnya...


🧪 Mengevaluasi batch 2 (3 soal)...


Evaluating: 100%|██████████| 24/24 [00:12<00:00,  1.93it/s]


{'rouge1_precision(mode=precision)': 0.4461, 'rouge1_recall(mode=recall)': 0.7084, 'rouge1_fmeasure(mode=fmeasure)': 0.5056, 'rougeL_precision(mode=precision)': 0.3800, 'rougeL_recall(mode=recall)': 0.5942, 'rougeL_fmeasure(mode=fmeasure)': 0.4301, 'answer_relevancy': 0.6904, 'faithfulness': 1.0000}
⏳ Menunggu 30 detik sebelum lanjut ke batch berikutnya...


🧪 Mengevaluasi batch 3 (3 soal)...


Evaluating: 100%|██████████| 24/24 [00:11<00:00,  2.07it/s]


{'rouge1_precision(mode=precision)': 0.4744, 'rouge1_recall(mode=recall)': 0.3203, 'rouge1_fmeasure(mode=fmeasure)': 0.3692, 'rougeL_precision(mode=precision)': 0.4225, 'rougeL_recall(mode=recall)': 0.2764, 'rougeL_fmeasure(mode=fmeasure)': 0.3238, 'answer_relevancy': 0.1882, 'faithfulness': 1.0000}
⏳ Menunggu 30 detik sebelum lanjut ke batch berikutnya...


🧪 Mengevaluasi batch 4 (3 soal)...


  quota_metric: "generativelanguage.googleapis.com/generate_content_free_tier_requests"
  quota_id: "GenerateRequestsPerMinutePerProjectPerModel-FreeTier"
  quota_dimensions {
    key: "model"
    value: "gemini-2.0-flash"
  }
  quota_dimensions {
    key: "location"
    value: "global"
  }
  quota_value: 15
}
, links {
  description: "Learn more about Gemini API quotas"
  url: "https://ai.google.dev/gemini-api/docs/rate-limits"
}
, retry_delay {
  seconds: 23
}
].
  quota_metric: "generativelanguage.googleapis.com/generate_content_free_tier_requests"
  quota_id: "GenerateRequestsPerMinutePerProjectPerModel-FreeTier"
  quota_dimensions {
    key: "model"
    value: "gemini-2.0-flash"
  }
  quota_dimensions {
    key: "location"
    value: "global"
  }
  quota_value: 15
}
, links {
  description: "Learn more about Gemini API quotas"
  url: "https://ai.google.dev/gemini-api/docs/rate-limits"
}
, retry_delay {
  seconds: 23
}
].
  quota_metric: "generativelanguage.googleapis.com/generate_

{'rouge1_precision(mode=precision)': 0.3574, 'rouge1_recall(mode=recall)': 0.8952, 'rouge1_fmeasure(mode=fmeasure)': 0.4911, 'rougeL_precision(mode=precision)': 0.3381, 'rougeL_recall(mode=recall)': 0.8514, 'rougeL_fmeasure(mode=fmeasure)': 0.4651, 'answer_relevancy': 0.3900, 'faithfulness': 0.9630}
⏳ Menunggu 30 detik sebelum lanjut ke batch berikutnya...


🧪 Mengevaluasi batch 5 (3 soal)...


Evaluating: 100%|██████████| 24/24 [00:22<00:00,  1.06it/s]


{'rouge1_precision(mode=precision)': 0.5501, 'rouge1_recall(mode=recall)': 0.9861, 'rouge1_fmeasure(mode=fmeasure)': 0.6740, 'rougeL_precision(mode=precision)': 0.5464, 'rougeL_recall(mode=recall)': 0.9807, 'rougeL_fmeasure(mode=fmeasure)': 0.6697, 'answer_relevancy': 0.3562, 'faithfulness': 1.0000}
⏳ Menunggu 30 detik sebelum lanjut ke batch berikutnya...


🧪 Mengevaluasi batch 6 (3 soal)...


Evaluating: 100%|██████████| 24/24 [00:19<00:00,  1.25it/s]


{'rouge1_precision(mode=precision)': 0.3928, 'rouge1_recall(mode=recall)': 0.7281, 'rouge1_fmeasure(mode=fmeasure)': 0.4746, 'rougeL_precision(mode=precision)': 0.3895, 'rougeL_recall(mode=recall)': 0.7126, 'rougeL_fmeasure(mode=fmeasure)': 0.4693, 'answer_relevancy': 0.2301, 'faithfulness': 1.0000}
⏳ Menunggu 30 detik sebelum lanjut ke batch berikutnya...


🧪 Mengevaluasi batch 7 (2 soal)...


Evaluating: 100%|██████████| 16/16 [00:20<00:00,  1.29s/it]


{'rouge1_precision(mode=precision)': 0.4095, 'rouge1_recall(mode=recall)': 0.8321, 'rouge1_fmeasure(mode=fmeasure)': 0.5402, 'rougeL_precision(mode=precision)': 0.3688, 'rougeL_recall(mode=recall)': 0.7577, 'rougeL_fmeasure(mode=fmeasure)': 0.4883, 'answer_relevancy': 0.2630, 'faithfulness': 1.0000}


In [15]:
all_results = []
all_results.append(result)
# === Rata-rata skor keseluruhan ===
from collections import defaultdict

# Kumpulkan semua skor berdasarkan metr
# ik
combined_scores = defaultdict(list)

for result in all_results:
    for batch_dict in result.scores:  # result.scores adalah list of dict
        for metric_name, score in batch_dict.items():
            combined_scores[metric_name].append(score)

# Tampilkan rata-rata per metrik
print("\n📊 RATA-RATA EVALUASI KESELURUHAN:")
for metric, scores in combined_scores.items():
    avg_score = sum(scores) / len(scores)
    print(f"{metric}: {avg_score:.4f}")


📊 RATA-RATA EVALUASI KESELURUHAN:
rouge1_precision(mode=precision): 0.4095
rouge1_recall(mode=recall): 0.8321
rouge1_fmeasure(mode=fmeasure): 0.5402
rougeL_precision(mode=precision): 0.3688
rougeL_recall(mode=recall): 0.7577
rougeL_fmeasure(mode=fmeasure): 0.4883
answer_relevancy: 0.2630
faithfulness: 1.0000
