In [None]:
import os
import json
import time
from pathlib import Path

import pandas as pd
import google.generativeai as genai
from tqdm.notebook import tqdm # S·ª≠ d·ª•ng tqdm.notebook ƒë·ªÉ thanh progress hi·ªÉn th·ªã ƒë·∫πp h∆°n trong notebook

In [None]:
GOOGLE_API_KEY = ""

# T√™n model c·ªßa Gemini d√πng ƒë·ªÉ ƒë√°nh gi√° (Judge)
JUDGE_MODEL_NAME = "gemini-2.5-flash"

# ƒê∆∞·ªùng d·∫´n ƒë·∫øn file rubric JSON ch·ª©a c√°c ti√™u ch√≠
RUBRIC_PATH = "/content/prompt_judge_minh.json"

# ƒê∆∞·ªùng d·∫´n ƒë·∫øn file d·ªØ li·ªáu .jsonl c·∫ßn ƒë∆∞·ª£c ƒë√°nh gi√°
INPUT_DATA_PATH = "/content/samples_detailed_prompt_context_2025-09-10T17-07-50.668664.jsonl"

In [None]:
# Cell 3: ƒê·ªãnh nghƒ©a l·ªõp LLMJudge (Phi√™n b·∫£n c·∫≠p nh·∫≠t x·ª≠ l√Ω l·ªói API)

class LLMJudge:
    def __init__(self, rubric_path: str, api_key: str, judge_model: str):
        if not api_key:
            raise ValueError("Kh√¥ng t√¨m th·∫•y GOOGLE_API_KEY. Vui l√≤ng thi·∫øt l·∫≠p trong Cell 2.")

        genai.configure(api_key=api_key)
        self.model = genai.GenerativeModel(judge_model)
        self.metrics = self._load_rubric(rubric_path)
        self.metric_names = [m['name'] for m in self.metrics]

        print(f"‚úÖ LLM Judge ƒë√£ ƒë∆∞·ª£c kh·ªüi t·∫°o v·ªõi model: {judge_model}")
        print(f"‚úÖ ƒê√£ t·∫£i th√†nh c√¥ng {len(self.metrics)} ti√™u ch√≠ t·ª´: {rubric_path}")

    def _load_rubric(self, path: str) -> list:
        """T·∫£i rubric t·ª´ file JSON."""
        with open(path, 'r', encoding='utf-8') as f:
            return json.load(f)

    def _create_bulk_prompt(self, data_doc: dict, model_response: str) -> str:
        """T·∫°o m·ªôt prompt duy nh·∫•t ƒë·ªÉ ƒë√°nh gi√° T·∫§T C·∫¢ c√°c ti√™u ch√≠ c√πng l√∫c."""
        criteria_text = "\n".join([f'- **{metric["name"]}**: {metric["description"]}' for metric in self.metrics])

        return f"""
B·∫°n l√† m·ªôt chuy√™n gia ƒë√°nh gi√° AI c√¥ng t√¢m v√† ch√≠nh x√°c. Nhi·ªám v·ª• c·ªßa b·∫°n l√† ƒë√°nh gi√° c√¢u tr·∫£ l·ªùi c·ªßa m·ªôt m√¥ h√¨nh ng√¥n ng·ªØ d·ª±a tr√™n m·ªôt b·ªô ti√™u ch√≠ ƒë∆∞·ª£c cung c·∫•p.

--- NG·ªÆ C·∫¢NH ---
{data_doc.get('context', 'Kh√¥ng c√≥')}

--- C√ÇU H·ªéI ---
{data_doc.get('question', 'Kh√¥ng c√≥')}

--- C√ÅC L·ª±a CH·ªåN ---
{data_doc.get('options', 'Kh√¥ng c√≥')}

--- C√ÇU TR·∫¢ L·ªúI C·ª¶A MODEL C·∫¶N ƒê√ÅNH GI√Å ---
{model_response}

--- B·ªò TI√äU CH√ç ƒê√ÅNH GI√Å ---
H√£y cho ƒëi·ªÉm c√¢u tr·∫£ l·ªùi tr√™n theo thang ƒëi·ªÉm t·ª´ 1 ƒë·∫øn 5 cho m·ªói ti√™u ch√≠ sau:
{criteria_text}

--- NHI·ªÜM V·ª§ C·ª¶A B·∫†N ---
H√£y ƒë√°nh gi√° c√¢u tr·∫£ l·ªùi c·ªßa model theo T·∫§T C·∫¢ c√°c ti√™u ch√≠ ƒë√£ cho.
C√¢u tr·∫£ l·ªùi c·ªßa b·∫°n B·∫ÆT BU·ªòC ph·∫£i l√† m·ªôt ƒë·ªëi t∆∞·ª£ng JSON duy nh·∫•t, kh√¥ng c√≥ b·∫•t k·ª≥ vƒÉn b·∫£n gi·∫£i th√≠ch n√†o tr∆∞·ªõc ho·∫∑c sau n√≥.
ƒê·ªëi t∆∞·ª£ng JSON ph·∫£i ch·ª©a c√°c key t∆∞∆°ng ·ª©ng v·ªõi t√™n ti√™u ch√≠, v√† value l√† ƒëi·ªÉm s·ªë (s·ªë nguy√™n t·ª´ 1 ƒë·∫øn 5).
V√≠ d·ª• ƒë·ªãnh d·∫°ng output:
{{
  "t√≠nh s√∫c t√≠ch": 4,
  "t√≠nh d·ªÖ hi·ªÉu": 5,
  "t√≠nh ch·∫∑t ch·∫Ω": 3,
  "t√≠nh li√™n quan": 5,
  "t√≠nh ƒë√∫ng ƒë·∫Øn": 4,
  "t√≠nh ƒë·∫ßy ƒë·ªß": 3,
  "C·∫•u tr√∫c c√¢u tr·∫£ l·ªùi": 4
}}
"""

    def _judge_all_criteria(self, data_doc: dict, model_response: str) -> dict:
        """
        G·ª≠i m·ªôt y√™u c·∫ßu duy nh·∫•t ƒë·∫øn Gemini API ƒë·ªÉ ƒë√°nh gi√° t·∫•t c·∫£ c√°c ti√™u ch√≠.
        Bao g·ªìm logic retry n√¢ng cao: ngh·ªâ 30 gi√¢y n·∫øu g·∫∑p l·ªói API/k·∫øt n·ªëi.
        """
        prompt = self._create_bulk_prompt(data_doc, model_response)
        default_scores = {name: 0 for name in self.metric_names}
        max_retries = 5  # Gi·ªõi h·∫°n s·ªë l·∫ßn th·ª≠ l·∫°i ƒë·ªÉ tr√°nh l·∫∑p v√¥ h·∫°n
        attempt = 0

        while attempt < max_retries:
            try:
                response = self.model.generate_content(prompt)
                cleaned_text = response.text.strip().replace("```json", "").replace("```", "")
                judgement = json.loads(cleaned_text)

                # Ki·ªÉm tra xem c√≥ ƒë·ªß key kh√¥ng
                if all(key in judgement for key in self.metric_names):
                    return judgement # Th√†nh c√¥ng, tr·∫£ v·ªÅ k·∫øt qu·∫£
                else:
                    # L·ªói JSON h·ª£p l·ªá nh∆∞ng thi·∫øu key
                    print(f"\nL·ªói: JSON tr·∫£ v·ªÅ thi·∫øu key. Th·ª≠ l·∫°i sau 1 gi√¢y...")
                    time.sleep(1)

            except Exception as e:
                error_message = str(e).lower()
                # Ki·ªÉm tra c√°c th√¥ng ƒëi·ªáp l·ªói c·ª• th·ªÉ
                if "api key not valid" in error_message or "connection aborted" in error_message or "remote end closed" in error_message:
                    attempt += 1
                    print(f"\nüî¥ L·ªói API ho·∫∑c k·∫øt n·ªëi. T·∫°m d·ª´ng 30 gi√¢y tr∆∞·ªõc khi th·ª≠ l·∫°i (l·∫ßn {attempt}/{max_retries})...")
                    time.sleep(30)
                else:
                    # C√°c l·ªói kh√°c (v√≠ d·ª•: JSONDecodeError)
                    attempt += 1
                    print(f"\nüü° L·ªói kh√¥ng x√°c ƒë·ªãnh: {e}. Th·ª≠ l·∫°i sau 1 gi√¢y (l·∫ßn {attempt}/{max_retries})...")
                    time.sleep(1)

        print(f"\n‚ùå L·ªói: Kh√¥ng th·ªÉ nh·∫≠n ƒë∆∞·ª£c k·∫øt qu·∫£ h·ª£p l·ªá t·ª´ API sau {max_retries} l·∫ßn th·ª≠.")
        return default_scores

    def process_and_evaluate_file(self, input_path: str) -> pd.DataFrame:
        """X·ª≠ l√Ω to√†n b·ªô file ƒë·∫ßu v√†o, ƒë√°nh gi√° t·ª´ng d√≤ng v√† tr·∫£ v·ªÅ k·∫øt qu·∫£ d∆∞·ªõi d·∫°ng DataFrame."""
        all_results = []
        try:
            with open(input_path, 'r', encoding='utf-8') as f:
                lines = f.readlines()
        except FileNotFoundError:
            print(f"‚ùå L·ªñI: Kh√¥ng t√¨m th·∫•y file ƒë·∫ßu v√†o t·∫°i {input_path}")
            return pd.DataFrame()

        for line in tqdm(lines, desc="ƒêang ƒë√°nh gi√° c√°c d√≤ng d·ªØ li·ªáu"):
            if not line.strip():
                continue

            data_record = json.loads(line)
            doc_data = data_record['doc']
            model_response = data_record['resps'][0][0]

            row_data = {
                'context': doc_data.get('context', ''),
                'question': doc_data.get('question', ''),
                'options': str(doc_data.get('options', '')),
                'response': model_response
            }

            scores = self._judge_all_criteria(doc_data, model_response)
            row_data.update(scores)

            valid_scores = [s for s in scores.values() if isinstance(s, (int, float)) and s > 0]
            row_data['average_score'] = sum(valid_scores) / len(valid_scores) if valid_scores else 0

            all_results.append(row_data)

        return pd.DataFrame(all_results)

In [None]:
# Kh·ªüi t·∫°o judge v·ªõi c√°c c·∫•u h√¨nh ƒë√£ thi·∫øt l·∫≠p
judge = LLMJudge(
    rubric_path=RUBRIC_PATH,
    api_key=GOOGLE_API_KEY,
    judge_model=JUDGE_MODEL_NAME
)

‚úÖ LLM Judge ƒë√£ ƒë∆∞·ª£c kh·ªüi t·∫°o v·ªõi model: gemini-2.5-flash
‚úÖ ƒê√£ t·∫£i th√†nh c√¥ng 7 ti√™u ch√≠ t·ª´: /content/prompt_judge_minh.json


In [None]:
print("--- B·∫Øt ƒë·∫ßu quy tr√¨nh ƒë√°nh gi√° LLM Judge ---")

# S·ª≠ d·ª•ng pathlib ƒë·ªÉ x·ª≠ l√Ω ƒë∆∞·ªùng d·∫´n m·ªôt c√°ch an to√†n
input_path = Path(INPUT_DATA_PATH)

# L·∫•y t√™n model t·ª´ t√™n th∆∞ m·ª•c cha c·ªßa file input
model_name = input_path.parent.name
print(f"Ph√°t hi·ªán model name: {model_name}")

# Ch·∫°y ƒë√°nh gi√° v√† nh·∫≠n v·ªÅ m·ªôt DataFrame
results_df = judge.process_and_evaluate_file(input_path=str(input_path))

if not results_df.empty:
    # T·∫°o ƒë∆∞·ªùng d·∫´n file output ƒë·ªông
    output_dir = Path("output")
    output_dir.mkdir(exist_ok=True) # T·∫°o th∆∞ m·ª•c output n·∫øu ch∆∞a c√≥
    output_path = output_dir / f"{model_name}_scores.csv"

    # L∆∞u DataFrame ra file CSV
    print(f"\nƒêang l∆∞u k·∫øt qu·∫£ ra file CSV t·∫°i: {output_path}")
    results_df.to_csv(output_path, index=False, encoding='utf-8-sig')
    print(f"üéâ ƒê√£ l∆∞u th√†nh c√¥ng k·∫øt qu·∫£ v√†o {output_path}")

    # Hi·ªÉn th·ªã 5 d√≤ng ƒë·∫ßu ti√™n c·ªßa k·∫øt qu·∫£
    print("\nXem tr∆∞·ªõc 5 d√≤ng k·∫øt qu·∫£ ƒë·∫ßu ti√™n:")
    display(results_df.head())
else:
    print("Kh√¥ng c√≥ k·∫øt qu·∫£ n√†o ƒë·ªÉ l∆∞u.")

print("\n--- Quy tr√¨nh ƒë√°nh gi√° ƒë√£ k·∫øt th√∫c ---")

--- B·∫Øt ƒë·∫ßu quy tr√¨nh ƒë√°nh gi√° LLM Judge ---
Ph√°t hi·ªán model name: content


ƒêang ƒë√°nh gi√° c√°c d√≤ng d·ªØ li·ªáu:   0%|          | 0/200 [00:00<?, ?it/s]