# 下載 litqa-v0.jsonl

In [None]:
!wget https://raw.githubusercontent.com/Future-House/LitQA/main/litqa-v0.jsonl

# 下載 FAISS_db.zip

In [None]:
url="https://drive.google.com/uc?export=download&id=1fQSmLJxOmGmHYlqUUamZKGIm4w1JmEXI"
!curl -L -o FAISS_db.zip "$url"

In [None]:
!unzip FAISS_db.zip

# 安裝並引入必要套件

In [None]:
!pip install -U langchain langchain-community pypdf python-docx sentence-transformers faiss-cpu

In [None]:
from langchain_community.document_loaders import PyPDFLoader, TextLoader, UnstructuredWordDocumentLoader
from langchain_text_splitters import RecursiveCharacterTextSplitter
from langchain_community.embeddings import HuggingFaceEmbeddings
from langchain_community.vectorstores import FAISS
import json
import os
import csv
import re
import time  # 新增 time 模組用於等待
from typing import List, Optional
from google import genai
from google.genai import types
from google.colab import userdata



# 自訂 E5 embedding 類別

In [None]:
class CustomE5Embedding(HuggingFaceEmbeddings):
    def embed_documents(self, texts):
        texts = [f"passage: {t}" for t in texts]
        return super().embed_documents(texts)

    def embed_query(self, text):
        return super().embed_query(f"query: {text}")

# 載入 `faiss_db`

In [None]:
embedding_model = CustomE5Embedding(model_name="intfloat/multilingual-e5-small")
db = FAISS.load_local("faiss_db", embedding_model, allow_dangerous_deserialization=True)
retriever = db.as_retriever()

# 設定 ID → Paper Title

In [None]:
paper_title = [
"Lymph node targeted multi-epitope subunit vaccine promotes effective immunity to EBV in HLA-expressing mice",
"Fasting mimicking diet in mice delays cancer growth and reduces immunotherapy associated cardiovascular and systemic side effects",
"Single-cell transcriptome analysis indicates fatty acid metabolism-mediated metastasis and immunosuppression in male breast cancer",
"Metabolic glycan labeling immobilizes dendritic cell membrane and enhances antitumor efficacy of dendritic cell vaccine",
"Primary germinal center-resident T follicular helper cells are a physiologically distinct subset of CXCR5hiPD-1hi T follicular helper cells",
"Two-photon synthetic aperture microscopy for minimally invasive fast 3D imaging of native subcellular behaviors in deep tissue",
"Single-cell protein expression profiling resolves circulating and resident memory T cell diversity across tissues and infection contexts",
"Symmetric Molecular Dynamics",
"Transcriptomic taxonomy and neurogenic trajectories of adult human, macaque, and pig hippocampal and entorhinal cells",
"Multi-omic analysis reveals divergent molecular events in scarring and regenerative wound healing",
"Chromatin-state barriers enforce an irreversible mammalian cell fate decision",
"Prenatal environmental stressors impair postnatal microglia function and adult behavior in males",
"Prenatal environmental stressors impair postnatal microglia function and adult behavior in males",
"Evolutionarily conserved bacterial effectors hijack abscisic acid signaling to induce an aqueous environment in the apoplast",
"Evolutionarily conserved bacterial effectors hijack abscisic acid signaling to induce an aqueous environment in the apoplast",
"Evolutionarily conserved bacterial effectors hijack abscisic acid signaling to induce an aqueous environment in the apoplast",
"Enhanced prime editing systems by manipulating cellular determinants of editing outcomes",
"Intestinal Microbiota Influence Doxorubicin Responsiveness in Triple-Negative Breast Cancer",
"Deciphering the molecular organization of GET pathway chaperones through native mass spectrometry",
"Deciphering the molecular organization of GET pathway chaperones through native mass spectrometry",
"Tertiary lymphoid structures generate and propagate anti-tumor antibody-producing plasma cells in renal cell cancer",
"Slow and negligible senescence among testudines challenges evolutionary theories of senescence",
"Thymic epithelial cells co-opt lineage-defining transcription factors to eliminate autoreactive T cells",
"Peptide-guided lipid nanoparticles deliver mRNA to the neural retina of rodents and nonhuman primates",
"Massive Multiplexing of Spatially Resolved Single Neuron Projections with Axonal BARseq",
"Dynamic mapping of proteome trafficking within and between living cells by TransitID",
"The connectome of an insect brain",
"The connectome of an insect brain",
"A transcription factor atlas of directed differentiation",
"Controlled Protein Activities with Viral Proteases, Antiviral Peptides, and Antiviral Drugs",
"Spatial imaging of glycoRNA in single cells with ARPLA",
"Spatial imaging of glycoRNA in single cells with ARPLA",
"Discovery of new deaminase functions by structure-based protein clustering",
"Rescue of α-synuclein aggregation in Parkinson’s patient neurons by synergistic enhancement of ER proteostasis and protein trafficking",
"Discovery of new deaminase functions by structure-based protein clustering",
"RhoA drives actin compaction to restrict axon regeneration and astrocyte reactivity after CNS injury",
"Prenatal environmental stressors impair postnatal microglia function and adult behavior in males",
"Concerted type I interferon signaling in microglia and neural cells promotes memory impairment associated with amyloid β plaques",
"Concerted type I interferon signaling in microglia and neural cells promotes memory impairment associated with amyloid β plaques",
"The allergy mediator histamine confers resistance to immunotherapy in cancer patients via activation of the macrophage histamine receptor H1",
"Conserved cell types with divergent features in human versus mouse cortex",
"Connectomic comparison of mouse and human cortex",
"Slide-seq: A scalable technology for measuring genome-wide expression at high spatial resolution",
"Slide-seq: A scalable technology for measuring genome-wide expression at high spatial resolution",
"Massively parallel base editing to map variant effects in human hematopoiesis",
"Scalable full-transcript coverage single cell RNA sequencing with Smart-seq3xpress",
"Illuminating protein space with a programmable generative mode",
"Long-term platinum-based drug accumulation in cancer-associated fibroblasts promotes colorectal cancer progression and resistance to therapy",
"Pan-KRAS inhibitor disables oncogenic signalling and tumour growth",
"Bempegaldesleukin (NKTR-214): a CD-122-biased IL-2 receptor agonist for cancer immunotherapy"
]

In [None]:
print(len(paper_title))

In [None]:
id_to_title = {}
count = 0
with open('litqa-v0.jsonl', 'r', encoding='utf-8') as f:
    for line in f:
        data = json.loads(line)
        if "canary" in data: continue
        id_to_title[data['id']] = paper_title[count].lower()
        count += 1

In [None]:
for k, v in id_to_title.items():
    print(k, v)
print(len(id_to_title))

# Base RAG

In [None]:
# 若未設定環境變數，請在此填入您的 API Key
# os.environ["GOOGLE_API_KEY"] = "YOUR_ACTUAL_API_KEY"

class LitQAEvaluator:
    def __init__(self, model_name: str = "gemini-2.5-flash-lite"):
        self.client = genai.Client(api_key=userdata.get('Gemini'))
        self.model_name = model_name

    def _get_system_prompt(self, has_references: bool) -> str:
        """OpenScholar 風格 System Prompt"""
        base_prompt = (
            "Provide a detailed, informative answer to the following research-related question. Your answer should be more than one paragraph. "
            "Base your answer on multiple pieces of evidence. "
            "Make sure to add citations to all citation-worthy statements using reference numbers (e.g., [1], [2]). "
        )

        if has_references:
            ref_instruction = "Use the provided 'References' section below. Add the citation number at the end of each relevant sentence. "
        else:
            ref_instruction = (
                "NO external references are provided. You must strictly use your INTERNAL KNOWLEDGE to identify and cite real, existing scientific papers. "
                "Generate citations numbers [1], [2]... in the text and list the full Paper Titles at the end."
            )

        output_instruction = (
            "You MUST produce output in **exactly this structure**:\n"
            "[Response_Start]\n"
            "Your reasoning here.\n"
            "\n"
            "[Response_End]\n"
            "{\"ans\": \"A\"}\n"
            "References:\n"
            "[1] <Most relevant document title>\n"
            "[2] <Second relevant document title>\n"
            "[3] <Third relevant document title>\n"
            "\n"
            "Rules:\n"
            "- This is a Multiple Choice Question. Select the most accurate option based on the reasoning.\n"
            "- You may include multiple references, numbered [1], [2], [3], ...\n"
            "- Each reference must correspond to a real, relevant document.\n"
            "- Do NOT fabricate titles; use exact titles from provided documents when possible.\n"
            "- If provided documents are not relevant, you may cite real papers from your internal knowledge instead.\n"
            "- Do NOT repeat the same title.\n"
            "- The JSON (e.g., {\"ans\": \"A\"}) must appear **immediately after [Response_End]** and be the ONLY JSON object.\n"
        )

        return base_prompt + ref_instruction + output_instruction

    def format_question_data(self, data: dict) -> tuple:
        """整理問題與選項，計算正確答案代號 (A, B, C...)"""
        question_text = data["question"]
        ideal = data["ideal"]

        all_options = sorted([ideal] + data["distractors"])
        correct_index = all_options.index(ideal)
        correct_letter = chr(65 + correct_index)

        options_str = ""
        for idx, opt in enumerate(all_options):
            options_str += f"{chr(65 + idx)}. {opt}\n"

        return question_text, options_str, correct_letter, all_options

    def extract_model_answer(self, text: str) -> str:
        """使用 Regex 提取 {"ans": "X"}"""
        match = re.search(r'\{\s*"ans"\s*:\s*"([A-Z])"\s*\}', text, re.IGNORECASE)
        if match:
            return match.group(1).upper()
        return "PARSE_ERROR"

    def evaluate_references(self, output_text: str, id: str) -> bool:
        """檢查模型輸出是否包含正確的 Reference 來源"""
        output_lower = output_text.lower()
        #print(output_lower)
        #print(id_to_title[id])
        if id_to_title[id] in output_lower:
            return True
        return False

    def run_evaluation(self, input_file: str, output_file: str, retrieve=False):
        """執行評測，包含斷點續傳與 API 重試機制"""
        fieldnames = [
            "id", "question", "correct_answer", "predicted_answer",
            "is_answer_correct", "is_reference_correct", "full_output"
        ]

        # --- 1. 斷點續傳邏輯 ---
        processed_ids = set()
        file_exists = os.path.exists(output_file) and os.path.getsize(output_file) > 0

        if file_exists:
            print(f"Output file '{output_file}' found. Reading processed IDs for resuming...")
            with open(output_file, 'r', encoding='utf-8') as f_read:
                reader = csv.DictReader(f_read)
                for row in reader:
                    if "id" in row and row["id"]:
                        processed_ids.add(row["id"])
            print(f"-> Found {len(processed_ids)} processed items. Skipping them.")
        else:
            print(f"-> Starting new evaluation. Output will be saved to '{output_file}'.")

        ans_correct_total = 0
        ref_correct_total = 0

        # --- 2. 開啟檔案 (Append 模式) ---
        with open(input_file, 'r', encoding='utf-8') as f_in, \
             open(output_file, 'a', newline='', encoding='utf-8') as f_out:

            writer = csv.DictWriter(f_out, fieldnames=fieldnames)
            if not file_exists:
                writer.writeheader()

            count = 1
            for line_idx, line in enumerate(f_in):
                line = line.strip()
                if not line: continue

                try:
                    row_data = json.loads(line)
                except json.JSONDecodeError:
                    print(f"Skipping line {line_idx+1}: Invalid JSON.")
                    continue

                if "canary" in row_data: continue

                curr_id = row_data.get("id")

                # --- 3. 跳過已處理題目 ---
                if curr_id in processed_ids:
                    continue

                # --- 4. 準備 Input ---
                q_text, opt_str, correct_letter, _ = self.format_question_data(row_data)

                has_refs = False
                ref_context = ""

                if retrieve:
                    try:
                        all_docs = retriever.invoke(q_text)
                        docs = [d.page_content for d in all_docs]
                        if docs:
                            has_refs = True
                            ref_context = "References:\n" + "\n".join([f"[{i+1}] {d}" for i, d in enumerate(docs)])
                            #print(f"Retrieved {len(docs)} references for ID {curr_id}.")
                            #print(f"Check docs: {ref_context}")
                    except Exception as e:
                        print(f"Retriever Error for {curr_id}: {e}")

                system_instruction = self._get_system_prompt(has_refs)
                full_content = (
                    f"{ref_context}\n"
                    f"Question: {q_text}\n"
                    f"Options:\n{opt_str}\n"
                    f"Now, please answer this question following the system instructions.\n"
                )

                print(f"#{count} Processing ID: {curr_id} ... ", end="", flush=True)
                count += 1

                # --- 5. 呼叫 Gemini API (含重試機制) ---
                output_text = None
                max_retries = 5
                base_wait_time = 20  # 基礎等待時間 20 秒

                for attempt in range(max_retries + 1):
                    try:
                        response = self.client.models.generate_content(
                            model=self.model_name,
                            contents=full_content,
                            config=types.GenerateContentConfig(
                                system_instruction=system_instruction,
                                temperature=0.0
                            )
                        )
                        output_text = response.text
                        break  # 成功，跳出重試迴圈

                    except Exception as e:
                        if attempt < max_retries:
                            # 計算等待時間: 20 * (2^0), 20 * (2^1)... -> 20, 40, 80, 160, 320
                            wait_time = base_wait_time * (2 ** attempt)
                            print(f"\n[API Error] Attempt {attempt+1}/{max_retries+1} failed: {e}")
                            print(f"-> Retrying in {wait_time} seconds...")
                            time.sleep(wait_time)
                        else:
                            print(f"\n[API Failed] All {max_retries+1} attempts failed. Error: {e}")
                            output_text = None # 標記為失敗

                # 如果最後還是失敗 (output_text 為 None)，跳過此題，不寫入 CSV，以便下次重跑
                if output_text is None:
                    print(f"Skipping ID {curr_id} due to API failures.")
                    continue

                # --- 6. 評估與記錄 ---
                predicted_ans = self.extract_model_answer(output_text)
                is_ans_correct = (predicted_ans == correct_letter)

                is_ref_correct = self.evaluate_references(output_text, curr_id)

                ans_correct_total += is_ans_correct
                ref_correct_total += is_ref_correct

                print(f"Ans: {predicted_ans} (Correct: {correct_letter}) | Ref Valid: {is_ref_correct}")

                writer.writerow({
                    "id": curr_id,
                    "question": q_text,
                    "correct_answer": correct_letter,
                    "predicted_answer": predicted_ans,
                    "is_answer_correct": is_ans_correct,
                    "is_reference_correct": is_ref_correct,
                    "full_output": output_text
                })

                # --- 7. 強制寫入硬碟 ---
                f_out.flush()

        print(f"\nEvaluation complete.")
        print(f"Correct Answers: {ans_correct_total}")
        print(f"Correct References: {ref_correct_total}")
        print(f"Accuracy: {ans_correct_total / line_idx * 100:.2f}%")
        print(f"Reference Accuracy: {ref_correct_total / line_idx * 100:.2f}%")

# --- 主程式執行區塊 ---
if __name__ == "__main__":
    input_filename = "litqa-v0.jsonl"
    output_filename = "evaluation.csv"

    if os.path.exists(output_filename): # For testing
        os.remove(output_filename)

    if not os.path.exists(input_filename):
        print(f"Error: Input file '{input_filename}' not found.")
    else:
        evaluator = LitQAEvaluator()
        # retriever_func 設為 None，讓 LLM 自行判斷
        evaluator.run_evaluation(input_filename, output_filename, retrieve=True)

# Main approach

In [None]:
class ReflectRAGEvaluator:
    def __init__(self, model_name: str = "gemini-2.5-pro"):
        self.client = genai.Client(api_key=userdata.get('Gemini'))
        self.model_name = model_name

    def format_question_data(self, data: dict) -> tuple:
        """整理問題與選項，計算正確答案代號 (A, B, C...)"""
        question_text = data["question"]
        ideal = data["ideal"]

        all_options = sorted([ideal] + data["distractors"])
        correct_index = all_options.index(ideal)
        correct_letter = chr(65 + correct_index)

        options_str = ""
        for idx, opt in enumerate(all_options):
            options_str += f"{chr(65 + idx)}. {opt}\n"

        return question_text, options_str, correct_letter, all_options

    # =========================================================
    # Stage 1: Document-level confidence reasoning
    # =========================================================
    def get_doc_confidences(self, query: str, docs: List[str]) -> List[dict]:
        """讓 LLM 為每篇文件生成暫時答案與信心分數"""
        doc_results = []
        for i, doc in enumerate(docs):
            prompt = f"""
You are a scientific assistant working on literature-based QA.

Question: {query}
Document [{i+1}]:
{doc}

Task:
1. Based on this document only, answer the question as best as you can.
2. After answering, rate your confidence (0–100%) that this document directly supports your answer.

Format:
Answer: <text>
Confidence: <number between 0 and 100>
"""
            try:
                response = self.client.models.generate_content(
                    model=self.model_name,
                    contents=prompt,
                    config=types.GenerateContentConfig(temperature=0.0)
                )
                text = response.text
                ans_match = re.search(r'Answer:\s*(.*)', text)
                conf_match = re.search(r'Confidence:\s*(\d+)', text)
                answer = ans_match.group(1).strip() if ans_match else "N/A"
                conf = int(conf_match.group(1)) if conf_match else 0
                doc_results.append({
                    "doc_id": i + 1,
                    "answer": answer,
                    "confidence": conf
                })
            except Exception as e:
                print(f"Confidence stage error on doc {i+1}: {e}")
        return doc_results

    # =========================================================
    # Stage 2: Reflection and evidence consolidation
    # =========================================================
    def reflect_and_verify(self, query: str, doc_results: List[dict], docs: List[str]) -> str:
        """反思階段：重新整理 reasoning 與 citation"""
        summary_text = "\n".join(
            [f"[Doc {r['doc_id']}] Answer: {r['answer']} | Confidence: {r['confidence']}%" for r in doc_results]
        )
        context_text = "\n".join([f"[{i+1}] {docs[i]}" for i in range(len(docs))])

        reflection_prompt = f"""
You are a scientific QA expert. You have read multiple papers and produced preliminary answers:

{summary_text}

Now reflect on your reasoning:
1. Re-evaluate which documents actually contain evidence supporting the answer.
2. Merge consistent reasoning from multiple supporting documents.
3. Identify any documents that were overconfident or irrelevant.
4. Provide a concise reasoning summary and list the document IDs that truly support the final conclusion.

References:
{context_text}

Output format:
Reflection Summary: <your reasoning>
Supporting Citations: [IDs, e.g., 1, 3, 5]
"""
        response = self.client.models.generate_content(
            model=self.model_name,
            contents=reflection_prompt,
            config=types.GenerateContentConfig(temperature=0.0)
        )
        return response.text

    # =========================================================
    # Stage 3: Final generation (answer + citations)
    # =========================================================
    def generate_final_answer(self, query: str, option, reflection_output: str) -> str:
        final_prompt = f"""
You are writing the final answer for a scientific question.

Question: {query}

Options: {option}

You have already reflected and identified supporting evidence below:
{reflection_output}

If all retrieved documents are irrelevant, you must strictly use your INTERNAL KNOWLEDGE to identify and cite real, existing scientific papers.
"""

        output_instruction = (
            "You MUST produce output in **exactly this structure**:\n"
            "[Response_Start]\n"
            "Your reasoning here.\n"
            "\n"
            "[Response_End]\n"
            "{\"ans\": \"A\"}\n"
            "References:\n"
            "[1] <Most relevant document title>\n"
            "[2] <Second relevant document title>\n"
            "[3] <Third relevant document title>\n"
            "\n"
            "Rules:\n"
            "- This is a Multiple Choice Question. Select the most accurate option based on the reasoning.\n"
            "- You may include multiple references, numbered [1], [2], [3], ...\n"
            "- Each reference must correspond to a real, relevant document.\n"
            "- Do NOT fabricate titles; use exact titles from provided documents when possible.\n"
            "- If provided documents are not relevant, you may cite real papers from your internal knowledge instead.\n"
            "- Do NOT repeat the same title.\n"
            "- The JSON (e.g., {\"ans\": \"A\"}) must appear **immediately after [Response_End]** and be the ONLY JSON object.\n"
        )

        response = self.client.models.generate_content(
            model=self.model_name,
            contents=final_prompt + output_instruction,
            config=types.GenerateContentConfig(temperature=0.0)
        )
        return response.text

    # =========================================================
    # Utility: Extract answer letter
    # =========================================================
    def extract_model_answer(self, text: str) -> str:
        match = re.search(r'\{\s*"ans"\s*:\s*"([A-Z])"\s*\}', text, re.IGNORECASE)
        if match:
            return match.group(1).upper()
        return "PARSE_ERROR"

    def evaluate_references(self, output_text: str, id: str) -> bool:
        """檢查模型輸出是否包含正確的 Reference 來源"""
        output_lower = output_text.lower()
        if id_to_title[id] in output_lower:
            return True
        return False

    # =========================================================
    # Main pipeline
    # =========================================================
    def run_reflectrag(self, input_file: str, output_file: str, retriever=None):
        fieldnames = [
            "id", "question", "correct_answer", "predicted_answer",
            "is_answer_correct", "is_reference_correct", "doc_results", "reflection", "full_output"
        ]

        processed_ids = set()
        file_exists = os.path.exists(output_file) and os.path.getsize(output_file) > 0

        if file_exists:
            print(f"Output file '{output_file}' found. Reading processed IDs for resuming...")
            with open(output_file, 'r', encoding='utf-8') as f_read:
                reader = csv.DictReader(f_read)
                for row in reader:
                    if "id" in row and row["id"]:
                        processed_ids.add(row["id"])
            print(f"-> Found {len(processed_ids)} processed items. Skipping them.")
        else:
            print(f"-> Starting new evaluation. Output will be saved to '{output_file}'.")

        ans_correct_total = 0
        ref_correct_total = 0

        with open(input_file, 'r', encoding='utf-8') as f_in, \
             open(output_file, 'a', newline='', encoding='utf-8') as f_out:

            writer = csv.DictWriter(f_out, fieldnames=fieldnames)
            if not file_exists:
                writer.writeheader()

            count = 1 # For testing
            for line_idx, line in enumerate(f_in):
                #if count < 6: # For testing
                #  count += 1
                #else:
                #  break
                line = line.strip()
                if not line: continue

                try:
                    row_data = json.loads(line)
                except json.JSONDecodeError:
                    print(f"Skipping line {line_idx+1}: Invalid JSON.")
                    continue

                if "canary" in row_data: continue

                curr_id = row_data.get("id")

                # --- 3. 跳過已處理題目 ---
                if curr_id in processed_ids:
                    continue

                q_text, opt_str, correct_letter, _ = self.format_question_data(row_data)

                # === retrieve documents ===
                docs = []
                if retriever:
                    retrieved = retriever.invoke(q_text)
                    docs = [d.page_content for d in retrieved]
                else:
                    docs = row_data.get("contexts", [])  # fallback if no retriever

                if not docs:
                    print(f"No documents found for {curr_id}, skipping.")
                    continue

                #print('#'*60)
                print(f"#{count} Processing ID: {curr_id} ... ", end="", flush=True)
                count += 1

                max_retries = 5
                base_wait_time = 20

                # === Stage 1 ===
                doc_results = None
                for attempt in range(max_retries + 1):
                    try:
                        doc_results = self.get_doc_confidences(q_text, docs)
                        break  # 成功，跳出重試迴圈

                    except Exception as e:
                        if attempt < max_retries:
                            # 計算等待時間: 20 * (2^0), 20 * (2^1)... -> 20, 40, 80, 160, 320
                            wait_time = base_wait_time * (2 ** attempt)
                            print(f"\n[API Error] Attempt {attempt+1}/{max_retries+1} failed: {e}")
                            print(f"-> Retrying in {wait_time} seconds...")
                            time.sleep(wait_time)
                        else:
                            print(f"\n[API Failed] All {max_retries+1} attempts failed. Error: {e}")
                            doc_results = None # 標記為失敗

                if doc_results is None:
                    print(f"Skipping ID {curr_id} due to API failures.")
                    continue

                # === Stage 2 ===
                reflection = None
                for attempt in range(max_retries + 1):
                    try:
                        reflection = self.reflect_and_verify(q_text, doc_results, docs)
                        break  # 成功，跳出重試迴圈

                    except Exception as e:
                        if attempt < max_retries:
                            # 計算等待時間: 20 * (2^0), 20 * (2^1)... -> 20, 40, 80, 160, 320
                            wait_time = base_wait_time * (2 ** attempt)
                            print(f"\n[API Error] Attempt {attempt+1}/{max_retries+1} failed: {e}")
                            print(f"-> Retrying in {wait_time} seconds...")
                            time.sleep(wait_time)
                        else:
                            print(f"\n[API Failed] All {max_retries+1} attempts failed. Error: {e}")
                            reflection = None # 標記為失敗

                if reflection is None:
                    print(f"Skipping ID {curr_id} due to API failures.")
                    continue

                # === Stage 3 ===
                final_output = None
                for attempt in range(max_retries + 1):
                    try:
                        final_output = self.generate_final_answer(q_text, opt_str, reflection)
                        break  # 成功，跳出重試迴圈

                    except Exception as e:
                        if attempt < max_retries:
                            # 計算等待時間: 20 * (2^0), 20 * (2^1)... -> 20, 40, 80, 160, 320
                            wait_time = base_wait_time * (2 ** attempt)
                            print(f"\n[API Error] Attempt {attempt+1}/{max_retries+1} failed: {e}")
                            print(f"-> Retrying in {wait_time} seconds...")
                            time.sleep(wait_time)
                        else:
                            print(f"\n[API Failed] All {max_retries+1} attempts failed. Error: {e}")
                            final_output = None # 標記為失敗

                if final_output is None:
                    print(f"Skipping ID {curr_id} due to API failures.")
                    continue

                # === Evaluate correctness ===
                predicted_ans = self.extract_model_answer(final_output)
                is_ans_correct = (predicted_ans == correct_letter)

                is_ref_correct = self.evaluate_references(final_output, curr_id)

                ans_correct_total += is_ans_correct
                ref_correct_total += is_ref_correct

                print(f"Ans: {predicted_ans} (Correct: {correct_letter}) | Ref Valid: {is_ref_correct}")
                #for doc in doc_results: # For check
                #    print(f"Doc {doc['doc_id']} | Confidence: {doc['confidence']}% | Answer: {doc['answer']}")
                #print(f"\nReflection: {reflection}") # For check
                #print(f"\nFinal output: {final_output}") # For check
                #print('#'*60, '\n')

                writer.writerow({
                    "id": curr_id,
                    "question": q_text,
                    "correct_answer": correct_letter,
                    "predicted_answer": predicted_ans,
                    "is_answer_correct": is_ans_correct,
                    "is_reference_correct": is_ref_correct,
                    "doc_results": doc_results,
                    "reflection": reflection,
                    "full_output": final_output
                })
                f_out.flush()

        print("Reflect-RAG evaluation complete.")
        print(f"Correct Answers: {ans_correct_total}")
        print(f"Correct References: {ref_correct_total}")
        print(f"Accuracy: {ans_correct_total / line_idx * 100:.2f}%")
        print(f"Reference Accuracy: {ref_correct_total / line_idx * 100:.2f}%")

if __name__ == "__main__":
    input_file = "litqa-v0.jsonl"
    output_file = "reflectrag_eval.csv"

    if os.path.exists(output_file): # For testing
        os.remove(output_file)

    evaluator = ReflectRAGEvaluator(model_name="gemini-2.5-flash-lite")
    evaluator.run_reflectrag(input_file, output_file, retriever=retriever)