In [None]:
import json
import pandas as pd
import os

#  Configuration Paths
# 1. Evaluation results file path (.jsonl)
results_path = r"/LLM/PrimeKG/eval_results/gpt2_scoring_results_20251212_201700.jsonl"

# 2. Original dataset file path (contains options opa, opb...)
dev_data_path = r"/LLM/data/medmcqa/dev.json"

#  Step 1: Build Option Query Dictionary (from dev.json)
print(f"Reading original data: {dev_data_path} ...")
question_map = {}

with open(dev_data_path, 'r', encoding='utf-8') as f:
    for line in f:
        line = line.strip()
        if not line: continue
        try:
            item = json.loads(line)
            # Get question text and strip whitespace to use as Key
            q_text = item.get('question', '').strip()
            
            # Extract options
            if q_text:
                question_map[q_text] = {
                    "option_A": item.get('opa', ''),
                    "option_B": item.get('opb', ''),
                    "option_C": item.get('opc', ''),
                    "option_D": item.get('opd', '')
                }
        except json.JSONDecodeError:
            continue

print(f"Original data loaded. Indexed {len(question_map)} questions.")

#  Step 2: Filter Results and Concatenate Options
print(f"Processing evaluation results: {results_path} ...")
merged_data = []

with open(results_path, 'r', encoding='utf-8') as f:
    for line in f:
        line = line.strip()
        if not line: continue
        try:
            res_item = json.loads(line)
            
            # Filter condition: Base is False AND RAG is True
            if (res_item.get("is_base_correct") is False) and (res_item.get("is_rag_correct") is True):
                
                q_text = res_item.get('question', '').strip()
                
                # Look up corresponding options in the dictionary
                if q_text in question_map:
                    # Merge options into the current dictionary
                    res_item.update(question_map[q_text])
                    merged_data.append(res_item)
                else:
                    # Keep it even if exact question match fails, but options will be empty
                    # print(f"Warning: Question not found in dev.json: {q_text[:30]}...")
                    res_item.update({
                        "option_A": "Not Found", "option_B": "Not Found",
                        "option_C": "Not Found", "option_D": "Not Found"
                    })
                    merged_data.append(res_item)
                    
        except json.JSONDecodeError:
            continue

#  Step 3: Convert to DataFrame and Display
df = pd.DataFrame(merged_data)

print(f"\nFiltering complete! Found {len(df)} cases where 'Base is Wrong & RAG is Correct'.")

# Reorder columns: place options after the question for easier viewing
cols = ['id', 'question', 'option_A', 'option_B', 'option_C', 'option_D', 
        'ground_truth', 'base_prediction', 'rag_prediction', 'rag_context']
# Keep only existing columns
existing_cols = [c for c in cols if c in df.columns]
df = df[existing_cols]

# Save to CSV
output_csv = "filtered_rag_success_cases.csv"
df.to_csv(output_csv, index=False, encoding='utf-8-sig')
print(f"Results saved to: {output_csv}")

# Show the first 3 rows
df.head(3)

正在读取原始数据: /LLM/data/medmcqa/dev.json ...
原始数据加载完成，共索引了 4183 个问题。
正在处理评测结果: /LLM/PrimeKG/eval_results/gpt2_scoring_results_20251212_201700.jsonl ...

筛选完成！共找到 135 条 'Base错-RAG对' 的数据。
结果已保存至: filtered_rag_success_cases.csv


Unnamed: 0,id,question,option_A,option_B,option_C,option_D,ground_truth,base_prediction,rag_prediction,rag_context
0,2,A 29 yrs old woman with a pregnancy of 17 week...,No test is required now as her age is below 35...,Ultra sound at this point of time will definit...,Amniotic fluid samples plus chromosomal analys...,blood screening at this point of time will cle...,C,A,C,Reference: [treatment] What are the treatments...
1,3,Axonal transport is:,Antegrade,Retrograde,Antegrade and retrograde,,C,B,C,Reference: [genetic changes] What are the gene...
2,7,Which of the following are not a branch of ext...,Sphenopalatine aery,Anterior ethmoidal aery,Greater palatine aery,Septal branch of superior labial aery,B,A,B,Reference: [causes] What causes Inguinal Herni...


In [None]:
import pandas as pd
import os

# Your file path
file_path = r"/LLM/data/pubmedqa_hf/pqa_labeled/train-00000-of-00001.parquet"

print(f"[Info] Reading: {file_path}")

if os.path.exists(file_path):
    # Read Parquet
    df = pd.read_parquet(file_path)
    
    # 1. View all column names (Check for 'question', 'final_decision', 'long_answer', etc.)
    print("\n=== 1. Column Names ===")
    print(df.columns.tolist())
    
    # 2. View top 3 rows of data (Check text format)
    print("\n=== 2. Sample Data (Top 3 Rows) ===")
    pd.set_option('display.max_columns', None)  # Display all columns
    pd.set_option('display.max_colwidth', 100)  # Slightly limit width to avoid cluttering the screen
    print(df.head(3))
    
    # 3. Key check: Unique values in the label column
    # PubMedQA label column is usually named 'final_decision', let's see what's stored inside (yes/no or 0/1?)
    target_col = "final_decision" 
    if target_col in df.columns:
        print(f"\n=== 3. Unique Values in '{target_col}' (Label Distribution) ===")
        print(df[target_col].unique())
    else:
        print(f"\n[Warn] Column '{target_col}' not found! Please check the column names above.")
        
    # 4. Data size
    print(f"\n=== 4. Total Rows: {len(df)} ===")

else:
    print(f"[Error] File not found: {file_path}")

[Info] Reading: /LLM/data/pubmedqa_hf/pqa_labeled/train-00000-of-00001.parquet

=== 1. Column Names (列名) ===
['pubid', 'question', 'context', 'long_answer', 'final_decision']

=== 2. Sample Data (前3行) ===
      pubid  \
0  21645374   
1  16418930   
2   9488747   

                                                                                     question  \
0  Do mitochondria play a role in remodelling lace plant leaves during programmed cell death?   
1                        Landolt C and snellen e acuity: differences in strabismus amblyopia?   
2             Syncope during bathing in infants, a pediatric form of water-induced urticaria?   

                                                                                               context  \
0  {'contexts': ['Programmed cell death (PCD) is the regulated death of cells within an organism. T...   
1  {'contexts': ['Assessment of visual acuity depends on the optotypes used for measurement. The ab...   
2  {'contexts': ['Apparent 