In [23]:
import os
import json
from pathlib import Path
from typing import List, Dict, Optional

## Load files and dataset overview

In [9]:
def load_json_if_exists(filepath: Path) -> Optional[dict]:
    if filepath.exists():
        with open(filepath, "r", encoding="utf-8") as f:
            return json.load(f)
    return None

In [10]:
def extract_v1_data(v1_path: Path) -> Optional[Dict]:
    """
    Loads parsed paper and reviews from a given v1 folder.
    Looks for paper.docling.json first, falls back to .itg or .tei if needed.
    """
    parsed_filenames = ["paper.docling.json", "paper.itg.json", "paper.tei.json"]
    paper_content = None

    for filename in parsed_filenames:
        paper_path = v1_path / filename
        if paper_path.exists():
            paper_content = load_json_if_exists(paper_path)
            if paper_content:
                break

    if not paper_content:
        print(f"[!] No parsed paper found in {v1_path}")
        return None

    reviews = load_json_if_exists(v1_path / "reviews.json") or []

    return {
        "paper_path": v1_path,
        "paper_content": paper_content,
        "reviews": reviews
    }

In [11]:
def load_arr_emnlp_dataset(root_dir: str) -> List[Dict]:
    """
    Loads v1 content and reviews for all papers under the ARR-EMNLP structure.

    Args:
        root_dir: Path to the ARR-EMNLP root (e.g., "./data/ARR-EMNLP")

    Returns:
        List of dicts, each with paper_id, parsed content, and reviews.
    """
    root = Path(root_dir)
    dataset = []

    for paper_dir in root.iterdir():
        if not paper_dir.is_dir():
            continue

        v1_dir = paper_dir / "v1"
        if not v1_dir.exists():
            print(f"[!] Skipping {paper_dir.name}, no v1 folder found.")
            continue

        paper_entry = extract_v1_data(v1_dir)
        if paper_entry:
            paper_entry["paper_id"] = paper_dir.name
            dataset.append(paper_entry)

    print(f"✅ Loaded {len(dataset)} paper(s) with v1 submissions.")
    return dataset

In [21]:
dataset_path = Path("../../data/ARR-EMNLP")

In [22]:
dataset = load_arr_emnlp_dataset(dataset_path)

# Find all paper folders (e.g., 123ABC456)
paper_dirs = [p for p in dataset_path.iterdir() if p.is_dir()]
first_paper_path = paper_dirs[0]
v1_path = first_paper_path / "v1"

# List the files inside the v1 directory
v1_files = {file.name: file for file in v1_path.iterdir() if file.is_file()}
v1_files_list = list(v1_files.keys())

print("📂 Paper folder:", first_paper_path.name)
print("📁 v1 files:", v1_files_list)

✅ Loaded 1 paper(s) with v1 submissions.
📂 Paper folder: 1
📁 v1 files: ['paper.pdf', 'reviews.json', 'paper.itg', 'paper.docling.json', 'paper.tei', 'meta.json']


## Meta.json overview

In [24]:
import json

# Load version-level metadata
meta_path = v1_files.get("meta.json")
if meta_path and meta_path.exists():
    with open(meta_path, "r") as f:
        meta_data = json.load(f)
    print("🗂️ Version Meta:")
    for k, v in meta_data.items():
        print(f"  {k}: {v}")
else:
    print("meta.json not found.")

🗂️ Version Meta:
  title: Impact of Co-occurrence on Factual Knowledge of Large Language Models
  authors: ['Cheongwoong Kang', 'Jaesik Choi']
  abstract: Large language models (LLMs) often make factually incorrect responses despite their success in various applications. In this paper, we hypothesize that relying heavily on simple co-occurrence statistics of the pre-training corpora is one of the main factors that cause factual errors. Our results reveal that LLMs are vulnerable to the co-occurrence bias, defined as preferring frequently co-occurred words over the correct answer. Consequently, LLMs struggle to recall facts whose subject and object rarely co-occur in the pre-training dataset although they are seen during finetuning. We show that co-occurrence bias remains despite scaling up model sizes or finetuning. Therefore, we suggest finetuning on a debiased dataset to mitigate the bias by filtering out biased samples whose subject-object co-occurrence count is high. Although debia

In [27]:
# Load review data
reviews_path = v1_files.get("reviews.json")
if reviews_path and reviews_path.exists():
    with open(reviews_path, "r") as f:
        reviews_data = json.load(f)

    print("\n📝 Reviews:")
    if reviews_data:
        for i, review in enumerate(reviews_data):
            print(f"\n🔸 Review #{i + 1}:")
            print("Reviewer ID:", review.get("rid", "[unknown]"))

            # Extract report sections if available
            report = review.get("report", {})
            if report:
                for section, text in report.items():
                    print(f"\n📌 {section.replace('_', ' ').capitalize()}:")
                    print(text.strip() if text else "[empty]")
            else:
                print("No report found.")

            # Optional: print scores
            scores = review.get("scores", {})
            if scores:
                print("\n🧪 Scores:")
                for score_cat, score_val in scores.items():
                    print(f"  - {score_cat.capitalize()}: {score_val}")

            # Optional: reviewer confidence
            confidence = review.get("meta", {}).get("reviewer_confidence")
            if confidence:
                print("\n💬 Reviewer Confidence:", confidence)
    else:
        print("No reviews found in file.")
else:
    print("reviews.json not found.")


📝 Reviews:

🔸 Review #1:
Reviewer ID: rfX6ne8ne4

📌 Paper topic and main contributions:
The paper investigates the effect of co-occurrence statistics on the ability of large language models to correctly answer simple factual questions (of the subject-relation-object form). The paper specifically checks whether simple co-occurrences between the subject and the object in the pretraining data, can lead the models to incorrectly answer factual questions where the co-occurrence diverges from the correct answer. As it is difficult to check this causal relation by direct manipulation (given the enormous costs of pretraining large language models), a correlation study was conducted. Results show correlation between the co-occurrence statistics of a triplet and the ability of the model to answer correctly questions on it. 
The paper further explores mitigation strategies to combat this bias. Two approaches are explored: debiased finetuning and knowledge editing. The former approach presents li

In [32]:
from pprint import pprint

# Diagnostic: Explore the actual docling file structure
docling_path = v1_files.get("paper.docling.json")
if docling_path and docling_path.exists():
    with open(docling_path, "r") as f:
        docling_data = json.load(f)

    print("🔍 Top-level keys in docling JSON:")
    pprint(docling_data.keys())

    print("\n🔍 Preview of the 'content' field (if exists):")
    if "content" in docling_data:
        pprint(docling_data["content"], depth=2)
    else:
        print("No 'content' field found.")

    print("\n🔍 Raw document preview:")
    pprint(docling_data, depth=1)
else:
    print("paper.docling.json not found.")

🔍 Top-level keys in docling JSON:
dict_keys(['schema_name', 'version', 'name', 'origin', 'furniture', 'body', 'groups', 'texts', 'pictures', 'tables', 'key_value_items', 'pages'])

🔍 Preview of the 'content' field (if exists):
No 'content' field found.

🔍 Raw document preview:
{'body': {...},
 'furniture': {...},
 'groups': [],
 'key_value_items': [],
 'name': 'paper',
 'origin': {...},
 'pages': {...},
 'pictures': [...],
 'schema_name': 'DoclingDocument',
 'tables': [...],
 'texts': [...],
 'version': '1.0.0'}


In [33]:
docling_path = v1_files.get("paper.docling.json")
if docling_path and docling_path.exists():
    with open(docling_path, "r") as f:
        docling_data = json.load(f)

    print("\n📄 Parsed Paper (docling):")

    # Show top-level metadata if available
    print("Document Name:", docling_data.get("name", "[no name]"))
    print("Schema Version:", docling_data.get("version", "[no version]"))

    # Display first few text segments
    texts = docling_data.get("texts", [])
    if texts:
        print("\n🧾 First Text Chunks:")
        for i, item in enumerate(texts[:5]):
            content = item.get("text", "[no text]")
            print(f"\n🔹 Text #{i+1}:")
            print(content[:500] + "..." if len(content) > 500 else content)
    else:
        print("No texts found in document.")

    # Optionally preview body or furniture
    print("\n🧱 Structure Overview (body keys):")
    pprint(docling_data.get("body", {}).keys())
else:
    print("paper.docling.json not found.")


📄 Parsed Paper (docling):
Document Name: paper
Schema Version: 1.0.0

🧾 First Text Chunks:

🔹 Text #1:
Impact of Co-occurrence on Factual Knowledge of Large Language Models

🔹 Text #2:
Cheongwoong Kang

🔹 Text #3:
KAIST

🔹 Text #4:
cw.kang@kaist.ac.kr

🔹 Text #5:
Abstract

🧱 Structure Overview (body keys):
dict_keys(['self_ref', 'children', 'name', 'label'])


In [34]:
def reconstruct_full_paper(docling_data: dict) -> str:
    """
    Reconstruct the full paper text from the docling-formatted JSON.

    Args:
        docling_data (dict): Parsed JSON data from paper.docling.json.

    Returns:
        str: The full concatenated text of the paper.
    """
    texts = docling_data.get("texts", [])
    full_text = "\n\n".join(t.get("text", "") for t in texts if "text" in t and t["text"].strip())
    return full_text.strip()

In [38]:
# Assuming docling_data was loaded already
full_paper_text = reconstruct_full_paper(docling_data)

print("🧾 Full paper preview (first 10000 characters):")
print(full_paper_text[:10000] + "..." if len(full_paper_text) > 10000 else full_paper_text)

🧾 Full paper preview (first 10000 characters):
Impact of Co-occurrence on Factual Knowledge of Large Language Models

Cheongwoong Kang

KAIST

cw.kang@kaist.ac.kr

Abstract

Large language models (LLMs) often make factually incorrect responses despite their success in various applications. In this paper, we hypothesize that relying heavily on simple cooccurrence statistics of the pre-training corpora is one of the main factors that cause factual errors. Our results reveal that LLMs are vulnerable to the co-occurrence bias, defined as preferring frequently co-occurred words over the correct answer. Consequently, LLMs struggle to recall facts whose subject and object rarely co-occur in the pre-training dataset although they are seen during finetuning. We show that co-occurrence bias remains despite scaling up model sizes or finetuning. Therefore, we suggest finetuning on a debiased dataset to mitigate the bias by filtering out biased samples whose subject-object co-occurrence count is hi