## **Configuration & Imports**

In [11]:
"""
Configuration & Imports
========================
Load all required modules and set pipeline configuration.
"""

# Standard Library
import os
import sys
import json
import pickle
from collections import defaultdict
from typing import Dict, List, Optional, Tuple

# Third-party
from tqdm.auto import tqdm

# Project Modules
from matcher.reference_cleaner import clean_bibtex_entry, clean_arxiv_reference
from matcher.reference_matcher import find_best_match, compute_match_score
from scanner.dataset_scanner import scan_dataset

# Configuration
RAW_ROOT = "../30-paper"
INTERMEDIATE_DIR = "intermediate"
OUTPUT_DIR = "output"
MANUAL_LABELS_FILE = "manual_groundtruth.json"

# Create output directories
os.makedirs(OUTPUT_DIR, exist_ok=True)

print("All imports loaded successfully")
print(f"Configuration: RAW_ROOT='{RAW_ROOT}', INTERMEDIATE_DIR='{INTERMEDIATE_DIR}'")

All imports loaded successfully
Configuration: RAW_ROOT='../30-paper', INTERMEDIATE_DIR='intermediate'


---

## **STEP 4.0 - Load References Data**

Load both BibTeX references and arXiv database.

In [12]:
"""
STEP 4.0: Load References for Matching
========================================
Load BibTeX and arXiv references.
"""

# Load BibTeX references from preprocessing
if os.path.exists(f"{INTERMEDIATE_DIR}/step3_final_references.pkl"):
    with open(f"{INTERMEDIATE_DIR}/step3_final_references.pkl", "rb") as f:
        bibtex_references = pickle.load(f)
else:
    print("ERROR: step3_final_references.pkl not found. Run 02_data_preprocessing.ipynb first.")
    bibtex_references = {}

# Scan dataset for arXiv references
scan_result = scan_dataset(RAW_ROOT)

# Load arXiv references from references.json
arxiv_references = {}
for pub_id in scan_result.keys():
    ref_path = os.path.join(RAW_ROOT, pub_id, "references.json")
    if os.path.exists(ref_path):
        with open(ref_path, 'r', encoding='utf-8') as f:
            refs = json.load(f)
            ref_list = [{'arxiv_id': arxiv_id, **ref_data} for arxiv_id, ref_data in refs.items()]
            arxiv_references[pub_id] = ref_list

# Summary
total_bibtex = sum(len(refs) for refs in bibtex_references.values())
total_arxiv = sum(len(refs) for refs in arxiv_references.values())
common_pubs = len(set(bibtex_references.keys()) & set(arxiv_references.keys()))

print(f"\nSTEP 4.0 Complete:")
print(f"  - BibTeX entries: {total_bibtex} from {len(bibtex_references)} publications")
print(f"  - arXiv entries: {total_arxiv} from {len(arxiv_references)} publications")
print(f"  - Publications with both: {common_pubs}")


STEP 4.0 Complete:
  - BibTeX entries: 1946 from 24 publications
  - arXiv entries: 795 from 30 publications
  - Publications with both: 24


---

## **STEP 4.1 - Data Cleaning**

Normalize and clean both BibTeX and arXiv references.

In [13]:
"""
STEP 4.1: Data Cleaning
========================
Normalize text for matching algorithms.
"""

cleaned_bibtex = {
    pub_id: [clean_bibtex_entry(ref) for ref in refs]
    for pub_id, refs in tqdm(bibtex_references.items(), desc="Cleaning BibTeX", leave=False)
}

cleaned_arxiv = {
    pub_id: [clean_arxiv_reference(ref) for ref in refs]
    for pub_id, refs in tqdm(arxiv_references.items(), desc="Cleaning arXiv", leave=False)
}

total_cleaned = sum(len(refs) for refs in cleaned_bibtex.values()) + sum(len(refs) for refs in cleaned_arxiv.values())
print(f"\nSTEP 4.1 Complete: Cleaned {total_cleaned} entries")

Cleaning BibTeX:   0%|          | 0/24 [00:00<?, ?it/s]

Cleaning arXiv:   0%|          | 0/30 [00:00<?, ?it/s]


STEP 4.1 Complete: Cleaned 2741 entries


---

## **STEP 4.2 - Automatic Heuristic Matching**

Use string similarity algorithms to match references.

**Algorithms:**
- Levenshtein distance
- Jaccard similarity
- SequenceMatcher
- Author overlap
- Year matching

In [14]:
"""
STEP 4.2: Automatic Matching
=============================
Use string similarity to match BibTeX with arXiv.
"""

automatic_matches = {}
match_statistics = {
    'total_bibtex': 0,
    'matched': 0,
    'unmatched': 0,
    'high_confidence': 0,  # ≥0.8
    'medium_confidence': 0,  # 0.6-0.8
}

for pub_id in tqdm(cleaned_bibtex.keys(), desc="STEP 4.2: Automatic matching", leave=False):
    if pub_id not in cleaned_arxiv:
        continue
    
    bibtex_entries = cleaned_bibtex[pub_id]
    arxiv_refs = cleaned_arxiv[pub_id]
    matches = []
    
    for bib_entry in bibtex_entries:
        match_statistics['total_bibtex'] += 1
        
        result = find_best_match(bib_entry, arxiv_refs, threshold=0.6)
        
        if result:
            arxiv_id, score, breakdown = result
            matches.append({
                'bibtex_key': bib_entry['key'],
                'bibtex_ref_id': bib_entry.get('ref_id', ''),
                'arxiv_id': arxiv_id,
                'confidence_score': score,
                'score_breakdown': breakdown,
                'bibtex_title': bib_entry['raw_title'],
                'bibtex_authors': bib_entry['raw_author'],
                'bibtex_year': bib_entry['raw_year'],
                'match_type': 'automatic'
            })
            match_statistics['matched'] += 1
            if score >= 0.8:
                match_statistics['high_confidence'] += 1
            else:
                match_statistics['medium_confidence'] += 1
        else:
            match_statistics['unmatched'] += 1
    
    if matches:
        automatic_matches[pub_id] = matches

# Summary
match_rate = match_statistics['matched']/match_statistics['total_bibtex']*100 if match_statistics['total_bibtex'] > 0 else 0
print(f"\nSTEP 4.2 Complete:")
print(f"  - Matched: {match_statistics['matched']}/{match_statistics['total_bibtex']} ({match_rate:.1f}%)")
print(f"  - High confidence (≥0.8): {match_statistics['high_confidence']}")
print(f"  - Medium confidence (0.6-0.8): {match_statistics['medium_confidence']}")

STEP 4.2: Automatic matching:   0%|          | 0/24 [00:00<?, ?it/s]


STEP 4.2 Complete:
  - Matched: 83/1946 (4.3%)
  - High confidence (≥0.8): 33
  - Medium confidence (0.6-0.8): 50


---

## **STEP 4.3 - Manual Labeling Integration**

Load manual ground truth labels if available.

**Requirements:**
- ≥5 publications
- ≥20 labeled reference pairs

**Note:** If `manual_groundtruth.json` doesn't exist, you can:
1. Create it manually by editing the JSON file
2. Use the interactive labeling tool from main.ipynb
3. Generate sample labels from high-confidence automatic matches (for testing)

In [15]:
"""
STEP 4.3: Load Manual Labels
==============================
Load manually verified ground truth labels.
"""

manual_labels_list = []

# Try to load manual ground truth
if os.path.exists(MANUAL_LABELS_FILE):
    with open(MANUAL_LABELS_FILE, 'r', encoding='utf-8') as f:
        manual_labels_dict = json.load(f)
    
    # Convert from dict format to list format
    for pub_id, labels in manual_labels_dict.items():
        for bib_key, arxiv_id in labels.items():
            manual_labels_list.append({
                'pub_id': pub_id,
                'bibtex_key': bib_key,
                'arxiv_id': arxiv_id,
                'match_type': 'manual',
                'confidence': 'verified'
            })
    
    manual_pubs = len(manual_labels_dict)
    manual_count = len(manual_labels_list)
    print(f"Loaded {manual_count} manual labels from {manual_pubs} publications")
else:
    print(f"{MANUAL_LABELS_FILE} not found. Using automatic matches only.")
    print(f"  To create manual labels:")
    print(f"  1. Run interactive labeler in main.ipynb")
    print(f"  2. Or create {MANUAL_LABELS_FILE} manually")
    
    # Generate sample labels from high-confidence matches for testing
    sample_labels = []
    for pub_id, matches in list(automatic_matches.items())[:5]:
        high_conf = [m for m in matches if m['confidence_score'] >= 0.85][:5]
        for match in high_conf:
            sample_labels.append({
                'pub_id': pub_id,
                'bibtex_key': match['bibtex_key'],
                'bibtex_ref_id': match['bibtex_ref_id'],
                'arxiv_id': match['arxiv_id'],
                'match_type': 'pseudo_manual',
                'confidence': 'auto_generated',
                'auto_score': match['confidence_score'],
                'notes': 'Auto-generated from high-confidence matches - NOT VERIFIED'
            })
            if len(sample_labels) >= 20:
                break
        if len(sample_labels) >= 20:
            break
    
    manual_labels_list = sample_labels
    print(f"  Generated {len(sample_labels)} pseudo-manual labels for testing")

# Summary
manual_pubs = len(set(l['pub_id'] for l in manual_labels_list))
print(f"\nSTEP 4.3 Complete: {len(manual_labels_list)} labels from {manual_pubs} publications")

manual_groundtruth.json not found. Using automatic matches only.
  To create manual labels:
  1. Run interactive labeler in main.ipynb
  2. Or create manual_groundtruth.json manually
  Generated 7 pseudo-manual labels for testing

STEP 4.3 Complete: 7 labels from 3 publications


---

## **STEP 4.4 - Combine All Matches**

Merge automatic and manual matches (manual labels override automatic).

In [16]:
"""
STEP 4.4: Combine Matches
==========================
Merge automatic and manual matches with priority.
"""

final_matches = {}

# Start with automatic matches
for pub_id, matches in automatic_matches.items():
    final_matches[pub_id] = matches.copy()

# Override with manual labels (higher priority)
for label in manual_labels_list:
    pub_id = label['pub_id']
    
    if pub_id not in final_matches:
        final_matches[pub_id] = []
    
    # Remove automatic match for same key if exists
    final_matches[pub_id] = [m for m in final_matches[pub_id] if m['bibtex_key'] != label['bibtex_key']]
    
    # Add manual label
    final_matches[pub_id].append({
        'bibtex_key': label['bibtex_key'],
        'bibtex_ref_id': label.get('bibtex_ref_id', ''),
        'arxiv_id': label['arxiv_id'],
        'confidence_score': 1.0,
        'match_type': label.get('match_type', 'manual'),
        'confidence': label.get('confidence', 'verified'),
        'notes': label.get('notes', '')
    })

# Summary
total_matches = sum(len(m) for m in final_matches.values())
manual_count = sum(1 for m in final_matches.values() for x in m if x['match_type'] in ['manual', 'pseudo_manual'])
auto_count = total_matches - manual_count

print(f"\nSTEP 4.4 Complete:")
print(f"  - Total matches: {total_matches}")
print(f"  - Manual: {manual_count}, Automatic: {auto_count}")
print(f"  - Publications: {len(final_matches)}")


STEP 4.4 Complete:
  - Total matches: 83
  - Manual: 7, Automatic: 76
  - Publications: 16


---

## **STEP 4.5 - Export Final Results**

Save matched references and manual labels.

In [17]:
"""
STEP 4.5: Save Final Results
=============================
Export matched references to JSON.
"""

# Save matched references
with open("step4_matched_references.pkl", "wb") as f:
    pickle.dump(final_matches, f)

with open("step4_matched_references.json", "w", encoding="utf-8") as f:
    json.dump(final_matches, f, indent=2, ensure_ascii=False)

# Save manual labels
with open("step4_manual_labels.json", "w", encoding="utf-8") as f:
    json.dump(manual_labels_list, f, indent=2, ensure_ascii=False)

print(f"Saved matched references to:")
print(f"  - step4_matched_references.pkl")
print(f"  - step4_matched_references.json")
print(f"  - step4_manual_labels.json")

Saved matched references to:
  - step4_matched_references.pkl
  - step4_matched_references.json
  - step4_manual_labels.json


---

## **STEP 4.6 - Requirements Validation**

Check if all project requirements are met.

In [18]:
"""
STEP 4.6: Requirements Validation
===================================
Verify all project requirements are satisfied.
"""

manual_pubs = len(set(l['pub_id'] for l in manual_labels_list))
manual_pairs = len(manual_labels_list)
total_refs = match_statistics['total_bibtex']
auto_matched = match_statistics['matched']
required_10pct = total_refs * 0.1

print(f"\n{'='*80}")
print(f"REFERENCE MATCHING PIPELINE - REQUIREMENT CHECK")
print(f"{'='*80}")

# Requirement 1: Data Cleaning
print(f"\n✓ 1. Data Cleaning: COMPLETED")
print(f"   - Normalized {total_bibtex} BibTeX + {total_arxiv} arXiv entries")

# Requirement 2: Manual Labeling
req2_met = manual_pairs >= 20 and manual_pubs >= 5
status2 = '✓' if req2_met else '✗'
print(f"\n{status2} 2. Manual Labeling:")
print(f"   - Publications: {manual_pubs}/5 {'✓' if manual_pubs >= 5 else '✗'}")
print(f"   - Reference pairs: {manual_pairs}/20 {'✓' if manual_pairs >= 20 else '✗'}")
if not req2_met:
    print(f"   ⚠ Use interactive labeling tool or create manual_groundtruth.json")

# Requirement 3: Automatic Matching
req3_met = auto_matched >= required_10pct
status3 = '✓' if req3_met else '✗'
print(f"\n{status3} 3. Automatic Matching:")
print(f"   - Total entries: {total_refs}")
print(f"   - Matched: {auto_matched} ({auto_matched/total_refs*100:.1f}%)")
print(f"   - Required (10%): {required_10pct:.0f} {'✓' if req3_met else '✗'}")

# Requirement 4: Algorithms
print(f"\n✓ 4. Algorithms Implemented:")
print(f"   - Levenshtein distance, Jaccard similarity, SequenceMatcher")
print(f"   - Author overlap, Year matching, Combined scoring")

# Overall Status
all_met = req2_met and req3_met
print(f"\n{'='*80}")
if all_met:
    print(f"✓ ALL REQUIREMENTS MET")
else:
    print(f"⚠ SOME REQUIREMENTS NOT MET")
    if not req2_met:
        print(f"  - Need more manual labels")
    if not req3_met:
        print(f"  - Need more automatic matches")
print(f"{'='*80}")


REFERENCE MATCHING PIPELINE - REQUIREMENT CHECK

✓ 1. Data Cleaning: COMPLETED
   - Normalized 1946 BibTeX + 795 arXiv entries

✗ 2. Manual Labeling:
   - Publications: 3/5 ✗
   - Reference pairs: 7/20 ✗
   ⚠ Use interactive labeling tool or create manual_groundtruth.json

✗ 3. Automatic Matching:
   - Total entries: 1946
   - Matched: 83 (4.3%)
   - Required (10%): 195 ✗

✓ 4. Algorithms Implemented:
   - Levenshtein distance, Jaccard similarity, SequenceMatcher
   - Author overlap, Year matching, Combined scoring

⚠ SOME REQUIREMENTS NOT MET
  - Need more manual labels
  - Need more automatic matches


---

## **Pipeline Summary**

Final summary of the matching pipeline.

In [19]:
"""
Pipeline Summary
=================
"""

print(f"\n{'='*80}")
print(f"03_DATA_MODELING PIPELINE - SUMMARY")
print(f"{'='*80}")
print(f"STEP 4.0: Loaded {total_bibtex} BibTeX + {total_arxiv} arXiv references")
print(f"STEP 4.1: Cleaned {total_cleaned} entries")
print(f"STEP 4.2: Automatic matching - {match_rate:.1f}% match rate")
print(f"STEP 4.3: Loaded {len(manual_labels_list)} manual labels")
print(f"STEP 4.4: Combined {total_matches} total matches")
print(f"STEP 4.5: Exported results to JSON")
print(f"STEP 4.6: Requirements validation complete")
print(f"\nAll outputs saved to current directory")
print(f"{'='*80}")


03_DATA_MODELING PIPELINE - SUMMARY
STEP 4.0: Loaded 1946 BibTeX + 795 arXiv references
STEP 4.1: Cleaned 2741 entries
STEP 4.2: Automatic matching - 4.3% match rate
STEP 4.3: Loaded 7 manual labels
STEP 4.4: Combined 83 total matches
STEP 4.5: Exported results to JSON
STEP 4.6: Requirements validation complete

All outputs saved to current directory


---

## **Optional: Interactive Labeling Tool**

If you need to create manual ground truth labels, uncomment and run the cells below.

In [20]:
# """
# Interactive Labeling Tool
# ==========================
# Uncomment to use interactive labeling.
# """

# import random

# class InteractiveLabeler:
#     """Interactive labeling tool with auto-save and resume capability."""
#     
#     def __init__(self, 
#                  bibtex_refs: Dict[str, List[Dict]],
#                  arxiv_refs: Dict[str, List[Dict]],
#                  output_file: str = "manual_groundtruth.json"):
#         self.bibtex_refs = bibtex_refs
#         self.arxiv_refs = arxiv_refs
#         self.output_file = output_file
#         self.labels = {}
#         self._load_existing_labels()
#     
#     def _load_existing_labels(self):
#         if os.path.exists(self.output_file):
#             with open(self.output_file, 'r', encoding='utf-8') as f:
#                 self.labels = json.load(f)
#             total = sum(len(v) for v in self.labels.values())
#             print(f"Loaded {total} existing labels from {len(self.labels)} pubs")
#     
#     def _save_labels(self):
#         with open(self.output_file, 'w', encoding='utf-8') as f:
#             json.dump(self.labels, f, indent=2, ensure_ascii=False)
#         total = sum(len(v) for v in self.labels.values())
#         print(f"Saved {total} labels from {len(self.labels)} pubs")
#     
#     def _generate_candidates(self, bibtex_entry, arxiv_pool, top_k=5):
#         candidates = []
#         for arxiv_ref in arxiv_pool:
#             score = compute_match_score(bibtex_entry, arxiv_ref)
#             candidates.append((arxiv_ref, score, {}))
#         candidates.sort(key=lambda x: x[1], reverse=True)
#         return candidates[:top_k]
#     
#     def run(self, pub_ids=None, num_pubs=5, refs_per_pub=5):
#         if pub_ids is None:
#             valid_pubs = list(set(self.bibtex_refs.keys()) & set(self.arxiv_refs.keys()))
#             pub_ids = random.sample(valid_pubs, min(num_pubs, len(valid_pubs)))
#         
#         print(f"Starting interactive labeling for {len(pub_ids)} publications...")
#         print(f"Target: ~{refs_per_pub} references per publication")
#         
#         for pub_id in pub_ids:
#             print(f"\n{'='*80}")
#             print(f"Publication: {pub_id}")
#             print(f"{'='*80}")
#             
#             if pub_id not in self.labels:
#                 self.labels[pub_id] = {}
#             
#             bibtex_entries = self.bibtex_refs[pub_id][:refs_per_pub]
#             arxiv_pool = self.arxiv_refs[pub_id]
#             
#             for bib_entry in bibtex_entries:
#                 bib_key = bib_entry['key']
#                 if bib_key in self.labels[pub_id]:
#                     continue
#                 
#                 print(f"\nBibTeX: {bib_entry['raw_title'][:60]}...")
#                 candidates = self._generate_candidates(bib_entry, arxiv_pool)
#                 
#                 for i, (arxiv_ref, score, _) in enumerate(candidates, 1):
#                     print(f"[{i}] {score:.3f} - {arxiv_ref.get('arxiv_id', 'N/A')}")
#                 
#                 choice = input("Select [1-5], s=skip, q=quit: ").strip().lower()
#                 
#                 if choice == 'q':
#                     self._save_labels()
#                     return self.labels
#                 elif choice == 's':
#                     continue
#                 elif choice.isdigit() and 1 <= int(choice) <= len(candidates):
#                     arxiv_id = candidates[int(choice)-1][0]['arxiv_id']
#                     self.labels[pub_id][bib_key] = arxiv_id
#                     print(f"Labeled: {bib_key} -> {arxiv_id}")
#             
#             self._save_labels()
#         
#         return self.labels

# # Usage:
# # labeler = InteractiveLabeler(cleaned_bibtex, cleaned_arxiv)
# # manual_labels = labeler.run(num_pubs=5, refs_per_pub=5)