In [5]:
"""
LLM K/S/A Enhancement Pipeline - Stable Working Version
This version successfully produced 254 enhanced items with full provenance
"""

import json
import pandas as pd
import re
import hashlib
import os
from pathlib import Path
from datetime import datetime
import time
from typing import List, Dict, Tuple, Optional
from getpass import getpass

# pip install anthropic openai

import anthropic
from openai import OpenAI

# ============================================================================
# CONFIGURATION
# ============================================================================

# Paths
DATA_DIR = Path(r"C:\Users\Kyle\OneDrive\Desktop\Capstone\fall-2025-group6\src\Data\Manual Extraction")
INPUT_FILE = DATA_DIR / "ksa_output_simple" / "ksa_extractions.csv"
CORPUS_FILE = DATA_DIR / "corpus_manual_dataset.jsonl"
OUTPUT_DIR = DATA_DIR / "ksa_enhanced"
OUTPUT_DIR.mkdir(exist_ok=True)

# Model Configuration (tool-agnostic)
ENHANCER_PROVIDER = os.getenv('ENHANCER_PROVIDER', 'claude')  # claude or openai
CLAUDE_MODEL = os.getenv('CLAUDE_MODEL', 'claude-opus-4-1-20250805')  # Claude Opus 4.1
OPENAI_MODEL = os.getenv('OPENAI_MODEL', 'gpt-4o-mini')  # Safe default for OpenAI
TEMPERATURE = float(os.getenv('LLM_TEMPERATURE', '0.1'))  # Deterministic
SEED = int(os.getenv('LLM_SEED', '42'))  # For reproducibility (best-effort)

# Processing options
USE_CACHE = True
MIN_INFERRED_CONFIDENCE = 0.45  # Lower confidence for inferred items
MIN_EXPLICIT_CONFIDENCE = 0.82  # Raised from 0.75 for better precision
MAX_KNOWLEDGE_PER_AFSC = 15  # Reduced from 20 for tighter control

# ============================================================================
# EVIDENCE-PRESERVING ENHANCER
# ============================================================================

class ProvenanceEnhancer:
    def __init__(self):
        """Initialize with selected provider only"""
        print("API Authentication Required")
        print("-" * 40)
        
        self.provider = ENHANCER_PROVIDER
        self.claude = None
        self.openai = None
        
        if self.provider == 'claude':
            claude_key = getpass("Enter your Claude API key: ")
            self.claude = anthropic.Anthropic(api_key=claude_key)
            print(f"✓ Claude ({CLAUDE_MODEL}) initialized\n")
        elif self.provider == 'openai':
            openai_key = getpass("Enter your OpenAI API key: ")
            self.openai = OpenAI(api_key=openai_key)
            print(f"✓ OpenAI ({OPENAI_MODEL}) initialized\n")
        else:
            # Use both for comparison/validation
            claude_key = getpass("Enter your Claude API key: ")
            openai_key = getpass("Enter your OpenAI API key: ")
            self.claude = anthropic.Anthropic(api_key=claude_key)
            self.openai = OpenAI(api_key=openai_key)
            print("✓ Both providers initialized\n")
        
        self.cache = self.load_cache()
        
    def load_cache(self) -> dict:
        """Load cached responses"""
        cache_file = OUTPUT_DIR / "llm_cache_v2.json"  # Stable filename
        if cache_file.exists():
            with open(cache_file, 'r', encoding='utf-8') as f:
                return json.load(f)
        return {}
    
    def save_cache(self):
        """Save cache to disk"""
        cache_file = OUTPUT_DIR / "llm_cache_v2.json"  # Stable filename
        with open(cache_file, 'w', encoding='utf-8') as f:
            json.dump(self.cache, f, indent=2)
    
    def _make_cache_key(self, prefix: str, afsc: str, content: str) -> str:
        """Create deterministic cache key including content hash"""
        content_hash = hashlib.md5(content[:500].encode()).hexdigest()[:8]
        return f"{prefix}_{afsc}_{content_hash}"
    
    def extract_knowledge_from_document(self, afsc: str, doc: Dict) -> List[Dict]:
        """Extract knowledge with full provenance from document"""
        
        text = doc.get('text', '')
        doc_id = doc.get('doc_id', '')
        title = doc.get('title', '')
        
        # Cache key includes doc content
        cache_key = self._make_cache_key('doc_knowledge', afsc, text)
        if cache_key in self.cache:
            return self.cache[cache_key]
        
        knowledge_items = []
        
        # Pattern 1: Find explicit Knowledge section (improved regex)
        patterns = [
            r"3\.1\.?\s*Knowledge[.\s]+(.+?)(?=\n3\.\d|\n2\.|$)",
            r"Knowledge[.\s]*(?:is mandatory of|includes?|requires?)[:\s]+(.+?)(?=\n\d\.|$)",
            r"(?:principles?|theory|concepts?)\s+of[:\s]+(.+?)(?=\n|\.{2,}|$)"
        ]
        
        for pattern in patterns:
            matches = re.findall(pattern, text, re.IGNORECASE | re.DOTALL)
            for match in matches:
                knowledge_text = match[:1500]  # Limit length
                
                # Find exact location in document
                match_start = text.find(match)
                section = self._identify_section(text, match_start)
                page = doc.get('page_start', 0)  # Use actual page if available
                
                # Use LLM to structure with evidence requirement
                prompt = f"""Extract ONLY explicit knowledge requirements from the text below.

Text:
{knowledge_text}

Return ONLY knowledge items as plain lines.

Rules:
- Each item must be directly stated in the text (no inference).
- 3-6 words per line, noun phrases only (no leading verbs).
- Each item must be a complete, specific concept.
- No numbers, bullets, markdown, quotes, or explanations.
- No single words or fragments (bad: "flight", "aircraft operating").
- Prefer specific phrases (good: "aircraft operating procedures").
- Use lowercase, no trailing punctuation.
- Maximum 10 lines. No duplicates.
- If none are explicit, return: NONE

Examples (format and specificity):
aircraft operating procedures
air navigation principles
aviation meteorology
weapons system capabilities
mission planning procedures

Return ONLY the knowledge phrases, nothing else."""

                if self.provider == 'claude' and self.claude:
                    response = self.claude.messages.create(
                        model=CLAUDE_MODEL,
                        max_tokens=300,
                        temperature=TEMPERATURE,
                        messages=[{"role": "user", "content": prompt}]
                    )
                    content = response.content[0].text
                elif self.openai:
                    # Fix 4: Handle seed parameter carefully
                    kwargs = {
                        'model': OPENAI_MODEL,
                        'messages': [{"role": "user", "content": prompt}],
                        'temperature': TEMPERATURE,
                        'max_tokens': 300
                    }
                    try:
                        kwargs['seed'] = SEED
                    except Exception:
                        pass
                    response = self.openai.chat.completions.create(**kwargs)
                    content = response.choices[0].message.content
                else:
                    continue
                
                # Parse response with evidence
                if "NONE" not in content.upper():
                    for line in content.strip().split('\n'):
                        if not line.strip() or line.startswith('Quote:'):
                            continue
                        
                        # Extract the knowledge item
                        line = line.strip('- •*').strip()
                        if len(line) > 5 and len(line) < 100:
                            # Find supporting evidence in original text
                            evidence = self._find_evidence(knowledge_text, line)
                            
                            knowledge_items.append({
                                'text': line,
                                'type': 'knowledge',
                                'confidence': MIN_EXPLICIT_CONFIDENCE,
                                'source_method': 'document_explicit',
                                'evidence_snippet': evidence,
                                'doc_id': doc_id,
                                'title': title,
                                'category': doc.get('category', ''),
                                'afsc': afsc,
                                'section': section,
                                'page': page if page > 0 else None
                            })
        
        # Simple deduplication: remove items with very similar text
        seen_texts = set()
        unique_items = []
        for item in knowledge_items:
            # Normalize for comparison (lowercase, strip extra spaces)
            normalized = ' '.join(item['text'].lower().split())
            if normalized not in seen_texts:
                seen_texts.add(normalized)
                unique_items.append(item)
        
        # Sort by text length (prefer specific over generic) and take top N
        unique_items.sort(key=lambda x: len(x['text']), reverse=True)
        unique_items = unique_items[:MAX_KNOWLEDGE_PER_AFSC]
        
        # Cache and return
        self.cache[cache_key] = unique_items
        self.save_cache()
        return unique_items
    
    def infer_knowledge_from_skills(self, afsc: str, skills_df: pd.DataFrame, doc_meta: Dict) -> List[Dict]:
        """Infer knowledge with traceable provenance to source skills
        
        Args:
            afsc: AFSC code
            skills_df: DataFrame of skills for this AFSC
            doc_meta: Document metadata from corpus for proper linkage
        """
        
        if skills_df.empty:
            return []
        
        # Take top 5 skills by confidence
        top_skills = skills_df.nlargest(5, 'confidence') if 'confidence' in skills_df else skills_df.head(5)
        skills_text = '\n'.join([f"- {row.get('text', row.get('Raw Skill', ''))}" for _, row in top_skills.iterrows()])
        
        # Include evidence from source skills
        skills_evidence = '\n'.join([f"- {row.get('evidence_snippet', '')}" for _, row in top_skills.iterrows() if row.get('evidence_snippet')])
        
        cache_key = self._make_cache_key('inferred', afsc, skills_text)
        if cache_key in self.cache:
            return self.cache[cache_key]
        
        prompt = f"""Given these verified Air Force skills for AFSC {afsc}:
{skills_text}

Supporting evidence from documents:
{skills_evidence if skills_evidence else 'N/A'}

What knowledge is NECESSARILY required to perform these skills?
- Return ONLY knowledge that is clearly implied by the skills
- If knowledge cannot be reliably inferred, return "NONE"
- Format: Brief theoretical concepts (3-7 words each)
- Maximum 3 items"""

        if self.provider == 'claude' and self.claude:
            response = self.claude.messages.create(
                model=CLAUDE_MODEL,
                max_tokens=200,
                temperature=TEMPERATURE,
                messages=[{"role": "user", "content": prompt}]
            )
            content = response.content[0].text
        elif self.openai:
            # Fix 4: Handle seed parameter carefully
            kwargs = {
                'model': OPENAI_MODEL,
                'messages': [{"role": "user", "content": prompt}],
                'temperature': TEMPERATURE,
                'max_tokens': 200
            }
            try:
                kwargs['seed'] = SEED
            except Exception:
                pass
            response = self.openai.chat.completions.create(**kwargs)
            content = response.choices[0].message.content
        else:
            return []
        
        knowledge_items = []
        if "NONE" not in content.upper():
            for line in content.strip().split('\n'):
                line = line.strip('- •*').strip()
                if len(line) > 5 and len(line) < 100:
                    # Link to source skills
                    source_skills = [row.get('text', row.get('Raw Skill', '')) for _, row in top_skills.iterrows()]
                    
                    knowledge_items.append({
                        'text': line,
                        'type': 'knowledge',
                        'confidence': MIN_INFERRED_CONFIDENCE,
                        'source_method': 'skill_inferred',
                        'evidence_snippet': f"Inferred from skills: {', '.join(source_skills[:2])}",
                        'doc_id': doc_meta.get('doc_id', ''),  # Use corpus doc metadata
                        'title': doc_meta.get('title', ''),
                        'category': doc_meta.get('category', ''),
                        'afsc': afsc,
                        'section': 'inferred',
                        'page': None,  # Use None instead of 0
                        'parent_skills': source_skills
                    })
        
        self.cache[cache_key] = knowledge_items
        self.save_cache()
        return knowledge_items
    
    def _find_evidence(self, text: str, item: str) -> str:
        """Find supporting evidence - prefer shortest complete sentence with ≥2 key terms"""
        # Extract key terms (remove common words like 'of', 'the')
        key_terms = [w for w in item.lower().split() if len(w) > 3][:3]
        
        if not key_terms:
            return text[:150].strip() + "..."
        
        # Split into sentences
        sentences = re.split(r'(?<=[.!?])\s+', text)
        best_sentence = None
        best_score = 0
        
        for sent in sentences:
            sent = sent.strip()
            if len(sent) < 20:  # Too short to be meaningful
                continue
                
            sent_lower = sent.lower()
            # Count matching key terms
            matches = sum(1 for term in key_terms if term in sent_lower)
            
            if matches >= 2:  # Prefer sentences with multiple key terms
                # Score: more matches is better, shorter is better
                score = (matches * 1000) - len(sent)
                if score > best_score:
                    best_score = score
                    best_sentence = sent
        
        if best_sentence:
            return best_sentence[:200]  # Cap length but keep complete
        
        # Fallback: context around first key term match
        for term in key_terms:
            if term in text.lower():
                idx = text.lower().find(term)
                start = max(0, idx - 50)
                end = min(len(text), idx + len(term) + 100)
                return "..." + text[start:end].strip() + "..."
        
        return text[:150].strip() + "..."
    
    def _identify_section(self, text: str, position: int) -> str:
        """Identify section number at position in text"""
        # Look backwards for section marker
        before = text[:position]
        section_match = re.findall(r'(\d+\.\d+\.?\s*[A-Z][^.]+)', before[-200:])
        if section_match:
            return section_match[-1][:20]
        return "3.1 Knowledge"  # Better default

# ============================================================================
# MAIN PIPELINE WITH PROVENANCE
# ============================================================================

def cap_k_per_afsc(df: pd.DataFrame, k: int = MAX_KNOWLEDGE_PER_AFSC) -> pd.DataFrame:
    """Defensive cap to ensure no AFSC exceeds knowledge limit"""
    blocks = []
    for afsc in df['afsc'].dropna().unique():
        sub = df[df['afsc'] == afsc]
        keep_k = sub[sub['type'] == 'knowledge'].sort_values('confidence', ascending=False).head(k)
        others = sub[sub['type'] != 'knowledge']
        blocks.append(pd.concat([keep_k, others], ignore_index=True))
    return pd.concat(blocks, ignore_index=True) if blocks else df

def enhance_with_provenance():
    """Enhanced pipeline maintaining full provenance"""
    
    print("=" * 60)
    print("PROPOSAL-COMPLIANT K/S/A ENHANCEMENT")
    print("=" * 60)
    
    # Load base extractions
    print("\n1. Loading LAiSER extractions...")
    base_df = pd.read_csv(INPUT_FILE)
    
    # Standardize column names for compatibility
    base_df = base_df.rename(columns={
        'Raw Skill': 'text',
        'ksa_type': 'type',
        'Correlation Coefficient': 'confidence'
    })
    
    # Add source_method for existing items
    base_df['source_method'] = 'laiser'
    
    print(f"   Loaded {len(base_df)} items")
    
    # Load corpus
    print("\n2. Loading source corpus...")
    corpus = {}
    with open(CORPUS_FILE, 'r') as f:
        for line in f:
            doc = json.loads(line)
            corpus[doc['afsc']] = doc
    print(f"   Loaded {len(corpus)} documents")
    
    # Initialize enhancer
    print("\n3. Initializing enhancement engine...")
    enhancer = ProvenanceEnhancer()
    
    # Process each AFSC
    all_items = []
    new_knowledge = []
    
    print("\n4. Extracting knowledge with provenance...")
    for afsc in sorted(base_df['afsc'].unique()):
        print(f"\n   Processing {afsc}...")
        
        # Keep existing items
        afsc_base = base_df[base_df['afsc'] == afsc].copy()
        all_items.extend(afsc_base.to_dict('records'))
        
        # Extract explicit knowledge from document
        doc_knowledge = []  # Initialize to prevent UnboundLocalError
        if afsc in corpus:
            doc_knowledge = enhancer.extract_knowledge_from_document(afsc, corpus[afsc])
            new_knowledge.extend(doc_knowledge)
            if len(doc_knowledge) > 0:
                print(f"     Found {len(doc_knowledge)} explicit knowledge items")
            else:
                print(f"     No explicit knowledge found in document")
        else:
            print(f"     No corpus document found; skipping explicit knowledge extraction")
        
        # Infer knowledge from skills (only if needed)
        afsc_skills = afsc_base[afsc_base['type'] == 'skill'].copy()
        afsc_skills['confidence'] = pd.to_numeric(afsc_skills['confidence'], errors='coerce').fillna(0.0)
        
        if len(afsc_skills) > 0 and len(doc_knowledge) < 2:  # Conservative inference
            doc_meta = corpus.get(afsc, {})  # Pass document metadata
            inferred = enhancer.infer_knowledge_from_skills(afsc, afsc_skills, doc_meta)
            new_knowledge.extend(inferred)
            print(f"     Inferred {len(inferred)} knowledge items from skills")
        
        time.sleep(0.3)  # Rate limiting
    
    # Combine all items
    print(f"\n5. Combining and validating...")
    enhanced_df = pd.DataFrame(all_items + new_knowledge)
    
    # Ensure all required columns exist
    required_cols = ['text', 'type', 'confidence', 'source_method', 
                     'evidence_snippet', 'doc_id', 'title', 'category', 'afsc']
    
    for col in required_cols:
        if col not in enhanced_df.columns:
            enhanced_df[col] = ''
    
    # FIX 1: Normalize types and bound confidence
    enhanced_df['type'] = enhanced_df['type'].str.lower().replace({
        'knowledge': 'knowledge',
        'skill': 'skill',
        'ability': 'ability'
    }).fillna('skill')
    
    enhanced_df['confidence'] = pd.to_numeric(enhanced_df['confidence'], errors='coerce').clip(0, 1).fillna(0.5)
    
    # Fill other missing values appropriately
    enhanced_df['evidence_snippet'] = enhanced_df['evidence_snippet'].fillna('')
    enhanced_df['source_method'] = enhanced_df['source_method'].fillna('unknown')
    
    # FIX 7: Add review status for QC workflow
    enhanced_df['review_status'] = enhanced_df.apply(
        lambda x: 'pending' if x['source_method'] in ['skill_inferred', 'document_explicit'] else 'reviewed',
        axis=1
    )
    
    # Remove duplicates (Fix 2: include type in dedup)
    enhanced_df = enhanced_df.drop_duplicates(subset=['afsc', 'type', 'text']).reset_index(drop=True)
    
    # Apply defensive cap per AFSC (ensures no AFSC exceeds limit)
    enhanced_df = cap_k_per_afsc(enhanced_df, MAX_KNOWLEDGE_PER_AFSC)
    
    # Sort by AFSC, type, confidence
    enhanced_df = enhanced_df.sort_values(
        ['afsc', 'type', 'confidence'],
        ascending=[True, True, False]
    )
    
    # Save main output
    print(f"\n6. Saving outputs...")
    output_file = OUTPUT_DIR / "ksa_extractions_enhanced.csv"
    enhanced_df.to_csv(output_file, index=False)
    
    # Create QC candidates file for new knowledge (with review_status and origin - Fix 7)
    qc_candidates = pd.DataFrame(new_knowledge)
    if len(qc_candidates) > 0:
        qc_candidates['review_status'] = 'pending'
        qc_candidates['origin'] = 'enhancement'
    qc_file = OUTPUT_DIR / "qc_candidates.csv"
    qc_candidates.to_csv(qc_file, index=False)
    
    # FIX 6: Generate graph export
    graph_file = OUTPUT_DIR / "graph_export_enhanced.json"
    graph_data = export_graph(enhanced_df, graph_file)
    
    # Generate statistics with improved calculation (FIX 5)
    mask_explicit = (enhanced_df['source_method'] == 'document_explicit')
    mask_inferred = (enhanced_df['source_method'] == 'skill_inferred')
    mask_laiser = (enhanced_df['source_method'] == 'laiser')
    
    stats = {
        'enhancement_date': datetime.now().isoformat(),
        'provider': ENHANCER_PROVIDER,
        'model': CLAUDE_MODEL if ENHANCER_PROVIDER == 'claude' else OPENAI_MODEL,
        'original_items': len(base_df),
        'enhanced_items': len(enhanced_df),
        'knowledge_added': len(new_knowledge),
        'explicit_knowledge': sum(1 for k in new_knowledge if k.get('source_method') == 'document_explicit'),
        'inferred_knowledge': sum(1 for k in new_knowledge if k.get('source_method') == 'skill_inferred'),
        'type_distribution': enhanced_df['type'].value_counts().to_dict(),
        'source_distribution': enhanced_df['source_method'].value_counts().to_dict(),
        'avg_confidence': {
            'overall': float(enhanced_df['confidence'].mean()),
            'explicit': float(enhanced_df.loc[mask_explicit, 'confidence'].mean()) if mask_explicit.any() else 0.0,
            'inferred': float(enhanced_df.loc[mask_inferred, 'confidence'].mean()) if mask_inferred.any() else 0.0,
            'laiser': float(enhanced_df.loc[mask_laiser, 'confidence'].mean()) if mask_laiser.any() else 0.0
        },
        'graph_stats': {
            'nodes': len(graph_data['nodes']),
            'edges': len(graph_data['edges'])
        }
    }
    
    stats_file = OUTPUT_DIR / "enhancement_stats.json"
    with open(stats_file, 'w') as f:
        json.dump(stats, f, indent=2)
    
    # QC checks
    print("\n7. Running quality checks...")
    problems = []
    
    # Check confidence bounds
    bad_conf = enhanced_df[(enhanced_df['confidence'] < 0) | (enhanced_df['confidence'] > 1)]
    if not bad_conf.empty:
        problems.append(("confidence_out_of_bounds", len(bad_conf)))
    
    # Check for empty text
    empty_text = enhanced_df[enhanced_df['text'].astype(str).str.strip() == ""]
    if not empty_text.empty:
        problems.append(("empty_text", len(empty_text)))
    
    # Check each AFSC has ≥3 items
    by_afsc = enhanced_df.groupby('afsc').size()
    low_afsc = by_afsc[by_afsc < 3]
    if not low_afsc.empty:
        problems.append(("afsc_with_lt3_items", low_afsc.to_dict()))
    
    if problems:
        print(f"   QC issues found: {problems}")
    else:
        print(f"   ✓ All quality checks passed")
    
    # Print summary
    print("\n" + "=" * 60)
    print("ENHANCEMENT COMPLETE")
    print("=" * 60)
    print(f"Original items: {stats['original_items']}")
    print(f"Enhanced items: {stats['enhanced_items']}")
    print(f"Knowledge added: {stats['knowledge_added']}")
    print(f"  - Explicit: {stats['explicit_knowledge']}")
    print(f"  - Inferred: {stats['inferred_knowledge']}")
    print(f"\nType distribution:")
    for t, count in stats['type_distribution'].items():
        print(f"  {t}: {count} ({count/len(enhanced_df)*100:.1f}%)")
    print(f"\nFiles saved:")
    print(f"  - {output_file.name} (main dataset)")
    print(f"  - {qc_file.name} (QC candidates)")
    print(f"  - {graph_file.name} (graph export)")
    print(f"  - {stats_file.name} (statistics)")

# ============================================================================
# GRAPH EXPORT FUNCTION
# ============================================================================

def export_graph(df: pd.DataFrame, output_path: Path) -> dict:
    """Export enhanced data as graph structure
    
    Args:
        df: Enhanced dataframe with KSAs
        output_path: Path to save graph JSON
        
    Returns:
        Graph data dictionary
    """
    nodes = []
    edges = []
    seen_nodes = set()
    
    # Create AFSC nodes
    for afsc in df['afsc'].unique():
        if pd.notna(afsc):
            afsc_data = df[df['afsc'] == afsc].iloc[0]
            nodes.append({
                'id': str(afsc),
                'type': 'AFSC',
                'properties': {
                    'title': afsc_data.get('title', ''),
                    'category': afsc_data.get('category', '')
                }
            })
            seen_nodes.add(str(afsc))
    
    # Create KSA nodes and edges
    for _, row in df.iterrows():
        # Create stable node ID using hash
        text_for_hash = f"{row['afsc']}_{row['text']}"
        node_id = f"{row['type']}_{hashlib.md5(text_for_hash.encode()).hexdigest()[:12]}"
        
        # Add node if not already seen
        if node_id not in seen_nodes:
            # Fix 3: Flexible ESCO column handling
            esco_tag = (row.get('esco_tag') or row.get('Skill Tag') or 
                       row.get('ESCO') or row.get('Esco Tag') or '')
            
            nodes.append({
                'id': node_id,
                'type': row['type'].upper(),
                'properties': {
                    'text': row['text'],
                    'confidence': float(row['confidence']),
                    'source_method': row.get('source_method', 'unknown'),
                    'esco_tag': esco_tag
                }
            })
            seen_nodes.add(node_id)
        
        # Add edge from AFSC to KSA
        edges.append({
            'source': str(row['afsc']),
            'target': node_id,
            'relationship': f"REQUIRES_{row['type'].upper()}",
            'properties': {
                'confidence': float(row['confidence']),
                'evidence': (row.get('evidence_snippet', '') or '')[:200],
                'source_method': row.get('source_method', 'unknown')
            }
        })
    
    graph_data = {
        'nodes': nodes,
        'edges': edges,
        'metadata': {
            'created': datetime.now().isoformat(),
            'total_nodes': len(nodes),
            'total_edges': len(edges),
            'afsc_count': len([n for n in nodes if n['type'] == 'AFSC']),
            'knowledge_count': len([n for n in nodes if n['type'] == 'KNOWLEDGE']),
            'skill_count': len([n for n in nodes if n['type'] == 'SKILL']),
            'ability_count': len([n for n in nodes if n['type'] == 'ABILITY'])
        }
    }
    
    # Save to file
    with open(output_path, 'w') as f:
        json.dump(graph_data, f, indent=2)
    
    return graph_data

# ============================================================================
# ENTRY POINT
# ============================================================================

if __name__ == "__main__":
    enhance_with_provenance()

PROPOSAL-COMPLIANT K/S/A ENHANCEMENT

1. Loading LAiSER extractions...
   Loaded 73 items

2. Loading source corpus...
   Loaded 12 documents

3. Initializing enhancement engine...
API Authentication Required
----------------------------------------
✓ Claude (claude-opus-4-1-20250805) initialized


4. Extracting knowledge with provenance...

   Processing 11F3...
     Found 7 explicit knowledge items

   Processing 12B...
     Found 7 explicit knowledge items

   Processing 14F...
     Found 8 explicit knowledge items

   Processing 14N...
     Found 11 explicit knowledge items

   Processing 1A3X1...
     Found 15 explicit knowledge items

   Processing 1C3...
     Found 14 explicit knowledge items

   Processing 1N0...
     Found 10 explicit knowledge items

   Processing 1N4...
     Found 10 explicit knowledge items

   Processing 21A...
     Found 14 explicit knowledge items

   Processing 21M...
     No explicit knowledge found in document
     Inferred 3 knowledge items from skil