# LLM K/S/A – QC Checks (Shareable Notebook)

This notebook contains quick quality-control checks for the enhanced K/S/A dataset and the exported graph.

**What it does**
- Samples knowledge items for spot-checking format/length.
- Flags malformed/overlong items.
- Summarizes node/edge counts from `graph_export_enhanced.json`.
- Surfaces potential evidence issues for manual review.

**How to run**
1. Install: `pip install pandas`
2. Update the file paths in code cells to point to your local repo (they currently reference Windows absolute paths).
3. Run cells top-to-bottom.

> Note: This copy was sanitized for sharing: outputs cleared, execution counts removed.

In [None]:
import pandas as pd, re
p = r"C:\Users\Kyle\OneDrive\Desktop\Capstone\fall-2025-group6\src\Data\Manual Extraction\ksa_enhanced\ksa_extractions_enhanced.csv"
df = pd.read_csv(p)

k = df[df["type"]=="knowledge"].copy()

# 1) Fragment/too-short or too-long (after the stricter prompt these should be few)
fragments = k[k["text"].str.len().between(1, 6, inclusive="both")]
too_long = k[k["text"].str.len() > 60]  # long often means "phrase + rationale"
print("Fragments:", len(fragments), "Too long:", len(too_long))

# 2) Leading verbs (bad for knowledge)
LEADING_VERBS = r'^(apply|perform|conduct|operate|manage|lead|coordinate|supervise|implement|analyze|assess|develop)\b'
leading_verbs = k[k["text"].str.lower().str.match(LEADING_VERBS, na=False)]
print("Leading-verb items:", len(leading_verbs))

# 3) Generic catch-alls (tune list if you see false positives)
GENERIC = [
    r'\bpolicies and procedures\b', r'\bregulations\b', r'\bstandards\b',
    r'\bsafety practices\b', r'\brequirements\b', r'\bgeneral knowledge\b'
]
generic_mask = k["text"].str.lower().apply(lambda s: any(re.search(g, s) for g in GENERIC))
generics = k[generic_mask]
print("Generic items:", len(generics))

# 4) Duplicates after normalization
norm = k["text"].str.lower().str.replace(r'\s+', ' ', regex=True).str.strip()
dups = k[norm.duplicated(keep=False)].sort_values(["afsc","text"])
print("Duplicate knowledge items:", dups.shape[0])


In [None]:
import pandas as pd

# Load your enhanced dataset
df = pd.read_csv(r"C:\Users\Kyle\OneDrive\Desktop\Capstone\fall-2025-group6\src\Data\Manual Extraction\ksa_enhanced\ksa_extractions_enhanced.csv")

# Check per-AFSC knowledge distribution
print("Knowledge items per AFSC:")
for afsc in df['afsc'].unique():
    afsc_k = df[(df['afsc'] == afsc) & (df['type'] == 'knowledge')]
    print(f"\n{afsc}: {len(afsc_k)} knowledge items")
    
    # Check for generic terms
    generic_terms = ['policies and procedures', 'applicable regulations', 'safety practices', 'standard procedures']
    for term in generic_terms:
        generic_items = afsc_k[afsc_k['text'].str.contains(term, case=False, na=False)]
        if not generic_items.empty:
            print(f"  ⚠️ Found {len(generic_items)} items with '{term}'")
            for _, item in generic_items.head(2).iterrows():
                print(f"    - {item['text'][:50]}...")

In [None]:
# Sample for evidence audit
sample_afscs = {
    'Operations': ['11F3', '12B'],      # Pick one from each
    'Intelligence': ['14N'],
    'Maintenance': ['21A'],
    'Inferred': ['21M']                 # The one with inferred items
}

audit_items = []
for category, afscs in sample_afscs.items():
    for afsc in afscs:
        afsc_items = df[df['afsc'] == afsc]
        
        # Get 2 explicit + 1 inferred (if available)
        explicit = afsc_items[(afsc_items['type'] == 'knowledge') & 
                             (afsc_items['source_method'] == 'document_explicit')].head(2)
        inferred = afsc_items[(afsc_items['type'] == 'knowledge') & 
                             (afsc_items['source_method'] == 'skill_inferred')].head(1)
        
        audit_items.append(pd.concat([explicit, inferred]))

# Create audit spreadsheet
audit_df = pd.concat(audit_items)[['afsc', 'text', 'evidence_snippet', 'confidence', 'source_method']]
audit_df['is_theoretical'] = ''  # Add column for your review
audit_df['evidence_matches'] = ''  # Add column for your review
audit_df.to_csv('evidence_audit.csv', index=False)

print(f"Created evidence_audit.csv with {len(audit_df)} items to review")

In [None]:
import json
from pathlib import Path

# Correct file name
graph_file = Path(r"C:\Users\Kyle\OneDrive\Desktop\Capstone\fall-2025-group6\src\Data\Manual Extraction\ksa_enhanced\graph_export_enhanced.json")

# Load graph export
with open(graph_file, 'r', encoding='utf-8') as f:
    graph = json.load(f)

# Basic counts
afsc_nodes = [n for n in graph['nodes'] if n['type'] == 'AFSC']
k_nodes = [n for n in graph['nodes'] if n['type'] == 'KNOWLEDGE']
s_nodes = [n for n in graph['nodes'] if n['type'] == 'SKILL']
a_nodes = [n for n in graph['nodes'] if n['type'] == 'ABILITY']

print(f"AFSC nodes: {len(afsc_nodes)}")  # Should be 12
print(f"Knowledge nodes: {len(k_nodes)}")  # Should be ~161
print(f"Skill nodes: {len(s_nodes)}")      # Should be ~65
print(f"Ability nodes: {len(a_nodes)}")    # Should be ~8

# Check edges per AFSC
from collections import Counter
edge_counts = Counter(e['source'] for e in graph['edges'])
print("\nEdges per AFSC:")
for afsc, count in edge_counts.most_common():
    print(f"  {afsc}: {count} edges")

# Top confidence knowledge items
k_with_conf = [(n['properties']['text'], n['properties']['confidence']) 
               for n in k_nodes]
k_with_conf.sort(key=lambda x: x[1], reverse=True)
print("\nTop 5 knowledge items by confidence:")
for text, conf in k_with_conf[:5]:
    print(f"  {conf:.3f}: {text[:50]}...")

# Check for any low-confidence items that slipped through
low_conf = [n for n in k_nodes if n['properties']['confidence'] < 0.82]
if low_conf:
    print(f"\n⚠️ Found {len(low_conf)} knowledge items below 0.82 confidence")
else:
    print("\n✓ All knowledge items meet confidence threshold")

In [None]:
import pandas as pd
import re

df = pd.read_csv(r"C:\Users\Kyle\OneDrive\Desktop\Capstone\fall-2025-group6\src\Data\Manual Extraction\ksa_enhanced\ksa_extractions_enhanced.csv")

# Identify problematic patterns
problems = df[
    df['text'].str.contains(r'^(Based on|Quote:|^\d+\.\s*\*\*|Here are)', case=False, na=False) |
    df['text'].str.len() > 100  # Knowledge items shouldn't be this long
]

print(f"Found {len(problems)} malformed knowledge items")
for _, row in problems.head(10).iterrows():
    print(f"  {row['afsc']}: {row['text'][:60]}...")

# Clean them
def clean_knowledge_text(text):
    # Remove LLM meta-commentary
    text = re.sub(r'^(Based on.*?:|Quote:\s*|Here are.*?:|\d+\.\s*)', '', text)
    text = re.sub(r'\*\*([^*]+)\*\*', r'\1', text)  # Remove markdown bold
    text = re.sub(r'^"([^"]+)"', r'\1', text)  # Remove quotes
    text = re.sub(r'\s*-\s*Exact quote:.*', '', text)  # Remove quote references
    
    # Extract just the knowledge phrase if embedded
    if 'Knowledge is mandatory of' in text:
        match = re.search(r'Knowledge is mandatory of[:\s]+([^.]+)', text)
        if match:
            return match.group(1).strip()
    
    return text.strip()

# Apply cleaning
df['text'] = df['text'].apply(clean_knowledge_text)

# Save cleaned version
df.to_csv(r"C:\Users\Kyle\OneDrive\Desktop\Capstone\fall-2025-group6\src\Data\Manual Extraction\ksa_enhanced\ksa_extractions_enhanced_cleaned.csv", index=False)
print(f"Cleaned and saved {len(df)} items")

In [None]:
import pandas as pd

# Load the new results
df = pd.read_csv(r"C:\Users\Kyle\OneDrive\Desktop\Capstone\fall-2025-group6\src\Data\Manual Extraction\ksa_enhanced\ksa_extractions_enhanced.csv")

# Sample knowledge items
knowledge = df[df['type'] == 'knowledge']
print("Sample knowledge items:")
for _, row in knowledge.head(10).iterrows():
    print(f"  {row['afsc']}: {row['text']}")

# Check text length distribution
print(f"\nText length stats:")
print(f"  Mean: {knowledge['text'].str.len().mean():.1f} chars")
print(f"  Max: {knowledge['text'].str.len().max()} chars")

# Any remaining issues?
long_items = knowledge[knowledge['text'].str.len() > 50]
if len(long_items) > 0:
    print(f"\nItems over 50 chars: {len(long_items)}")
    for _, row in long_items.head(3).iterrows():
        print(f"  {row['text'][:60]}...")