# Debug Field Patterns in JCC2 Data

This notebook investigates the actual field naming patterns to understand why app-specific ratings aren't being detected.

In [None]:
import sys
import os
sys.path.insert(0, os.path.abspath('..'))
from jcc2_data_processor import create_processor
import pandas as pd

In [None]:
# Load data
data_file = '/home/nathanjgaul/Downloads/DCDC UQ Combined_2025-07-30_pii_scrubbed.csv'
processor = create_processor(data_file)
df = processor.load_data()
print(f"Data loaded: {len(df)} rows, {len(df.columns)} columns")

In [None]:
# Let's examine MOP 1.1.1 fields in detail
print("MOP 1.1.1 Fields:")
print("=" * 80)

if 'mop_1_1_1' in processor.sections:
    mop_fields = processor.sections['mop_1_1_1']
    print(f"Total fields: {len(mop_fields)}\n")
    
    # Group fields by type
    effectiveness_fields = [f for f in mop_fields if 'effectiveness' in f or 'effective' in f]
    suitability_fields = [f for f in mop_fields if 'suitability' in f or 'suitable' in f]
    
    print(f"Effectiveness fields: {len(effectiveness_fields)}")
    for field in sorted(effectiveness_fields):
        print(f"  - {field}")
        # Check which app this might be for
        apps = ['a2it', 'cad', 'codex', 'crucible', 'cyber9line', 'dispatch',
                'jcc2cyberops', 'jcc2readiness', 'madss', 'rally', 'redmap',
                'sigact', 'threathub', 'triage', 'unity']
        for app in apps:
            if app in field.lower():
                print(f"    -> Contains app: {app}")
    
    print(f"\nSuitability fields: {len(suitability_fields)}")
    for field in sorted(suitability_fields):
        print(f"  - {field}")

In [None]:
# Check all target sections for app-specific patterns
target_sections = [
    'mop_1_1_1', 'mos_1_1_2', 'mop_1_1_3', 'reporting_and_data_export'
]

apps = ['a2it', 'cad', 'codex', 'crucible', 'cyber9line', 'dispatch',
        'jcc2cyberops', 'jcc2readiness', 'madss', 'rally', 'redmap',
        'sigact', 'threathub', 'triage', 'unity']

for section in target_sections:
    print(f"\n{'='*80}")
    print(f"Section: {section}")
    print(f"{'='*80}")
    
    if section in processor.sections:
        fields = processor.sections[section]
        
        # Find all rating fields
        rating_fields = [f for f in fields if any(x in f for x in ['effectiveness', 'effective', 'suitability', 'suitable'])]
        
        print(f"Rating fields found: {len(rating_fields)}")
        
        # Check for app-specific fields
        app_fields_found = {app: [] for app in apps}
        overall_fields = []
        unmatched_fields = []
        
        for field in rating_fields:
            matched = False
            field_lower = field.lower()
            
            # Check each app
            for app in apps:
                if app in field_lower:
                    app_fields_found[app].append(field)
                    matched = True
                    break
            
            if not matched:
                if 'overall' in field_lower:
                    overall_fields.append(field)
                else:
                    unmatched_fields.append(field)
        
        # Display results
        print(f"\nOverall fields: {len(overall_fields)}")
        for field in overall_fields[:5]:
            print(f"  - {field}")
        
        print(f"\nApp-specific fields found:")
        for app, fields in app_fields_found.items():
            if fields:
                print(f"  {app}: {len(fields)} fields")
                for field in fields[:2]:  # Show first 2
                    print(f"    - {field}")
        
        if unmatched_fields:
            print(f"\nUnmatched fields: {len(unmatched_fields)}")
            for field in unmatched_fields[:5]:
                print(f"  - {field}")

In [None]:
# Let's specifically look for threathub fields across all columns
print("Searching for 'threathub' in all columns:")
print("=" * 80)

threathub_cols = [col for col in df.columns if 'threathub' in col.lower()]
print(f"Found {len(threathub_cols)} columns containing 'threathub':\n")

for col in sorted(threathub_cols)[:20]:  # Show first 20
    print(f"  - {col}")
    # Check which section it belongs to
    for section, fields in processor.sections.items():
        if col in fields:
            print(f"    -> In section: {section}")
            break

In [None]:
# Check actual data values for some fields
print("Sample data values:")
print("=" * 80)

# Check a few specific fields
test_fields = [
    'mop_1_1_1.intelligence_data_provided_threathub',
    'mop_1_1_1.intelligence_data_overall_effectiveness',
    'reporting_and_data_export.overall_reporting_effectiveness'
]

for field in test_fields:
    if field in df.columns:
        print(f"\nField: {field}")
        print(f"Non-null values: {df[field].notna().sum()}")
        print(f"Value counts:")
        print(df[field].value_counts().head())
    else:
        print(f"\nField '{field}' not found in columns")

In [None]:
# Analyze the actual pattern of field names
print("Field naming pattern analysis:")
print("=" * 80)

# Get all effectiveness/suitability fields
rating_fields = [col for col in df.columns if any(x in col for x in ['effectiveness', 'effective', 'suitability', 'suitable'])]

print(f"\nTotal rating fields: {len(rating_fields)}")

# Analyze patterns
patterns = {
    'ends_with_app': [],
    'contains_app_middle': [],
    'overall_fields': [],
    'other': []
}

apps = ['a2it', 'cad', 'codex', 'crucible', 'cyber9line', 'dispatch',
        'jcc2cyberops', 'jcc2readiness', 'madss', 'rally', 'redmap',
        'sigact', 'threathub', 'triage', 'unity']

for field in rating_fields:
    field_lower = field.lower()
    categorized = False
    
    # Check if it's an overall field
    if 'overall' in field_lower:
        patterns['overall_fields'].append(field)
        categorized = True
    else:
        # Check if it ends with an app name
        for app in apps:
            if field_lower.endswith(app) or field_lower.endswith(f'_{app}'):
                patterns['ends_with_app'].append((field, app))
                categorized = True
                break
        
        # If not ending, check if app is in the middle
        if not categorized:
            for app in apps:
                if app in field_lower:
                    patterns['contains_app_middle'].append((field, app))
                    categorized = True
                    break
    
    if not categorized:
        patterns['other'].append(field)

# Display pattern analysis
print(f"\nFields ending with app name: {len(patterns['ends_with_app'])}")
for field, app in patterns['ends_with_app'][:5]:
    print(f"  - {field} -> {app}")

print(f"\nFields with app name in middle: {len(patterns['contains_app_middle'])}")
for field, app in patterns['contains_app_middle'][:5]:
    print(f"  - {field} -> {app}")

print(f"\nOverall fields: {len(patterns['overall_fields'])}")
for field in patterns['overall_fields'][:5]:
    print(f"  - {field}")

print(f"\nOther fields: {len(patterns['other'])}")
for field in patterns['other'][:10]:
    print(f"  - {field}")