In [2]:
#!/usr/bin/env python3
"""
Military Skills Taxonomy Creator

This script processes ESCO skills and O*NET Military Crosswalk data to create
a merged dataset for LLM-based military skills taxonomy creation.

It performs the following steps:
1. Process O*NET Military Crosswalk to extract Air Force Specialty Codes (AFSCs)
2. Process ESCO skills data to extract structured skill information
3. Create mappings between AFSCs and relevant skills
4. Generate a structured output for LLM taxonomy creation
"""

import os
import csv
import json
import pandas as pd
from typing import Dict, List, Any
import logging
from datetime import datetime

# Configure logging
logging.basicConfig(
    level=logging.INFO,
    format='%(asctime)s - %(levelname)s - %(message)s',
    handlers=[
        logging.FileHandler("taxonomy_creator.log"),
        logging.StreamHandler()
    ]
)
logger = logging.getLogger(__name__)

# File paths
INPUT_DIR = r"C:\Users\Kyle\Desktop\Grad School\IS Demo\Phase 2 Rebuild"  # Adjust if needed
OUTPUT_DIR = r"C:\Users\Kyle\Desktop\Grad School\IS Demo\Phase 2 Rebuild"
os.makedirs(OUTPUT_DIR, exist_ok=True)

# Input files with updated filenames
MILITARY_CROSSWALK_FILE = os.path.join(INPUT_DIR, "milx0724.csv")
ESCO_ONET_CROSSWALK_FILE = os.path.join(INPUT_DIR, "Original Crosswalk ESCO and ONet.csv")
ESCO_SKILLS_FILE = os.path.join(INPUT_DIR, "skills_en.csv")
ESCO_HIERARCHY_FILE = os.path.join(INPUT_DIR, "skillsHierarchy_en.csv")
ESCO_OCCUPATIONS_SKILLS_FILE = os.path.join(INPUT_DIR, "occupationSkillRelations_en.csv")

# Output files
PROCESSED_AFSC_FILE = os.path.join(OUTPUT_DIR, "processed_afsc_data.json")
PROCESSED_SKILLS_FILE = os.path.join(OUTPUT_DIR, "processed_skills_data.json")
MERGED_TAXONOMY_FILE = os.path.join(OUTPUT_DIR, "military_skills_taxonomy.json")

def process_military_crosswalk() -> List[Dict]:
    """
    Process the O*NET Military Crosswalk file to extract Air Force specialties.
    
    Returns:
        List of dictionaries containing processed AFSC data
    """
    logger.info(f"Processing Military Crosswalk from: {MILITARY_CROSSWALK_FILE}")
    
    afscs = []
    
    try:
        # Read CSV file
        with open(MILITARY_CROSSWALK_FILE, 'r', encoding='utf-8') as file:
            reader = csv.DictReader(file)
            rows = list(reader)
            
            # Count records by service
            service_counts = {}
            for row in rows:
                svc = row.get('SVC', '')
                service_counts[svc] = service_counts.get(svc, 0) + 1
                
            logger.info(f"Service counts in crosswalk: {service_counts}")
            
            # Filter for Air Force entries (SVC = 'F')
            air_force_rows = [row for row in rows if row.get('SVC') == 'F']
            logger.info(f"Found {len(air_force_rows)} Air Force entries")
            
            # Transform to structured format
            for row in air_force_rows:
                # Skip inactive records
                if row.get('STATUS') != 'A':
                    continue
                    
                # Extract O*NET and SOC codes and titles
                onet_codes = []
                onet_titles = []
                soc_codes = []
                soc_titles = []
                
                for i in range(1, 5):  # There are up to 4 O*NET and SOC mappings
                    if row.get(f'ONET{i}') and row.get(f'ONET{i}').strip():
                        onet_codes.append(row.get(f'ONET{i}'))
                        onet_titles.append(row.get(f'ONET{i}_TITLE', ''))
                    
                    if row.get(f'SOC{i}') and row.get(f'SOC{i}').strip():
                        soc_codes.append(row.get(f'SOC{i}'))
                        soc_titles.append(row.get(f'SOC{i}_TITLE', ''))
                
                # Create a combined description
                description_elements = []
                if row.get('MOC_TITLE'):
                    description_elements.append(row.get('MOC_TITLE'))
                if row.get('DODOCC_TITLE') and row.get('DODOCC_TITLE') != '-':
                    description_elements.append(f"DoD Occupation: {row.get('DODOCC_TITLE')}")
                for title in onet_titles:
                    if title and title not in description_elements:
                        description_elements.append(f"Related to civilian role: {title}")
                
                description = ". ".join(description_elements)
                
                # Extract keywords from titles
                keywords = set()
                for title in [row.get('MOC_TITLE', '')] + onet_titles + soc_titles:
                    if not title:
                        continue
                    # Simple keyword extraction by splitting and filtering
                    words = [w.lower() for w in title.split() if len(w) > 3 and w.lower() not in
                             ['and', 'the', 'for', 'with', 'all', 'other']]
                    keywords.update(words)
                
                # Create structured AFSC entry
                afsc_entry = {
                    'afscCode': row.get('MOC', ''),
                    'afscTitle': row.get('MOC_TITLE', ''),
                    'category': 'Enlisted' if row.get('MPC') == 'E' else 'Officer' if row.get('MPC') == 'O' else 'Warrant',
                    'description': description,
                    'onetCodes': onet_codes,
                    'onetTitles': onet_titles,
                    'socCodes': soc_codes,
                    'socTitles': soc_titles,
                    'keywords': list(keywords),
                    'source': 'O*NET Military Crosswalk'
                }
                
                afscs.append(afsc_entry)
            
        logger.info(f"Processed {len(afscs)} active Air Force specialties")
        return afscs
        
    except Exception as e:
        logger.error(f"Error processing Military Crosswalk: {str(e)}")
        raise

def process_esco_onet_crosswalk() -> Dict[str, Any]:
    """
    Process the ESCO to O*NET crosswalk file.
    
    Returns:
        Dictionary mapping between ESCO and O*NET concepts
    """
    logger.info(f"Processing ESCO to O*NET crosswalk from: {ESCO_ONET_CROSSWALK_FILE}")
    
    try:
        # Initialize mapping dictionaries
        esco_to_onet = {}
        onet_to_esco = {}
        
        # Read the crosswalk file
        df = pd.read_csv(ESCO_ONET_CROSSWALK_FILE, encoding='utf-8')
        
        # Log column names to understand the structure
        logger.info(f"ESCO-O*NET crosswalk columns: {', '.join(df.columns)}")
        
        # Determine the correct column names based on the file structure
        # This is flexible since we don't know the exact structure
        esco_code_col = next((col for col in df.columns if 'esco' in col.lower() and 'code' in col.lower()), None)
        onet_code_col = next((col for col in df.columns if 'onet' in col.lower() and 'code' in col.lower()), None)
        
        if not esco_code_col or not onet_code_col:
            # Try alternative column names
            possible_esco_cols = [col for col in df.columns if 'esco' in col.lower()]
            possible_onet_cols = [col for col in df.columns if 'onet' in col.lower()]
            
            if possible_esco_cols and possible_onet_cols:
                esco_code_col = possible_esco_cols[0]
                onet_code_col = possible_onet_cols[0]
            else:
                logger.warning("Could not identify ESCO and O*NET columns in the crosswalk file")
                return {'esco_to_onet': {}, 'onet_to_esco': {}}
        
        # Create mappings
        for _, row in df.iterrows():
            esco_code = str(row[esco_code_col]).strip()
            onet_code = str(row[onet_code_col]).strip()
            
            if esco_code and onet_code and esco_code != 'nan' and onet_code != 'nan':
                # Map ESCO to O*NET
                if esco_code not in esco_to_onet:
                    esco_to_onet[esco_code] = []
                if onet_code not in esco_to_onet[esco_code]:
                    esco_to_onet[esco_code].append(onet_code)
                
                # Map O*NET to ESCO
                if onet_code not in onet_to_esco:
                    onet_to_esco[onet_code] = []
                if esco_code not in onet_to_esco[onet_code]:
                    onet_to_esco[onet_code].append(esco_code)
        
        logger.info(f"Created mappings for {len(esco_to_onet)} ESCO codes to O*NET")
        logger.info(f"Created mappings for {len(onet_to_esco)} O*NET codes to ESCO")
        
        return {
            'esco_to_onet': esco_to_onet,
            'onet_to_esco': onet_to_esco
        }
        
    except Exception as e:
        logger.error(f"Error processing ESCO-O*NET crosswalk: {str(e)}")
        return {'esco_to_onet': {}, 'onet_to_esco': {}}

def process_esco_skills() -> Dict[str, Any]:
    """
    Process ESCO skills and hierarchy data.
    
    Returns:
        Dictionary containing processed skills data and hierarchy
    """
    logger.info(f"Processing ESCO skills from: {ESCO_SKILLS_FILE}")
    logger.info(f"Processing ESCO hierarchy from: {ESCO_HIERARCHY_FILE}")
    
    try:
        # Process skills
        skills_dict = {}
        skills_by_type = {}
        
        with open(ESCO_SKILLS_FILE, 'r', encoding='utf-8') as file:
            reader = csv.DictReader(file)
            for row in reader:
                skill_uri = row.get('conceptUri', '')
                if not skill_uri:
                    continue
                
                # Extract the verb (first word of skill name or description)
                preferred_label = row.get('preferredLabel', '')
                description = row.get('description', '')
                
                verb = ''
                if preferred_label:
                    verb = preferred_label.split()[0].lower() if preferred_label.split() else ''
                elif description:
                    verb = description.split()[0].lower() if description.split() else ''
                
                # Process alternate labels
                alt_labels_text = row.get('altLabels', '')
                alt_labels = [label.strip() for label in alt_labels_text.split('\n') if label.strip()]
                
                # Extract skill type and level
                skill_type = row.get('skillType', '')
                reuse_level = row.get('reuseLevel', '')
                
                # Organize by skill type
                if skill_type not in skills_by_type:
                    skills_by_type[skill_type] = []
                
                # Extract keywords from description
                keywords = []
                if description:
                    # Simple keyword extraction
                    words = description.lower().split()
                    # Filter out common words and short words
                    keywords = [word for word in words if len(word) > 3 and word not in 
                              ['and', 'the', 'for', 'with', 'this', 'that', 'which', 'what',
                               'where', 'when', 'who', 'how', 'such', 'from', 'their']]
                
                # Create skill entry
                skill_entry = {
                    'skillUri': skill_uri,
                    'skillName': preferred_label,
                    'skillType': skill_type,
                    'reuseLevel': reuse_level,
                    'description': description,
                    'altLabels': alt_labels,
                    'verb': verb,
                    'keywords': keywords
                }
                
                skills_dict[skill_uri] = skill_entry
                skills_by_type[skill_type].append(skill_uri)
        
        logger.info(f"Processed {len(skills_dict)} ESCO skills")
        logger.info(f"Skill types: {', '.join(skills_by_type.keys())}")
        
        # Process hierarchy
        hierarchy = {}
        skill_categories = {}
        
        with open(ESCO_HIERARCHY_FILE, 'r', encoding='utf-8') as file:
            reader = csv.DictReader(file)
            for row in reader:
                level0_uri = row.get('Level 0 URI', '')
                level0_term = row.get('Level 0 preferred term', '')
                level1_uri = row.get('Level 1 URI', '')
                level1_term = row.get('Level 1 preferred term', '')
                level2_uri = row.get('Level 2 URI', '')
                level2_term = row.get('Level 2 preferred term', '')
                level3_uri = row.get('Level 3 URI', '')
                level3_term = row.get('Level 3 preferred term', '')
                description = row.get('Description', '')
                
                # Create hierarchy entries
                if level0_uri and level0_term:
                    if level0_uri not in hierarchy:
                        hierarchy[level0_uri] = {
                            'uri': level0_uri,
                            'term': level0_term,
                            'description': description,
                            'children': {}
                        }
                
                if level0_uri and level1_uri and level1_term:
                    if level1_uri not in hierarchy[level0_uri]['children']:
                        hierarchy[level0_uri]['children'][level1_uri] = {
                            'uri': level1_uri,
                            'term': level1_term,
                            'description': description,
                            'children': {}
                        }
                
                if level0_uri and level1_uri and level2_uri and level2_term:
                    if level2_uri not in hierarchy[level0_uri]['children'][level1_uri]['children']:
                        hierarchy[level0_uri]['children'][level1_uri]['children'][level2_uri] = {
                            'uri': level2_uri,
                            'term': level2_term,
                            'description': description,
                            'children': {}
                        }
                
                if level0_uri and level1_uri and level2_uri and level3_uri and level3_term:
                    children = hierarchy[level0_uri]['children'][level1_uri]['children'][level2_uri]['children']
                    if level3_uri not in children:
                        children[level3_uri] = {
                            'uri': level3_uri,
                            'term': level3_term,
                            'description': description,
                            'skills': []
                        }
                
                # Map skills to categories
                if level3_uri and level3_term:
                    skill_categories[level3_uri] = {
                        'level0': {'uri': level0_uri, 'term': level0_term},
                        'level1': {'uri': level1_uri, 'term': level1_term},
                        'level2': {'uri': level2_uri, 'term': level2_term},
                        'level3': {'uri': level3_uri, 'term': level3_term},
                        'description': description
                    }
        
        # Count skills by hierarchy level
        level_counts = {'level0': 0, 'level1': 0, 'level2': 0, 'level3': 0}
        for level0_uri, level0_data in hierarchy.items():
            level_counts['level0'] += 1
            for level1_uri, level1_data in level0_data['children'].items():
                level_counts['level1'] += 1
                for level2_uri, level2_data in level1_data['children'].items():
                    level_counts['level2'] += 1
                    for level3_uri, level3_data in level2_data['children'].items():
                        level_counts['level3'] += 1
        
        logger.info(f"Processed hierarchy with counts: {level_counts}")
        
        # Process occupation-skill relations
        skill_to_occupations = {}
        occupation_to_skills = {}
        
        with open(ESCO_OCCUPATIONS_SKILLS_FILE, 'r', encoding='utf-8') as file:
            reader = csv.DictReader(file)
            for row in reader:
                occ_uri = row.get('occupationUri', '')
                skill_uri = row.get('skillUri', '')
                relation_type = row.get('relationType', '')  # essential or optional
                
                if not occ_uri or not skill_uri:
                    continue
                
                # Map skill to occupations
                if skill_uri not in skill_to_occupations:
                    skill_to_occupations[skill_uri] = []
                
                skill_to_occupations[skill_uri].append({
                    'occupationUri': occ_uri,
                    'relationType': relation_type
                })
                
                # Map occupation to skills
                if occ_uri not in occupation_to_skills:
                    occupation_to_skills[occ_uri] = []
                
                occupation_to_skills[occ_uri].append({
                    'skillUri': skill_uri,
                    'relationType': relation_type
                })
        
        logger.info(f"Processed {len(skill_to_occupations)} skill-to-occupation mappings")
        logger.info(f"Processed {len(occupation_to_skills)} occupation-to-skill mappings")
        
        return {
            'skills': skills_dict,
            'hierarchy': hierarchy,
            'skillCategories': skill_categories,
            'skillsByType': skills_by_type,
            'skillToOccupations': skill_to_occupations,
            'occupationToSkills': occupation_to_skills
        }
        
    except Exception as e:
        logger.error(f"Error processing ESCO skills: {str(e)}")
        raise

def create_afsc_skill_mapping(afscs: List[Dict], esco_data: Dict, crosswalk_data: Dict) -> List[Dict]:
    """
    Create mappings between AFSCs and ESCO skills.
    
    Args:
        afscs: List of processed AFSC data
        esco_data: Dictionary containing processed ESCO data
        crosswalk_data: Dictionary containing ESCO-O*NET crosswalk data
        
    Returns:
        List of AFSCs with mapped skills
    """
    logger.info("Creating AFSC to ESCO skill mappings")
    
    skills_dict = esco_data['skills']
    skill_to_occupations = esco_data['skillToOccupations']
    onet_to_esco = crosswalk_data.get('onet_to_esco', {})
    
    # Create lookup dictionaries for O*NET codes and titles
    onet_code_to_afsc = {}
    onet_title_to_afsc = {}
    
    for afsc in afscs:
        for code in afsc.get('onetCodes', []):
            if code not in onet_code_to_afsc:
                onet_code_to_afsc[code] = []
            onet_code_to_afsc[code].append(afsc['afscCode'])
        
        for title in afsc.get('onetTitles', []):
            if title not in onet_title_to_afsc:
                onet_title_to_afsc[title] = []
            onet_title_to_afsc[title].append(afsc['afscCode'])
    
    # Function to calculate match score between AFSC and skill
    def calculate_match_score(afsc, skill):
        score = 0
        
        # Keyword matching
        afsc_keywords = set(kw.lower() for kw in afsc.get('keywords', []))
        skill_keywords = set(kw.lower() for kw in skill.get('keywords', []))
        
        # Count matching keywords
        common_keywords = afsc_keywords.intersection(skill_keywords)
        score += len(common_keywords) * 2
        
        # Check if skill verb appears in AFSC title or description
        verb = skill.get('verb', '').lower()
        if verb and (
            verb in afsc.get('afscTitle', '').lower() or 
            verb in afsc.get('description', '').lower()
        ):
            score += 3
        
        # Check skill description against AFSC description
        skill_desc = skill.get('description', '').lower()
        afsc_desc = afsc.get('description', '').lower()
        
        for kw in afsc_keywords:
            if kw in skill_desc:
                score += 1
        
        for kw in skill_keywords:
            if kw in afsc_desc:
                score += 1
        
        # Check crosswalk data for additional matches
        # If this AFSC's O*NET codes map to this skill through the crosswalk, increase score
        for onet_code in afsc.get('onetCodes', []):
            esco_uris = onet_to_esco.get(onet_code, [])
            if skill.get('skillUri') in esco_uris:
                score += 5  # Higher score for direct crosswalk matches
        
        return score
    
    # Process each AFSC and find matching skills
    afsc_skill_mappings = []
    
    for afsc in afscs:
        logger.info(f"Processing AFSC {afsc['afscCode']}: {afsc['afscTitle']}")
        
        # Find candidate skills
        candidate_skills = []
        
        # Approach 1: Direct keyword matching
        for skill_uri, skill in skills_dict.items():
            match_score = calculate_match_score(afsc, skill)
            if match_score >= 3:  # Threshold for considering a match
                candidate_skills.append({
                    'skillUri': skill_uri,
                    'skillName': skill['skillName'],
                    'description': skill['description'],
                    'matchScore': match_score,
                    'matchMethod': 'keyword',
                    'verb': skill['verb'],
                    'skillType': skill['skillType']
                })
        
        # Sort candidates by match score
        candidate_skills.sort(key=lambda x: x['matchScore'], reverse=True)
        
        # Limit to top matches
        top_candidates = candidate_skills[:50]
        
        # Group skills by verb for better organization
        skills_by_verb = {}
        for skill in top_candidates:
            verb = skill.get('verb', '') or 'other'
            if verb not in skills_by_verb:
                skills_by_verb[verb] = []
            skills_by_verb[verb].append(skill)
        
        # Create the AFSC entry with mapped skills
        afsc_mapping = {
            **afsc,  # Include all original AFSC data
            'matchedSkills': top_candidates,
            'skillsByVerb': [
                {'verb': verb, 'skills': skills} 
                for verb, skills in skills_by_verb.items()
            ],
            'totalMatches': len(top_candidates)
        }
        
        afsc_skill_mappings.append(afsc_mapping)
    
    logger.info(f"Created skill mappings for {len(afsc_skill_mappings)} AFSCs")
    return afsc_skill_mappings

def create_taxonomy_data(afsc_skill_mappings: List[Dict], esco_data: Dict) -> Dict:
    """
    Create the final taxonomy data structure.
    
    Args:
        afsc_skill_mappings: List of AFSCs with mapped skills
        esco_data: Dictionary containing processed ESCO data
        
    Returns:
        Dictionary containing the complete taxonomy
    """
    logger.info("Creating final taxonomy data structure")
    
    # Organize AFSCs by category
    afscs_by_category = {}
    for afsc in afsc_skill_mappings:
        category = afsc['category']
        if category not in afscs_by_category:
            afscs_by_category[category] = []
        afscs_by_category[category].append(afsc)
    
    # Sort AFSCs within each category
    for category in afscs_by_category:
        afscs_by_category[category].sort(key=lambda x: x['afscCode'])
    
    # Extract top level skill categories from hierarchy
    skill_hierarchy = esco_data['hierarchy']
    top_level_categories = []
    
    for uri, data in skill_hierarchy.items():
        if uri.startswith('http://data.europa.eu/esco/skill/'):
            top_level_categories.append({
                'uri': uri,
                'name': data['term'],
                'description': data['description']
            })
    
    # Create the taxonomy structure
    taxonomy = {
        'metadata': {
            'title': 'Military Skills Taxonomy',
            'description': 'A taxonomy mapping Air Force Specialty Codes (AFSCs) to relevant skills',
            'creationDate': datetime.now().isoformat(),
            'sources': ['ESCO', 'O*NET Military Crosswalk', 'ESCO-O*NET Crosswalk'],
            'totalAfscs': len(afsc_skill_mappings),
            'afscCategories': list(afscs_by_category.keys()),
            'topSkillCategories': top_level_categories,
        },
        'afscCategories': [
            {
                'categoryName': category,
                'afscs': afscs
            }
            for category, afscs in afscs_by_category.items()
        ],
        'skillHierarchy': {
            'topLevelCategories': top_level_categories,
            'structureDescription': 'Skills are organized in a 4-level hierarchy from broadest (Level 0) to most specific (Level 3)'
        }
    }
    
    return taxonomy

def main():
    """Main execution function"""
    try:
        logger.info("Starting Military Skills Taxonomy creation process")
        
        # Process Military Crosswalk to extract AFSCs
        afscs = process_military_crosswalk()
        
        # Save processed AFSC data
        with open(PROCESSED_AFSC_FILE, 'w', encoding='utf-8') as f:
            json.dump({
                'processedAfscs': afscs,
                'metadata': {
                    'count': len(afscs),
                    'processingDate': datetime.now().isoformat()
                }
            }, f, indent=2)
        logger.info(f"Saved processed AFSC data to {PROCESSED_AFSC_FILE}")
        
        # Process ESCO skills data
        esco_data = process_esco_skills()
        
        # Process ESCO-O*NET crosswalk
        crosswalk_data = process_esco_onet_crosswalk()
        
        # Save processed skills data
        with open(PROCESSED_SKILLS_FILE, 'w', encoding='utf-8') as f:
            json.dump({
                'processedEscoData': esco_data,
                'crosswalkData': crosswalk_data,
                'metadata': {
                    'skillCount': len(esco_data['skills']),
                    'processingDate': datetime.now().isoformat()
                }
            }, f, indent=2)
        logger.info(f"Saved processed ESCO skills data to {PROCESSED_SKILLS_FILE}")
        
        # Create AFSC to skill mappings
        afsc_skill_mappings = create_afsc_skill_mapping(afscs, esco_data, crosswalk_data)
        
        # Create taxonomy data
        taxonomy = create_taxonomy_data(afsc_skill_mappings, esco_data)
        
        # Save the merged taxonomy
        with open(MERGED_TAXONOMY_FILE, 'w', encoding='utf-8') as f:
            json.dump(taxonomy, f, indent=2)
        logger.info(f"Saved merged taxonomy to {MERGED_TAXONOMY_FILE}")
        
        logger.info("Military Skills Taxonomy creation process completed successfully")
        
    except Exception as e:
        logger.error(f"Error in main execution: {str(e)}")
        raise

if __name__ == "__main__":
    main()

2025-03-30 21:38:47,880 - INFO - Starting Military Skills Taxonomy creation process
2025-03-30 21:38:47,882 - INFO - Processing Military Crosswalk from: C:\Users\Kyle\Desktop\Grad School\IS Demo\Phase 2 Rebuild\milx0724.csv
2025-03-30 21:38:48,006 - INFO - Service counts in crosswalk: {'V': 4419, 'N': 7302, 'J': 809, 'C': 320, 'G': 1150, 'F': 9624, 'H': 4517, 'Y': 3418, 'U': 2377, 'A': 2086, 'X': 70, 'M': 2245, 'O': 20, 'S': 562, 'P': 208, 'D': 467, 'Q': 54, 'Z': 40, 'K': 343, 'L': 46}
2025-03-30 21:38:48,009 - INFO - Found 9624 Air Force entries
2025-03-30 21:38:48,057 - INFO - Processed 4020 active Air Force specialties
2025-03-30 21:38:48,124 - INFO - Saved processed AFSC data to C:\Users\Kyle\Desktop\Grad School\IS Demo\Phase 2 Rebuild\processed_afsc_data.json
2025-03-30 21:38:48,124 - INFO - Processing ESCO skills from: C:\Users\Kyle\Desktop\Grad School\IS Demo\Phase 2 Rebuild\skills_en.csv
2025-03-30 21:38:48,125 - INFO - Processing ESCO hierarchy from: C:\Users\Kyle\Desktop\Grad

In [4]:
# Load the existing taxonomy data
import json
import os

# Define the path to your existing taxonomy JSON
INPUT_FILE = r"C:\Users\Kyle\Desktop\Grad School\IS Demo\Phase 2 Rebuild\military_skills_taxonomy.json"
OUTPUT_FILE = r"C:\Users\Kyle\Desktop\Grad School\IS Demo\Phase 2 Rebuild\enhanced_military_skills_taxonomy.json"

# Load the data
with open(INPUT_FILE, 'r', encoding='utf-8') as f:
    taxonomy_data = json.load(f)

print(f"Loaded taxonomy with {len(taxonomy_data['afscCategories'])} AFSC categories")

# Define all enhancement functions
def enhance_data_quality(afscs):
    """Filter out low-quality or irrelevant matches"""
    for afsc in afscs:
        if 'matchedSkills' in afsc:
            # Filter out very low scoring matches
            afsc['matchedSkills'] = [skill for skill in afsc['matchedSkills'] if skill.get('matchScore', 0) >= 4]
            
            # Filter out likely domain mismatches (ship/naval related for Air Force)
            filtered_skills = []
            for skill in afsc['matchedSkills']:
                description = skill.get('description', '').lower()
                skill_name = skill.get('skillName', '').lower()
                
                # Check for competing domain terminology
                competing_domains = ['ship', 'vessel', 'maritime', 'naval', 'ocean', 'seaman']
                if any(term in description or term in skill_name for term in competing_domains):
                    # Only keep if the score is very high despite the competing domain terms
                    if skill.get('matchScore', 0) >= 10:
                        filtered_skills.append(skill)
                else:
                    filtered_skills.append(skill)
            
            afsc['matchedSkills'] = filtered_skills
    
    return afscs

def enhance_military_context(afscs):
    """Add military-specific context to improve LLM understanding"""
    
    # Military service mapping
    service_context = {
        'Air Force': {
            'mission': 'Fly, fight and win in air, space and cyberspace',
            'domains': ['air', 'space', 'cyberspace'],
            'key_capabilities': ['air superiority', 'global strike', 'rapid mobility', 
                                'intelligence gathering', 'command and control']
        }
    }
    
    # AFSC prefix/category mapping
    afsc_categories = {
        '1': 'Operations',
        '2': 'Logistics and Maintenance',
        '3': 'Support',
        '4': 'Medical and Dental',
        '5': 'Legal and Chaplain',
        '6': 'Acquisition and Financial Management',
        '7': 'Special Investigations',
        '8': 'Special Duty and Reporting Identifiers',
        '9': 'Special Duty and Reporting Identifiers'
    }
    
    for afsc in afscs:
        # Add service context
        afsc['serviceContext'] = service_context['Air Force']
        
        # Add AFSC category context
        if 'afscCode' in afsc and afsc['afscCode']:
            prefix = afsc['afscCode'][0]
            if prefix in afsc_categories:
                afsc['afscCategory'] = afsc_categories[prefix]
                afsc['afscCategoryDescription'] = f"This AFSC belongs to the {afsc_categories[prefix]} category of Air Force specialties."
    
    return afscs

def enhance_verb_analysis(afscs):
    """Improve verb extraction and grouping for better action analysis"""
    
    # Define verb synonyms for grouping related actions
    verb_groups = {
        'analyze': ['analyze', 'assess', 'evaluate', 'examine', 'inspect', 'investigate', 'review'],
        'manage': ['manage', 'administer', 'coordinate', 'direct', 'oversee', 'supervise'],
        'develop': ['develop', 'create', 'design', 'establish', 'formulate', 'generate'],
        'implement': ['implement', 'apply', 'deploy', 'execute', 'perform', 'use'],
        'communicate': ['communicate', 'advise', 'inform', 'instruct', 'present', 'report'],
        'operate': ['operate', 'control', 'drive', 'handle', 'navigate', 'pilot']
    }
    
    # Create reverse mapping for quick lookup
    verb_to_group = {}
    for group, verbs in verb_groups.items():
        for verb in verbs:
            verb_to_group[verb] = group
    
    for afsc in afscs:
        if 'matchedSkills' in afsc:
            # Advanced verb extraction with fallback
            for skill in afsc['matchedSkills']:
                # Use existing verb field, but enhance it
                verb = skill.get('verb', '').lower() if skill.get('verb') else ''
                
                # Assign to verb group if it exists
                if verb in verb_to_group:
                    skill['verbGroup'] = verb_to_group[verb]
                else:
                    skill['verbGroup'] = 'other'
            
            # Create skills by verb group instead of just verb
            skills_by_verb_group = {}
            for skill in afsc['matchedSkills']:
                group = skill.get('verbGroup', 'other')
                if group not in skills_by_verb_group:
                    skills_by_verb_group[group] = []
                skills_by_verb_group[group].append(skill)
            
            # Add to AFSC data
            afsc['skillsByVerbGroup'] = [
                {
                    'verbGroup': group,
                    'commonVerbs': list(set(skill.get('verb', '') for skill in skills if skill.get('verb', ''))),
                    'skills': skills
                }
                for group, skills in skills_by_verb_group.items()
            ]
    
    return afscs

def refine_relevance_scores(afscs):
    """Refine relevance scores to better prioritize directly applicable skills"""
    
    for afsc in afscs:
        if 'matchedSkills' in afsc:
            for skill in afsc['matchedSkills']:
                # Normalize score to 0-100 scale for better LLM understanding
                match_score = skill.get('matchScore', 0)
                normalized_score = min(match_score * 10, 100)
                
                # Add confidence level based on score
                if normalized_score >= 80:
                    confidence = "high"
                elif normalized_score >= 50:
                    confidence = "medium"
                else:
                    confidence = "low"
                
                # Update skill data
                skill['relevance'] = normalized_score
                skill['confidenceLevel'] = confidence
            
            # Re-sort skills by new normalized relevance score
            if len(afsc['matchedSkills']) > 0:
                afsc['matchedSkills'].sort(key=lambda x: x.get('relevance', 0), reverse=True)
    
    return afscs

def create_llm_optimized_taxonomy(taxonomy_data):
    """Create an LLM-optimized taxonomy structure with clear relationships"""
    
    optimized_taxonomy = {
        "metadata": {
            "title": "Air Force Skills Taxonomy",
            "description": "A hierarchical taxonomy of skills for Air Force Specialty Codes (AFSCs)",
            "purpose": "This taxonomy maps military occupational specialties to their required skills and capabilities",
            "structure": "This taxonomy is organized in the following hierarchy: AFSC Categories > Individual AFSCs > Skill Categories > Skills"
        },
        "afscCategories": {}
    }
    
    # Process each AFSC category
    for category in taxonomy_data['afscCategories']:
        category_name = category['categoryName']
        optimized_taxonomy["afscCategories"][category_name] = {
            "name": category_name,
            "afscs": []
        }
        
        # Process each AFSC in this category
        for afsc in category['afscs']:
            if 'afscCode' not in afsc:
                continue
                
            # Create simplified AFSC entry optimized for LLM comprehension
            simplified_afsc = {
                "code": afsc.get('afscCode', ''),
                "title": afsc.get('afscTitle', ''),
                "description": afsc.get('description', ''),
                "category": afsc.get('afscCategory', ''),
                "skillSummary": "This AFSC requires capabilities in the following areas:",
                "coreVerbs": [],
                "coreSkills": []
            }
            
            # Add top verb groups
            for verb_group in afsc.get('skillsByVerbGroup', []):
                if len(simplified_afsc["coreVerbs"]) < 5:  # Limit to top 5
                    simplified_afsc["coreVerbs"].append(verb_group['verbGroup'])
            
            # Add top skills
            top_skills = sorted(afsc.get('matchedSkills', []), 
                              key=lambda x: x.get('relevance', x.get('matchScore', 0)), 
                              reverse=True)[:10]  # Top 10 skills
            
            for skill in top_skills:
                simplified_afsc["coreSkills"].append({
                    "name": skill.get('skillName', ''),
                    "action": skill.get('verb', ''),
                    "relevance": skill.get('relevance', 0),
                    "confidence": skill.get('confidenceLevel', 'medium')
                })
            
            # Add to taxonomy
            optimized_taxonomy["afscCategories"][category_name]["afscs"].append(simplified_afsc)
    
    return optimized_taxonomy

# Function to apply all enhancements
def apply_enhancements(taxonomy_data):
    """Apply all enhancements to the taxonomy data"""
    enhanced_data = taxonomy_data.copy()
    
    # Process each AFSC category
    for category_idx, category in enumerate(enhanced_data['afscCategories']):
        category_name = category['categoryName']
        print(f"Enhancing {category_name} category...")
        
        # Apply enhancements to each AFSC in this category
        category['afscs'] = enhance_data_quality(category['afscs'])
        category['afscs'] = enhance_military_context(category['afscs'])
        category['afscs'] = enhance_verb_analysis(category['afscs'])
        category['afscs'] = refine_relevance_scores(category['afscs'])
    
    # Create optimized structure
    enhanced_data['llm_optimized_structure'] = create_llm_optimized_taxonomy(enhanced_data)
    
    return enhanced_data

# Apply the enhancements
enhanced_taxonomy = apply_enhancements(taxonomy_data)

# Save the enhanced taxonomy
with open(OUTPUT_FILE, 'w', encoding='utf-8') as f:
    json.dump(enhanced_taxonomy, f, indent=2)

print(f"Enhanced taxonomy saved to {OUTPUT_FILE}")

Loaded taxonomy with 3 AFSC categories
Enhancing Enlisted category...
Enhancing Officer category...
Enhancing Warrant category...
Enhanced taxonomy saved to C:\Users\Kyle\Desktop\Grad School\IS Demo\Phase 2 Rebuild\enhanced_military_skills_taxonomy.json
