In [None]:
#!/usr/bin/env python3
"""
ESCO Skills Data Processing Pipeline

This script replicates the n8n workflow for processing ESCO skills data:
1. Fetches raw ESCO data from GitHub
2. Processes and extracts essential skill information
3. Saves the processed data to a local file

Author: Kyle Hall
Date: March 2025
"""

import requests
import json
import datetime
import re
import os
from typing import List, Dict, Any
import logging

# Configure logging
logging.basicConfig(
    level=logging.INFO,
    format='%(asctime)s - %(levelname)s - %(message)s',
    handlers=[
        logging.FileHandler("esco_processor.log"),
        logging.StreamHandler()
    ]
)
logger = logging.getLogger(__name__)

def fetch_esco_data(url: str) -> List[Dict[str, Any]]:
    """
    Fetch ESCO data from GitHub or other URL
    
    Args:
        url: URL to the ESCO JSON data
        
    Returns:
        List of ESCO entries as dictionaries
    """
    logger.info(f"Fetching ESCO data from {url}")
    response = requests.get(url)
    response.raise_for_status()
    data = json.loads(response.text)
    logger.info(f"Retrieved {len(data)} ESCO entries")
    return data

def process_esco_skills(esco_data: List[Dict[str, Any]], batch_size: int = 1000) -> Dict[str, Any]:
    """
    Process ESCO data to extract skills information
    
    Args:
        esco_data: Raw ESCO data from API
        batch_size: Number of items to process in each batch
        
    Returns:
        Dictionary containing processed skills and metadata
    """
    # Filter for skills
    skill_entries = [entry for entry in esco_data if entry.get('skillType') == 'skill/competence']
    logger.info(f"Found {len(skill_entries)} skill/competence entries")
    
    total_batches = (len(skill_entries) + batch_size - 1) // batch_size
    
    # Process in batches
    processed_skills = []
    for i in range(0, len(skill_entries), batch_size):
        batch_end = min(i + batch_size, len(skill_entries))
        logger.info(f"Processing batch {i//batch_size + 1}/{total_batches}: entries {i} to {batch_end-1}")
        
        batch = skill_entries[i:batch_end]
        
        # Process each skill in batch
        for skill in batch:
            try:
                # Extract verb (first word of skill name)
                skill_name = skill.get('skillName', '').strip().lower()
                skill_name_words = re.split(r'\s+', skill_name)
                verb = skill_name_words[0] if skill_name_words else ''
                
                # Extract alternate labels
                alt_labels_text = skill.get('alternateLabels', '')
                alternate_labels = [label.strip() for label in alt_labels_text.split('\n') if label.strip()]
                
                # Create search text
                search_text = ' '.join([
                    skill.get('skillName', ''),
                    ' '.join(alternate_labels),
                    skill.get('description', '')
                ]).lower()
                
                # Extract keywords
                all_words = re.split(r'\s+', search_text)
                common_words = ['and', 'or', 'the', 'a', 'an', 'in', 'on', 'at', 'to', 'for', 'with', 'by', 'of', 'such', 'as']
                filtered_words = [word for word in all_words if word and len(word) > 3 and word not in common_words]
                
                # Count word frequencies
                word_freq = {}
                for word in filtered_words:
                    word_freq[word] = word_freq.get(word, 0) + 1
                
                # Get top keywords
                keywords = [word for word, _ in sorted(word_freq.items(), key=lambda x: x[1], reverse=True)[:20]]
                
                processed_skills.append({
                    'skillId': skill.get('id', ''),
                    'skillName': skill.get('skillName', ''),
                    'description': skill.get('description', ''),
                    'verb': verb,
                    'alternateLabels': alternate_labels,
                    'keywords': keywords,
                    'searchText': search_text
                })
            except Exception as e:
                logger.error(f"Error processing skill '{skill.get('skillName', 'Unknown')}': {str(e)}")
    
    logger.info(f"ESCO processing complete. Processed {len(processed_skills)} skills.")
    
    return {
        'processedEscoSkills': processed_skills,
        'metadata': {
            'totalEntries': len(esco_data),
            'processedSkills': len(processed_skills),
            'processingDate': datetime.datetime.now().isoformat()
        }
    }

def save_to_file(processed_data: Dict[str, Any], output_dir: str = '.', file_path: str = None) -> str:
    """
    Save processed data to a JSON file
    
    Args:
        processed_data: The processed data to save
        output_dir: Directory to save the file
        file_path: Optional specific file path
        
    Returns:
        Path to the saved file
    """
    # Create output directory if it doesn't exist
    os.makedirs(output_dir, exist_ok=True)
    
    if file_path is None:
        timestamp = datetime.datetime.now().strftime("%Y-%m-%d-%H-%M-%S")
        file_path = os.path.join(output_dir, f"processed-esco-skills-{timestamp}.json")
    
    with open(file_path, 'w', encoding='utf-8') as f:
        json.dump(processed_data, f, indent=2)
    
    logger.info(f"Data saved to {file_path}")
    return file_path

def main():
    """Main entry point for the script"""
    # URL to ESCO data
    url = "https://raw.githubusercontent.com/Kyleinexile/IS-Repo/refs/heads/main/ESCO.json"
    output_dir = "processed_data"
    
    try:
        # Fetch data
        esco_data = fetch_esco_data(url)
        
        # Process data
        processed_data = process_esco_skills(esco_data)
        
        # Save to file
        save_to_file(processed_data, output_dir)
        
        logger.info("Workflow completed successfully")
    except Exception as e:
        logger.error(f"Error in workflow: {str(e)}")
        raise

if __name__ == "__main__":
    main()

2025-03-27 18:58:07,303 - INFO - Fetching ESCO data from https://raw.githubusercontent.com/Kyleinexile/IS-Repo/refs/heads/main/ESCO.json
2025-03-27 18:58:08,218 - INFO - Retrieved 13939 ESCO entries
2025-03-27 18:58:08,220 - INFO - Found 10715 skill/competence entries
2025-03-27 18:58:08,221 - INFO - Processing batch 1/11: entries 0 to 999
2025-03-27 18:58:08,243 - INFO - Processing batch 2/11: entries 1000 to 1999
2025-03-27 18:58:08,252 - INFO - Processing batch 3/11: entries 2000 to 2999
2025-03-27 18:58:08,262 - INFO - Processing batch 4/11: entries 3000 to 3999
2025-03-27 18:58:08,272 - INFO - Processing batch 5/11: entries 4000 to 4999
2025-03-27 18:58:08,281 - INFO - Processing batch 6/11: entries 5000 to 5999
2025-03-27 18:58:08,290 - INFO - Processing batch 7/11: entries 6000 to 6999
2025-03-27 18:58:08,300 - INFO - Processing batch 8/11: entries 7000 to 7999
2025-03-27 18:58:08,309 - INFO - Processing batch 9/11: entries 8000 to 8999
2025-03-27 18:58:08,317 - INFO - Processin