In [5]:
#!/usr/bin/env python3
"""
ESCO Skills Data Processing Pipeline

Replicates the final n8n-style pipeline in a single Python script:
1. Fetches raw ESCO data from GitHub (JSON).
2. Filters for skillType = 'skill/competence'.
3. Maps conceptUri -> skillId, preferredLabel -> skillName.
4. Splits altLabels on newlines into an array.
5. Extracts a 'verb' (first word of the skillName).
6. Combines skillName, altLabels, description into 'searchText' (lowercased).
7. Generates up to 20 high-frequency keywords from searchText.
8. Saves the processed data into a 'processed_data' folder.

Usage:
    python esco_processor.py
"""

import requests
import json
import datetime
import re
import os
from typing import List, Dict, Any
import logging

# Configure basic logging
logging.basicConfig(
    level=logging.INFO,
    format='%(asctime)s - %(levelname)s - %(message)s'
)
logger = logging.getLogger(__name__)

def fetch_esco_data(url: str) -> List[Dict[str, Any]]:
    """
    Fetch ESCO data (JSON list) from a provided URL (GitHub raw link).
    """
    logger.info(f"Fetching ESCO data from {url}")
    response = requests.get(url)
    response.raise_for_status()
    data = json.loads(response.text)
    logger.info(f"Retrieved {len(data)} ESCO entries")
    return data

def process_esco_skills(esco_data: List[Dict[str, Any]], batch_size: int = 1000) -> Dict[str, Any]:
    """
    Process ESCO data to match the n8n pipeline's final structure:
      - Keep conceptUri as skillId
      - Convert preferredLabel -> skillName
      - Preserve skillType, reuseLevel, description
      - altLabels -> array of lines
      - verb is the first word of skillName (lowercased)
      - searchText is skillName + altLabels + description
      - Up to 20 keywords, ignoring common short words
    """
    # Filter: only keep skillType = 'skill/competence'
    skill_entries = [entry for entry in esco_data if entry.get('skillType') == 'skill/competence']
    logger.info(f"Found {len(skill_entries)} skill/competence entries")

    total_batches = (len(skill_entries) + batch_size - 1) // batch_size
    processed_skills = []

    for i in range(0, len(skill_entries), batch_size):
        batch_end = min(i + batch_size, len(skill_entries))
        logger.info(f"Processing batch {i // batch_size + 1}/{total_batches}: entries {i} to {batch_end - 1}")
        batch = skill_entries[i:batch_end]

        for skill in batch:
            try:
                # conceptUri -> skillId
                skill_id = skill.get('conceptUri', '')

                # preferredLabel -> skillName
                raw_name = skill.get('preferredLabel', '').strip()
                skill_name_lower = raw_name.lower()

                # verb -> first word of skillName
                skill_name_words = re.split(r'\s+', skill_name_lower)
                verb = skill_name_words[0] if skill_name_words else ''

                # altLabels -> array of synonyms (split on newlines)
                alt_labels_text = skill.get('altLabels', '')
                alternate_labels = [lbl.strip() for lbl in alt_labels_text.split('\n') if lbl.strip()]

                # Build searchText from skillName + altLabels + description
                description_text = skill.get('description', '')
                search_text = ' '.join([
                    raw_name,
                    ' '.join(alternate_labels),
                    description_text
                ]).lower()

                # Remove short words & common stopwords, then keep top 20 frequencies
                all_words = re.split(r'\s+', search_text)
                common_words = ['and','or','the','a','an','in','on','at','to','for','with','by','of','such','as']
                filtered_words = [w for w in all_words if w and len(w) > 3 and w not in common_words]

                word_freq = {}
                for w in filtered_words:
                    word_freq[w] = word_freq.get(w, 0) + 1

                # Sort by frequency desc, slice top 20
                keywords = [w for w, _ in sorted(word_freq.items(), key=lambda x: x[1], reverse=True)[:20]]

                processed_skills.append({
                    'skillId': skill_id,
                    'skillType': skill.get('skillType', ''),
                    'reuseLevel': skill.get('reuseLevel', ''),
                    'skillName': raw_name,
                    'description': description_text,
                    'verb': verb,
                    'alternateLabels': alternate_labels,
                    'keywords': keywords,
                    'searchText': search_text
                })

            except Exception as e:
                logger.error(f"Error processing skill '{skill.get('preferredLabel', 'Unknown')}': {str(e)}")

    logger.info(f"ESCO processing complete. Processed {len(processed_skills)} skills.")

    return {
        'processedEscoSkills': processed_skills,
        'metadata': {
            'totalEntries': len(esco_data),
            'processedSkills': len(processed_skills),
            'processingDate': datetime.datetime.now().isoformat()
        }
    }

def save_to_file(processed_data: Dict[str, Any], output_dir: str = 'processed_data', file_path: str = None) -> str:
    """
    Save processed data to a JSON file in the specified directory.
    If file_path is not provided, creates a timestamped filename.
    """
    os.makedirs(output_dir, exist_ok=True)

    if file_path is None:
        timestamp = datetime.datetime.now().strftime("%Y-%m-%d-%H-%M-%S")
        file_path = os.path.join(output_dir, f"processed-esco-skills-{timestamp}.json")

    with open(file_path, 'w', encoding='utf-8') as f:
        json.dump(processed_data, f, indent=2)

    logger.info(f"Data saved to {file_path}")
    return file_path

def main():
    """
    Main function to run the ESCO processing pipeline.
    """
    # URL to ESCO data (GitHub raw link)
    url = "https://raw.githubusercontent.com/Kyleinexile/IS-Repo/refs/heads/main/ESCO.json"

    try:
        # 1. Fetch data
        esco_data = fetch_esco_data(url)

        # 2. Process data
        processed_data = process_esco_skills(esco_data)

        # 3. Save to file in processed_data folder
        save_to_file(processed_data, 'processed_data')

        logger.info("Workflow completed successfully")
    except Exception as e:
        logger.error(f"Error in workflow: {str(e)}")
        raise

if __name__ == "__main__":
    main()


2025-03-28 13:32:53,445 - INFO - Fetching ESCO data from https://raw.githubusercontent.com/Kyleinexile/IS-Repo/refs/heads/main/ESCO.json
2025-03-28 13:32:54,552 - INFO - Retrieved 13939 ESCO entries
2025-03-28 13:32:54,554 - INFO - Found 10715 skill/competence entries
2025-03-28 13:32:54,554 - INFO - Processing batch 1/11: entries 0 to 999
2025-03-28 13:32:54,573 - INFO - Processing batch 2/11: entries 1000 to 1999
2025-03-28 13:32:54,592 - INFO - Processing batch 3/11: entries 2000 to 2999
2025-03-28 13:32:54,613 - INFO - Processing batch 4/11: entries 3000 to 3999
2025-03-28 13:32:54,632 - INFO - Processing batch 5/11: entries 4000 to 4999
2025-03-28 13:32:54,652 - INFO - Processing batch 6/11: entries 5000 to 5999
2025-03-28 13:32:54,672 - INFO - Processing batch 7/11: entries 6000 to 6999
2025-03-28 13:32:54,692 - INFO - Processing batch 8/11: entries 7000 to 7999
2025-03-28 13:32:54,712 - INFO - Processing batch 9/11: entries 8000 to 8999
2025-03-28 13:32:54,732 - INFO - Processin