In [32]:
import pandas as pd

In [227]:
from transformers import pipeline
import spacy
import re
from collections import defaultdict

# Initialize models
nlp = spacy.load("en_core_web_lg")  # Load before SkillNer

# Correct SkillNer import (newer versions may use different import)
try:
    from skillNer.skill_extractor import SkillExtractor
    skill_extractor = SkillExtractor(nlp)
except ImportError:
    # Fallback to alternative skill extraction
    print("SkillNer import failed, using alternative method")
    skill_extractor = None

# Initialize NER pipeline
ner_pipeline = pipeline("ner",
                       model="dslim/bert-base-NER",
                       aggregation_strategy="simple")




SkillNer import failed, using alternative method


Some weights of the model checkpoint at dslim/bert-base-NER were not used when initializing BertForTokenClassification: ['bert.pooler.dense.bias', 'bert.pooler.dense.weight']
- This IS expected if you are initializing BertForTokenClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForTokenClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Device set to use cpu


In [239]:
import re
from typing import List, Dict, Optional, Union

# ----------------------
# Constants
# ----------------------
SKILLS_LIST = {skill.lower() for skill in [
    # --- Programming Languages ---
    'python', 'java', 'javascript', 'typescript', 'c++', 'c#', 'go', 'golang', 'ruby', 'php',
    'swift', 'kotlin', 'rust', 'scala', 'r', 'dart', 'perl', 'bash', 'sql', 'html', 'css', 'Python',

    # --- Web & Frontend ---
    'react', 'angular', 'vue', 'svelte', 'next.js', 'nuxt.js', 'jquery', 'd3.js', 'redux',
    'webpack', 'babel', 'tailwind', 'bootstrap', 'sass', 'less', 'graphql', 'rest', 'api',

    # --- Backend & Frameworks ---
    'node.js', 'express', 'django', 'flask', 'spring', 'spring boot', 'laravel', 'ruby on rails',
    'asp.net', '.net core', 'fastapi', 'nestjs', 'hibernate', 'jpa', 'gin', 'echo',

    # --- Mobile & Game Dev ---
    'react native', 'flutter', 'xamarin', 'ionic', 'unity', 'unreal engine', 'cocos2d', 'godot',
    'opengl', 'directx', 'phaser', 'pixi.js',

    # --- DevOps & Cloud ---
    'aws', 'azure', 'google cloud', 'gcp', 'docker', 'kubernetes', 'terraform', 'ansible',
    'jenkins', 'github actions', 'gitlab ci', 'circleci', 'argo cd', 'helm', 'prometheus',
    'grafana', 'splunk', 'new relic', 'datadog', 'istio', 'linkerd',

    # --- Databases ---
    'postgresql', 'mysql', 'mongodb', 'redis', 'cassandra', 'elasticsearch', 'dynamodb',
    'firebase', 'snowflake', 'bigquery', 'oracle', 'sql server', 'mariadb', 'neo4j', 'cosmos db',

    # --- Data Science & AI ---
    'pandas', 'numpy', 'scikit-learn', 'tensorflow', 'pytorch', 'keras', 'opencv', 'spark',
    'hadoop', 'hive', 'kafka', 'airflow', 'mlflow', 'ray', 'nltk', 'spacy', 'hugging face',
    'langchain', 'llama', 'gpt', 'bert', 'transformers', 'computer vision', 'nlp', 'ai'
    'reinforcement learning', 'supervised learning', 'unsupervised learning', 'AI management'

    # --- Data Engineering ---
    'etl', 'elt', 'data pipeline', 'data warehouse', 'data lake', 'delta lake', 'apache beam',
    'apache flink', 'dbt', 'prefect', 'dagster', 'redshift', 'databricks', 'tableau', 'power bi',
    'looker', 'metabase', 'qlik', 'dax', 'power query', 'ssis', 'ssas', 'ssrs', 'data management'

    # --- BI & Analytics ---
    'tableau', 'power bi', 'looker', 'qlikview', 'qliksense', 'microstrategy', 'sisense',
    'domo', 'matplotlib', 'seaborn', 'plotly', 'ggplot', 'excel', 'google sheets', 'vba',

    # --- Cybersecurity ---
    'owasp', 'penetration testing', 'metasploit', 'burp suite', 'wireshark', 'nmap',
    'siem', 'soc', 'ids', 'ips', 'firewall', 'vpn', 'zero trust', 'saml', 'oauth', 'openid',
    'iso 27001', 'gdpr', 'hipaa', 'pci dss', 'nist', 'cis controls',

    # --- QA & Testing ---
    'selenium', 'cypress', 'jest', 'mocha', 'junit', 'testng', 'pytest', 'postman',
    'soapui', 'jmeter', 'loadrunner', 'appium', 'cucumber', 'specflow', 'robot framework',

    # --- ERP & Business Systems ---
    'sap', 'salesforce', 'microsoft dynamics', 'oracle erp', 'netsuite', 'workday', 'peoplesoft',
    'sage', 'odoo', 'zoho', 'hubspot', 'microsoft power platform', 'sharepoint',

    # --- IT Support & Admin ---
    'active directory', 'windows server', 'linux', 'ubuntu', 'centos', 'red hat', 'macos',
    'vmware', 'hyper-v', 'citrix', 'azure ad', 'office 365', 'exchange', 'powershell',

    # --- Finance & Accounting ---
    'quickbooks', 'xero', 'sage 50', 'peachtree', 'freshbooks', 'oracle financials',
    'sap fico', 'hyperion', 'blackline', 'reconciliation', 'financial modeling', 'gaap',
    'ifrs', 'taxation', 'audit', 'treasury', 'risk management',

    # --- Project Management ---
    'agile', 'scrum', 'kanban', 'saFe', 'waterfall', 'prince2', 'pmp', 'jira', 'trello',
    'asana', 'monday.com', 'ms project', 'smartsheet', 'confluence', 'azure devops',

    # --- Design & Creative ---
    'figma', 'adobe xd', 'sketch', 'illustrator', 'photoshop', 'indesign', 'after effects',
    'premiere pro', 'maya', 'blender', 'autocad', 'solidworks', 'revit', '3d modeling',

    # --- Industry-Specific ---
    # Healthcare
    'hl7', 'fhir', 'epic', 'cerner', 'meditech', 'hipaa compliance', 'electronic health records',
    # Manufacturing
    'plc', 'scada', 'cnc', 'cad/cam', 'industry 4.0', 'iot', 'lean manufacturing', 'six sigma',
    # Logistics
    'wms', 'tms', 'supply chain', 'inventory management', 'sap mm', 'sap sd', 'sap ewm',
    # Legal
    'legal research', 'contract drafting', 'litigation', 'e-discovery', 'compliance', 'corporate law',
    # Education
    'lms', 'moodle', 'blackboard', 'e-learning', 'instructional design', 'curriculum development',
    # Marketing
    'seo', 'sem', 'ppc', 'google analytics', 'google tag manager', 'facebook ads', 'linkedin ads',
    'marketing automation', 'hubspot', 'marketo', 'salesforce marketing cloud', 'content marketing',
    'social media marketing', 'email marketing', 'crm', 'salesforce', 'zoho crm', 'hubspot crm','word',
    'MS Office', 'outlook', 'ms office suite']}

SKILL_CATEGORIES = {
    # --- Programming Languages ---
    'Programming Languages': [
        'python', 'java', 'javascript', 'typescript', 'c++', 'c#', 'go', 'golang',
        'ruby', 'php', 'swift', 'kotlin', 'rust', 'scala', 'r', 'dart', 'perl',
        'bash', 'html', 'css', 'Python', 'oop', 'object-oriented','object-oriented programming'
    ],

    # --- Web Development ---
    'Web Development': [
        'react', 'angular', 'vue', 'svelte', 'next.js', 'nuxt.js', 'jquery',
        'd3.js', 'redux', 'webpack', 'babel', 'tailwind', 'bootstrap', 'sass',
        'less', 'graphql', 'rest', 'api','web development', 'web dev','web developer'
    ],

    # --- Backend Development ---
    'Backend Development': [
        'node.js', 'express', 'django', 'flask', 'spring', 'spring boot',
        'laravel', 'ruby on rails', 'asp.net', '.net core', 'fastapi', 'nestjs',
        'hibernate', 'jpa', 'gin', 'echo','api', 'rest','rest api', 'restful'
    ],

    # --- Mobile & Game Development ---
    'Mobile & Game Dev': [
        'react native', 'flutter', 'xamarin', 'ionic', 'unity', 'unreal engine',
        'cocos2d', 'godot', 'opengl', 'directx', 'phaser', 'pixi.js'
    ],

    # --- Cloud & DevOps ---
    'Cloud & DevOps': [
        'aws', 'azure', 'google cloud', 'gcp', 'docker', 'kubernetes',
        'terraform', 'ansible', 'jenkins', 'github actions', 'gitlab ci',
        'circleci', 'argo cd', 'helm', 'prometheus', 'grafana', 'splunk',
        'new relic', 'datadog', 'istio', 'linkerd', 'cloud','cloud computing'
    ],

    # --- Databases ---
    'Databases': [
        'sql', 'postgresql', 'mysql', 'mongodb', 'redis', 'cassandra', 'databases',
        'elasticsearch', 'dynamodb', 'firebase', 'snowflake', 'bigquery','relational',
        'oracle', 'sql server', 'mariadb', 'neo4j', 'cosmos db', 'database','non relational'
        'no sql','nosql','non-relational'
    ],

    # --- Data Science & AI ---
    'Data Science & AI': [
        'pandas', 'numpy', 'scikit-learn', 'tensorflow', 'pytorch', 'keras',
        'opencv', 'spark', 'hadoop', 'hive', 'kafka', 'airflow', 'mlflow',
        'ray', 'nltk', 'spacy', 'hugging face', 'langchain', 'llama', 'gpt',
        'bert', 'transformers', 'computer vision', 'nlp', 'ai',
        'reinforcement learning', 'supervised learning', 'unsupervised learning',
        'ai management', 'ai developement', 'llm', 'deep learning', 'tensorflow',
        'pytorch','data science',
    ],

    # --- Data Engineering ---
    'Data Engineering': [
        'etl', 'elt', 'data pipeline', 'data warehouse', 'data lake',
        'delta lake', 'apache beam', 'apache flink', 'dbt', 'prefect',
        'dagster', 'redshift', 'databricks', 'data strategy', 'data architecture',
        'data foundation', 'data management', 'data engineering', 'data engineer'
    ],

    # --- BI & Analytics ---
    'BI & Analytics': [
        'tableau', 'power bi', 'looker', 'qlikview', 'qliksense', 'bi tools','bi'
        'microstrategy', 'sisense', 'domo', 'matplotlib', 'seaborn', 'dashboards',
        'plotly', 'ggplot', 'excel', 'google sheets', 'vba', 'pbi','dashboard',
        'business intelligence','visualization', 'data analysis','data visualization'
    ],

    # --- Cybersecurity ---
    'Cybersecurity': [
        'owasp', 'penetration testing', 'metasploit', 'burp suite',
        'wireshark', 'nmap', 'siem', 'soc', 'ids', 'ips', 'firewall',
        'vpn', 'zero trust', 'saml', 'oauth', 'openid', 'iso 27001',
        'gdpr', 'hipaa', 'pci dss', 'nist', 'cis controls'
    ],

    # --- QA & Testing ---
    'QA & Testing': [
        'selenium', 'cypress', 'jest', 'mocha', 'junit', 'testng',
        'pytest', 'postman', 'soapui', 'jmeter', 'loadrunner',
        'appium', 'cucumber', 'specflow', 'robot framework'
    ],

    # --- Business Systems ---
    'Business Systems': [
        'sap', 'salesforce', 'microsoft dynamics', 'oracle erp',
        'netsuite', 'workday', 'peoplesoft', 'sage', 'odoo', 'zoho',
        'hubspot', 'microsoft power platform', 'sharepoint'
    ],

    # --- IT & Administration ---
    'IT & Admin': [
        'active directory', 'windows server', 'linux', 'ubuntu',
        'centos', 'red hat', 'macos', 'vmware', 'hyper-v', 'citrix',
        'azure ad', 'office 365', 'exchange', 'powershell'
    ],

    # --- Business & Management ---
    'Business & Management': [
        'control models', 'business relationships', 'financial modeling',
        'risk management', 'strategic planning', 'project management'
    ],

    # --- Design & Creative ---
    'Design & Creative': [
        'figma', 'adobe xd', 'sketch', 'illustrator', 'photoshop',
        'indesign', 'after effects', 'premiere pro', 'maya', 'blender',
        'autocad', 'solidworks', 'revit', '3d modeling'
    ],

    # --- Industry-Specific ---
    'Healthcare IT': [
        'hl7', 'fhir', 'epic', 'cerner', 'meditech', 'hipaa compliance',
        'electronic health records'
    ],
    'Manufacturing': [
        'plc', 'scada', 'cnc', 'cad/cam', 'industry 4.0', 'iot',
        'lean manufacturing', 'six sigma'
    ],
    'Logistics': [
        'wms', 'tms', 'supply chain', 'inventory management',
        'sap mm', 'sap sd', 'sap ewm'
    ],
    'Marketing Tech': [
        'seo', 'sem', 'ppc', 'google analytics', 'google tag manager',
        'facebook ads', 'linkedin ads', 'marketing automation',
        'content marketing', 'social media marketing', 'email marketing'
    ],
    'Office Tools': [
        'word', 'ms office', 'outlook', 'ms office suite', 'powerpoint',
        'excel', 'google docs', 'google sheets'
    ]
}

SOFT_SKILLS = [
    'team player', 'open-minded', 'organized', 'communication',
    'adaptability', 'collaboration', 'initiative', 'creativity',
    'analytical', 'problem solving', 'attention to detail',
    'time management', 'leadership', 'management', 'communicate',
    'communicates'
]

LANGUAGE_PATTERN = r'\b(english|german|french|spanish|italian|dutch|mandarin|japanese)\b'
EXPERIENCE_PATTERN = r'(\d+\+?\s*(?:years?|yrs?|months?)\s*(?:of)?\s*(?:experience|work)?)'
EDUCATION_PATTERN = r'\b(bachelor(?:\'?s)?(?: of science)?|b\.sc|master(?:\'?s)?(?: of science)?|m\.sc|ph\.?d|diploma|degree|certification)\b'
EDUCATION_FIELD_PATTERN = r'(economic|technical|it[\-\s]related|computer science|engineering|business)'

# ----------------------
# Helper Functions
# ----------------------
def clean_entities(entities: List[Dict]) -> List[str]:
    """Post-process transformers NER output with compound skill handling."""
    cleaned = []
    current_text = ""

    for entity in entities:
        text = entity['text']

        # Handle word pieces
        if text.startswith('##'):
            current_text += text[2:]
        else:
            if current_text:
                cleaned.append(current_text)
            current_text = text

        # Handle special compound cases
        if current_text.lower() == 'ai' and cleaned and cleaned[-1].lower() == '/':
            cleaned.pop()  # Remove the slash
            current_text = 'AI/' + current_text

    if current_text:
        cleaned.append(current_text)

    return cleaned

def get_skill_category(skill: str) -> str:
    """Categorize a skill based on predefined categories."""
    # Handle case where skill might be None or not a string
    if not isinstance(skill, str):
        return 'other'

    try:
        skill_lower = skill.lower()
        for category, skills in SKILL_CATEGORIES.items():
            # Compare against both raw and normalized versions
            if (skill_lower in (s.lower() for s in skills) or
                skill_lower in (normalize_skill(s).lower() for s in skills)):
                return category
        return 'other'
    except AttributeError:
        return 'other'

def normalize_skill(skill: str) -> str:
    """Normalize skill names and handle variants."""
    skill = skill.lower()
    variants = {
        'ai': 'AI',
        'ki': 'AI',
        'it': 'IT',
        'sql': 'SQL',
        'ai/data strategy': 'AI/Data Strategy'
    }
    return variants.get(skill, skill.capitalize())

def extract_skills(text: str, entities: List[Union[str, Dict]],
                  extractor=None) -> Optional[List[Dict]]:
    """Enhanced skill extraction with comprehensive pattern matching and robust error handling."""
    skills = []
    text_lower = text.lower()

    # 1. Extract from skill extractor if available
    if extractor:
        try:
            matches = extractor.annotate(text)
            for match in matches.get('results', {}).get('full_matches', []):
                if not isinstance(match, dict):
                    continue

                skill_name = match.get('doc_node_value')
                if not isinstance(skill_name, str):
                    continue

                try:
                    skills.append({
                        'skill': normalize_skill(skill_name),
                        'type': 'hard',
                        'category': get_skill_category(skill_name),
                        'source': 'extractor',
                        'confidence': float(match.get('score', 0.8))
                    })
                except (AttributeError, ValueError) as e:
                    print(f"Skipping invalid skill match: {e}")
                    continue

        except Exception as e:
            print(f"Skill extractor error: {e}")

    # 2. Extract from entities with validation
    for ent in entities:
        try:
            if isinstance(ent, dict):
                ent_text = ent.get('text', '')
                confidence = float(ent.get('confidence', 0.8))
            else:
                ent_text = str(ent)
                confidence = 0.9

            if not ent_text:
                continue

            ent_lower = ent_text.lower()

            if ent_lower in SKILLS_LIST:
                skills.append({
                    'skill': normalize_skill(ent_text),
                    'type': 'hard',
                    'category': get_skill_category(ent_text),
                    'source': 'entity',
                    'confidence': confidence
                })
        except (AttributeError, ValueError) as e:
            print(f"Skipping invalid entity: {e}")
            continue

    # 3. Enhanced compound skill patterns
    compound_skills = [
        # AI/Data Patterns
        (r'\b(ai|artificial intelligence|ml|machine learning|data)\s*(strategy|roadmap|vision)\b',
         'AI/Data Strategy'),
        (r'\bdata\s+(foundation|governance|management|architecture|platform)\b',
         'Data Foundation'),
        (r'\bdata\s+(templates?|models?|schemas?)\b',
         'Data Modeling'),
        (r'\b(regulatory|compliance)\s+data\b',
         'Regulatory Data'),

        # Cloud & Infrastructure
        (r'\bcloud\b.*\b(computing|architecture|services|migration|native)\b',
         'Cloud Computing'),
        (r'\b(on[-\s]*premise|on[-\s]*prem)\b',
         'On-Premise Systems'),
        (r'\b(hybrid|multi[-\s]*cloud)\b',
         'Hybrid/Multi Cloud'),
        (r'\b(kubernetes|k8s)\b',
         'Kubernetes'),
        (r'\b(docker|containerization|containers)\b',
         'Containerization'),
        (r'\b(cicd|ci/cd|continuous integration|continuous deployment)\b',
         'CI/CD'),

        # Cloud Providers
        (r'\b(aws|azure|gcp|google cloud platform)\b',
         lambda m: m.group(1).upper()),

        # Programming Languages
        (r'\b(python|java|scala|r|sql|go|typescript|javascript|c\+\+|c#|bash|shell|perl|php|rust|ruby|matlab)\b',
         'Programming Language'),

        # Data Engineering
        (r'\b(etl|elt|data\s*pipeline|airflow|data\s*integration|data\s*orchestration)\b',
         'Data Engineering'),
        (r'\bdbt\b',
         'dbt (Data Build Tool)'),
        (r'\b(nifi|luigi|kafka|flink|beam|streamsets|azkaban)\b',
         'Data Pipeline Tool'),

        # DevOps/MLOps
        (r'\b(devops|mlops|jenkins|gitlab\s*ci|argo\s*cd|kubeflow|mlflow|dvc|ml\s+ops)\b',
         'DevOps/MLOps'),
        (r'\b(infrastructure\s+as\s+code|terraform|ansible|cloudformation|pulumi)\b',
         'IaC'),

        # Data Visualization
        (r'\b(data|business)\s*dashboards?\b',
         'Data Dashboards'),
        (r'\b(power\s*bi|tableau|looker|qlik|superset|metabase|microstrategy|google\s*data\s*studio)\b',
         'BI Tools'),

        # Data Science
        (r'\b(statistical\s+analysis|predictive\s+modeling|data\s+science|descriptive\s+analytics)\b',
         'Data Science'),
        (r'\b(customer\s+segmentation|churn\s+prediction|forecasting|recommendation\s+system)\b',
         'Use Case'),
        (r'\b(a/b\s+testing|causal\s+inference|time\s+series)\b',
         'Analytics Techniques'),

        # Machine Learning
        (r'\b(supervised|unsupervised|reinforcement)\s+learning\b',
         'Machine Learning Type'),
        (r'\b(classification|regression|clustering|nlp|computer\s*vision|deep\s*learning)\b',
         'ML Techniques'),
        (r'\b(pytorch|tensorflow|sklearn|scikit-learn|keras|xgboost|lightgbm)\b',
         'ML/AI Framework'),

        # Big Data
        (r'\b(hadoop|spark|pyspark|hive|pig|flink|beam|storm)\b',
         'Big Data Tool'),

        # Databases
        (r'\b(sql|nosql|relational\s+database|mongodb|postgres|mysql|oracle|sqlite|mariadb)\b',
         'Database'),
        (r'\b(data\s*warehouse|dwh|snowflake|bigquery|redshift|databricks|synapse)\b',
         'Data Warehouse'),
        (r'\b(data\s+lake|lakehouse)\b',
         'Data Lake')
    ]

    # Process compound patterns
    for pattern, handler in compound_skills:
        try:
            for match in re.finditer(pattern, text_lower):
                if callable(handler):
                    skill_name = handler(match)
                else:
                    skill_name = handler

                if isinstance(skill_name, str):
                    skills.append({
                        'skill': skill_name,
                        'type': 'hard',
                        'category': get_skill_category(skill_name),
                        'source': 'pattern',
                        'confidence': 0.95
                    })
        except Exception as e:
            print(f"Error processing pattern {pattern}: {e}")
            continue

    # 4. Remove duplicates (keeping highest confidence version)
    unique_skills = {}
    for skill in skills:
        try:
            if not isinstance(skill, dict):
                continue

            name_lower = str(skill.get('skill', '')).lower()
            if not name_lower:
                continue

            current_conf = float(skill.get('confidence', 0))

            if (name_lower not in unique_skills or
                unique_skills[name_lower].get('confidence', 0) < current_conf):
                unique_skills[name_lower] = skill
        except Exception as e:
            print(f"Skipping invalid skill entry: {e}")
            continue

    # Convert to clean output format
    clean_skills = []
    for skill in unique_skills.values():
        try:
            clean_skills.append({
                'skill': str(skill.get('skill', '')),
                'skill_type': str(skill.get('type', 'hard')),
                'skill_category': str(skill.get('category', 'other')),
                'detection_method': str(skill.get('source', 'unknown')),
                'confidence_score': float(skill.get('confidence', 0.5))
            })
        except Exception as e:
            print(f"Skipping malformed skill: {e}")
            continue

    return clean_skills or None

def extract_education(text: str) -> Dict:
    """Enhanced education extraction with degree types and fields."""
    degrees = list(set(re.findall(EDUCATION_PATTERN, text, re.IGNORECASE)))
    fields = list(set(re.findall(EDUCATION_FIELD_PATTERN, text.lower())))

    if not degrees and not fields:
        return None

    return {
        'degrees': [d.lower() for d in degrees] if degrees else None,
        'fields': [f.capitalize() for f in fields] if fields else None
    }

def extract_soft_skills(text: str) -> Optional[List[Dict]]:
    """Enhanced soft skill extraction with normalization."""
    found = []
    text_lower = text.lower()

    for skill in SOFT_SKILLS:
        skill_lower = skill.lower()
        if skill_lower in text_lower:
            # Normalize verb forms to nouns
            normalized = skill_lower
            if skill_lower.endswith('s') and skill_lower[:-1] in SOFT_SKILLS:
                normalized = skill_lower[:-1]
            elif skill_lower.endswith('e') and skill_lower + 's' in SOFT_SKILLS:
                normalized = skill_lower + 's'

            found.append({
                'name': normalized.capitalize(),
                'type': 'soft_skill'
            })

    return found or None

# for cleaning the result dict before assigning it to df
def clean_extraction_results(results: dict) -> dict:
    """Cleans the raw extraction results and returns a structured dict with normalized fields."""

    cleaned = {
        'hard_skills': {},      # {skill: category}
        'languages': None,      # List of languages
        'experience': None,     # First experience string
        'education': None,      # Dict: {degree/field: type}
        'soft_skills': None     # List of soft skill names
    }

    # --- Hard Skills ---
    if isinstance(results.get('skills'), list):
        for skill in results['skills']:
            name = skill.get('skill', '').strip().lower()
            category = skill.get('skill_category', '').strip().lower()
            if name and category:
                cleaned['hard_skills'][name] = category
    elif isinstance(results.get('skills'), dict):
        # If already in dict format
        cleaned['hard_skills'] = {
            k.strip().lower(): v.strip().lower()
            for k, v in results['skills'].items()
        }

    # --- Languages ---
    if isinstance(results.get('languages'), list):
        languages = [lang.strip().lower() for lang in results['languages'] if isinstance(lang, str)]
        cleaned['languages'] = list(set(languages)) or None

    # --- Experience ---
    if isinstance(results.get('experience'), list) and results['experience']:
        cleaned['experience'] = results['experience'][0].strip()

    # --- Education ---
    if isinstance(results.get('education'), dict):
        degrees = results['education'].get('degrees', [])
        fields = results['education'].get('fields', [])
        if (degrees is None) | (fields is None):
          cleaned['education'] = None
        else:
          combined = {
            deg.strip().lower(): 'degree'
            for deg in degrees if isinstance(deg, str)
          }
          combined.update({
            fld.strip().lower(): 'field'
            for fld in fields if isinstance(fld, str)
          })
          cleaned['education'] = combined or None

    # --- Soft Skills ---
    if isinstance(results.get('soft_skills'), list):
        softs = [s.get('name', '').strip().lower() for s in results['soft_skills'] if isinstance(s, dict) and s.get('name')]
        cleaned['soft_skills'] = list(set(softs)) or None

    return cleaned


# ----------------------
# Main Extraction Function
# ----------------------

def extract_job_requirements(text: str, ner_pipeline = ner_pipeline, skill_extractor=None) -> Dict:
    """Enhanced job requirements extraction with structured output."""
    results = {
        'skills': None,
        'languages': None,
        'experience': None,
        'education': None,
        'soft_skills': None,
        'entities': None,
    }

    # Pre-process text
    text = re.sub(r'\s+', ' ', text)  # Normalize whitespace
    text = text.replace('AI / Data', 'AI/Data').replace('AI/ Data', 'AI/Data')

    # Named Entity Recognition
    try:
        raw_entities = ner_pipeline(text)
        filtered = [
            {'text': e['word'], 'type': e['entity_group'], 'confidence': float(e['score'])}
            for e in raw_entities if e['entity_group'] != 'PER'
        ]
        cleaned_entities = clean_entities(filtered)
        results['entities'] = cleaned_entities or None
    except Exception as e:
        print(f"NER pipeline failed: {e}")
        cleaned_entities = []

    # Extract each category with enhanced processing
    results['skills'] = extract_skills(text, cleaned_entities, skill_extractor)
    results['languages'] = extract_languages(text)
    results['experience'] = extract_experience(text)
    results['education'] = extract_education(text)
    results['soft_skills'] = extract_soft_skills(text)


    return clean_extraction_results(results)

In [236]:
df = pd.read_csv('job_postings_classified.csv')
df = df.drop(df.columns[0], axis=1)

In [241]:
# Convert all values to strings first
df["requirements_extracted"] = df["requirements"].astype(str).apply(extract_job_requirements)

In [243]:
df.to_csv("jobs_with_requirements.csv")