In [None]:
import pandas as pd
import os
import json
import re
import spacy # Import spaCy

print("--- Loading spaCy model (this may take a moment) ---")
# Load the small English model. We only need its linguistic features.
nlp = spacy.load("en_core_web_sm")
print("spaCy model loaded.")


print("--- Loading the skills dataset from skills_list.csv ---")
input_file_path = os.path.join('..', 'data', 'processed', 'skills_list.csv')
df = pd.read_csv(input_file_path)
df.dropna(subset=['job_skills'], inplace=True)

print("--- Processing and applying advanced cleaning ---")
all_skills_set = set()

# --- ADVANCED: Define rules using spaCy ---
def is_valid_skill_advanced(skill_text, nlp_doc):
    # Rule 1: Basic text filters (as before)
    if not 2 <= len(skill_text) <= 50: return False
    if re.search(r'[\d$€£]', skill_text): return False
    noise_phrases = ['salary', 'hourly', 'bonus', 'mindset', 'per hour', 'llc', 'inc', 'benefits', 'ability to']
    if any(phrase in skill_text for phrase in noise_phrases): return False
    
    # Rule 2: Linguistic Filtering using spaCy
    # A valid skill should not start with a verb (like 'work', 'lift') or be an entire clause.
    # We check the Part-of-Speech (POS) tag of the first token.
    # Good skills are usually nouns (NN, NNP) or noun chunks.
    first_token_pos = nlp_doc[0].pos_
    if first_token_pos not in ['NOUN', 'PROPN', 'ADJ']:
        return False
        
    # Rule 3: Reject if it's identified as a geographic or political entity (GPE)
    for ent in nlp_doc.ents:
        if ent.label_ == 'GPE': # GPE = Geopolitical Entity (e.g., "united states")
            return False
            
    return True

# Loop through each row in the DataFrame's 'job_skills' column
for skills_string in df['job_skills']:
    skills_list = [skill.strip().lower() for skill in skills_string.split(',')]
    
    for skill in skills_list:
        cleaned_skill = re.sub(r'[\\"\'\(\)\[\]]', '', skill).strip()
        
        if cleaned_skill:
            # Process the skill with spaCy to get its linguistic features
            doc = nlp(cleaned_skill)
            
            # Only add the skill if it passes our new advanced validation
            if is_valid_skill_advanced(cleaned_skill, doc):
                all_skills_set.add(cleaned_skill)

# Convert to a final sorted list
master_skill_list = sorted(list(all_skills_set))

print(f"\nCreated a master dictionary with {len(master_skill_list)} unique, ADVANCED CLEANED skills.")
print("\nHere is a sample of the new cleaned skills:")
print(master_skill_list[1500:1510])

# --- Save the final clean dictionary ---
output_path = os.path.join('..', 'data', 'processed', 'master_skills.json')
with open(output_path, 'w', encoding='utf-8') as f:
    json.dump({"skills": master_skill_list}, f, indent=4)

print(f"\nFinal master skill dictionary saved to: {output_path}")

--- Loading spaCy model (this may take a moment) ---
✅ spaCy model loaded.
--- Loading the skills dataset from skills_list.csv ---
--- Processing and applying advanced cleaning ---

✅ Created a master dictionary with 50811 unique, ADVANCED CLEANED skills.

Here is a sample of the new cleaned skills:
['analyse', 'analyses', 'analysis', 'analysis activities', 'analysis and integration', 'analysis and presentation of data', 'analysis and refinement of system requirements', 'analysis and reporting', 'analysis and validation of physical models', 'analysis driven problem solving']

✅ Final master skill dictionary saved to: ..\data\processed\master_skills.json
