In [104]:
import json
import re
import random
import html
import spacy
from spacy.matcher import PhraseMatcher

In [105]:
# Load spaCy blank model for tokenization alignment checking
nlp = spacy.blank("en")
matcher = PhraseMatcher(nlp.vocab, attr="LOWER")  # Case-insensitive matching

In [None]:
# Load skills from a linkedin skills file
skill_file = "../data/linkedin_skills.txt"

In [107]:
with open(skill_file, "r", encoding="utf-8") as f:
    skill_list = [line.strip() for line in f.readlines() if line.strip()]

In [None]:
# Function to clean skills (Fix HTML entities, handle hyphens, preserve single letters)
def clean_skill(skill):
    skill = html.unescape(skill)  # Convert HTML entities (&amp; -> &)
    skill = skill.replace("\t", " ").strip()  # Remove tabs and extra spaces
    skill = re.sub(r"\s+", " ", skill)  # Normalize multiple spaces

    # Normalize ampersands to "and"
    skill = skill.replace("&", "and")  

    # Convert hyphens to spaces for better tokenization
    skill = skill.replace("-", " ")  

    # Preserve single-letter words (e.g., "v" in "hyper v") by adding "_"
    skill = re.sub(r"\b([a-zA-Z])\b", r"\1_", skill)  

    # Normalize apostrophes to avoid tokenization errors
    skill = skill.replace("’", "'")  # Normalize different apostrophe characters

    return skill.lower().strip()  # Convert to lowercase for better matching

In [None]:
# Sentence templates
# This is a temporary approach until we get live data coming in
sentence_templates = [
    "{} is a required skill for this position.",
    "Candidates should have expertise in {}.",
    "Proficiency in {} is essential for this job.",
    "A background in {} is strongly preferred.",
    "We are looking for applicants skilled in {}.",
    "Experience with {} is a significant advantage.",
    "{} knowledge is a key qualification.",
    "An ideal candidate must be proficient in {}.",
    "Knowledge of {} is required for this role.",
    "{} is an important skill for candidates.",
    "Candidates should demonstrate strong {} skills.",
    "Proficiency in {} is preferred by employers.",
    "Having a solid background in {} will be beneficial.",
    "{} is a valuable skill for this position.",
    "Having experience in {} will be beneficial.",
    "Familiarity with {} is a must-have for this role.",
    "Candidates should be well-versed in {}.",
    "Employers seek professionals with {} expertise.",
    "A solid foundation in {} is required.",
    "{} is highly valued in this industry.",
    "Strong {} skills are necessary for success in this role.",
    "Understanding {} is crucial for this job.",
    "Applicants must demonstrate competency in {}.",
    "A candidate must be experienced in {}.",
    "{} proficiency will help you excel in this role.",
    "The ability to work with {} is a must-have for this position.",
    "We are prioritizing candidates familiar with {}.",
    "Employers value professionals who are skilled in {}."
]

In [110]:
# Apply cleaning function
skill_list = [clean_skill(skill) for skill in skill_list]

# ✅ Use PhraseMatcher to add skills for proper tokenization
skill_patterns = [nlp.make_doc(skill) for skill in skill_list]
matcher.add("SKILL", skill_patterns)

In [None]:
def generate_training_data(skill_list, sentence_templates, num_sentences=5000):
    """Generates labeled training data for spaCy's NER model"""
    training_data = []
    used_sentences = set()

    while len(training_data) < num_sentences:
        skill = random.choice(skill_list)
        skill = clean_skill(skill)  # Ensure skill is cleaned

        if not skill or len(skill) < 2:
            continue  # Skip invalid skills

        template = random.choice(sentence_templates)
        sentence = template.format(skill)

        # Ensure uniqueness to prevent duplicate patterns
        if sentence in used_sentences:
            continue
        used_sentences.add(sentence)

        # Tokenize sentence using spaCy's tokenizer
        doc = nlp(sentence)
        tokenized_sentence = " ".join([token.text.lower() for token in doc])

        # Ensure skills like "hyper v" are matched correctly
        matches = matcher(doc)
        matched_entities = []
        for match_id, start, end in matches:
            span = doc[start:end]
            if skill.lower() in span.text.lower():
                matched_entities.append((span.start_char, span.end_char, "SKILL"))

        if not matched_entities:
            print(f"Skipping misaligned skill: '{skill}' in sentence: '{sentence}'")
            continue  # Skip misaligned skill

        training_data.append((sentence, {"entities": matched_entities}))

    return training_data

In [None]:
# Generate training data
training_data = generate_training_data(skill_list, sentence_templates, num_sentences=20000)

Skipping misaligned skill: 'ima' in sentence: 'Applicants must demonstrate competency in ima.'


In [None]:
# Save labeled data in spaCy format
output_file = "../data/spacy_training_data.json"
with open(output_file, "w") as f:
    json.dump(training_data, f, indent=4)

In [None]:
print(f"Labeled training data saved to {output_file} with {len(training_data)} sentences!")

✅ Labeled training data saved to ../data/spacy_training_data.json with 20000 sentences!
