In [2]:
import sys
sys.path.append('/home/nyto/.local/pipx/venvs/spacy/lib/python3.11/site-packages')
#sys.path.pop()

In [27]:
import pandas as pd
import spacy
from tqdm import tqdm
from spacy.matcher import PhraseMatcher

def load_skill_keywords(skillset_filepath, file_type='csv'):
    """Loads skills keywords from a CSV or Excel file."""

    try:
        if file_type == 'csv':
            skills_df = pd.read_csv(skillset_filepath)
        else:
            skills_df = pd.read_excel(skillset_filepath)
            print("Skill keywords loaded successfully!")
    except Exception as e:
        print(f"Error loading skill keywords: {e}")
        return None

    return skills_df

def create_skill_matcher(nlp, skills_df):
    """Creates a PhraseMatcher for identifying skills in text."""

    matcher = PhraseMatcher(nlp.vocab)
    for column in skills_df.columns:
        skill_patterns = [nlp(text) for text in skills_df[column].dropna()]
        matcher.add(column, None, *skill_patterns)

    return matcher

def extract_skills(text, nlp, matcher):
    """Extracts skills from text using the provided NLP model and matcher."""

    doc = nlp(text)
    matches = matcher(doc)

    extracted_skills = {}
    for match_id, start, end in matches:
        skill_name = nlp.vocab.strings[match_id]
        skill_text = doc[start:end].text
        extracted_skills.setdefault(skill_name, []).append(skill_text)

    return extracted_skills

def main():
    """Main function to process job descriptions and extract skills."""

    nlp = spacy.load('en_core_web_lg')
    skill_keywords_df = load_skill_keywords("../input/Skills_Keywords.xlsx", file_type="excel")
    skill_matcher = create_skill_matcher(nlp, skill_keywords_df)

    job_data = pd.read_json("../input/jobs_jumble.json")
    tqdm.pandas(desc="Traitement en cours")

    job_data['skills'] = job_data['description'].progress_apply(extract_skills, args=(nlp, skill_matcher))
    
    job_data.to_json('../output/jobs_with_skills.json')
    print('Les données ont été enregistrées dans ../output/jobs_with_skills.json')
    
if __name__ == "__main__":
    main()


Skill keywords loaded successfully!


Traitement en cours: 100%|█████████████████████████████████| 223/223 [00:11<00:00, 20.13it/s]

Les données ont été enregistrées dans ../output/jobs_with_skills.json





In [14]:
data

Unnamed: 0,job_id,title,company_name,location,via,description,skills
0,eyJqb2JfdGl0bGUiOiJSZW1vdGUgU2VydmljZU5vdyBBZG...,"Remote ServiceNow Administration with Python, ...",Cube Hub Inc.,"Irvine, Californie",via SaluteMyJob,Position Description: ServiceNow Developer for...,"{'Développement Web': ['Python', 'HTML', 'CSS'..."
1,eyJqb2JfdGl0bGUiOiJMZWFkIEF1dG9tYXRlZCBQeXRob2...,Lead Automated Python Tester,Lumen,États-Unis,via Lumen Jobs,About Lumen\nLumen is guided by our belief tha...,"{'Développement Web': ['Python'], 'Programmati..."
2,eyJqb2JfdGl0bGUiOiJQeXRob24gRGV2ZWxvcGVyIiwiaH...,Python Developer,Neotech Solutions,"San José, Californie",via Indeed,Job Role: Python Developer\n\nLocation: San Jo...,"{'Développement Web': ['Python', 'Python', 'Py..."
3,eyJqb2JfdGl0bGUiOiJQeXRob24gRW5naW5lZXLigJMgKE...,Python Engineer– (Manufacturing Software Tools...,"Zodiac Solutions, Inc","San José, Californie",via LinkedIn,"Hello,\n\ni Hope you are doing great, my name ...","{'Développement Web': ['client', 'Python', 'Py..."
4,eyJqb2JfdGl0bGUiOiJHQ1AgIExlYWQgd2l0aCBTdHJvbm...,GCP Lead with Strong Python & SQL coding expe...,ClifyX,"Boston, Massachusetts",via SaluteMyJob,"1252334\nGCP(GCS, BigQuery, BQ SQL, Python, Da...","{'Programmation': ['SQL', 'Python', 'Python', ..."
...,...,...,...,...,...,...,...
218,eyJqb2JfdGl0bGUiOiJKci4gUHl0aG9uIChEamFuZ28sIE...,"Jr. Python (Django, Flask) and JavaScript (Rea...",S.i. Systems,"Toronto, ON, Canada",via Talent.com,"Jr. Python (Django, Flask) and JavaScript (Rea...","{'Développement Web': ['Python', 'Django', 'cl..."
219,eyJqb2JfdGl0bGUiOiJOb2RlLkpTIERldmVsb3BlciIsIm...,Node.JS Developer,Mindpal,"Ottawa, ON, Canada",via Talent.com,We are currently seeking a talented and experi...,"{'DevOps': ['Node.js', 'Node.js', 'Node.js', '..."
220,eyJqb2JfdGl0bGUiOiJKci4gUHl0aG9uIiwiaHRpZG9jaW...,Jr. Python,S.i. Systems,"Montreal, QC, Canada",via BeBee Canada,"Jr. Python (Django, Flask) and JavaScript (Rea...","{'Développement Web': ['Python', 'Django', 'cl..."
221,eyJqb2JfdGl0bGUiOiJKci4gUHl0aG9uIChEamFuZ28sIE...,"Jr. Python (Django, Flask) and JavaScript (Rea...",S.i. Systems,"Edmonton, AB, Canada",via Talent.com,"Jr. Python (Django, Flask) and JavaScript (Rea...","{'Développement Web': ['Python', 'Django', 'cl..."
