In [1]:
import pandas as pd
import numpy as np
import spacy
from spacy.lang.en.stop_words import STOP_WORDS
import re

# Load the spaCy model
nlp = spacy.load("en_core_web_sm")

# Load your DataFrame
df_resume = pd.read_csv("/home/koala/github/NLP/NLP-A4/data/Resume.csv")

# Shuffle and reduce the dataset to the first 500 entries for this task
df_resume = df_resume.reindex(np.random.permutation(df_resume.index))
df_resume = df_resume.iloc[:500].reset_index(drop=True)

# Preprocessing function from your code
def preprocessing(sentence):
    stopwords = list(STOP_WORDS)
    doc = nlp(sentence)
    clean_tokens = []
    for token in doc:
        if token.text not in stopwords and token.pos_ != 'PUNCT' and token.pos_ != 'SYM' and token.pos_ != 'SPACE':
            clean_tokens.append(token.lemma_.lower().strip())
    return " ".join(clean_tokens)

# Function to extract skills (assuming you have a way to identify them as 'SKILL')
def get_skills(text):
    doc = nlp(text)
    skills = []
    for ent in doc.ents:
        if re.match(r'^SKILL', ent.label_):
            skills.append(ent.text)
    return skills

# Function to extract job titles
def get_job_titles(text):
    doc = nlp(text)
    job_titles = []
    for ent in doc.ents:
        if ent.label_ == 'JOB_TITLE':  # Assuming 'JOB_TITLE' is a defined entity in your NER model
            job_titles.append(ent.text)
    return job_titles

# Apply preprocessing and extract skills
df_resume['Clean_resume'] = df_resume['Resume_str'].apply(preprocessing)
# df_resume['Skills'] = df_resume['Clean_resume'].apply(get_skills)

# Extract job titles - this requires your attention to correctly identify job titles
df_resume['Job_Titles'] = df_resume['Clean_resume'].apply(get_job_titles)

# Since you haven't defined how to uniquely identify job titles, adjust the `get_job_titles` function as necessary.

# Save the first 500 entries with skills and job titles into a new CSV
df_resume[['Job_Titles']].to_csv("extracted_skills_job_titles.csv", index=False)


  from .autonotebook import tqdm as notebook_tqdm
