In [1]:
import spacy
from spacy.training.example import Example

In [2]:
nlp = spacy.load('en_core_web_lg')

In [3]:
import pandas as pd

df = pd.read_csv('Data.csv')

In [None]:
print(df.info())

In [None]:
print(df.describe())

In [None]:
print(df.head())

In [None]:
print(df['Description'].value_counts())

In [None]:
# Access a specific column value from the first row
first_description = df.loc[0, 'Description']

# Display the value
print(first_description)

In [4]:
import re

def preprocess_text(text):
    # Remove extra whitespaces
    text = ' '.join(text.split())

    # Remove unwanted characters
    text = re.sub(r"[^a-zA-Z0-9.,!?']", ' ', text)

    # Ensure proper spacing after punctuation
    text = re.sub(r'(?<=[.,!?])(?=[^\s])', ' ', text)

    return text


def preprocess(text):
    doc = nlp(text)
    return ' '.join([token.text.lower() for token in doc if not token.is_punct])


df['processed_text'] = df['Description'].apply(preprocess_text)
df['clean_text'] = df['processed_text'].apply(preprocess)

In [5]:
print(df['processed_text'])

0     Company DescriptionAt Lakeshore, we create inn...
1     DescriptionPosition Title  Store ManagerDepart...
2     Company OverviewKREWE is an independent high f...
3     Company DescriptionVuori is re defining what a...
4     ABOUT US Founded in 1945, Pierre Balmain's epo...
                            ...                        
88    Surgical  Orthopedic 5 South   Certified Nurse...
89    GUARDIAN ANGEL SENIOR SERVICES We no longer re...
90    Dietary Aide is responsible for running the tr...
91    Get paid daily with DailyPay! RequirementsCNA ...
92    Livingston Hills Nursing and rehab is actively...
Name: processed_text, Length: 93, dtype: object


In [6]:
ner = nlp.get_pipe("ner")
ner.add_label("JOB_REQUIREMENT")

1

In [7]:
train_data = [
    ("we are seeking a skilled software engineer with a strong background in python development", {"entities": [(25, 42, "JOB_REQUIREMENT")]}),
    ("the candidate should have experience working with machine learning frameworks like tensorflow and pytorch", {"entities": [(50, 66, "JOB_REQUIREMENT"), (83, 105, "JOB_REQUIREMENT")]}),
    ("excellent problem-solving skills and the ability to work in a fast-paced environment are required", {"entities": [(0, 32, "JOB_REQUIREMENT"), (41, 84, "JOB_REQUIREMENT")]}),
    ("familiarity with cloud computing platforms such as aws or azure is a plus", {"entities": [(17, 32, "JOB_REQUIREMENT"), (51, 63, "JOB_REQUIREMENT")]}),
    ("the ideal candidate will possess a bachelor's degree in computer science or a related field", {"entities": [(35, 72, "JOB_REQUIREMENT")]}),
    ("strong communication and interpersonal skills are essential for this role", {"entities": [(0, 20, "JOB_REQUIREMENT"), (25, 45, "JOB_REQUIREMENT")]}),
    ("experience in project management and the ability to lead a team are key requirements", {"entities": [(14, 32, "JOB_REQUIREMENT"), (41, 63, "JOB_REQUIREMENT")]}),
    ("the candidate must be proficient in sql and have a deep understanding of database management", {"entities": [(36, 39, "JOB_REQUIREMENT"), (73, 92, "JOB_REQUIREMENT")]}),
    ("candidates with certifications such as pmp or scrum master are highly desirable", {"entities": [(39, 42, "JOB_REQUIREMENT"), (46, 58, "JOB_REQUIREMENT")]}),
    ("proven experience in designing and implementing scalable and reliable systems is a must", {"entities": [(21, 77, "JOB_REQUIREMENT")]}),
    ("the successful candidate will be responsible for collaborating with cross-functional teams", {"entities": [(49, 90, "JOB_REQUIREMENT")]}),
]

In [None]:
spacy.training.offsets_to_biluo_tags(nlp.make_doc("the ideal candidate will possess a bachelor's degree in computer science or a related field"), [(35, 72, "JOB_REQUIREMENT")])

In [8]:
for _, annotations in train_data:
    for ent in annotations.get("entities", []):
        ner.add_label(ent[2])

In [26]:
n_iter = 90
for _ in range(n_iter):
    for text, annotations in train_data:
        example = Example.from_dict(nlp.make_doc(text), annotations)
        nlp.update([example], drop=0.5)

In [27]:
nlp.to_disk("job_requirement_ner_model")

In [28]:
loaded_nlp = spacy.load("job_requirement_ner_model")

In [29]:
def extract_job_requirements(text):
    doc = loaded_nlp(text)
    return [(ent.text, ent.label_) for ent in doc.ents if ent.label_ == "JOB_REQUIREMENT"]

In [30]:
df['job_requirements'] = df['clean_text'].apply(extract_job_requirements)

In [31]:
print(df[['Description', 'job_requirements']])

                                          Description  \
0   Company DescriptionAt Lakeshore, we create inn...   
1   DescriptionPosition Title: Store ManagerDepart...   
2   Company OverviewKREWE is an independent high-f...   
3   Company DescriptionVuori is re-defining what a...   
4   ABOUT US:Founded in 1945, Pierre Balmain's epo...   
..                                                ...   
88  Surgical /Orthopedic 5 South / Certified Nurse...   
89  GUARDIAN ANGEL SENIOR SERVICES We no longer re...   
90  Dietary Aide is responsible for running the tr...   
91  Get paid daily with DailyPay!RequirementsCNA c...   
92  Livingston Hills Nursing and rehab is actively...   

                                     job_requirements  
0   [(company descriptionat, JOB_REQUIREMENT), (in...  
1   [(descriptionposition title, JOB_REQUIREMENT),...  
2   [(company overviewkrewe, JOB_REQUIREMENT), (ey...  
3   [(company descriptionvuori, JOB_REQUIREMENT), ...  
4   [(us founded, JOB_REQUIREMENT),

In [33]:
print(df['job_requirements'])

0     [(company descriptionat, JOB_REQUIREMENT), (in...
1     [(descriptionposition title, JOB_REQUIREMENT),...
2     [(company overviewkrewe, JOB_REQUIREMENT), (ey...
3     [(company descriptionvuori, JOB_REQUIREMENT), ...
4     [(us founded, JOB_REQUIREMENT), (balmain 's ep...
5     [(80 years dollar general helps, JOB_REQUIREME...
6     [(job descriptionare, JOB_REQUIREMENT), (kansa...
7     [(job detailsdescriptionabout, JOB_REQUIREMENT...
8     [(job detailsdescriptionabout, JOB_REQUIREMENT...
9     [(store manager, JOB_REQUIREMENT), (store mana...
10    [(80 years dollar general helps, JOB_REQUIREME...
11    [(vibes    bold style inspired, JOB_REQUIREMEN...
12    [(store manager, JOB_REQUIREMENT), (helping yo...
13    [(onsitelocation   onsite, JOB_REQUIREMENT), (...
14    [(store manager, JOB_REQUIREMENT), (collection...
15    [(shop manager, JOB_REQUIREMENT), (development...
16    [(retail manager, JOB_REQUIREMENT), (leadershi...
17    [(retail store, JOB_REQUIREMENT), (manager

In [32]:
pd.set_option('display.max_rows', None)

In [25]:
pd.reset_option('display.max_rows')