# Job Type NuNER-Zero KeyBERT Jaro-Winkler Classification

## Load Data

In [1]:
import pandas as pd

fileName = 'Test_Results_Translated'
excel_file = '../' + fileName + '.xlsx'

df = pd.read_excel(excel_file)

In [2]:
columns = ['Title', 'OfferDescription', 'Requirements', 'Responsibilities', 'AdditionalInformation', 'Descriptions']
existing_columns = [col for col in columns if col in df.columns]

descriptions = df[existing_columns].copy().apply(lambda x: '. '.join(x.dropna().astype(str)), axis=1).tolist()

## NuNER-Zero

### Load Model

In [3]:
import torch
from gliner import GLiNER

device = torch.device('cuda:0') if torch.cuda.is_available() else torch.device('cpu')

model = GLiNER.from_pretrained("numind/NuNerZero")
model = model.to(device)

Fetching 6 files:   0%|          | 0/6 [00:00<?, ?it/s]



### Merge Entities Function

In [4]:
def merge_entities(entities):
    if not entities:
        return []
    merged = []
    current = entities[0]
    for next_entity in entities[1:]:
        if next_entity['label'] == current['label'] and (next_entity['start'] == current['end'] + 1 or next_entity['start'] == current['end']):
            current['text'] = text[current['start']: next_entity['end']].strip()
            current['end'] = next_entity['end']
        else:
            merged.append(current)
            current = next_entity
    # Append the last entity
    merged.append(current)
    return merged

### Evaluate

In [5]:
def split_text(text, max_length=384):
    segments = []
    start = 0

    while start < len(text):
        end = min(start + max_length, len(text))
        split_point = text.rfind('.', start, end)

        if split_point == -1 or end == len(text):
            segments.append(text[start:end])
            break
        else:
            segments.append(text[start:split_point + 1])
            start = split_point + 1

    return segments

In [6]:
# NuZero requires labels to be lower-cased!
labels = ["Field", "Job Type", "Task"]
labels = [l.lower() for l in labels]

textFields = []

for text in descriptions:
    result = split_text(text)

    for segment in result:
        entities = model.predict_entities(segment, labels)
        entities = merge_entities(entities)

        fields = []

        for entity in entities:
            fields.append(entity["text"])

    textFields.append(fields)


Asking to truncate to max_length but no maximum length is provided and the model has no predefined maximum length. Default to no truncation.


## Prepare Data

### Tokenization

In [7]:
import nltk
from nltk import word_tokenize
nltk.download('punkt')

[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\Usuario\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!


True

In [8]:
descriptions_token = []

for description in descriptions:
    tokens = word_tokenize(description)
    descriptions_token.append(tokens)

### Remove Hyphens

In [9]:
def remove_hyphens(text):
    return [item.strip('-') for item in text]

In [10]:
rem_hyphens = []

for description in descriptions_token:
    tokens = remove_hyphens(description)
    rem_hyphens.append(tokens)

### Lowcase all the text

In [11]:
def lowercase_text(text):
    return [item.lower() for item in text]

In [12]:
lowcase_descriptions = []

for description in rem_hyphens:
    tokens = lowercase_text(description)
    lowcase_descriptions.append(tokens)

### Remove casing

In [13]:
def remove_case(text):
    return [item.strip('[]') for item in text]

In [14]:
rem_casing = []

for description in lowcase_descriptions:
    tokens = remove_case(description)
    rem_casing.append(tokens)

### Remove stopwords

In [15]:
nltk.download("stopwords")
from nltk.corpus import stopwords

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\Usuario\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [16]:
stop_words = set(stopwords.words("english"))

In [17]:
def remove_stopwords(text):
    return [word for word in text if word.casefold() not in stop_words]

In [18]:
rem_stopwords = []

for description in rem_casing:
    tokens = remove_stopwords(description)
    rem_stopwords.append(tokens)

### Remove unicode symbols

In [19]:
import re

In [20]:
unicode_regex = re.compile(r'[^\x00-\x7F]')

In [21]:
def remove_unicode(text):
    return [word for word in text if not unicode_regex.search(word)]

In [22]:
rem_unicode = []

for description in rem_stopwords:
    tokens = remove_unicode(description)
    rem_unicode.append(tokens)

### Remove numbers and digits

In [23]:
number_regex = re.compile(r'\d')

In [24]:
def remove_number(text):
    return [word for word in text if not number_regex.search(word)]

In [25]:
rem_number = []

for description in rem_unicode:
    tokens = remove_number(description)
    rem_number.append(tokens)

### Remove Special Words Mixed with String, Numbers, Punctuation, and Other Symbols

In [26]:
special_regex = re.compile(r'^(?=.*\d)(?=.*[A-Za-z])|(?=.*[A-Za-z])(?=.*[\W_])|(?=.*\d)(?=.*[\W_])')

In [27]:
def remove_special(text):
    return [word for word in text if not special_regex.search(word)]

In [28]:
rem_special = []

for description in rem_number:
    tokens = remove_special(description)
    rem_special.append(tokens)

### Remove URL

In [29]:
url_regex = re.compile(r"(@\[A-Za-z0-9]+)|([^0-9A-Za-z \t])|(\w+:\/\/\S+)|^rt|http.+?")

In [30]:
def remove_url(text):
    return [word for word in text if not url_regex.search(word)]

In [31]:
rem_url = []

for description in rem_special:
    tokens = remove_url(description)
    rem_url.append(tokens)

### Remove Punctuation

In [32]:
import string

def remove_punct(text):
    return [word for word in text if word not in string.punctuation]

In [33]:
rem_punct = []

for description in rem_url:
    tokens = remove_punct(description)
    rem_punct.append(tokens)

### Remove Repetitions

In [34]:
def remove_duplicates(arr):
    return list(set(arr))

In [35]:
rem_dupl = []

for description in rem_punct:
    tokens = remove_duplicates(description)
    rem_dupl.append(tokens)

### Remove single characters and non-english words

In [36]:
from nltk.corpus import words
nltk.download('words')

[nltk_data] Downloading package words to
[nltk_data]     C:\Users\Usuario\AppData\Roaming\nltk_data...
[nltk_data]   Package words is already up-to-date!


True

In [37]:
english_words = set(words.words())

In [38]:
def remove_non_eng(text):
    return [word for word in text if word in english_words]

In [39]:
def remove_single_char_words(arr):
    return list(filter(lambda word: len(word) > 1, arr))

In [40]:
rem_noEnglish = []

for description in rem_dupl:
    tokens = remove_non_eng(description)
    tokens = remove_single_char_words(tokens)
    rem_noEnglish.append(tokens)

### Remove spelling mistakes

In [41]:
from spellchecker import SpellChecker

def remove_misspelled_words(arr):
    spell = SpellChecker()
    return [word for word in arr if word in spell]

In [42]:
rem_misspelled = []

for description in rem_noEnglish:
    tokens = remove_misspelled_words(description)
    rem_misspelled.append(tokens)

### Lemmatization

In [43]:
from nltk.stem import WordNetLemmatizer
nltk.download("wordnet")
nltk.download("omw-1.4")

# Initialize wordnet lemmatizer
wnl = WordNetLemmatizer()

[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\Usuario\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package omw-1.4 to
[nltk_data]     C:\Users\Usuario\AppData\Roaming\nltk_data...
[nltk_data]   Package omw-1.4 is already up-to-date!


In [44]:
def lematize(arr):
    return [wnl.lemmatize(word) for word in arr]

In [45]:
cleaned_texts = []

for description in rem_misspelled:
    tokens = lematize(description)
    cleaned_texts.append(tokens)

## KeyBERT

In [46]:
from sentence_transformers import SentenceTransformer
from keybert import KeyBERT

model = SentenceTransformer("all-MiniLM-L6-v2")
model = model.to(device)

kw_model = KeyBERT(model)

In [47]:
def glue_texts(array):
    return [' '.join(tokens) for tokens in array]

In [48]:
final_texts = glue_texts(cleaned_texts)

In [49]:
keywords_per_text = []
for text in final_texts:
    keywords = kw_model.extract_keywords(text)
    keywords_per_text.append([item[0] for item in keywords])

## Jaro-Winkler

In [50]:
category_map = {
    "Undefined": [],
    "Management": [
        "General Manager", "CEO", "Executive Director", "Operations Manager", "Project Leader",
        "Program Director", "Department Head", "Executive Officer", "Division Head", "Branch Director",
        "Regional Director", "Country Director", "Site Director", "Organizational Development",
        "Business Strategy", "Leadership Development", "Management Consulting", "Team Leadership",
        "Performance Management", "Employee Development", "Talent Management", "Succession Planning",
        "Conflict Resolution", "Staff Management", "Workforce Planning", "Goal Setting", "Performance Appraisal",
        "HR Management", "Financial Management", "Marketing Management", "Sales Management",
        "Product Management", "IT Management", "Quality Management", "Supply Chain Management",
        "Risk Management", "Compliance Management", "Customer Service Management", "Facilities Management",
        "Administrative Management", "Corporate Governance", "Business Operations", "Stakeholder Management",
        "Process Improvement", "Business Continuity", "KPI Management", "Resource Allocation", "Crisis Management",
        "Innovation Management", "Knowledge Management", "Strategic Initiatives", "Organizational Behavior",
        "Change Leadership", "Strategic Execution", "Leadership Communication", "Decision Making",
        "Resource Optimization"
    ],
    "Sales and Marketing": [
        "Sales Representative", "Sales Manager", "Sales Director", "Account Executive",
        "Business Development Representative", "Sales Specialist", "Marketing Manager", "Product Manager",
        "Marketing Director", "Marketing Specialist", "Digital Marketing Manager", "Content Marketing Manager",
        "SEO Specialist", "Business Development", "Market Research", "Advertising", "Sales Strategies",
        "Customer Relationship Management", "Digital Marketing", "Social Media Marketing", "Public Relations",
        "Content Marketing", "Email Marketing", "Lead Generation", "Campaign Management", "Market Segmentation",
        "Sales Forecasting", "Customer Engagement", "Sales Operations", "Promotional Strategies", "Sales Enablement",
        "Marketing Communications", "Consumer Behavior", "Sales Analytics", "Market Analysis", "Brand Management",
        "Customer Insights", "Campaign Strategy", "Content Strategy", "Digital Advertising", "Sales Optimization",
        "Customer Experience Management", "Influencer Marketing", "Growth Hacking", "Performance Marketing"
    ],
    "Engineering and Science": [
        "Software Engineer", "Mechanical Engineer", "Electrical Engineer", "Civil Engineer", "Structural Engineer",
        "Chemical Engineer", "Environmental Engineer", "Industrial Engineer", "Systems Engineer", "Hardware Engineer",
        "IT Support Specialist", "Data Analyst", "Systems Analyst", "Network Engineer", "Database Administrator",
        "Cybersecurity Specialist", "Cloud Computing Specialist", "DevOps Engineer", "AI Developer", "Software Developer",
        "IT Project Manager", "Software Development", "Network Security", "Cloud Services", "Data Management",
        "Systems Integration", "Quality Assurance", "Data Engineering", "IT Management", "Software Architecture",
        "Digital Transformation", "Application Development", "Hardware Design", "Systems Engineering",
        "Network Architecture", "Cybersecurity Protocols", "Data Analytics", "Cloud Infrastructure",
        "Emerging Technologies", "Geospatial Analysis", "Spatial Data Management", "Remote Sensing",
        "Artificial Intelligence", "Machine Learning", "Robotics", "Big Data Analytics", "IoT", "Computational Science",
        "Mathematician", "Statistician", "Data Scientist", "Quantitative Analyst", "Actuary",
        "Mathematical Modeler", "Operations Research Analyst", "Physicist", "Chemist", "Astronomer",
        "Geophysicist", "Environmental Scientist", "Biochemist", "Research Scientist", "Laboratory Technician",
        "Climate Scientist", "Mathematical Analysis", "Data Modeling", "Statistical Analysis", "Experimental Research",
        "Environmental Analysis", "Scientific Research", "Applied Mathematics", "Physical Chemistry",
        "Organic Chemistry", "Inorganic Chemistry", "Astrophysics", "Quantum Mechanics",
        "Photonics", "Laser Technology", "Quantum Computing", "Scientific Data Analysis", "Experimental Physics",
        "Statistical Mechanics", "Spatial Statistics", "Geostatistics", "Environmental Modeling",
        "Neuroscience", "Computational Biology", "Bioinformatics", "Complex Systems", "Neurorehabilitation",
        "Trauma Care", "Psychiatric Medicine", "Geriatric Medicine", "Telemedicine", "PhD", "Doctorate",
        "Postdoctoral Research", "Academic Research", "Research Fellow", "Principal Investigator",
        "Research Associate", "Assistant Professor", "Associate Professor", "Professor", "Lecturer", "Microbiology",
        "Biology", "Geosciences"
    ],
    "Finance": [
        "Accountant", "Financial Analyst", "Controller", "Auditor", "Budget Analyst", "Investment Analyst",
        "Treasury Analyst", "Risk Manager", "Tax Advisor", "Credit Analyst", "Cash Flow Manager",
        "Financial Records", "Financial Analysis", "Compliance", "Office Management", "Budget Management",
        "Payroll Management", "Accounts Payable", "Accounts Receivable", "Financial Planning", "Corporate Finance",
        "Cost Accounting", "Management Accounting", "Financial Reporting", "Internal Audit", "Business Valuation",
        "Asset Management", "Credit Management", "Financial Risk Management", "Regulatory Compliance", "Treasury Management",
        "Financial Modeling", "Risk Assessment", "Budget Planning", "Audit Procedures", "Financial Strategy",
        "Administrative Operations", "Expense Management", "Financial Technology", "Fintech Innovations",
        "Equity Research Analyst", "Portfolio Manager", "Investment Banker", "Financial Consultant",
        "Hedge Fund Manager", "Private Equity Analyst", "Venture Capital Analyst", "Corporate Treasurer",
        "Financial Advisor", "Commercial Banker", "Loan Officer", "Trade Analyst", "Merger and Acquisition Analyst",
        "Forensic Accountant", "Tax Planner", "Estate Planner", "Credit Risk Analyst", "Quantitative Finance Analyst",
        "Personal Financial Advisor", "Insurance Underwriter", "Pension Fund Manager", "Real Estate Finance Specialist"
    ],
    "Administration": [
        "Administrative Assistant", "Office Manager", "Executive Assistant", "Office Administrator",
        "Receptionist", "Clerical Assistant", "Data Entry Clerk", "Office Coordinator",
        "Administrative Coordinator", "Secretary", "Personal Assistant", "Administrative Manager",
        "Front Desk Coordinator", "Customer Service Administrator", "Records Manager", "Office Support Specialist",
        "Mailroom Clerk", "File Clerk", "Virtual Assistant", "Scheduling Coordinator", "Office Operations Manager",
        "Office Support Supervisor", "Executive Secretary", "Administrative Support Specialist", "Administrative Officer",
        "Office Supervisor", "Administrative Director", "Facilities Coordinator", "Document Controller",
        "Administrative Receptionist", "Office Assistant", "Operations Administrator"
    ]
}

In [51]:
from jaro import jaro_winkler_metric

In [52]:
def classify_text(extracted_keywords, category_map):
    similarity_scores = {}

    def compute_max_similarity(keywords, category_keywords):
        max_similarity = 0
        for kw in keywords:
            for cat_kw in category_keywords:
                similarity = jaro_winkler_metric(kw, cat_kw)
                if similarity > max_similarity:
                    max_similarity = similarity
        return max_similarity

    for category, keywords in category_map.items():
        similarity = compute_max_similarity(extracted_keywords, keywords)
        similarity_scores[category] = similarity

    best_category = max(similarity_scores, key=similarity_scores.get)
    return best_category, similarity_scores

In [53]:
finalKeywords = []

for i in range(0, len(keywords_per_text)):
    finalKeywords.append(keywords_per_text[i] + textFields[i])

## Zero Shot

### Load the pre-trained zeroShot RoBERTa tokenizer and model

In [54]:
from transformers import AutoTokenizer, AutoModelForSequenceClassification

model_name = "MoritzLaurer/deberta-v3-large-zeroshot-v2.0-c"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModelForSequenceClassification.from_pretrained(model_name)
model = model.to(device)

### Create a Zero-Shot Classification pipeline

In [55]:
from transformers import pipeline

classifier = pipeline("zero-shot-classification", model=model, tokenizer=tokenizer)

## Classification & Evaluation

In [56]:
import datetime

# Initialize a dictionary to keep track of counts for each category
category_counts = {category: 0 for category in category_map.keys()}
total_count = len(finalKeywords)  # Total number of offers
accuracy = 0
jobTypes = []

for keywords, text in zip(finalKeywords, descriptions):
    best_category, similarity_scores = classify_text(keywords, category_map)
    matched_categories = []

    for category, score in similarity_scores.items():
        if score >= 0.6:
            matched_categories.append(category)

    finalResult = ''
    if not matched_categories:
        finalResult = "Undefined"
    elif len(matched_categories) ==  1:
        finalResult = matched_categories[0]
    else:
        result = classifier(text, candidate_labels=matched_categories)
        finalResult = result['labels'][0]
    category_counts[finalResult] += 1
    jobTypes.append(finalResult)
    
df['JobType'] = jobTypes
df.to_excel('../' + fileName + '_JobType.xlsx', index=False, engine='openpyxl')

# Print percentage of offers in each category
for category, count in category_counts.items():
    if (category == "Undefined"):
        percentage = (count / total_count) * 100
        print(f"Classified: {(100 - percentage):.2f}%")
        print(f"{category}: {percentage:.2f}%")
        print("\n ----------------------------- \n")
    else:
        percentage = (count / total_count) * 100
        print(f"Category '{category}': {percentage:.2f}%")

print("\n ----------------------------- \n")

# Calculate accuracy
true_categories = df['Type'].tolist()
predicted_categories = df['JobType'].tolist()

# Count correct predictions
correct_predictions = sum(1 for true, pred in zip(true_categories, predicted_categories) if true == pred)

# Calculate percentage accuracy
accuracy = (correct_predictions / total_count) * 100

# Print the result
print(f"Accuracy: {accuracy:.2f}%")

print("\n ----------------------------- \n")
print("Document Annotated")

Classified: 100.00%
Undefined: 0.00%

 ----------------------------- 

Category 'Management': 4.44%
Category 'Sales and Marketing': 0.00%
Category 'Engineering and Science': 91.11%
Category 'Finance': 0.00%
Category 'Administration': 4.44%

 ----------------------------- 

Accuracy: 91.11%

 ----------------------------- 

Document Annotated
