# Job Type KeyBERT Jaro-Winkler Classification

### Load Data

In [1]:
import pandas as pd
from sklearn.model_selection import train_test_split

excel_file = './Euraxess_GNSS_Keywords.xlsx'
df = pd.read_excel(excel_file)

_, sampled_df = train_test_split(df, test_size=0.01, random_state=42)

### Prepare Data

In [2]:
sampled_df['Concatenated'] = sampled_df[['Title', 'OfferDescription', 'Requirements', 'AdditionalInformation']].apply(lambda x: ' '.join(x.dropna().astype(str)), axis=1)

descriptions = sampled_df['Concatenated'].tolist()

### Tokenization

In [3]:
import nltk
from nltk import word_tokenize
nltk.download('punkt')

[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\Usuario\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!


True

In [4]:
descriptions_token = []

for description in descriptions:
    tokens = word_tokenize(description)
    descriptions_token.append(tokens)

### Remove Hyphens

In [5]:
def remove_hyphens(text):
    return [item.strip('-') for item in text]

In [6]:
rem_hyphens = []

for description in descriptions_token:
    tokens = remove_hyphens(description)
    rem_hyphens.append(tokens)

### Lowcase all the text

In [7]:
def lowercase_text(text):
    return [item.lower() for item in text]

In [8]:
lowcase_descriptions = []

for description in rem_hyphens:
    tokens = lowercase_text(description)
    lowcase_descriptions.append(tokens)

### Remove casing

In [9]:
def remove_case(text):
    return [item.strip('[]') for item in text]

In [10]:
rem_casing = []

for description in lowcase_descriptions:
    tokens = remove_case(description)
    rem_casing.append(tokens)

### Remove stopwords

In [11]:
nltk.download("stopwords")
from nltk.corpus import stopwords

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\Usuario\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [12]:
stop_words = set(stopwords.words("english"))

In [13]:
def remove_stopwords(text):
    return [word for word in text if word.casefold() not in stop_words]

In [14]:
rem_stopwords = []

for description in rem_casing:
    tokens = remove_stopwords(description)
    rem_stopwords.append(tokens)

### Remove unicode symbols

In [15]:
import re

In [16]:
unicode_regex = re.compile(r'[^\x00-\x7F]')

In [17]:
def remove_unicode(text):
    return [word for word in text if not unicode_regex.search(word)] 

In [18]:
rem_unicode = []

for description in rem_stopwords:
    tokens = remove_unicode(description)
    rem_unicode.append(tokens)

### Remove numbers and digits

In [19]:
number_regex = re.compile(r'\d')

In [20]:
def remove_number(text):
    return [word for word in text if not number_regex.search(word)] 

In [21]:
rem_number = []

for description in rem_unicode:
    tokens = remove_number(description)
    rem_number.append(tokens)

### Remove Special Words Mixed with String, Numbers, Punctuation, and Other Symbols

In [22]:
special_regex = re.compile(r'^(?=.*\d)(?=.*[A-Za-z])|(?=.*[A-Za-z])(?=.*[\W_])|(?=.*\d)(?=.*[\W_])')

In [23]:
def remove_special(text):
    return [word for word in text if not special_regex.search(word)] 

In [24]:
rem_special = []

for description in rem_number:
    tokens = remove_special(description)
    rem_special.append(tokens)

### Remove URL

In [25]:
url_regex = re.compile(r"(@\[A-Za-z0-9]+)|([^0-9A-Za-z \t])|(\w+:\/\/\S+)|^rt|http.+?")

In [26]:
def remove_url(text):
    return [word for word in text if not url_regex.search(word)]

In [27]:
rem_url = []

for description in rem_special:
    tokens = remove_url(description)
    rem_url.append(tokens)

### Remove Punctuation

In [28]:
import string

def remove_punct(text):
    return [word for word in text if word not in string.punctuation]

In [29]:
rem_punct = []

for description in rem_url:
    tokens = remove_punct(description)
    rem_punct.append(tokens)

### Remove Repetitions

In [30]:
def remove_duplicates(arr):
    return list(set(arr))

In [31]:
rem_dupl = []

for description in rem_punct:
    tokens = remove_duplicates(description)
    rem_dupl.append(tokens)

### Remove single characters and non-english words

In [32]:
from nltk.corpus import words
nltk.download('words')

[nltk_data] Downloading package words to
[nltk_data]     C:\Users\Usuario\AppData\Roaming\nltk_data...
[nltk_data]   Package words is already up-to-date!


True

In [33]:
english_words = set(words.words())

In [34]:
def remove_non_eng(text):
    return [word for word in text if word in english_words]

In [35]:
def remove_single_char_words(arr):
    return list(filter(lambda word: len(word) > 1, arr))

In [36]:
rem_noEnglish = []

for description in rem_dupl:
    tokens = remove_non_eng(description)
    tokens = remove_single_char_words(tokens)
    rem_noEnglish.append(tokens)

### Remove spelling mistakes

In [37]:
from spellchecker import SpellChecker

def remove_misspelled_words(arr):
    spell = SpellChecker()
    return [word for word in arr if word in spell]

In [38]:
rem_misspelled = []

for description in rem_noEnglish:
    tokens = remove_misspelled_words(description)
    rem_misspelled.append(tokens)

### Lemmatization

In [39]:
from nltk.stem import WordNetLemmatizer
nltk.download("wordnet")
nltk.download("omw-1.4")

# Initialize wordnet lemmatizer
wnl = WordNetLemmatizer()

[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\Usuario\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package omw-1.4 to
[nltk_data]     C:\Users\Usuario\AppData\Roaming\nltk_data...
[nltk_data]   Package omw-1.4 is already up-to-date!


In [40]:
def lematize(arr):
    return [wnl.lemmatize(word) for word in arr]

In [41]:
cleaned_texts = []

for description in rem_misspelled:
    tokens = lematize(description)
    cleaned_texts.append(tokens)

## KeyBERT

In [42]:
from keybert import KeyBERT

kw_model = KeyBERT()

  from tqdm.autonotebook import tqdm, trange


In [43]:
def glue_texts(array):
    return [' '.join(tokens) for tokens in array]

In [44]:
final_texts = glue_texts(cleaned_texts)

In [45]:
keywords_per_text = []
for text in final_texts:
    keywords = kw_model.extract_keywords(text)
    keywords_per_text.append([item[0] for item in keywords])

## Jaro-Winkler

In [54]:
category_map = {
    "Management": [
        "General Manager", "CEO", "Executive Director", "Operations Manager", "Project Leader",
        "Program Director", "Department Head", "Executive Officer", "Division Head", "Branch Director",
        "Regional Director", "Country Director", "Site Director", "Organizational Development",
        "Business Strategy", "Leadership Development", "Management Consulting", "Team Leadership",
        "Performance Management", "Employee Development", "Talent Management", "Succession Planning",
        "Conflict Resolution", "Staff Management", "Workforce Planning", "Goal Setting", "Performance Appraisal",
        "HR Management", "Financial Management", "Marketing Management", "Sales Management",
        "Product Management", "IT Management", "Quality Management", "Supply Chain Management",
        "Risk Management", "Compliance Management", "Customer Service Management", "Facilities Management",
        "Administrative Management", "Corporate Governance", "Business Operations", "Stakeholder Management",
        "Process Improvement", "Business Continuity", "KPI Management", "Resource Allocation", "Crisis Management",
        "Innovation Management", "Knowledge Management", "Strategic Initiatives", "Organizational Behavior",
        "Change Leadership", "Strategic Execution", "Leadership Communication", "Decision Making",
        "Resource Optimization"
    ],
    "Sales and Marketing": [
        "Sales Representative", "Sales Manager", "Sales Director", "Account Executive",
        "Business Development Representative", "Sales Specialist", "Marketing Manager", "Product Manager",
        "Marketing Director", "Marketing Specialist", "Digital Marketing Manager", "Content Marketing Manager",
        "SEO Specialist", "Business Development", "Market Research", "Advertising", "Sales Strategies",
        "Customer Relationship Management", "Digital Marketing", "Social Media Marketing", "Public Relations",
        "Content Marketing", "Email Marketing", "Lead Generation", "Campaign Management", "Market Segmentation",
        "Sales Forecasting", "Customer Engagement", "Sales Operations", "Promotional Strategies", "Sales Enablement",
        "Marketing Communications", "Consumer Behavior", "Sales Analytics", "Market Analysis", "Brand Management",
        "Customer Insights", "Campaign Strategy", "Content Strategy", "Digital Advertising", "Sales Optimization",
        "Customer Experience Management", "Influencer Marketing", "Growth Hacking", "Performance Marketing"
    ],
    "Engineering and Science": [
        "Software Engineer", "Mechanical Engineer", "Electrical Engineer", "Civil Engineer", "Structural Engineer",
        "Chemical Engineer", "Environmental Engineer", "Industrial Engineer", "Systems Engineer", "Hardware Engineer",
        "IT Support Specialist", "Data Analyst", "Systems Analyst", "Network Engineer", "Database Administrator",
        "Cybersecurity Specialist", "Cloud Computing Specialist", "DevOps Engineer", "AI Developer", "Software Developer",
        "IT Project Manager", "Software Development", "Network Security", "Cloud Services", "Data Management",
        "Systems Integration", "Quality Assurance", "Data Engineering", "IT Management", "Software Architecture",
        "Digital Transformation", "Application Development", "Hardware Design", "Systems Engineering",
        "Network Architecture", "Cybersecurity Protocols", "Data Analytics", "Cloud Infrastructure",
        "Emerging Technologies", "Geospatial Analysis", "Spatial Data Management", "Remote Sensing",
        "Artificial Intelligence", "Machine Learning", "Robotics", "Big Data Analytics", "IoT", "Computational Science",
        "Mathematician", "Statistician", "Data Scientist", "Quantitative Analyst", "Actuary",
        "Mathematical Modeler", "Operations Research Analyst", "Physicist", "Chemist", "Astronomer",
        "Geophysicist", "Environmental Scientist", "Biochemist", "Research Scientist", "Laboratory Technician",
        "Climate Scientist", "Mathematical Analysis", "Data Modeling", "Statistical Analysis", "Experimental Research",
        "Environmental Analysis", "Scientific Research", "Applied Mathematics", "Physical Chemistry",
        "Organic Chemistry", "Inorganic Chemistry", "Astrophysics", "Quantum Mechanics",
        "Photonics", "Laser Technology", "Quantum Computing", "Scientific Data Analysis", "Experimental Physics",
        "Statistical Mechanics", "Spatial Statistics", "Geostatistics", "Environmental Modeling",
        "Neuroscience", "Computational Biology", "Bioinformatics", "Complex Systems", "Neurorehabilitation",
        "Trauma Care", "Psychiatric Medicine", "Geriatric Medicine", "Telemedicine", "PhD", "Doctorate",
        "Postdoctoral Research", "Academic Research", "Research Fellow", "Principal Investigator",
        "Research Associate", "Assistant Professor", "Associate Professor", "Professor", "Lecturer", "Microbiology"
    ],
    "Finance": [
        "Accountant", "Financial Analyst", "Controller", "Auditor", "Budget Analyst", "Investment Analyst",
        "Treasury Analyst", "Risk Manager", "Tax Advisor", "Credit Analyst", "Cash Flow Manager",
        "Financial Records", "Financial Analysis", "Compliance", "Office Management", "Budget Management",
        "Payroll Management", "Accounts Payable", "Accounts Receivable", "Financial Planning", "Corporate Finance",
        "Cost Accounting", "Management Accounting", "Financial Reporting", "Internal Audit", "Business Valuation",
        "Asset Management", "Credit Management", "Financial Risk Management", "Regulatory Compliance", "Treasury Management",
        "Financial Modeling", "Risk Assessment", "Budget Planning", "Audit Procedures", "Financial Strategy",
        "Administrative Operations", "Expense Management", "Financial Technology", "Fintech Innovations",
        "Equity Research Analyst", "Portfolio Manager", "Investment Banker", "Financial Consultant",
        "Hedge Fund Manager", "Private Equity Analyst", "Venture Capital Analyst", "Corporate Treasurer",
        "Financial Advisor", "Commercial Banker", "Loan Officer", "Trade Analyst", "Merger and Acquisition Analyst",
        "Forensic Accountant", "Tax Planner", "Estate Planner", "Credit Risk Analyst", "Quantitative Finance Analyst",
        "Personal Financial Advisor", "Insurance Underwriter", "Pension Fund Manager", "Real Estate Finance Specialist"
    ],
    "Administration": [
        "Administrative Assistant", "Office Manager", "Executive Assistant", "Office Administrator",
        "Receptionist", "Clerical Assistant", "Data Entry Clerk", "Office Coordinator",
        "Administrative Coordinator", "Secretary", "Personal Assistant", "Administrative Manager",
        "Front Desk Coordinator", "Customer Service Administrator", "Records Manager", "Office Support Specialist",
        "Mailroom Clerk", "File Clerk", "Virtual Assistant", "Scheduling Coordinator", "Office Operations Manager",
        "Office Support Supervisor", "Executive Secretary", "Administrative Support Specialist", "Administrative Officer",
        "Office Supervisor", "Administrative Director", "Facilities Coordinator", "Document Controller",
        "Administrative Receptionist", "Office Assistant", "Operations Administrator"
    ],
    "Technical Works": [
        "Electrician", "Plumber", "Carpenter", "Welder", "Machinist", "Machine Operator", "HVAC Technician",
        "Automotive Technician", "Diesel Mechanic", "Industrial Mechanic", "Maintenance Technician",
        "Construction Worker", "Heavy Equipment Operator", "Line Installer", "Cable Technician", "Elevator Installer",
        "Boilermaker", "Roofer", "Painter", "Bricklayer", "Forklift Operator", "CNC Operator", "Tool and Die Maker",
        "Sheet Metal Worker", "Pipefitter", "Insulation Worker", "Crane Operator", "Paving Equipment Operator",
        "Rig Operator", "Power Plant Operator", "Wind Turbine Technician", "Solar Panel Installer", "Glazier",
        "Lineman", "Telecommunications Technician", "Security System Installer", "Locksmith",
        "Septic Tank Servicer", "Gas Appliance Technician", "Aircraft Mechanic", "Marine Mechanic",
        "Railroad Technician", "Signal and Track Switch Repairer", "Nuclear Technician", "Petroleum Technician",
        "Textile Worker", "Printing Press Operator", "Quality Control Inspector", "Warehouse Associate",
        "Logistics Coordinator", "Material Handler", "Production Worker", "Assembler"
    ]
}

In [47]:
from jaro import jaro_winkler_metric

In [48]:
def classify_text(extracted_keywords, category_map):
    similarity_scores = {}
    
    def compute_max_similarity(keywords, category_keywords):
        max_similarity = 0
        for kw in keywords:
            for cat_kw in category_keywords:
                similarity = jaro_winkler_metric(kw, cat_kw)
                if similarity > max_similarity:
                    max_similarity = similarity
        return max_similarity

    for category, keywords in category_map.items():
        similarity = compute_max_similarity(extracted_keywords, keywords)
        similarity_scores[category] = similarity
    
    best_category = max(similarity_scores, key=similarity_scores.get)
    return best_category, similarity_scores

In [55]:
similarCount = 0

for keywords in keywords_per_text:
    best_category, similarity_scores = classify_text(keywords, category_map)
    if (similarity_scores[best_category] >= 0.7):
        similarCount += 1
        print (keywords, best_category, similarity_scores[best_category])
print ("Similar offers", similarCount)
print ("Total offers", len(keywords_per_text))
print (similarCount/len(keywords_per_text) * 100, "%")

['recruiter', 'internship', 'training', 'graduate', 'hire'] Engineering and Science 0.8366402116402116
['excellence', 'collaborate', 'comprehensive', 'architect', 'collaboration'] Management 0.7657407407407407
['biotic', 'recruiting', 'cultivated', 'agriculture', 'departmental'] Management 0.8333333333333334
['qualification', 'career', 'dissertation', 'qualified', 'apply'] Technical Works 0.7962962962962963
['engineering', 'postdoctoral', 'chemistry', 'environmental', 'porosity'] Engineering and Science 0.8596491228070176
['study', 'researcher', 'cellulosic', 'utilization', 'recruitment'] Sales and Marketing 0.7548821548821548
['recruitment', 'increasingly', 'career', 'academic', 'education'] Technical Works 0.7962962962962963
['letter', 'employment', 'employer', 'postdoctoral', 'office'] Finance 0.8333333333333334
['molecule', 'researcher', 'molecular', 'biological', 'biology'] Engineering and Science 0.7658730158730159
['recruitment', 'employment', 'undergraduate', 'supplemental', 'd