# Classify by Required Degree using NuNER-Zero & JaroW

## Load Data

In [54]:
import pandas as pd

fileName = 'LinkedIn_GNSS_08-01-2024'
excel_file = '../Results/' + fileName + '.xlsx'
requirementsOnly = False

df = pd.read_excel(excel_file)

### Prepare Data

In [55]:
input = []

columns = ['Title', 'OfferDescription', 'Requirements', 'Responsibilities', 'AdditionalInformation', 'Job_title', 'Job_description']
existing_columns = [col for col in columns if col in df.columns]

if 'Requirements' in df.columns:
    requirements = df['Requirements'].tolist()
else:
    requirements = ['-' for _ in range(len(df))]

if requirementsOnly:
    input = requirements
else:
    descriptions = df[existing_columns].copy().apply(lambda x: '. '.join(x.dropna().astype(str)), axis=1).tolist()
    input = [
        req if isinstance(req, str) and req != "-" else desc
        for req, desc in zip(requirements, descriptions)
    ]

## Evaluate

### Load Model

In [56]:
from gliner import GLiNER

model = GLiNER.from_pretrained("numind/NuNerZero")

Fetching 6 files:   0%|          | 0/6 [00:00<?, ?it/s]



### Merge Entities Function

In [57]:
def merge_entities(entities):
    if not entities:
        return []
    merged = []
    current = entities[0]
    for next_entity in entities[1:]:
        if next_entity['label'] == current['label'] and (next_entity['start'] == current['end'] + 1 or next_entity['start'] == current['end']):
            current['text'] = text[current['start']: next_entity['end']].strip()
            current['end'] = next_entity['end']
        else:
            merged.append(current)
            current = next_entity
    # Append the last entity
    merged.append(current)
    return merged

### Get Required Degrees

In [58]:
def split_text(text, max_length=384):
    segments = []
    start = 0

    while start < len(text):
        end = min(start + max_length, len(text))
        split_point = text.rfind('.', start, end)

        if split_point == -1 or end == len(text):
            segments.append(text[start:end])
            break
        else:
            segments.append(text[start:split_point + 1])
            start = split_point + 1

    return segments

In [59]:
import re

def extract_degrees(text):
    # Define regex patterns for degrees
    phd_pattern = r'\b(?:phd|p\.h\.d|doctorate|postdoctoral)\b'
    masters_pattern = r'\b(?:master|masters|master\'s|ma|msc)\b'
    bachelors_pattern = r'\b(?:bachelor|bachelors|bachelor\'s|ba|bsc|undergraduate)\b'
    less_than_degree_pattern = r'\b(?:high school|vocational)\b'

    # Combine patterns into one regex
    combined_pattern = f"{phd_pattern}|{masters_pattern}|{bachelors_pattern}|{less_than_degree_pattern}"

    # Compile the regex
    regex = re.compile(combined_pattern, re.IGNORECASE)

    # Find all matches
    matches = regex.findall(text)

    return matches

In [60]:
# NuZero requires labels to be lower-cased!
labels = ["Degree"]
labels = [l.lower() for l in labels]

requiredDegrees = []

for text in input:
    degrees = []

    if (isinstance(text, str)):
        regexResults = extract_degrees(text)
        for regexResult in regexResults:
            degrees.append(regexResult)

        result = split_text(text)

        for segment in result:
            entities = model.predict_entities(segment, labels)
            entities = merge_entities(entities)

            for entity in entities:
                degrees.append(entity["text"])

    requiredDegrees.append(degrees)


Asking to truncate to max_length but no maximum length is provided and the model has no predefined maximum length. Default to no truncation.


## Clean Data

### Lowcase all the text

In [61]:
def lowercase_text(text):
    return [item.lower() for item in text]

In [62]:
lowcase_descriptions = []

for description in requiredDegrees:
    tokens = lowercase_text(description)
    lowcase_descriptions.append(tokens)

### Remove casing

In [63]:
def remove_case(text):
    return [item.strip('[]') for item in text]

In [64]:
rem_casing = []

for description in lowcase_descriptions:
    tokens = remove_case(description)
    rem_casing.append(tokens)

### Remove unicode symbols

In [65]:
unicode_regex = re.compile(r'[^\x00-\x7F]')

In [66]:
def remove_unicode(text):
    return [word for word in text if not unicode_regex.search(word)]

In [67]:
rem_unicode = []

for description in rem_casing:
    tokens = remove_unicode(description)
    rem_unicode.append(tokens)

### Remove Repetitions

In [68]:
def remove_duplicates(arr):
    return list(set(arr))

In [69]:
rem_dupl = []

for description in rem_unicode:
    tokens = remove_duplicates(description)
    rem_dupl.append(tokens)

### Remove 'Degree'

In [70]:
def remove_degree_keyword(strings):
    return [s.replace('degree', '').replace('degrees', '').replace("degree's", '') for s in strings]

In [71]:
rem_degree = []

for description in rem_dupl:
    tokens = remove_degree_keyword(description)
    rem_degree.append(tokens)

### Remove empty strings

In [72]:
def remove_single_char_words(arr):
    return list(filter(lambda word: len(word) > 1, arr))

In [73]:
def split_and_clean(arr):
    return [word for s in arr for word in s.split() if word]

In [74]:
requiredDegrees = []

for description in rem_degree:
    tokens = split_and_clean(description)
    tokens = remove_single_char_words(tokens)
    requiredDegrees.append(tokens)

## Classify

In [75]:
from jaro import jaro_winkler_metric

In [76]:
def classify_text(extracted_keywords, category_map):
    similarity_scores = {}

    def compute_max_similarity(keywords, category_keywords):
        max_similarity = 0
        for kw in keywords:
            for cat_kw in category_keywords:
                similarity = jaro_winkler_metric(kw, cat_kw)
                if similarity > max_similarity:
                    max_similarity = similarity
        return max_similarity

    for category, keywords in category_map.items():
        similarity = compute_max_similarity(extracted_keywords, keywords)
        similarity_scores[category] = similarity

    best_category = max(similarity_scores, key=similarity_scores.get)
    return best_category, similarity_scores

I could add "university" to the bachelor category, but despite increasing the coverage it may lead to failures in cases such as 'University Master's Degree'

In [77]:
category_map = {
    "Undefined": [],
    "Less Than a Degree": ["high school", "vocational"],
    "Bachelor's Degree": ["bachelor", "bachelors", "bachelor's", "ba", "bsc", "undergraduate"],
    "Master's Degree": ["master", "masters", "master's", "ma", "msc"],
    "PhD or Doctorate": ["phd", "p.h.d", "doctorate", "postdoctoral"],
}

In [78]:
import datetime

# Initialize a dictionary to keep track of counts for each category
category_counts = {category: 0 for category in category_map.keys()}
total_count = len(requiredDegrees)  # Total number of offers
new_categories = []

for keywords in requiredDegrees:
    best_category, similarity_scores = classify_text(keywords, category_map)
    if (similarity_scores[best_category] >= 0.8):
        category_counts[best_category] += 1
        new_categories.append(best_category)
    else:
        category_counts["Undefined"] += 1
        new_categories.append('Undefined')
        
df['RequiredDegree'] = new_categories
df.to_excel('../Results/' + fileName + '.xlsx', index=False, engine='openpyxl')

# Print percentage of offers in each category
for category, count in category_counts.items():
    if (category == "Undefined"):
        percentage = (count / total_count) * 100
        print(f"Classified: {(100 - percentage):.2f}%")
        print(f"{category}: {percentage:.2f}%")
        print("\n ----------------------------- \n")
    else:
        percentage = (count / total_count) * 100
        print(f"Category '{category}': {percentage:.2f}%")

print("\n ----------------------------- \n")
print("Document Annotated")

Classified: 23.03%
Undefined: 76.97%

 ----------------------------- 

Category 'Less Than a Degree': 0.78%
Category 'Bachelor's Degree': 10.30%
Category 'Master's Degree': 10.97%
Category 'PhD or Doctorate': 0.98%

 ----------------------------- 

Document Annotated
