# Segment NuNER-Zero KeyBERT Jaro-Winkler Classification

## Load Data

In [4]:
import pandas as pd

fileName = 'Test_Results_Translated'
excel_file = '../' + fileName + '.xlsx'

df = pd.read_excel(excel_file)

In [5]:
columns = ['Title', 'OfferDescription', 'Requirements', 'Responsibilities', 'AdditionalInformation', 'Abstract', 'Descriptions']
existing_columns = [col for col in columns if col in df.columns]

descriptions = df[existing_columns].copy().apply(lambda x: '. '.join(x.dropna().astype(str)), axis=1).tolist()

## NuNER-Zero

### Load Model

In [6]:
from gliner import GLiNER

model = GLiNER.from_pretrained("numind/NuNerZero")

Fetching 6 files:   0%|          | 0/6 [00:00<?, ?it/s]



### Merge Entities Function

In [7]:
def merge_entities(entities):
    if not entities:
        return []
    merged = []
    current = entities[0]
    for next_entity in entities[1:]:
        if next_entity['label'] == current['label'] and (next_entity['start'] == current['end'] + 1 or next_entity['start'] == current['end']):
            current['text'] = text[current['start']: next_entity['end']].strip()
            current['end'] = next_entity['end']
        else:
            merged.append(current)
            current = next_entity
    # Append the last entity
    merged.append(current)
    return merged

### Evaluate

In [8]:
def split_text(text, max_length=384):
    segments = []
    start = 0

    while start < len(text):
        end = min(start + max_length, len(text))
        split_point = text.rfind('.', start, end)

        if split_point == -1 or end == len(text):
            segments.append(text[start:end])
            break
        else:
            segments.append(text[start:split_point + 1])
            start = split_point + 1

    return segments

In [9]:
# NuZero requires labels to be lower-cased!
labels = ["Market Segment"]
labels = [l.lower() for l in labels]

textFields = []

for text in descriptions:
    result = split_text(text)

    for segment in result:
        entities = model.predict_entities(segment, labels)
        entities = merge_entities(entities)

        fields = []

        for entity in entities:
            fields.append(entity["text"])

    textFields.append(fields)


Asking to truncate to max_length but no maximum length is provided and the model has no predefined maximum length. Default to no truncation.


## Prepare Data

### Tokenization

In [10]:
import nltk
from nltk import word_tokenize
nltk.download('punkt')

[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\Usuario\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!


True

In [11]:
descriptions_token = []

for description in descriptions:
    tokens = word_tokenize(description)
    descriptions_token.append(tokens)

### Remove Hyphens

In [12]:
def remove_hyphens(text):
    return [item.strip('-') for item in text]

In [13]:
rem_hyphens = []

for description in descriptions_token:
    tokens = remove_hyphens(description)
    rem_hyphens.append(tokens)

### Lowcase all the text

In [14]:
def lowercase_text(text):
    return [item.lower() for item in text]

In [15]:
lowcase_descriptions = []

for description in rem_hyphens:
    tokens = lowercase_text(description)
    lowcase_descriptions.append(tokens)

### Remove casing

In [16]:
def remove_case(text):
    return [item.strip('[]') for item in text]

In [17]:
rem_casing = []

for description in lowcase_descriptions:
    tokens = remove_case(description)
    rem_casing.append(tokens)

### Remove stopwords

In [18]:
nltk.download("stopwords")
from nltk.corpus import stopwords

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\Usuario\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [19]:
stop_words = set(stopwords.words("english"))

In [20]:
def remove_stopwords(text):
    return [word for word in text if word.casefold() not in stop_words]

In [21]:
rem_stopwords = []

for description in rem_casing:
    tokens = remove_stopwords(description)
    rem_stopwords.append(tokens)

### Remove unicode symbols

In [22]:
import re

In [23]:
unicode_regex = re.compile(r'[^\x00-\x7F]')

In [24]:
def remove_unicode(text):
    return [word for word in text if not unicode_regex.search(word)]

In [25]:
rem_unicode = []

for description in rem_stopwords:
    tokens = remove_unicode(description)
    rem_unicode.append(tokens)

### Remove numbers and digits

In [26]:
number_regex = re.compile(r'\d')

In [27]:
def remove_number(text):
    return [word for word in text if not number_regex.search(word)]

In [28]:
rem_number = []

for description in rem_unicode:
    tokens = remove_number(description)
    rem_number.append(tokens)

### Remove Special Words Mixed with String, Numbers, Punctuation, and Other Symbols

In [29]:
special_regex = re.compile(r'^(?=.*\d)(?=.*[A-Za-z])|(?=.*[A-Za-z])(?=.*[\W_])|(?=.*\d)(?=.*[\W_])')

In [30]:
def remove_special(text):
    return [word for word in text if not special_regex.search(word)]

In [31]:
rem_special = []

for description in rem_number:
    tokens = remove_special(description)
    rem_special.append(tokens)

### Remove URL

In [32]:
url_regex = re.compile(r"(@\[A-Za-z0-9]+)|([^0-9A-Za-z \t])|(\w+:\/\/\S+)|^rt|http.+?")

In [33]:
def remove_url(text):
    return [word for word in text if not url_regex.search(word)]

In [34]:
rem_url = []

for description in rem_special:
    tokens = remove_url(description)
    rem_url.append(tokens)

### Remove Punctuation

In [35]:
import string

def remove_punct(text):
    return [word for word in text if word not in string.punctuation]

In [36]:
rem_punct = []

for description in rem_url:
    tokens = remove_punct(description)
    rem_punct.append(tokens)

### Remove Repetitions

In [37]:
def remove_duplicates(arr):
    return list(set(arr))

In [38]:
rem_dupl = []

for description in rem_punct:
    tokens = remove_duplicates(description)
    rem_dupl.append(tokens)

### Remove single characters and non-english words

In [39]:
from nltk.corpus import words
nltk.download('words')

[nltk_data] Downloading package words to
[nltk_data]     C:\Users\Usuario\AppData\Roaming\nltk_data...
[nltk_data]   Package words is already up-to-date!


True

In [40]:
english_words = set(words.words())

In [41]:
def remove_non_eng(text):
    return [word for word in text if word in english_words]

In [42]:
def remove_single_char_words(arr):
    return list(filter(lambda word: len(word) > 1, arr))

In [43]:
rem_noEnglish = []

for description in rem_dupl:
    tokens = remove_non_eng(description)
    tokens = remove_single_char_words(tokens)
    rem_noEnglish.append(tokens)

### Remove spelling mistakes

In [44]:
from spellchecker import SpellChecker

def remove_misspelled_words(arr):
    spell = SpellChecker()
    return [word for word in arr if word in spell]

In [45]:
rem_misspelled = []

for description in rem_noEnglish:
    tokens = remove_misspelled_words(description)
    rem_misspelled.append(tokens)

### Lemmatization

In [46]:
from nltk.stem import WordNetLemmatizer
nltk.download("wordnet")
nltk.download("omw-1.4")

# Initialize wordnet lemmatizer
wnl = WordNetLemmatizer()

[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\Usuario\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package omw-1.4 to
[nltk_data]     C:\Users\Usuario\AppData\Roaming\nltk_data...
[nltk_data]   Package omw-1.4 is already up-to-date!


In [47]:
def lematize(arr):
    return [wnl.lemmatize(word) for word in arr]

In [48]:
cleaned_texts = []

for description in rem_misspelled:
    tokens = lematize(description)
    cleaned_texts.append(tokens)

## KeyBERT

In [49]:
from keybert import KeyBERT

kw_model = KeyBERT()

In [50]:
def glue_texts(array):
    return [' '.join(tokens) for tokens in array]

In [51]:
final_texts = glue_texts(cleaned_texts)

In [52]:
keywords_per_text = []
for text in final_texts:
    keywords = kw_model.extract_keywords(text)
    keywords_per_text.append([item[0] for item in keywords])

## Jaro-Winkler

In [53]:
category_map = {
    "Undefined": [],
    "Agriculture": [
        "Digital Farming", "Precision Agriculture", "Farm Management Technology", "Agriculture Value Chains", "Agriculture R&D", "Data-Driven Farming",
        "Crop Monitoring", "Fertiliser Optimization", "Sustainable Agriculture", "Farm Profitability", "Regenerative Agriculture",
        "Agricultural Cooperatives", "AgTech Innovation", "AI in Agriculture", "Machine Guidance", "Livestock Tracking", "Eco-schemes",
    ],
    "Aviation and Drones": [
        "Aviation Growth", "Air Traffic Management", "Urban Air Mobility", "Performance Based Navigation", "Flight Planning", "Airspace Design",
        "EGNOS Procedures", "Dual Frequency Multi Constellation (DFMC)", "Unmanned Aviation", "Drone Positioning", "BVLOS Operations", "Autonomous Flights",
        "SBAS Position Augmentation", "Ash Cloud Prediction", "Aerodrome Accessibility", "Aviation and Drones Value Chains",
        "Electronic Conspicuity", "U-space", "Drone Potential", "EGNSS", "Copernicus Usage", "Aviation Research and Innovation"
    ],
    "Climate, Environment, and Biodiversity": [
        "Environmental Monitoring", "Climate Change Mitigation", "Environmental Resource Management", "Water Management", "Ocean Monitoring",
        "Coastal Monitoring", "Atmosphere Monitoring", "Land Monitoring", "Human Impact Assessment", "Environmental Audits", "Environmental Impact Assessments",
        "ESG Reports", "Climate Modelling", "Climate Forecasting", "Snow and Ice Monitoring", "Sea-level Rise", "Greenhouse Gas Monitoring", "Biodiversity Monitoring",
        "Ecosystem Health", "Soil Health", "Water Quality", "Coral Reefs Health", "Flora and Fauna Monitoring", "GNSS Animal Tracking", "Climate Services", "Environmental Policies",
        "Natural Capital", "Niche Capabilities", "Technology Integration", "Environmental Impact Monitoring"
    ],
    "Consumer Solutions, Tourism and Health": [
        "EO-enabled Consumer Solutions", "GNSS-enabled Consumer Solutions", "Health and Lifestyle Apps", "Tourism Applications", "Smartphones", "Wearables",
        "Personal Tracking Devices", "Digital Cameras", "Portable Computers", "Internet of Things (IoT)", "Consumer Robotics", "Commercialisation of EO Data", 
        "Geo-location Revenues", "Sustainable Tourism", "SME and Start-up Support", "Collaborative Decision-making", "Inclusivity and Diversity"
    ],
    "Emergency Management and Humanitarian Aid": [
        "Emergency Management", "Humanitarian Aid", "Disaster Preparedness", "Disaster Response", "Disaster Recovery", "SAR Operations",
        "Drought Response", "Earthquake Response", "Flood Response", "Natural Disaster Management", "Search and Rescue Beacons", "EO Data for Crisis Management", 
        "Humanitarian Assistance", "EO Data for Prevention", "EO Data for Mitigation", "Coordination Centres", "National Governments", "International Organisations", 
        "NGOs", "Dedicated Agencies", "Humanitarian Situation Monitoring"
    ],
    "Energy and Raw Materials": [
        "Energy Sector", "Energy Project Developers", "Utility Companies", "Energy Asset Manufacturers", "Energy Traders", "Supply Chain Managers",
        "Space Data for Energy", "Renewable Energy Forecasting", "Solar Energy", "Wind Energy", "Hydropower", "Energy Grid Stability", 
        "Real-time Renewable Energy Production", "Energy Transmission Networks", "Phasor Measurement Units", "Green Energy Transition", "Raw Materials Sector", "Mining Companies",
        "Mining Associations", "Commodity Traders", "Exploration and Site Identification", "Mine Safety", "Tailings Slope Stability", "Illegal Mining Detection",
        "Post-operational Mine Management", "Site Clean-up", "Rehabilitation and Waste Management", "Climate Impact on Renewable Energy", 
        "Environmental Impact Assessment"
    ],
    "Fisheries and Aquaculture": [
        "Fisheries", "Aquaculture", "Sustainable Catch", "Satellite Data in Fisheries", "Fish Stock Location", "Fishing Effort Optimization",
        "Fishing Vessel Tracking", "Illegal, Unreported, and Unregulated (IUU) Fishing", "Safety at Sea",
        "Site Selection for Fish Farms", "Maritime Spatial Planning", "Aquaculture Operations Optimization", "Inland Aquaculture",
        "Digitalisation of Aquaculture", "Fish Stock Modelling", "Blue Economy"
    ],
    "Forestry": [
        "Forestry Management", "Forest Cultivation", "Forest Maintenance", "Forest Development", "Remote Monitoring of Forests",
        "Forest Health Assessment", "Illegal Logging Detection", "Deforestation Monitoring", "GNSS in Forestry", "Precision Forestry",
        "On-tree Health Sensors", "EU Forest Strategy 2030", "Forest Information System for Europe", "Forest Regeneration"
    ],
    "Infrastructure": [
        "Infrastructure Management", "Construction Companies", "Utility Operators", "Heavy Machinery",
        "Telecommunication Networks", "Data Centres", "Cloud Services", "Industrial Production Systems",
        "Construction Monitoring", "Post-Construction Operations", "Ground Deformation Monitoring",
        "Resilient Infrastructure Design", "Maintenance Optimization", "Buildings", "Bridges", "Roads", "Railway Lines", "Pipelines", "Dams", "Factories",
        "Power Plants", "Telecommunication Networks", "Infrastructure Resilience", "EO-Based Construction Progress Monitoring"
    ],
    "Insurance and Finance": [
        "Insurance Sector", "Financial Services", "Insurance Companies", "Re-Insurers", "Financial Institutions", "Private Banks", "Commercial Banks", "Stock Exchanges", "Traders",
        "Claims Management", "Parametric Products", "Index Production", "Risk Modelling", "Risk Assessments", "Investment Screening"
        "Shipping Monitoring", "Supply and Demand Prediction",
        "Investment Strategies", "Sustainable Investing", "Transaction Time-Stamping",
        "Pre-Event Analysis", "Post-Event Analysis"
    ],
    "Maritime and Inland Waterways": [
        "Maritime Sector", "Inland Waterways", "Vessel Emissions Reduction",
        "Ecosystem Preservation", "Port Security", "Maritime Safety", "Autonomous Vessels", "Semi-Autonomous Vessels",
        "Vessel Operators", "Port Authorities", "Recreational Boaters", "Maritime Efficiency",
        "Digitalisation of Vessels", "Automation of Ports", "Sustainable Blue Economy",
        "Enhanced Maritime Devices", "Advanced Data for Maritime Performance", "Maritime Value Chains"
    ],
    "Rail": [
        "Rail Sector", "Private Railway Operators",
        "Railway Manufacturers", "Train Operations", "Railway Infrastructure Safety", "Track Deformation Monitoring",
        "Ground Motion Projects for Rail", "Rail GNSS & EO Value Chains", "Railway Applications", "European Systems for Rail",
        "European Projects in Rail Sector"
    ],
    "Road and Automotive": [
        "Transport and Mobility", "Automotive Industry", "Mobility Services", "Road Transport Network",
        "Connected Cars", "Automated Cars", "Emergency Assistance Services", "eCall", "Road User Charging", "Smart Tachographs",
        "Vehicle Asset Management", "Traffic Information Collection", "Driving Experience Improvement", "Transport Infrastructure",
        "Traffic Management Services", "Road Infrastructure Planning", "HD Maps", "Connected Driving", "Automated Driving",
        "In-Vehicle Systems", "Smart Mobility", "HAS and OSNMA", "Safe and Secure Road Transport"
    ],
    "Space": [
        "GNSS in Space", "Low Earth Orbit (LEO)", "GNSS Receivers on Satellites", "Multi-GNSS Space Service Volume (SSV)", "Cislunar Economy", "Spacecraft Navigation",
        "Spaceborne GNSS Data", "Space-Based Services", "Mega-Constellations", "Space Users"
    ],
    "Urban Development and Cultural Heritage": [
        "Smart Cities", "Connected Cities", "Climate-Neutral Cities", "Urban Planning",
        "Monitoring Informal Dwellings", "Urban Greening", "Air Quality Monitoring", "Greenhouse Gas Emissions", "Cultural Heritage Sites", "Ground Subsidence", "Built Environment", 
        "Urban Resilience", "Urban Simulation", "Cultural Heritage Preservation", "Urban Areas Planning"
    ]
};

In [54]:
from jaro import jaro_winkler_metric

In [55]:
def classify_text(extracted_keywords, category_map):
    similarity_scores = {}

    def compute_max_similarity(keywords, category_keywords):
        max_similarity = 0
        for kw in keywords:
            for cat_kw in category_keywords:
                similarity = jaro_winkler_metric(kw, cat_kw)
                if similarity > max_similarity:
                    max_similarity = similarity
        return max_similarity

    for category, keywords in category_map.items():
        similarity = compute_max_similarity(extracted_keywords, keywords)
        similarity_scores[category] = similarity

    best_category = max(similarity_scores, key=similarity_scores.get)
    return best_category, similarity_scores

In [56]:
finalKeywords = []

for i in range(0, len(keywords_per_text)):
    finalKeywords.append(keywords_per_text[i] + textFields[i])

## Zero Shot

### Load the pre-trained zeroShot RoBERTa tokenizer and model

In [57]:
from transformers import AutoTokenizer, AutoModelForSequenceClassification

model_name = "MoritzLaurer/deberta-v3-large-zeroshot-v2.0-c"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModelForSequenceClassification.from_pretrained(model_name)

### Create a Zero-Shot Classification pipeline

In [58]:
from transformers import pipeline

classifier = pipeline("zero-shot-classification", model=model, tokenizer=tokenizer)

## Classification & Evaluation

In [59]:
import datetime

# Initialize a dictionary to keep track of counts for each category
category_counts = {category: 0 for category in category_map.keys()}
total_count = len(finalKeywords)  # Total number of offers
accuracy = 0
jobTypes = []

for keywords, text in zip(finalKeywords, descriptions):
    best_category, similarity_scores = classify_text(keywords, category_map)
    matched_categories = []

    for category, score in similarity_scores.items():
        if score >= 0.6:
            matched_categories.append(category)

    finalResult = ''
    if not matched_categories:
        finalResult = "Undefined"
    elif len(matched_categories) ==  1:
        finalResult = matched_categories[0]
    else:
        result = classifier(text, candidate_labels=matched_categories)
        finalResult = result['labels'][0]
    category_counts[finalResult] += 1
    jobTypes.append(finalResult)
    
df['MarketSegment'] = jobTypes
df.to_excel('../' + fileName + '_Segment.xlsx', index=False, engine='openpyxl')

# Print percentage of offers in each category
for category, count in category_counts.items():
    if (category == "Undefined"):
        percentage = (count / total_count) * 100
        print(f"Classified: {(100 - percentage):.2f}%")
        print(f"{category}: {percentage:.2f}%")
        print("\n ----------------------------- \n")
    else:
        percentage = (count / total_count) * 100
        print(f"Category '{category}': {percentage:.2f}%")
        
print("\n ----------------------------- \n")

# Calculate accuracy
true_categories = df['Segment'].tolist()
predicted_categories = df['MarketSegment'].tolist()

# Count correct predictions
correct_predictions = sum(1 for true, pred in zip(true_categories, predicted_categories) if true == pred)

# Calculate percentage accuracy
accuracy = (correct_predictions / total_count) * 100

# Print the result
print(f"Accuracy: {accuracy:.2f}%")

print("\n ----------------------------- \n")
print("Document Annotated")

Classified: 100.00%
Undefined: 0.00%

 ----------------------------- 

Category 'Agriculture': 0.00%
Category 'Aviation and Drones': 0.00%
Category 'Climate, Environment, and Biodiversity': 12.57%
Category 'Consumer Solutions, Tourism and Health': 7.43%
Category 'Emergency Management and Humanitarian Aid': 0.57%
Category 'Energy and Raw Materials': 9.14%
Category 'Fisheries and Aquaculture': 0.00%
Category 'Forestry': 0.57%
Category 'Infrastructure': 40.57%
Category 'Insurance and Finance': 5.14%
Category 'Maritime and Inland Waterways': 12.57%
Category 'Rail': 5.14%
Category 'Road and Automotive': 1.14%
Category 'Space': 5.14%
Category 'Urban Development and Cultural Heritage': 0.00%

 ----------------------------- 

Accuracy: 16.57%

 ----------------------------- 

Document Annotated
