## EQF Classification with Masking DistilRoBERTa

### Load the pre-trained zeroShot RoBERTa tokenizer and model

In [43]:
from transformers import pipeline

# Load the model and tokenizer
model_name = "distilbert-base-uncased"  # or "distilroberta-base"
fill_mask = pipeline("fill-mask", model=model_name)

### Load Data

In [44]:
import pandas as pd

excel_file = './job_descriptions.csv'
df = pd.read_csv(excel_file)

sampled_df = df.head(1000).copy()

### Prepare Data

In [45]:
sampled_df['Concatenated'] = sampled_df[['Job Title', 'Job Description', 'skills', 'Responsibilities']].apply(lambda x: ' '.join(x.dropna().astype(str)), axis=1)

texts = sampled_df['Concatenated'].tolist()
true_labels = sampled_df['Qualifications'].tolist()

candidate_labels = list(set(true_labels))

### Classify

In [52]:
from difflib import SequenceMatcher

predictions = []
mask = []

semantic_fields = {
    "Bachelor's Degree": ["bachelor", "undergraduate", "baccalaureate", "college", "university", "BS", "BA", "AB", "regular", "common", "basic"],
    "Master’s Degree": ["master", "graduate", "postgraduate", "MSc", "MA", "MS", "MPhil", "MBA", "high", "advanced", "specialized"],
    "PhD or Doctorate": ["phd", "doctorate", "doctoral", "Ph.D.", "research", "DPhil", "EdD", "very high", "expert", "specialist"]
}

def calculate_similarity(predicted_token, semantic_field):
    max_similarity = 0.0
    for token in semantic_field:
        similarity = SequenceMatcher(None, predicted_token, token).ratio()
        if similarity > max_similarity:
            max_similarity = similarity
    return max_similarity

for text in texts:
    # Append the masked sentence to the job offer text
    masked_text = text + ". A [MASK] degree is preferred for this role."
    
    # Get predictions for the masked token
    pred = fill_mask(masked_text)
    
    # Extract predictions and calculate similarity
    best_label = None
    best_mask = None
    max_similarity = 0.0
    
    for prediction in pred:
        predicted_token = prediction['token_str']
        confidence_score = prediction['score']
        
        # Calculate similarity with each semantic field
        for category, semantic_field in semantic_fields.items():
            similarity = calculate_similarity(predicted_token, semantic_field)
            if similarity > max_similarity:
                max_similarity = similarity
                best_label = category
                best_mask = predicted_token
    
    # Add predicted label with highest similarity to predictions array
    predictions.append(best_label)
    mask.append(best_mask)

### Evaluate

In [53]:
correct = 0
total = len(true_labels)

for true_label, pred_label in zip(true_labels, predictions):
    if pred_label in candidate_labels and pred_label == true_label:
        correct += 1

accuracy = correct / total
print(f"Accuracy: {accuracy:.4f}")

# Optional: Print some examples of predictions vs true labels
for i in range(8):
    print(f"Job Offer: {texts[i]}")
    print(f"True Label: {true_labels[i]}, Predicted: {predictions[i]}, Mask: {mask[i]}")
    print("-----")

Accuracy: 0.1860
Job Offer: Digital Marketing Specialist Social Media Managers oversee an organizations social media presence. They create and schedule content, engage with followers, and analyze social media metrics to drive brand awareness and engagement. Social media platforms (e.g., Facebook, Twitter, Instagram) Content creation and scheduling Social media analytics and insights Community engagement Paid social advertising Manage and grow social media accounts, create engaging content, and interact with the online community. Develop social media content calendars and strategies. Monitor social media trends and engagement metrics.
True Label: Master’s Degree, Predicted: PhD or Doctorate, Mask: doctorate
-----
Job Offer: Web Developer Frontend Web Developers design and implement user interfaces for websites, ensuring they are visually appealing and user-friendly. They collaborate with designers and backend developers to create seamless web experiences for users. HTML, CSS, JavaScript