## EQF Classification with Zero-Shot RoBERTa

### Load the pre-trained zeroShot RoBERTa tokenizer and model

In [1]:
from transformers import AutoTokenizer, AutoModelForSequenceClassification

model_name = "MoritzLaurer/deberta-v3-large-zeroshot-v2.0-c"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModelForSequenceClassification.from_pretrained(model_name)

### Create a Zero-Shot Classification pipeline

In [2]:
from transformers import pipeline

classifier = pipeline("zero-shot-classification", model=model, tokenizer=tokenizer)

### Load Data

In [13]:
import pandas as pd

excel_file = './Euraxess_GNSS_Keywords.xlsx'
df = pd.read_excel(excel_file)

sampled_df = df[df['Position'].notna()]
sampled_df = sampled_df[sampled_df['Position'].str.strip() != '']
sampled_df = sampled_df.head(100)

### Prepare Data

In [14]:
sampled_df['Concatenated'] = sampled_df[['Title', 'OfferDescription', 'Requirements', 'AdditionalInformation']].apply(lambda x: ' '.join(x.dropna().astype(str)), axis=1)

texts = sampled_df['Concatenated'].tolist()
true_labels = sampled_df['Position'].tolist()

candidate_labels = list(set(true_labels))

### Classify

In [15]:
predicted_labels = []
for text in texts:
    result = classifier(text, candidate_labels=candidate_labels)
    predicted_labels.append(result['labels'][0])

### Evaluate

In [17]:
from sklearn.metrics import f1_score, accuracy_score, classification_report

accuracy = accuracy_score(true_labels, predicted_labels)
f1 = f1_score(true_labels, predicted_labels, average='weighted')
report = classification_report(true_labels, predicted_labels)

print("Accuracy:", accuracy)
print("F1-Score:", f1)
print("\nClassification Report:\n", report)

Accuracy: 0.44
F1-Score: 0.5190058479532164

Classification Report:
                    precision    recall  f1-score   support

 Master Positions       1.00      0.29      0.44        28
  Other Positions       0.00      0.00      0.00         1
    PhD Positions       0.70      0.49      0.58        67
Postdoc Positions       0.09      0.75      0.17         4

         accuracy                           0.44       100
        macro avg       0.45      0.38      0.30       100
     weighted avg       0.75      0.44      0.52       100

