In [None]:
import pandas as pd
from datasets import Dataset, DatasetDict
from setfit import SetFitModel, SetFitTrainer
from sklearn.metrics import classification_report

In [None]:
train_df = pd.read_csv("../final/classified_output_v3.csv")
test_df = pd.read_csv("../final/highest_active_jobs.csv")


train_df.rename(columns={"predicted_seniority": "label"}, inplace=True)
test_df.rename(columns={"seniority": "label", "position": "text"}, inplace=True)

In [None]:
train_ds = Dataset.from_pandas(train_df)
test_ds = Dataset.from_pandas(test_df)

dataset = DatasetDict({
    "train": train_ds,
    "test": test_ds
})

dataset = dataset.class_encode_column("label")

label_names = dataset["train"].features["label"].names

model = SetFitModel.from_pretrained(
    #"BAAI/bge-m3",
    "sentence-transformers/paraphrase-multilingual-mpnet-base-v2",
    labels=label_names,
    device="cuda" 
)

trainer = SetFitTrainer(
    model=model,
    train_dataset=dataset["train"],
    eval_dataset=dataset["test"],
    metric="accuracy",
    batch_size=16,
    num_iterations=20,
    num_epochs=1,
)

trainer.train()
metrics = trainer.evaluate()
print(f"Metrics: {metrics}")


In [None]:
print("Generating report...")

preds = model.predict(dataset["test"]["text"])

label_names = dataset["test"].features["label"].names
y_true = [label_names[i] for i in dataset["test"]["label"]]

print(classification_report(y_true, preds))

In [None]:
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.metrics import confusion_matrix
import numpy as np

cm = confusion_matrix(y_true, preds, labels=label_names)
cm_normalized = cm.astype('float') / cm.sum(axis=1)[:, np.newaxis]

plt.figure(figsize=(10, 8), dpi=150)
sns.heatmap(cm_normalized, annot=True, fmt='.2f', cmap='Blues', 
            xticklabels=label_names, yticklabels=label_names,
            cbar_kws={'label': 'Proportion'},
            annot_kws={'size': 13, 'weight': 'bold'})
plt.title('Confusion Matrix - SetFit Model Seniority', fontsize=14, fontweight='bold', pad=15)
plt.ylabel('True Label', fontsize=12, fontweight='bold')
plt.xlabel('Predicted Label', fontsize=12, fontweight='bold')
plt.tight_layout()
plt.savefig('confusion_matrix.png', dpi=300, bbox_inches='tight')
plt.show()

print(f"\nConfusion Matrix:\n{cm}")