In [28]:
import numpy as np
import joblib
from setfit import SetFitModel
from sklearn.metrics import f1_score, classification_report, accuracy_score, confusion_matrix
from sklearn.base import BaseEstimator, ClassifierMixin
import pandas as pd
import json
from sklearn.ensemble import StackingClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import StratifiedKFold
from sklearn.base import clone
from sklearn.linear_model import LogisticRegression

#### Test Data -> annotated.json

In [29]:
class Preprocessing_CSV():
    def __init__(self, file_path: str):
        self.file_path = file_path
        self.df: pd.DataFrame | None = None

        # Label and Text
        self.X: pd.Series = None
        self.y: pd.Series = None

        # Optional: keep raw versions too
        self.X_raw: pd.Series | None = None
        self.y_raw: pd.Series | None = None

        self.read_csv()

    @staticmethod
    def clean_text(text: str) -> str:
        """Lowercase, strip, replace - and / with spaces."""
        return str(text).lower().strip().replace("-", " ").replace("/", " ")

    def read_csv(self):
        """Reads CSV and exposes X (cleaned text) and y (raw text labels)."""
        self.df = pd.read_csv(self.file_path)

        required_cols = {"text", "label"}
        if not required_cols.issubset(self.df.columns):
            raise ValueError("Wrong file mate :( Expected columns: text, label")

        # Raw
        self.X_raw = self.df["text"].astype(str)
        self.y_raw = self.df["label"].astype(str)

        # Cleaned + labels as strings
        self.X = self.X_raw.apply(self.clean_text)
        self.y = self.y_raw

    def label_distribution(self) -> pd.Series:
        """Quick check of label counts."""
        if self.y is None:
            return pd.Series(dtype=int)
        return self.y.value_counts()

In [30]:
data = Preprocessing_CSV(
    "/Users/jonas/Documents/Master_Vorlesungen/Semester_02/Practical Data Science/Final/PDS_Final/data/seniority-v2.csv"
)

X = data.X
y = data.y

not_annotated_data = pd.read_csv("/Users/jonas/Documents/Master_Vorlesungen/Semester_02/Practical Data Science/Final/PDS_Final/data/labeled_not_annotated.csv")
X_not_annotated = not_annotated_data["position"].astype(str).apply(data.clean_text)
y_not_annotated = not_annotated_data["seniority"].astype(str)

X_train = pd.concat([X, X_not_annotated], ignore_index=True)
y_train_raw = pd.concat([y, y_not_annotated], ignore_index=True)

y_train = encoder.transform(y_train_raw.values.reshape(-1,1)).flatten()

#### Pipeline:

In [31]:
# Load BOW + Logistic Regression Model
bow_lr_model = joblib.load("/Users/jonas/Documents/Master_Vorlesungen/Semester_02/Practical Data Science/Final/PDS_Final/models/bow_seniority.joblib")

bow_lr_model

In [32]:
# Huggingface Model
model_hf = SetFitModel.from_pretrained(
    "/Users/jonas/Documents/Master_Vorlesungen/Semester_02/Practical Data Science/Final/PDS_Final/models/setfit_seniority",
    local_files_only=True
)

The tokenizer you are loading from '/Users/jonas/Documents/Master_Vorlesungen/Semester_02/Practical Data Science/Final/PDS_Final/models/setfit_seniority' with an incorrect regex pattern: https://huggingface.co/mistralai/Mistral-Small-3.1-24B-Instruct-2503/discussions/84#69121093e8b480e709447d5e. This will lead to incorrect tokenization. You should set the `fix_mistral_regex=True` flag when loading this tokenizer to fix this issue.


In [33]:
enc_order = list(encoder.categories_[0])
hf_labels = list(model_hf.labels)

idx = [hf_labels.index(c) for c in enc_order]

aligned_label_order = [hf_labels[i] for i in idx]
print("HF labels (original):", hf_labels)
print("HF labels (reordered):", aligned_label_order)
print("Encoder order:", enc_order)

assert aligned_label_order == enc_order, "Alignment failed: reordered HF labels != encoder order"


HF labels (original): ['Director', 'Junior', 'Lead', 'Management', 'Professional', 'Senior']
HF labels (reordered): ['Junior', 'Professional', 'Senior', 'Lead', 'Management', 'Director']
Encoder order: ['Junior', 'Professional', 'Senior', 'Lead', 'Management', 'Director']


In [34]:
enc_classes = np.array(encoder.categories_[0], dtype=object)
enc_order = list(enc_classes)

def setfit_proba_aligned_to_encoder(model_hf, texts, enc_order):
    """
    SetFit predict_proba columns are in model_hf.labels order.
    Reorder them to match enc_order.
    """
    P = np.asarray(model_hf.predict_proba(texts))
    hf_labels = list(model_hf.labels)
    idx = [hf_labels.index(c) for c in enc_order]
    return P[:, idx]

def soft_vote_predict(model_bow, model_hf, X_texts, enc_classes, w_bow=0.2, w_hf=0.8):
    X_texts = list(X_texts)

    P_bow = model_bow.predict_proba(X_texts)
    P_hf  = setfit_proba_aligned_to_encoder(model_hf, X_texts, list(enc_classes))

    P_ens = (w_bow * P_bow + w_hf * P_hf) / (w_bow + w_hf)
    y_pred = enc_classes[np.argmax(P_ens, axis=1)]
    return y_pred, P_ens

X_test_texts = X_test.squeeze().astype(str).tolist()
y_test_str = y_test_raw.to_numpy()

y_pred_hf = np.array(model_hf.predict(X_test_texts), dtype=object)

print("HF Accuracy:", accuracy_score(y_test_str, y_pred_hf))
print("HF Macro F1:", f1_score(y_test_str, y_pred_hf, average="macro"))

y_pred_soft, P_ens = soft_vote_predict(
    bow_lr_model, model_hf, X_test_texts, enc_classes,
    w_bow=0.2, w_hf=0.8
)

print("\nSoftVote Accuracy:", accuracy_score(y_test_str, y_pred_soft))
print("SoftVote Macro F1:", f1_score(y_test_str, y_pred_soft, average="macro"))

print("\nSoftVote report:")
print(classification_report(
    y_test_str, y_pred_soft,
    labels=enc_order, target_names=enc_order,
    zero_division=0
))



HF Accuracy: 0.7789934354485777
HF Macro F1: 0.7348907574575904

SoftVote Accuracy: 0.7768052516411379
SoftVote Macro F1: 0.7317220812316791

SoftVote report:
              precision    recall  f1-score   support

      Junior       0.60      0.30      0.40        10
Professional       0.70      0.86      0.78       154
      Senior       0.96      0.69      0.81        39
        Lead       0.85      0.72      0.78        97
  Management       0.78      0.77      0.78       133
    Director       0.87      0.83      0.85        24

    accuracy                           0.78       457
   macro avg       0.80      0.70      0.73       457
weighted avg       0.79      0.78      0.78       457



Compare with Huggingface model alone

In [35]:
y_pred_hf = np.array(model_hf.predict(X_test_texts), dtype=object)

print("HF Accuracy:", accuracy_score(y_test_str, y_pred_hf))
print("HF Macro F1:", f1_score(y_test_str, y_pred_hf, average="macro"))
print(classification_report(
    y_test_str, y_pred_hf,
    labels=enc_order, target_names=enc_order,
    zero_division=0
))


HF Accuracy: 0.7789934354485777
HF Macro F1: 0.7348907574575904
              precision    recall  f1-score   support

      Junior       0.60      0.30      0.40        10
Professional       0.70      0.88      0.78       154
      Senior       1.00      0.69      0.82        39
        Lead       0.88      0.71      0.79        97
  Management       0.78      0.77      0.78       133
    Director       0.87      0.83      0.85        24

    accuracy                           0.78       457
   macro avg       0.81      0.70      0.73       457
weighted avg       0.79      0.78      0.78       457

