In [52]:
import numpy as np
import joblib
from setfit import SetFitModel
from sklearn.metrics import f1_score, classification_report, accuracy_score, confusion_matrix
from sklearn.base import BaseEstimator, ClassifierMixin
import pandas as pd
import json
from sklearn.ensemble import StackingClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier

#### Test Data -> annotated.json

In [14]:
class Preprocessing_JSON_annotated_Seniority:
    """
    Loads an annotated JSON file (list of persons, each a list of jobs).
    Keeps ONLY the latest ACTIVE job per person (by startDate).
    Returns:
      - self.X: pd.Series of cleaned positions (text)
      - self.y: pd.Series of raw string labels (seniority)  # NOT encoded
      - self.df: DataFrame with columns ["text", "label"]
    """

    def __init__(self, path: str):
        self.path = path
        self.df: pd.DataFrame | None = None
        self.X: pd.Series | None = None
        self.y: pd.Series | None = None

        self.read_json()

    @staticmethod
    def _parse_year_month(s):
        """Expects 'YYYY-MM' -> (year, month) or None."""
        if not isinstance(s, str) or len(s) < 7:
            return None
        try:
            year, month = s.split("-")
            return int(year), int(month)
        except Exception:
            return None

    @staticmethod
    def clean_text(text: str) -> str:
        return str(text).lower().strip().replace("-", " ").replace("/", " ")

    def read_json(self):
        with open(self.path, "r", encoding="utf-8") as f:
            data = json.load(f)

        rows = []

        for person_jobs in data:
            if not isinstance(person_jobs, list):
                continue

            active_jobs = []
            for job in person_jobs:
                if not isinstance(job, dict):
                    continue
                if job.get("status") != "ACTIVE":
                    continue

                start = self._parse_year_month(job.get("startDate"))
                if start is None:
                    continue

                active_jobs.append((start, job))

            if not active_jobs:
                continue

            _, job = max(active_jobs, key=lambda x: x[0])

            position = job.get("position")
            seniority = job.get("seniority")

            if not position or not seniority:
                continue

            rows.append(
                {"text": self.clean_text(position), "label": str(seniority)}
            )

        self.df = pd.DataFrame(rows)
        if self.df.empty:
            raise ValueError("No valid samples found in JSON")

        self.X = self.df["text"].astype(str)
        self.y = self.df["label"].astype(str)

        print(f"[JSON] Loaded {len(self.df)} samples from {self.path}")

In [22]:
# Load Test Data
data = Preprocessing_JSON_annotated_Seniority(
    "/Users/jonas/Documents/Master_Vorlesungen/Semester_02/Practical Data Science/Final/PDS_Final/data/linkedin-cvs-annotated.json",
)

X_test = data.X
y_test_raw = data.y

encoder = joblib.load("/Users/jonas/Documents/Master_Vorlesungen/Semester_02/Practical Data Science/Final/PDS_Final/task_six/models/encoder_seniority.joblib")
y_test = encoder.transform(y_test_raw.values.reshape(-1,1)).flatten()

[JSON] Loaded 457 samples from /Users/jonas/Documents/Master_Vorlesungen/Semester_02/Practical Data Science/Final/PDS_Final/data/linkedin-cvs-annotated.json


#### Pipeline:

In [None]:
# Load BOW + Logistic Regression Model
bow_lr_model = joblib.load("/Users/jonas/Documents/Master_Vorlesungen/Semester_02/Practical Data Science/Final/PDS_Final/task_six/models/bow_seniority.joblib")

bow_lr_model

In [7]:
# Load TF-IDF + Logistic Regression Model
tf_idf_lr_model = joblib.load("/Users/jonas/Documents/Master_Vorlesungen/Semester_02/Practical Data Science/Final/PDS_Final/task_six/models/tf_idf_seniority.joblib")

tf_idf_lr_model

In [29]:
class SoftVotingEnsemble(BaseEstimator, ClassifierMixin):
    def __init__(self, models, weights=None):
        self.models = models
        self.weights = weights

    def fit(self, X=None, y=None):
        self.classes_ = self.models[0].classes_
        return self

    def predict_proba(self, X):
        probas = [m.predict_proba(X) for m in self.models]
        P = np.stack(probas, axis=0)
        if self.weights is None:
            w = np.ones(P.shape[0], dtype=float)
        else:
            w = np.asarray(self.weights, dtype=float)

        w = w / w.sum()
        return np.tensordot(w, P, axes=(0, 0))

    def predict(self, X):
        P = self.predict_proba(X)
        return self.classes_[np.argmax(P, axis=1)]


In [12]:
print("BoW classes:  ", bow_lr_model.classes_)
print("TFIDF classes:", tf_idf_lr_model.classes_)

assert np.array_equal(bow_lr_model.classes_, tf_idf_lr_model.classes_), "Class order differs between models!"

BoW classes:   [0. 1. 2. 3. 4. 5.]
TFIDF classes: [0. 1. 2. 3. 4. 5.]


In [30]:
ensemble = SoftVotingEnsemble(
    models=[bow_lr_model, tf_idf_lr_model],
    weights=[0.5, 0.5]   # try [0.7, 0.3] etc.
).fit()

y_pred = ensemble.predict(X_test)
y_proba = ensemble.predict_proba(X_test)


In [34]:
print("Macro F1:", f1_score(y_test, y_pred, average="macro"))
print(classification_report(y_test, y_pred))

Macro F1: 0.43231288547136054
              precision    recall  f1-score   support

         0.0       0.19      0.50      0.27        10
         1.0       0.30      0.02      0.04       154
         2.0       0.23      0.82      0.35        39
         3.0       0.44      0.66      0.53        97
         4.0       0.86      0.62      0.72       133
         5.0       0.55      0.88      0.68        24

    accuracy                           0.46       457
   macro avg       0.43      0.58      0.43       457
weighted avg       0.50      0.46      0.41       457



In [32]:
best = (-1, None)
for w in np.linspace(0, 1, 11):
    ens = SoftVotingEnsemble([bow_lr_model, tf_idf_lr_model], weights=[w, 1-w]).fit()
    pred = ens.predict(X_test)
    score = f1_score(y_test, pred, average="macro")
    if score > best[0]:
        best = (score, [float(w), float(1-w)])

print("Best (macro F1, weights):", best)


Best (macro F1, weights): (0.43401150199437377, [0.8, 0.19999999999999996])


In [39]:
from sklearn.metrics import f1_score

pred_bow = bow_lr_model.predict(X_test)
pred_tfidf = tf_idf_lr_model.predict(X_test)

print("Macro F1 BoW:  ", f1_score(y_test, pred_bow, average="macro"))
print("Macro F1 TFIDF:", f1_score(y_test, pred_tfidf, average="macro"))
print("Macro F1 ENS:  ", f1_score(y_test, y_pred, average="macro"))


Macro F1 BoW:   0.4301127825285708
Macro F1 TFIDF: 0.4249048591774159
Macro F1 ENS:   0.43231288547136054


In [40]:

def proba_aligned(model, X, target_classes):
    P = model.predict_proba(X)
    idx = [list(model.classes_).index(c) for c in target_classes]
    return P[:, idx]

target_classes = bow_lr_model.classes_

P_bow   = proba_aligned(bow_lr_model, X_test, target_classes)
P_tfidf = proba_aligned(tf_idf_lr_model, X_test, target_classes)

best = (-1, None)
for w in np.linspace(0, 1, 21):
    P = w * P_bow + (1 - w) * P_tfidf
    pred = target_classes[np.argmax(P, axis=1)]
    score = f1_score(y_test, pred, average="macro")
    if score > best[0]:
        best = (score, w)

print("Best macro F1:", best[0], "Best w_for_bow:", best[1])


Best macro F1: 0.43401179902223874 Best w_for_bow: 0.75


#### Prepare Stacking

In [42]:
class Preprocessing_CSV():
    def __init__(self, file_path: str):
        self.file_path = file_path
        self.df: pd.DataFrame | None = None

        # Label and Text
        self.X: pd.Series = None
        self.y: pd.Series = None

        # Optional: keep raw versions too
        self.X_raw: pd.Series | None = None
        self.y_raw: pd.Series | None = None

        self.read_csv()

    @staticmethod
    def clean_text(text: str) -> str:
        """Lowercase, strip, replace - and / with spaces."""
        return str(text).lower().strip().replace("-", " ").replace("/", " ")

    def read_csv(self):
        """Reads CSV and exposes X (cleaned text) and y (raw text labels)."""
        self.df = pd.read_csv(self.file_path)

        required_cols = {"text", "label"}
        if not required_cols.issubset(self.df.columns):
            raise ValueError("Wrong file mate :( Expected columns: text, label")

        # Raw
        self.X_raw = self.df["text"].astype(str)
        self.y_raw = self.df["label"].astype(str)

        # Cleaned + labels as strings
        self.X = self.X_raw.apply(self.clean_text)
        self.y = self.y_raw

    def label_distribution(self) -> pd.Series:
        """Quick check of label counts."""
        if self.y is None:
            return pd.Series(dtype=int)
        return self.y.value_counts()

In [45]:
data = Preprocessing_CSV(
    "/Users/jonas/Documents/Master_Vorlesungen/Semester_02/Practical Data Science/Final/PDS_Final/data/seniority-v2.csv"
)

X = data.X
y = data.y

not_annotated_data = pd.read_csv("/Users/jonas/Documents/Master_Vorlesungen/Semester_02/Practical Data Science/Final/PDS_Final/data/labeled_not_annotated.csv")
X_not_annotated = not_annotated_data["position"].astype(str).apply(data.clean_text)
y_not_annotated = not_annotated_data["seniority"].astype(str)

X_train = pd.concat([X, X_not_annotated], ignore_index=True)
y_train_raw = pd.concat([y, y_not_annotated], ignore_index=True)

y_train = encoder.transform(y_train_raw.values.reshape(-1,1)).flatten()

#### Stacking

In [None]:
stack = StackingClassifier(
    estimators=[("bow", bow_lr_model), ("tfidf", tf_idf_lr_model)],
    final_estimator=LogisticRegression(max_iter=2000),
    stack_method="predict_proba",
    n_jobs=-1
)

stack.fit(X_train, y_train)
pred_stack = stack.predict(X_test)


In [54]:
# 3) Metrics
def eval_model(name, y_true, y_pred, classes=None):
    print("\n" + "="*60)
    print(name)
    print("="*60)
    print("Accuracy:  ", accuracy_score(y_true, y_pred))
    print("Macro F1:  ", f1_score(y_true, y_pred, average="macro"))
    print("Weighted F1:", f1_score(y_true, y_pred, average="weighted"))
    print("\nReport:")
    if classes is not None:
        print(classification_report(y_true, y_pred, labels=classes, target_names=classes))
    else:
        print(classification_report(y_true, y_pred))

# Use consistent class order if your labels are strings
classes = None
if hasattr(bow_lr_model, "classes_"):
    classes = list(bow_lr_model.classes_)

eval_model("BoW + LR", y_test, pred_bow)
eval_model("TF-IDF + LR", y_test, pred_tfidf)
eval_model("STACK (BoW + TF-IDF)", y_test, pred_stack)

# 4) Confusion matrix (optional)
cm = confusion_matrix(y_test, pred_stack, labels=classes if classes is not None else None)
print("\nConfusion matrix (STACK):")
print(cm)

# 5) How often stacking changes the decision (optional)
print("\n% predictions where STACK differs from BoW:",
      np.mean(pred_stack != pred_bow))
print("% predictions where STACK differs from TF-IDF:",
      np.mean(pred_stack != pred_tfidf))


BoW + LR
Accuracy:   0.45295404814004375
Macro F1:   0.4301127825285708
Weighted F1: 0.40770136200294804

Report:
              precision    recall  f1-score   support

         0.0       0.19      0.50      0.27        10
         1.0       0.36      0.03      0.05       154
         2.0       0.24      0.82      0.37        39
         3.0       0.42      0.68      0.52        97
         4.0       0.90      0.60      0.72       133
         5.0       0.54      0.83      0.66        24

    accuracy                           0.45       457
   macro avg       0.44      0.58      0.43       457
weighted avg       0.53      0.45      0.41       457


TF-IDF + LR
Accuracy:   0.4420131291028446
Macro F1:   0.4249048591774159
Weighted F1: 0.3999750147326863

Report:
              precision    recall  f1-score   support

         0.0       0.17      0.50      0.25        10
         1.0       0.18      0.02      0.04       154
         2.0       0.22      0.85      0.35        39
         

  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


In [55]:
from sklearn.model_selection import StratifiedKFold
stack.set_params(cv=StratifiedKFold(5, shuffle=True, random_state=42))