In [27]:
import pandas as pd
from sklearn.preprocessing import LabelEncoder, OrdinalEncoder
from preprocessing.preprocessing_csv import Preprocessing_CSV_Seniority
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.linear_model import LogisticRegression
import pandas as pd
from preprocessing.preprocessing_json import Preprocessing_JSON_Seniority
from sklearn.metrics import accuracy_score, classification_report
from imblearn.over_sampling import RandomOverSampler
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import train_test_split
import json
import numpy as np

#### CSV Class

In [33]:
class Preprocessing_CSV_Seniority():
    def __init__(self, file_path):
        self.file_path = file_path
        self.df: pd.DataFrame = None
        self.X = None
        self.y = None
        self.y_str = None

        # Da Labels eine Ordnung haben OrdinalEncoder anstatten LabelEncoder
        #self.label_encoder = LabelEncoder()
        self.ordninal_labels = [
            "Junior",
            #"Professional",
            "Senior",
            "Lead",
            "Management",
            "Director"
        ]
        self.label_encoder = OrdinalEncoder(categories=[self.ordninal_labels],
                                            handle_unknown="use_encoded_value",
                                            unknown_value=-1)

        self.read_csv()

    def clean_text(self, text: str):
        """
        Removes - and / and replaces with <space>
        """
        text = text.lower().strip().replace("-", " ").replace("/", " ")
        return text

    def read_csv(self):
        """
        Reads CSV file and saves them in class properties
        """
        self.df = pd.read_csv(self.file_path)

        # Check if correct file is given
        requiered_cols = {"text", "label"}
        if not requiered_cols.issubset(self.df.columns):
            raise ValueError(
                f"Wrong file mate :("
            )

        self.X = self.df["text"].astype(str).apply(self.clean_text)

        # Für OrdinalEncoder
        labels = self.df["label"].values.reshape(-1,1)
        self.y = self.label_encoder.fit_transform(labels).flatten()

        # Für LabelEncoder
        #self.y_str = self.df["label"].astype(str)
        #self.y = self.label_encoder.fit_transform(self.y_str)
        #self.X = self.df["text"]

    def labels(self):
        """
        Just quick check, can be removed
        """
        return {
            i: label for i, label in enumerate(self.label_encoder.categories_[0])
        }



#### JSON class

In [46]:
import json
import pandas as pd
import numpy as np
from sklearn.preprocessing import OrdinalEncoder


class Preprocessing_JSON_Seniority:
    """
    JSON preprocessing ONLY (no prediction logic).

    Output (analog zur CSV-Pipeline):
        self.df         -> pd.DataFrame (text, label)
        self.X          -> pd.Series (cleaned text)
        self.y          -> np.ndarray (ordinal labels, incl. Professional)
        self.y_reduced  -> np.ndarray (ordinal labels, Professional merged into Junior)
    """

    def __init__(self, json_path: str):
        self.json_path = json_path

        # Full ordinal scale (ground truth)
        self.ordinal_labels = [
            "Junior",
            "Professional",
            "Senior",
            "Lead",
            "Management",
            "Director"
        ]

        self.label_encoder = OrdinalEncoder(
            categories=[self.ordinal_labels],
            handle_unknown="use_encoded_value",
            unknown_value=-1
        )

        self.df: pd.DataFrame = None
        self.X: pd.Series = None
        self.y: np.ndarray = None
        self.y_reduced: np.ndarray = None

        self._read_json()

    # ----------------------------
    # Helpers
    # ----------------------------
    @staticmethod
    def _parse_year_month(s):
        """Expects 'YYYY-MM'. Returns (year, month) or None."""
        if not isinstance(s, str) or len(s) < 7:
            return None
        try:
            year, month = s.split("-")
            return int(year), int(month)
        except Exception:
            return None

    @staticmethod
    def clean_text(text: str) -> str:
        return str(text).lower().strip().replace("-", " ").replace("/", " ")

    # ----------------------------
    # Core preprocessing
    # ----------------------------
    def _read_json(self):
        with open(self.json_path, "r") as f:
            data = json.load(f)

        rows = []

        for person in data:
            active_jobs = []

            for job in person:
                # Rule 1: only ACTIVE jobs
                if job.get("status") != "ACTIVE":
                    continue

                start = self._parse_year_month(job.get("startDate"))
                if start is None:
                    continue

                active_jobs.append((start, job))

            # Rule 2: no ACTIVE job → skip person
            if not active_jobs:
                continue

            # Rule 3: newest ACTIVE job wins
            _, job = max(active_jobs, key=lambda x: x[0])

            position = job.get("position")
            seniority = job.get("seniority")

            if not position or not seniority:
                continue

            rows.append({
                "text": self.clean_text(position),
                "label": seniority
            })

        self.df = pd.DataFrame(rows)

        if self.df.empty:
            raise ValueError("No valid samples found in JSON")

        # Output analog zur CSV pipeline
        self.X = self.df["text"]

        # ---- Full ordinal encoding (WITH Professional) ----
        self.y = self.label_encoder.fit_transform(
            self.df["label"].values.reshape(-1, 1)
        ).flatten()

        # ---- Reduced ordinal encoding (Professional → Junior) ----
        # Mapping indices:
        # Junior=0, Professional=1, Senior=2, Lead=3, Management=4, Director=5
        self.y_reduced = np.where(
            self.y <= 1,        # Junior or Professional
            0,                  # → Junior
            self.y - 1          # shift everything above down by 1
        )

        print(
            f"[JSON] Loaded {len(self.df)} samples | "
            f"Professional: {(self.df['label'] == 'Professional').sum()} | "
            f"Unknown: {(self.y == -1).sum()}"
        )


#### Pipeline

In [22]:
# 1. Load and prepare data
data = Preprocessing_CSV_Seniority(
    "/Users/jonas/Documents/Master_Vorlesungen/Semester_02/Practical Data Science/Final/PDS_Final/data/seniority-v2.csv"
)

X = data.X
y = data.y

In [23]:
# 2. Train/Test Split Data
X_train, X_test, y_train, y_test = train_test_split(X,y, test_size=0.2)

In [24]:
# 3. Train
bow = CountVectorizer(ngram_range=(1,2))
X_train_vec = bow.fit_transform(X_train)
X_test_vec = bow.transform(X_test)

ros = RandomOverSampler(random_state=123)
X_train_balanced, y_train_balanced = ros.fit_resample(X_train_vec, y_train)

print(f"Original Shape: {X_train_vec.shape}\nBalanced Shape: {X_train_balanced.shape}")

Original Shape: (7542, 12302)
Balanced Shape: (15085, 12302)


In [25]:
parameters = {
    "C": [0.01, 0.1, 1, 10, 100],
    "penalty": ["l2"],
    "solver": ["liblinear"],
    "class_weight": [None, "balanced"]
}


logistic_reg = LogisticRegression(max_iter=1000)

grid = GridSearchCV(
    estimator=logistic_reg,
    param_grid=parameters,
    cv=5,
    scoring="f1_weighted",
    n_jobs=-1,
    verbose=1
)

grid.fit(X_train_balanced, y_train_balanced)
best_model = grid.best_estimator_

print(f"Best params: {grid.best_params_}")
print(f"Best CV score: {grid.best_score_}")

Fitting 5 folds for each of 10 candidates, totalling 50 fits
Best params: {'C': 100, 'class_weight': None, 'penalty': 'l2', 'solver': 'liblinear'}
Best CV score: 0.9956186140968739


In [26]:
# 4. Test on CSV
test_csv_prediction = best_model.predict(X_test_vec)
print("Accuracy:", accuracy_score(y_test, test_csv_prediction))
print(
    classification_report(
        y_test,
        test_csv_prediction,
        target_names=data.label_encoder.categories_[0])
)

Accuracy: 0.9856839872746553
              precision    recall  f1-score   support

      Junior       0.98      1.00      0.99        87
      Senior       0.99      1.00      0.99       716
        Lead       0.99      0.98      0.99       728
  Management       0.96      0.93      0.95       167
    Director       0.99      0.99      0.99       188

    accuracy                           0.99      1886
   macro avg       0.98      0.98      0.98      1886
weighted avg       0.99      0.99      0.99      1886



#### Handling Professional so, dass es klassifiziert wird wenn sich der Klassifikator weniger als 60% sicher auf Junior und Senior ist.

In [52]:
# 5. Test on annotated.json
annotated_data = Preprocessing_JSON_Seniority(
    "/Users/jonas/Documents/Master_Vorlesungen/Semester_02/Practical Data Science/Final/PDS_Final/data/linkedin-cvs-annotated.json",
)

X_test_json = bow.transform(annotated_data.X)
y_json_reduced = annotated_data.y_reduced
prediction_json = best_model.predict(X_test_json)

print("Accuracy:", accuracy_score(y_json_reduced, prediction_json))
print(classification_report(y_json_reduced, prediction_json, target_names=data.label_encoder.categories[0]))

[JSON] Loaded 457 samples | Professional: 154 | Unknown: 0
Accuracy: 0.4573304157549234
              precision    recall  f1-score   support

      Junior       0.92      0.07      0.12       164
      Senior       0.25      0.82      0.39        39
        Lead       0.34      0.71      0.46        97
  Management       0.97      0.56      0.71       133
    Director       0.55      0.92      0.69        24

    accuracy                           0.46       457
   macro avg       0.61      0.62      0.48       457
weighted avg       0.74      0.46      0.42       457



In [53]:
probabilities = best_model.predict_proba(X_test_json)

In [79]:
def predict_with_professional(prob_row, threshold=0.85):
    max_prob = np.max(prob_row)
    reduced_class = np.argmax(prob_row)

    # Junior (0) or Senior (1) AND low confidence → Professional
    if reduced_class in [0,1,2,3,4] and max_prob < threshold:
        return 1  # Professional (FULL space)

    # Otherwise map reduced → full
    # reduced: 0,1,2,3,4
    # full:    0,2,3,4,5
    return reduced_class + 1 if reduced_class >= 1 else 0

y_pred_full = np.array(
    [predict_with_professional(p) for p in probabilities]
)
# Wie oft wurde Professional vorhergesagt?
print("Predicted Professional:", np.sum(y_pred_full == 1))

# Wie oft ist Professional tatsächlich vorhanden?
print("True Professional:", np.sum(annotated_data.y == 1))


Predicted Professional: 30
True Professional: 154


In [80]:

label_names = [
    "Junior",
    "Professional",
    "Senior",
    "Lead",
    "Management",
    "Director"
]

print(
    classification_report(
        annotated_data.y,
        y_pred_full,
        target_names=label_names,
        zero_division=0
    )
)

              precision    recall  f1-score   support

      Junior       0.27      0.30      0.29        10
Professional       0.10      0.02      0.03       154
      Senior       0.27      0.82      0.40        39
        Lead       0.36      0.70      0.48        97
  Management       0.97      0.52      0.68       133
    Director       0.53      0.83      0.65        24

    accuracy                           0.43       457
   macro avg       0.42      0.53      0.42       457
weighted avg       0.45      0.43      0.38       457



In [81]:
from sklearn.metrics import confusion_matrix
import pandas as pd

cm = confusion_matrix(
    annotated_data.y,
    y_pred_full,
    labels=list(range(len(label_names)))
)

cm_df = pd.DataFrame(
    cm,
    index=[f"True_{l}" for l in label_names],
    columns=[f"Pred_{l}" for l in label_names]
)

cm_df


Unnamed: 0,Pred_Junior,Pred_Professional,Pred_Senior,Pred_Lead,Pred_Management,Pred_Director
True_Junior,3,1,2,4,0,0
True_Professional,7,3,55,89,0,0
True_Senior,0,1,32,5,0,1
True_Lead,1,3,23,68,2,0
True_Management,0,20,8,19,69,17
True_Director,0,2,0,2,0,20
