In [1]:
import pandas as pd
from sklearn.preprocessing import LabelEncoder

df = pd.read_csv("/Users/jonas/Documents/Master_Vorlesungen/Semester_02/Practical Data Science/Final/PDS_Final/data/seniority-v2.csv")

X = df["text"]
y = df["label"]

label_encoder = LabelEncoder()
y_enc = label_encoder.fit_transform(y)

print(label_encoder.classes_)


['Director' 'Junior' 'Lead' 'Management' 'Senior']


In [2]:
from sklearn.feature_extraction.text import CountVectorizer

vectorizer = CountVectorizer(
    lowercase=True,
    ngram_range=(1, 2),
    min_df=1,
)


In [15]:
from sklearn.linear_model import LogisticRegression

clf = LogisticRegression(
    max_iter=1000,
    C=1.0,
    class_weight="balanced"
)


In [16]:
from sklearn.pipeline import Pipeline

pipeline = Pipeline([
    ("bow", vectorizer),
    ("clf", clf),
])

pipeline.fit(X, y_enc)


In [17]:
test_titles = [
    "Senior Application Engineer",
    "Chef de projet IT",
    "BI Analyst",
]

pred = pipeline.predict(test_titles)
pred_labels = label_encoder.inverse_transform(pred)

for t, p in zip(test_titles, pred_labels):
    print(t, "→", p)


Senior Application Engineer → Senior
Chef de projet IT → Lead
BI Analyst → Junior


In [18]:
import numpy as np

feature_names = pipeline.named_steps["bow"].get_feature_names_out()
coefs = pipeline.named_steps["clf"].coef_

for class_idx, class_name in enumerate(label_encoder.classes_):
    top = np.argsort(coefs[class_idx])[-10:]
    print(f"\nTop features for {class_name}:")
    for i in top:
        print(feature_names[i])



Top features for Director:
housing directorate
housing
directorate
managing directors
projektdirektor
verkaufsdirektor
abteilungsdirektor
vertriebsdirektor
directors
director

Top features for Junior:
analystin
assistent
associate
assistentin
mitarbeiterin
referentin
referent
mitarbeiter
analyst
junior

Top features for Lead:
vertriebsleitung
bereichsleiter
projektleiter
abteilungsleiter
teamleiter
leiterin
geschäftsleitung
vertriebsleiter
leiter
leitung

Top features for Management:
svp
cio
chief
geschäftsführerin
owner
founder
geschäftsführung
ceo
geschäftsführer
vp

Top features for Senior:
assistante
engineer
executive
assistant
responsable
managerin
consultant
management
senior
manager


In [19]:
import json
import pandas as pd

with open("/Users/jonas/Documents/Master_Vorlesungen/Semester_02/Practical Data Science/Final/PDS_Final/data/linkedin-cvs-annotated.json", "r") as f:
    data = json.load(f)

rows = []

LABEL_MAP = {
    "Junior": 0,
    "Senior": 1,
    "Lead": 2,
    "Management": 3,
    "Director": 4,
}


for person in data:
    for job in person:
        if job.get("status") == "ACTIVE":
            pos = job.get("position")
            sen = job.get("seniority")

            if pos and sen and sen in LABEL_MAP:
                rows.append({
                    "text": pos,
                    "label": LABEL_MAP[sen]
                })

df_test = pd.DataFrame(rows)

print(df_test.head())
print("Samples:", len(df_test))


                             text  label
0                       Prokurist      3
1                             CFO      3
2                     Prokuristin      3
3                             CFO      3
4  Director expansión de negocio.      4
Samples: 407


In [20]:
def normalize_title(title):
    return (
        title.lower()
        .strip()
        .replace("-", " ")
        .replace("/", " ")
    )

df_test["text"] = df_test["text"].apply(normalize_title)


In [21]:
KNOWN_LABELS = {0, 2, 3}  # example

df_test = df_test[df_test["label"].isin(KNOWN_LABELS)]


In [23]:
from sklearn.metrics import accuracy_score, classification_report

X_test = df_test["text"]
y_test = df_test["label"]

y_pred = pipeline.predict(X_test)

print("Accuracy:", accuracy_score(y_test, y_pred))
print(classification_report(
    y_test,
    y_pred,
    target_names=label_encoder.classes_
))


Accuracy: 0.6382978723404256
              precision    recall  f1-score   support

    Director       0.00      0.00      0.00        12
      Junior       0.00      0.00      0.00         0
        Lead       0.66      0.74      0.70       125
  Management       0.98      0.61      0.75       192
      Senior       0.00      0.00      0.00         0

    accuracy                           0.64       329
   macro avg       0.33      0.27      0.29       329
weighted avg       0.83      0.64      0.71       329



  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
