In [1]:
from preprocessing_bow import PreprocessingBOW
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report

In [2]:
prep = PreprocessingBOW(
    annotated_json_link="/Users/jonas/Documents/Master_Vorlesungen/Semester_02/Practical Data Science/Final/PDS_Final/data/linkedin-cvs-annotated.json",
    seniority_label_list_csv="/Users/jonas/Documents/Master_Vorlesungen/Semester_02/Practical Data Science/Final/PDS_Final/data/seniority-v2.csv",  # optional
    include_history=True,   # extension: use previous jobs too
    include_org=True
)

X = prep.X   # sparse TF-IDF matrix
Y = prep.Y   # label ids

8 duplicates removed
6 seniority labels found:
['director', 'junior', 'lead', 'management', 'professional', 'senior']
Dropped (no ACTIVE job or missing startDate): 152
Dropped (missing seniority label): 0
X shape: (457, 1489)


In [4]:
X_train, X_test, y_train, y_test = train_test_split(
    X, Y, test_size=0.2, random_state=42, stratify=Y
)

clf = LogisticRegression(max_iter=500, n_jobs=None)
clf.fit(X_train, y_train)

pred = clf.predict(X_test)
print(classification_report(y_test, pred, target_names=prep.label_categories))

              precision    recall  f1-score   support

    director       0.50      0.20      0.29         5
      junior       0.00      0.00      0.00         2
        lead       0.45      0.26      0.33        19
  management       0.70      0.78      0.74        27
professional       0.56      0.81      0.66        31
      senior       0.33      0.12      0.18         8

    accuracy                           0.58        92
   macro avg       0.42      0.36      0.37        92
weighted avg       0.54      0.58      0.54        92



In [6]:
merge = {"director": "management", "junior": "professional"}  # common and defensible

prep_bow_merged = PreprocessingBOW(
    "/Users/jonas/Documents/Master_Vorlesungen/Semester_02/Practical Data Science/Final/PDS_Final/data/linkedin-cvs-annotated.json",
    label_col="Seniority (Label 1)",
    merge_labels=merge,
    include_history=True,
    include_org=True
)

X_merged = prep_bow_merged.X
Y_merged = prep_bow_merged.Y

8 duplicates removed
4 seniority labels found:
['lead', 'management', 'professional', 'senior']
Dropped (no ACTIVE job or missing startDate): 152
Dropped (missing seniority label): 0
X shape: (457, 1560)


In [9]:
X_train_merged, X_test_merged, y_train_merged, y_test_merged = train_test_split(
    X_merged, Y_merged, test_size=0.2, random_state=42, stratify=Y
)

clf_merged = LogisticRegression(max_iter=500, n_jobs=None)
clf_merged.fit(X_train_merged, y_train_merged)

pred_merged = clf_merged.predict(X_test_merged)
print(classification_report(y_test_merged, pred_merged, target_names=prep_bow_merged.label_categories))

              precision    recall  f1-score   support

        lead       0.62      0.26      0.37        19
  management       0.81      0.78      0.79        32
professional       0.59      0.91      0.71        33
      senior       0.50      0.12      0.20         8

    accuracy                           0.66        92
   macro avg       0.63      0.52      0.52        92
weighted avg       0.66      0.66      0.63        92

