In [17]:
import pandas as pd
from sklearn.preprocessing import LabelEncoder, OrdinalEncoder
from preprocessing.preprocessing_csv import Preprocessing_CSV_Seniority
from preprocessing.preprocessing_department_json import Preprocessing_JSON_annotated_Department
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix, f1_score
from imblearn.over_sampling import RandomOverSampler
from sklearn.model_selection import GridSearchCV, StratifiedKFold
from sklearn.model_selection import train_test_split
import json
from imblearn.pipeline import Pipeline
import numpy as np
import matplotlib.pyplot as plt

In [9]:
# 1. Load and Prepare CSV Data
data = Preprocessing_CSV_Seniority(
    "/Users/jonas/Documents/Master_Vorlesungen/Semester_02/Practical Data Science/Final/PDS_Final/data/department-v2.csv"
)

X = data.X
y = data.y

In [12]:
# 2. Load and Prepare not-annotated.json Data
not_annotated_data = pd.read_csv("/Users/jonas/Documents/Master_Vorlesungen/Semester_02/Practical Data Science/Final/PDS_Final/data/seniority_with_departments.csv")
X_not_annotated = not_annotated_data["position"].astype(str).apply(data.clean_text)
y_not_annotated = not_annotated_data["department"].astype(str)

In [13]:
# 3. Concatenate and Train/Test split
X_concat = pd.concat([X, X_not_annotated], ignore_index=True)
y_concat = pd.concat([y, y_not_annotated], ignore_index=True)

encoder = LabelEncoder()
y_encoded = encoder.fit_transform(y_concat)

print("Combined X:", X_concat.shape)
print("Combined y:", y_encoded.shape)
print("Classes:", encoder.classes_)
print(y_concat.value_counts())

Combined X: (10449,)
Combined y: (10449,)
Classes: ['Administrative' 'Business Development' 'Consulting' 'Customer Support'
 'Human Resources' 'Information Technology' 'Marketing' 'Other'
 'Project Management' 'Purchasing' 'Sales']
Marketing                 4307
Sales                     3351
Information Technology    1327
Business Development       640
Project Management         214
Other                      210
Consulting                 178
Administrative              89
Customer Support            47
Purchasing                  45
Human Resources             41
Name: count, dtype: int64


In [14]:
X_train, X_test, y_train, y_test = train_test_split(X_concat, y_encoded, test_size=0.2, stratify=y_encoded)

In [15]:
# 4. BOW and Logistic Regression with Hyperparam Search and 5 fold CV

pipe = Pipeline(steps=[
    ("bow", CountVectorizer(ngram_range=(1, 2))),
    ("ros", RandomOverSampler(random_state=123)),
    ("clf", LogisticRegression(max_iter=2000))
])

parameters = {
    "bow__min_df": [1, 2, 3],
    "bow__max_df": [0.9, 0.95, 1.0],
    "clf__C": [0.1, 1, 10],
    "clf__solver": ["liblinear"],
    "clf__class_weight": [None, "balanced"]
}

cv = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)

grid = GridSearchCV(
    pipe,
    param_grid=parameters,
    cv=cv,
    scoring="f1_macro",
    n_jobs=-1,
    verbose=1
)

grid.fit(X_train, y_train)
best_model = grid.best_estimator_

print("Best params:", grid.best_params_)
print("Best CV score:", grid.best_score_)

Fitting 5 folds for each of 54 candidates, totalling 270 fits
Best params: {'bow__max_df': 0.9, 'bow__min_df': 1, 'clf__C': 10, 'clf__class_weight': None, 'clf__solver': 'liblinear'}
Best CV score: 0.8655701783106681


In [16]:
# 5. Test on test-set

csv_prediction = best_model.predict(X_test) # Auch predict_proba() probieren
print("Accuracy:", accuracy_score(y_test, csv_prediction))
print(
    classification_report(
        y_test,
        csv_prediction,
        target_names=encoder.classes_)
)

Accuracy: 0.9535885167464114
                        precision    recall  f1-score   support

        Administrative       0.85      0.94      0.89        18
  Business Development       0.96      0.97      0.96       128
            Consulting       0.89      0.89      0.89        36
      Customer Support       0.40      0.44      0.42         9
       Human Resources       0.88      0.88      0.88         8
Information Technology       0.94      0.94      0.94       265
             Marketing       0.99      0.97      0.98       862
                 Other       0.48      0.86      0.62        42
    Project Management       0.86      0.86      0.86        43
            Purchasing       1.00      1.00      1.00         9
                 Sales       0.99      0.96      0.97       670

              accuracy                           0.95      2090
             macro avg       0.84      0.88      0.86      2090
          weighted avg       0.96      0.95      0.96      2090



In [19]:
# 6. Test on annotated.json

annotated_json_data = Preprocessing_JSON_annotated_Department(
    "/Users/jonas/Documents/Master_Vorlesungen/Semester_02/Practical Data Science/Final/PDS_Final/data/linkedin-cvs-annotated.json",
)

X_annotated = annotated_json_data.X
y_annotated = annotated_json_data.y

y_annotated_encoded = encoder.transform(y_annotated)

[JSON-Department] Loaded 457 samples from /Users/jonas/Documents/Master_Vorlesungen/Semester_02/Practical Data Science/Final/PDS_Final/data/linkedin-cvs-annotated.json


In [20]:
predict_annotated = best_model.predict(X_annotated)

print(f"Accuracy on annotated.json: {accuracy_score(y_annotated_encoded, predict_annotated)}")

print(
    classification_report(
        y_annotated_encoded,
        predict_annotated,
        target_names=encoder.classes_,
        zero_division=0
    )
)

Accuracy on annotated.json: 0.6542669584245077
                        precision    recall  f1-score   support

        Administrative       0.00      0.00      0.00        10
  Business Development       0.29      0.31      0.30        16
            Consulting       0.94      0.59      0.73        27
      Customer Support       1.00      0.17      0.29         6
       Human Resources       0.80      0.50      0.62        16
Information Technology       0.63      0.33      0.43        52
             Marketing       0.50      0.47      0.49        19
                 Other       0.67      0.85      0.75       232
    Project Management       0.56      0.47      0.51        30
            Purchasing       0.75      0.50      0.60        12
                 Sales       0.68      0.68      0.68        37

              accuracy                           0.65       457
             macro avg       0.62      0.44      0.49       457
          weighted avg       0.65      0.65      0.63  

In [23]:
print(f"Confusion Matrix\n{confusion_matrix(y_annotated_encoded, predict_annotated)}")

Confusion Matrix
[[  0   0   0   0   0   1   1   7   1   0   0]
 [  0   5   0   0   0   0   0  10   1   0   0]
 [  0   2  16   0   1   0   1   5   2   0   0]
 [  0   0   0   1   0   2   0   2   0   0   1]
 [  1   0   0   0   8   0   0   6   0   0   1]
 [  0   2   0   0   0  17   0  31   1   0   1]
 [  0   0   0   0   0   2   9   7   0   0   1]
 [  1   8   1   0   1   5   4 198   5   2   7]
 [  0   0   0   0   0   0   1  14  14   0   1]
 [  0   0   0   0   0   0   1   5   0   6   0]
 [  0   0   0   0   0   0   1  10   1   0  25]]
