In [2]:
from preprocessing.preprocessing_csv import Preprocessing_CSV_Seniority
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
import pandas as pd
from preprocessing.preprocessing_json import Preprocessing_JSON_Seniority
from sklearn.metrics import accuracy_score, classification_report
from imblearn.over_sampling import RandomOverSampler
from sklearn.model_selection import GridSearchCV

In [3]:
dataset = Preprocessing_CSV_Seniority("/Users/jonas/Documents/Master_Vorlesungen/Semester_02/Practical Data Science/Final/PDS_Final/data/department-v2.csv")

X = dataset.X          # pd.Series of texts
y = dataset.y          # numpy array of encoded labels
y_str = dataset.y_str  # pd.Series of original labels

In [8]:
y_str.unique()

array(['Marketing', 'Project Management', 'Administrative',
       'Business Development', 'Consulting', 'Human Resources',
       'Information Technology', 'Other', 'Purchasing', 'Sales',
       'Customer Support'], dtype=object)

In [9]:
vectorizer = TfidfVectorizer(
    ngram_range=(1, 2),   # unigrams + bigrams
    min_df=2,             # ignore very rare terms
    max_df=0.95           # ignore very common terms
)

# Fit on training data and transform
X_vectorized = vectorizer.fit_transform(X)

In [10]:
y_train = dataset.y  # encoded labels

# Step 2: Oversample
ros = RandomOverSampler(random_state=42)
X_train_bal, y_train_bal = ros.fit_resample(X_vectorized, y_train)

In [11]:
print("Original shape:", X_vectorized.shape)
print("Balanced shape:", X_train_bal.shape)

Original shape: (10145, 3931)
Balanced shape: (47245, 3931)


In [12]:
#------- Verringert Accuracy -------

param_grid = {
    "C": [0.01, 0.1, 1, 10, 100],
    "penalty": ["l2"],          # use l1 if you want sparsity
    "solver": ["liblinear"],    # liblinear works with small/medium datasets
    "class_weight": [None, "balanced"]
}

# Initialize model
logistic = LogisticRegression(max_iter=1000)

# Grid search with 5-fold CV
grid = GridSearchCV(
    estimator=logistic,
    param_grid=param_grid,
    cv=5,
    scoring="f1_weighted",  # or 'accuracy'
    n_jobs=-1,
    verbose=1
)

# Fit on **oversampled training data**
grid.fit(X_train_bal, y_train_bal)

# Best hyperparameters
print("Best params:", grid.best_params_)
print("Best CV score:", grid.best_score_)

# Use best estimator
logistic_reg = grid.best_estimator_

Fitting 5 folds for each of 10 candidates, totalling 50 fits
Best params: {'C': 100, 'class_weight': None, 'penalty': 'l2', 'solver': 'liblinear'}
Best CV score: 0.9871973964214862


In [None]:
### Test Data ....