In [1]:
import os
os.chdir(r"c:\Users\britt\Desktop\YH\Applicerad AI\job_discrimination_sandbox")
import re
import warnings

import matplotlib.pyplot as plt
import numpy as np
import optuna
import pandas as pd
import pickle
from nltk.corpus import wordnet, stopwords
from nltk.stem import WordNetLemmatizer
from nltk.tokenize import word_tokenize
from nltk.util import ngrams
from nltk import pos_tag
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, confusion_matrix, ConfusionMatrixDisplay
from unidecode import unidecode

  from .autonotebook import tqdm as notebook_tqdm


In [12]:
with open("data/cleaned_data/cross_val_split.pkl", "rb") as read_file:
    folds = pickle.load(read_file)

In [27]:
def objective(trial):
    C = trial.suggest_float("C", 1e-7, 10.0, log=True)
    # tol = trial.suggest_float("tol", 1e-7, 10.0, log=True)
    c_weight = trial.suggest_categorical("c_weight", ["balanced", None])
    #max_iter = trial.suggest_int("max_iter", 50, 200)

    clf = LogisticRegression(
        # tol=tol, 
        C=C, 
        class_weight=c_weight, 
        #max_iter=max_iter
        )
    scores = []
    for fold in folds:
        clf.fit(fold["Train X Tfidf"], fold["Train y classes"])
        scores.append(clf.score(fold["Test X Tfidf"], fold["Test y classes"]))

    return np.mean(scores)

In [28]:
study = optuna.create_study(direction="maximize")
study.optimize(objective, n_trials=500)

[32m[I 2023-01-30 11:51:41,634][0m A new study created in memory with name: no-name-a96cf0a0-93ae-438c-9eb5-04e8b34ea96a[0m
[32m[I 2023-01-30 11:51:42,092][0m Trial 0 finished with value: 0.7967320261437909 and parameters: {'C': 0.0006879309377172401, 'c_weight': 'balanced'}. Best is trial 0 with value: 0.7967320261437909.[0m
[32m[I 2023-01-30 11:51:42,781][0m Trial 1 finished with value: 0.5931372549019608 and parameters: {'C': 1.00226111167485e-05, 'c_weight': None}. Best is trial 0 with value: 0.7967320261437909.[0m
[32m[I 2023-01-30 11:51:43,401][0m Trial 2 finished with value: 0.6267973856209149 and parameters: {'C': 0.1941071033868064, 'c_weight': None}. Best is trial 0 with value: 0.7967320261437909.[0m
[32m[I 2023-01-30 11:51:43,661][0m Trial 3 finished with value: 0.5356209150326797 and parameters: {'C': 6.136728515032025e-07, 'c_weight': 'balanced'}. Best is trial 0 with value: 0.7967320261437909.[0m
[32m[I 2023-01-30 11:51:44,357][0m Trial 4 finished with va

In [29]:
study.best_params

{'C': 2.60692940700584, 'c_weight': 'balanced'}

In [30]:
study.best_value

0.8486928104575163

In [33]:
def detailed_objective(trial):
    # Use same code objective to reproduce the best model
    C = trial.suggest_float("C", 1e-7, 10.0, log=True)
    #tol = trial.suggest_float("tol", 1e-7, 10.0, log=True)
    c_weight = trial.suggest_categorical("c_weight", ["balanced", None])
    # max_iter = trial.suggest_int("max_iter", 50, 200)

    clf = LogisticRegression(
        #tol=tol, 
        C=C, 
        class_weight=c_weight, 
        # max_iter=max_iter
        )
    acc_scores = []
    recall_scores = []
    precision_scores = []
    f1_scores = []
    for fold in folds:
        X_train = fold["Train X Tfidf"]
        y_train = fold["Train y classes"]
        X_test = fold["Test X Tfidf"]
        y_test = fold["Test y classes"]
        
        clf.fit(X_train, y_train)
        # calculate more evaluation metrics
        pred = clf.predict(X_test)
        
        acc_scores.append(accuracy_score(pred, y_test))
        recall_scores.append(recall_score(pred, y_test, average="weighted"))
        precision_scores.append(precision_score(pred, y_test, average="weighted"))
        f1_scores.append(f1_score(pred, y_test, average="weighted"))
    
    acc = np.mean(acc_scores)
    precision = np.mean(precision_scores)
    recall = np.mean(recall_scores)
    f1 = np.mean(f1_scores)

    return acc, f1, recall, precision

In [34]:
eval_metrics = detailed_objective(study.best_trial)  # calculate acc, f1, recall, and precision

  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


In [35]:
print(f"Score: {round(study.best_value, 3)}")
print(f"Best parameters: {study.best_params}")
print(f"F1: {round(eval_metrics[1], 3)}")
print(f"Recall: {round(eval_metrics[2], 3)}")
print(f"Precision: {round(eval_metrics[3], 3)}")

Score: 0.849
Best parameters: {'C': 2.60692940700584, 'c_weight': 'balanced'}
F1: 0.867
Recall: 0.849
Precision: 0.901
