In [3]:
import os
os.chdir(r"c:\Users\britt\Desktop\YH\Applicerad AI\job_discrimination_sandbox")
import re
# import warnings

import numpy as np
import pickle
from nltk.corpus import wordnet, stopwords
from nltk.stem import WordNetLemmatizer
from nltk.tokenize import word_tokenize
from nltk import pos_tag
from unidecode import unidecode

In [4]:
def get_wordnet_pos(treebank_tag):
    if treebank_tag.startswith("J"):
        return wordnet.ADJ
    elif treebank_tag.startswith("V"):
        return wordnet.VERB
    elif treebank_tag.startswith("N"):
        return wordnet.NOUN
    elif treebank_tag.startswith("R"):
        return wordnet.ADV
    else:
        return ""

def penn_to_wn(tag):
    return get_wordnet_pos(tag)

def preprocess_document(doc):
    lemmatizer = WordNetLemmatizer()

    remove_https = re.sub(r"http\S+", "", doc)
    remove_com = re.sub(r"\ [A-Za-a]*\.com", " ", remove_https)
    remove_numbers_punctuations = re.sub(r"[^a-zA-Z]+", " ", remove_com)
    pattern = re.compile(r"\s+")
    remove_extra_whitespaces = re.sub(pattern, " ", remove_numbers_punctuations)
    only_ascii = unidecode(remove_extra_whitespaces)
    doc = only_ascii.lower()

    list_of_tokens = word_tokenize(doc)
    list_of_tokens_pos = pos_tag(list_of_tokens)
    list_of_tokens_wn_pos = [(token[0], penn_to_wn(token[1])) for token in list_of_tokens_pos if token[0] not in stopwords.words("english")]
    list_of_lemmas = [lemmatizer.lemmatize(token[0], token[1]) if token[1] != "" else lemmatizer.lemmatize(token[0]) for token in list_of_tokens_wn_pos]
    cleaned_text = [" ".join(list_of_lemmas)]

    return cleaned_text

def get_n_most_important_words_clf(weights, vocabulary, n):
    indices = np.argpartition(weights, len(weights) - n)[-n:]
    min_elements = weights[indices]
    min_elements_order = np.argsort(min_elements)
    ordered_indices = indices[min_elements_order]
    words = [vocabulary[i] for i in ordered_indices]
    weights = [round(weights[i], 5) for i in ordered_indices]

    return words[::-1], weights[::-1]

def get_25_most_important_words_single_text(text, label, clf_vocabulary, clf_weights):
    vocabulary_text = list(set(text[0].split()))
    text_weights = []
    words, weights = get_n_most_important_words_clf(clf_weights[label], clf_vocabulary, len(clf_vocabulary))
    weights_dict = dict(zip(words, weights))
    for word in vocabulary_text:
        try:
            text_weights.append((word, weights_dict[word]))
        except KeyError:
            pass
    
    text_weights.sort(key=lambda x: x[1], reverse=True)
    
    return text_weights[:25]

def print_predict_data(file_name, pred_probas, predicted_label, prediction, weights):
    labels = [0, 1, 2]
    label_to_word = {1: "female", 2: "male"}

    print(f"The job advertisement {file_name} will most likely (with {round(pred_probas[0][predicted_label] * 100, 2)}% probability) get", end=" ")
    if 0 < prediction[0]:
        print(f"more than 70% {label_to_word[predicted_label]} applicants.")
    else:
        print("around as many female as male applicants")
    print("The 25 most important words in the advertisement for predicting the above was:")
    for word, weight in weights:
        print(f"{word}: {weight}")
    for label in labels:
        if label == predicted_label:
            pass
        elif label == 0:
            print(f"The probability for the advertisement getting around as many female as male applicants is {round(pred_probas[0][label] * 100, 2)}%")
        else:
            if label == 1:
                sex = "female"
            elif label == 2:
                sex = "male"
            print(f"The probability for the advertisement getting more than 70% {sex} applicants is {round(pred_probas[0][label] * 100, 2)}%")

In [5]:
with open("data/models/log_reg_tfidf.pkl", "rb") as read_file:
    clf = pickle.load(read_file)

In [6]:
with open("data/vectorizers/tfidf.pkl", "rb") as read_file:
    vectorizer = pickle.load(read_file)

In [7]:
clf_weights = clf.coef_
clf_vocabulary = vectorizer.get_feature_names_out()
file_name = "ASBESTOS WORKER 3435 100518.txt"

with open(f"data/original_data/job_bulletins/{file_name}", "r", encoding="utf-8") as f:
    application = f.read()

text = preprocess_document(application)
vectorized_text = vectorizer.transform(text)
prediction = clf.predict(vectorized_text)
pred_probas = clf.predict_proba(vectorized_text)

predicted_label = prediction[0]
neutr_prob, female_prob, male_prob = pred_probas[0]
weights = get_25_most_important_words_single_text(text, predicted_label, clf_vocabulary, clf_weights)

print_predict_data(file_name, pred_probas, predicted_label, prediction, weights)

The job advertisement ASBESTOS WORKER 3435 100518.txt will most likely (with 66.67% probability) get more than 70% male applicants.
The 25 most important words in the advertisement for predicting the above was:
equipment: 1.20576
safety: 0.50725
certificate: 0.47912
tool: 0.43513
material: 0.42955
california: 0.34695
attach: 0.34616
flat: 0.34122
rat: 0.33921
work: 0.3323
license: 0.28845
use: 0.28407
read: 0.26467
copy: 0.26212
prior: 0.26204
require: 0.26148
apprenticeship: 0.26089
time: 0.25197
cal: 0.25046
valid: 0.23466
hazardous: 0.23416
instruction: 0.22242
regulation: 0.21052
osha: 0.20628
safely: 0.20119
The probability for the advertisement getting around as many female as male applicants is 20.45%
The probability for the advertisement getting more than 70% female applicants is 12.89%
