In [5]:
import os
os.chdir(r"c:\Users\britt\Desktop\YH\Applicerad AI\job_discrimination_sandbox")
import re

import gensim
import numpy as np
import pandas as pd
import pickle
from nltk.corpus import wordnet, stopwords
from nltk.stem import WordNetLemmatizer
from nltk.tokenize import word_tokenize
from nltk import pos_tag
from sklearn.feature_extraction.text import TfidfVectorizer
from unidecode import unidecode

In [6]:
google_model = gensim.models.KeyedVectors.load_word2vec_format("c:/Users/britt/Downloads/GoogleNews-vectors-negative300.bin.gz", binary=True)

In [16]:
def get_wordnet_pos(treebank_tag):
    if treebank_tag.startswith("J"):
        return wordnet.ADJ
    elif treebank_tag.startswith("V"):
        return wordnet.VERB
    elif treebank_tag.startswith("N"):
        return wordnet.NOUN
    elif treebank_tag.startswith("R"):
        return wordnet.ADV
    else:
        return ""

def penn_to_wn(tag):
    return get_wordnet_pos(tag)

def preprocess_document(doc):
    lemmatizer = WordNetLemmatizer()

    remove_https = re.sub(r"http\S+", "", doc)
    remove_com = re.sub(r"\ [A-Za-a]*\.com", " ", remove_https)
    remove_numbers_punctuations = re.sub(r"[^a-zA-Z]+", " ", remove_com)
    pattern = re.compile(r"\s+")
    remove_extra_whitespaces = re.sub(pattern, " ", remove_numbers_punctuations)
    only_ascii = unidecode(remove_extra_whitespaces)
    doc = only_ascii.lower()

    list_of_tokens = word_tokenize(doc)
    list_of_tokens_pos = pos_tag(list_of_tokens)
    list_of_tokens_wn_pos = [(token[0], penn_to_wn(token[1])) for token in list_of_tokens_pos if token[0] not in stopwords.words("english")]
    list_of_lemmas = [lemmatizer.lemmatize(token[0], token[1]) if token[1] != "" else lemmatizer.lemmatize(token[0]) for token in list_of_tokens_wn_pos]
    cleaned_text = [" ".join(list_of_lemmas)]

    return cleaned_text

def print_predict_data_classifiers(file_name, pred_probas, predicted_label):
    labels = [0, 1, 2]
    label_to_word = {1: "female", 2: "male"}

    print(f"The job advertisement {file_name} will most likely (with {round(pred_probas[0][predicted_label] * 100, 2)}% probability) get", end=" ")
    if 0 < predicted_label:
        print(f"more than 70% {label_to_word[predicted_label]} applicants.")
    else:
        print("around as many female as male applicants")
    for label in labels:
        if label == predicted_label:
            pass
        elif label == 0:
            print(f"The probability for the advertisement getting around as many female as male applicants is {round(pred_probas[0][label] * 100, 2)}%")
        else:
            if label == 1:
                sex = "female"
            elif label == 2:
                sex = "male"
            print(f"The probability for the advertisement getting more than 70% {sex} applicants is {round(pred_probas[0][label] * 100, 2)}%")

def job_ad_to_doc_vector(file_name):
    df = pd.read_csv("data/cleaned_data/bulletins_labels_share_content.csv", dtype={'ID': object})
    corpus = list(df["Cleaned text"])
    tfidf_vectorizer = TfidfVectorizer()
    tfidf_vectorizer.fit_transform(corpus)
    google_model = gensim.models.KeyedVectors.load_word2vec_format("c:/Users/britt/Downloads/GoogleNews-vectors-negative300.bin.gz", binary=True)

    with open(f"data/original_data/job_bulletins/{file_name}", "r", encoding="utf-8") as f:
        application = f.read()

    text = preprocess_document(application)
    regressor_vocabulary = tfidf_vectorizer.get_feature_names_out()

    scaled_embeddings  = []
    doc_list = text[0].split()
    for word in doc_list:
        if word in google_model.key_to_index.keys():
            embedding = google_model[word]
            index = np.where(regressor_vocabulary == word)[0]
            try:
                scaled_embeddings.append(embedding * tfidf_vectorizer.idf_[index])
            except ValueError:
                pass

    doc_vector = np.average(scaled_embeddings, axis=0)

    return doc_vector


In [9]:
file_name = "ASBESTOS WORKER 3435 100518.txt"
doc_vector = job_ad_to_doc_vector(file_name)

In [17]:
with open("data/models/cat_boost_regr.pkl", "rb") as read_file:
    regressor = pickle.load(read_file)

cat_boost_prediction = regressor.predict(doc_vector)
cat_boost_prediction

array([0.82054963, 0.17006495])

In [13]:
with open("data/models/log_reg_scaled_emb.pkl", "rb") as read_file:
    clf = pickle.load(read_file)

predicted_label = clf.predict(doc_vector.reshape(1, -1))[0]
pred_probas = clf.predict_proba(doc_vector.reshape(1, -1))
print_predict_data(file_name, pred_probas, predicted_label)