# Load libraries

In [1]:
# Import necessary libraries
import pandas as pd
from bs4 import BeautifulSoup
import os
import spacy

import csv
import json
import itertools

from pathlib import Path

import csv
import json
import pandas as pd
import spacy
import math


from bs4 import BeautifulSoup
from spacy import displacy
from pathlib import Path
from cairosvg import svg2png

## Load and clean job descriptions

Load and clean the indeed.com job advertisement data.

In [2]:
# Get current directory for constructing file paths
current_directory = os.getcwd()

In [3]:
# Load job data
path = Path(os.path.join(current_directory, "data/jobs_raw.zip"))
df = pd.read_csv(path)

# Get just job descriptions
descriptions = df['Job Description']

In [4]:
# Function to tidy job description data
def pre_process_text(description, html=True):

    text = description

    if(html):
        soup = BeautifulSoup(description, "html.parser")
        text = soup.get_text(separator=' ')

        # Replace newline characters with spaces
        text = text.replace('\n', ' ')


    # set to lower
    text = text.lower()


    return text

In [5]:
# Tidy and save descriptions - taking a random sample of 1000 descriptions
result = descriptions.sample(n=1000, random_state=2)
result = result.apply(pre_process_text)

result.to_csv("data/descriptions_clean.csv")

## Create training data

Create training data to be used for training ner model.

In [6]:
# Define the file paths for the CSV and JSONL files
csv_file_path = Path(os.path.join(current_directory, "data/descriptions_clean.csv"))
jsonl_file_path = Path(os.path.join(current_directory, "data/data.jsonl"))
masculine_file_path = Path(os.path.join(current_directory, "data/masculine.csv"))
feminine__file_path = Path(os.path.join(current_directory, "data/feminine.csv"))


# Load word lists
with open(masculine_file_path, newline='') as f:
    masculine_words = [row[0] for row in csv.reader(f)]
with open(feminine__file_path, newline='') as f:
    feminine_words = [row[0] for row in csv.reader(f)]

# Create dictionary of word lists
wordlists = {
    "MAS" : masculine_words,
    "FEM" : feminine_words
}

In [7]:
# Load spacy
nlp = spacy.load('en_core_web_sm')

In [8]:
# Define helper functions

# Set index of matches in a given text in a spacy document - returns a list of indices that show where word matches appear
def match_words(text):
    # Create an empty list to store the matches
    matches = []
    # Create a spaCy Doc object from the text
    doc = nlp(text)
    # Loop through the word lists and check for matches
    for list_name, word_list in wordlists.items():
        for word in word_list:
            for token in doc:
                if token.text.lower() == word:
                    matches.append(
                        [
                            token.idx,
                            token.idx + len(token.text),
                            list_name
                        ]
                    )
    # Return the matches
    return matches


# Process each row of a CSV file to identify matches - returns a dictionary of the text and the matches
def process_row(row):
    # Get the text description from the row
    text = row["Job Description"]
    # Match the words in the text and get the label attributes
    matches = match_words(text)
    # Return a dictionary with the text and label attributes
    return {"text": text, "label": matches}

In [9]:
# Read in the CSV file and create a list of processed rows
processed_rows = []
with open(csv_file_path) as csvfile:
    reader = csv.DictReader(csvfile)
    for row in reader:
        processed_rows.append(process_row(row))

In [10]:
# Write the processed rows to a JSONL file - JSONL files can be loaded to doccano for review
with open(jsonl_file_path, "w") as outfile:
    for row in processed_rows:
        json.dump(row, outfile)
        outfile.write("\n")

In [11]:
# Define the file paths for where train and test datasets will be saved
train_file_path = Path(os.path.join(current_directory, "ner_model/assets/train.json"))
test_file_path = Path(os.path.join(current_directory, "ner_model/assets/dev.json"))


# Create train and dev versions the job description data
result = []

with open(jsonl_file_path, 'r') as f:
    for line in f.readlines():
        data = json.loads(line)

        labels = data['label']
        labels.sort()
        labels = list(l for l,_ in itertools.groupby(labels))

        entry = [data['text'], {"entities" : labels}]

        result.append(entry)
    
    n = len(result)
    n_train = int(n * 0.7)

    train = result[0:n_train]
    dev = result[n_train+1:n]

    with open(train_file_path, 'w') as g:
        json.dump(train, g)

    with open(test_file_path, 'w') as h:
        json.dump(dev, h)


## Train and evaluate model

Train and evaluate the Named Entity Recognition model.

In [12]:
# Train the NER model - all model settings and parameters are defined in ner_model/project.yml and ner_model/configs/config.cfg - these are used by spacy for model training/testing

!python -m spacy project run all ./ner_model

ℹ Running workflow 'all'
[1m
ℹ Skipping 'convert': nothing changed
[1m
ℹ Skipping 'train': nothing changed
[1m
ℹ Skipping 'evaluate': nothing changed
[1m
ℹ Skipping 'package': nothing changed


In [13]:
# Function to get spacy doc summary stats
def get_doc_stats(doc):

    total_tokens = doc.__len__()

    total_masculine = len([ent.text for ent in doc.ents if ent.label_ == "MAS"])
    total_feminine = len([ent.text for ent in doc.ents if ent.label_ == "FEM"])

    n = total_feminine + total_masculine

    if (n == 0):
        gt_score = 0.5
    else:
        x = (1/n)*(total_feminine - total_masculine)
        gt_score = 1/(1+math.exp(-x))


    result = {
        'total': total_tokens,
        'total_masculine': total_masculine,
        'total_feminine': total_feminine,
        'pct_masculine': round(total_masculine/total_tokens, 4),
        'pct_feminine': round(total_feminine/total_tokens, 4),
        'masc_ents': [ent.text for ent in doc.ents if ent.label_ == "MAS"],
        'fem_ents' : [ent.text for ent in doc.ents if ent.label_ == "FEM"],
        'gt_score' : gt_score
    }

    return result

In [14]:
# Load NER model
model_path = Path(os.path.join(current_directory, "ner_model/training/model-best"))

nlp = spacy.load(model_path)

In [15]:
# Get random sample of job descriptions
description_eval_sample = pd.DataFrame(descriptions.sample(n=1000, random_state=99))

# Create clean description
description_eval_sample["description_clean"] = description_eval_sample['Job Description'].apply(pre_process_text)

# Create spacy Doc object for each job description
description_eval_sample['spacy'] = description_eval_sample['description_clean'].apply(lambda row: nlp(row))

# Get summary statistics for Doc object
description_eval_sample['token_summary'] = description_eval_sample['spacy'].apply(lambda row: get_doc_stats(row))

description_eval_sample = pd.concat([description_eval_sample.drop(['token_summary'], axis=1), description_eval_sample['token_summary'].apply(pd.Series)], axis=1)

## Scoring advertisements

Get summary statistics for evaluating solution performance

In [16]:
# Total and percentage masculine and feminine job descriptions across the entire corpus
total_masculine_corpus = len(description_eval_sample[description_eval_sample['total_masculine'] != 0])
pct_masculine_corpus = total_masculine_corpus/len(description_eval_sample)
total_feminine_corpus = len(description_eval_sample[description_eval_sample['total_feminine'] != 0])
pct_feminine_corpus = total_feminine_corpus/len(description_eval_sample)

# Mean percent of masculine and feminine words in a job advertisement across the entire corpus
mean_pct_masculine_corpus = description_eval_sample['pct_masculine'].mean()
mean_pct_feminine_corpus = description_eval_sample['pct_feminine'].mean()

# Mean gender target score across the entire corpus
mean_gt_score_corpus = description_eval_sample['gt_score'].mean()

# Top masculine entities and their percentage
all_masc_ents = description_eval_sample['masc_ents'].to_list()
all_masc_ents = pd.DataFrame([item for all_masc_ents in all_masc_ents for item in all_masc_ents])

top_masc = all_masc_ents.value_counts(normalize=True) * 100

# Top feminine entities and their percentage
all_fem_ents = description_eval_sample['fem_ents'].to_list()
all_fem_ents = pd.DataFrame([item for all_fem_ents in all_fem_ents for item in all_fem_ents])

top_fem = all_fem_ents.value_counts(normalize=True) * 100


print("There are %d job descriptions with masculine language (%2f percent)" % (total_masculine_corpus, pct_masculine_corpus))
print("There are %d job descriptions with feminine language (%2f percent)" % (total_feminine_corpus, pct_feminine_corpus))
print("There is an average of %2f masculine words per job advertisement" % (mean_pct_masculine_corpus))
print("There is an average of %2f feminine words per job advertisement" % (mean_pct_feminine_corpus))
print("The average gender target score across the entire corpus is %2f" % mean_gt_score_corpus)

print(top_masc[0:5])
print(top_fem[0:5])

There are 997 job descriptions with masculine language (0.997000 percent)
There are 996 job descriptions with feminine language (0.996000 percent)
There is an average of 0.020398 masculine words per job advertisement
There is an average of 0.015232 feminine words per job advertisement
The average gender target score across the entire corpus is 0.469476
ability      18.169380
strong        8.174853
data          6.430428
able          4.562868
excellent     4.501300
dtype: float64
work             27.459469
service          16.037214
responsible       6.632277
understanding     4.587325
communicate       3.491157
dtype: float64


In [17]:
description_eval_sample

Unnamed: 0,Job Description,description_clean,spacy,total,total_masculine,total_feminine,pct_masculine,pct_feminine,masc_ents,fem_ents,gt_score
10842,"<div id=""jobDescriptionText"" class=""jobsearch-...",scale ai is accelerating the development of ai...,"(scale, ai, is, accelerating, the, development...",504,8,3,0.0159,0.0060,"[ability, challenging, objectives, decision, l...","[communicate, feel, work]",0.388281
3154,"<div id=""jobDescriptionText"" class=""jobsearch-...",overview: pandadoc is one of the fastest gr...,"( , overview, :, , pandadoc, is, one, of, th...",656,16,6,0.0244,0.0091,"[analytical, driven, individual, independent, ...","[work, understanding, interpersonal, organizat...",0.388281
21640,"<div id=""jobDescriptionText"" class=""jobsearch-...",the inside sales representative role require...,"( , the, inside, sales, representative, role,...",463,3,4,0.0065,0.0086,"[individual, ability, unique]","[dedicated, class, work, committed]",0.535654
8616,"<div id=""jobDescriptionText"" class=""jobsearch-...",ulta beauty takes great pride in continually...,"( , ulta, beauty, takes, great, pride, in, co...",556,13,11,0.0234,0.0198,"[unique, individual, exceptional, determine, i...","[responsible, service, service, present, busy,...",0.479179
12789,"<div id=""jobDescriptionText"" class=""jobsearch-...",company description our client is the num...,"( , company, description, , our, client, i...",421,9,4,0.0214,0.0095,"[he, capable, individual, ability, conceptuall...","[responsible, she, responsible, communicate]",0.405014
...,...,...,...,...,...,...,...,...,...,...,...
9583,"<div id=""jobDescriptionText"" class=""jobsearch-...",job details description so what does a...,"( , job, details, , description, , so, ...",1085,17,19,0.0157,0.0175,"[finding, objectives, result, expert, strong, ...","[dedicated, share, class, work, work, together...",0.513885
18288,"<div id=""jobDescriptionText"" class=""jobsearch-...",overview hudson valley federal credit uni...,"( , overview, , hudson, valley, federal, c...",840,7,13,0.0083,0.0155,"[strong, contribute, individual, principles, b...","[service, service, service, service, service, ...",0.574443
13699,"<div id=""jobDescriptionText"" class=""jobsearch-...","we are jackson hewitt, working hard for the ha...","(we, are, jackson, hewitt, ,, working, hard, f...",887,16,15,0.0180,0.0169,"[most, excellent, exceptional, determine, dete...","[service, sensitive, thorough, service, servic...",0.491936
28286,"<div id=""jobDescriptionText"" class=""jobsearch-...",specific information related to the posi...,"( , specific, information, related, to, t...",1027,12,17,0.0117,0.0166,"[superior, determined, superior, active, deter...","[work, responsible, service, responsible, resp...",0.542997


### Manual Inspection

Complete a manual inspection of the most masculine and most feminine job advertisements.

In [18]:
# Get top ads sorted by masculine pct and feminine pct
top_masc  = description_eval_sample.sort_values(['pct_masculine', 'pct_feminine'], ascending=[False,False]).iloc[0]
top_fem  = description_eval_sample.sort_values(['pct_feminine', 'pct_masculine'], ascending=[False,False]).iloc[0]

# Get top ads sorted by gender target score
top_masc_gt  = description_eval_sample.sort_values(['total_masculine'], ascending=False).iloc[0]
top_fem_gt  = description_eval_sample.sort_values(['total_feminine'], ascending=False).iloc[0]

In [19]:
# Settings for producing displacy images
colors = {"MAS": "linear-gradient(120deg, #a1c4fd, #a1c4fd)", "FEM": "linear-gradient(120deg, #fdcbf1, #fdcbf1)"}
options = {"ents": ["MAS", "FEM"], "colors": colors}

In [20]:
# Highest masc and fem pct
displacy.render(top_masc['spacy'], style='ent', options=options)

In [21]:
# Highest fem and masc pct
displacy.render(top_fem['spacy'], style='ent', options=options)

In [22]:
# Top masculine gender target score
displacy.render(top_masc_gt['spacy'], style='ent', options=options)

In [23]:
displacy.render(top_fem_gt['spacy'], style='ent', options=options)

## Generative AI Assessment

Assess model performance on generative AI examples

In [24]:
# Load generative AI created job advertisements
ai_descriptions_file_path = Path(os.path.join(current_directory, "data/generative_ai_advertisements.csv"))
ai_descriptions = pd.read_csv(ai_descriptions_file_path)

In [25]:
ai_descriptions['description_clean'] = ai_descriptions["description"].apply(pre_process_text)

# Create spacy Doc object for each job description
ai_descriptions['spacy'] = ai_descriptions['description_clean'].apply(lambda row: nlp(row))

# Get summary statistics for Doc object
ai_descriptions['token_summary'] = ai_descriptions['spacy'].apply(lambda row: get_doc_stats(row))

ai_descriptions = pd.concat([ai_descriptions.drop(['token_summary'], axis=1), ai_descriptions['token_summary'].apply(pd.Series)], axis=1)

In [26]:
ai_descriptions

Unnamed: 0,position,type,description,description_clean,spacy,total,total_masculine,total_feminine,pct_masculine,pct_feminine,masc_ents,fem_ents,gt_score
0,real_estate_agent,masculine,Company: Elite Homes Realty\r\nDescription: El...,company: elite homes realty\r description: eli...,"(company, :, elite, homes, realty, \r , descri...",472,13,8,0.0275,0.0169,"[skilled, expertise, strong, unique, research,...","[connect, understand, present, service, collab...",0.440756
1,real_estate_agent,feminine,\r\nCompany: Stellar Properties Group\r\nDescr...,\r company: stellar properties group\r descrip...,"(\r , company, :, stellar, properties, group, ...",639,16,16,0.025,0.025,"[leading, skilled, exceptional, strong, resear...","[inclusive, dedicated, service, responsible, s...",0.5
2,real_estate_agent,neutral,Company: Dream Homes Realty\r\nDescription: Dr...,company: dream homes realty\r description: dre...,"(company, :, dream, homes, realty, \r , descri...",452,16,9,0.0354,0.0199,"[exceptional, finding, skilled, exceptional, r...","[service, service, understand, present, servic...",0.430454
3,mechanic,masculine,Company: TurboFix Automotive\r\nDescription: T...,company: turbofix automotive\r description: tu...,"(company, :, turbofix, automotive, \r , descri...",405,12,10,0.0296,0.0247,"[leading, skilled, confidence, expertise, exce...","[work, responsible, reliability, collaborate, ...",0.477288
4,mechanic,feminine,Company: AutoPro Repair Services\r\nDescriptio...,company: autopro repair services\r description...,"(company, :, autopro, repair, services, \r , d...",657,13,15,0.0198,0.0228,"[skilled, expertise, exceptional, outstanding,...","[inclusive, service, work, collaborate, organi...",0.51785
5,mechanic,neutral,Company: AutoFix Repair Shop\r\nDescription: A...,company: autofix repair shop\r description: au...,"(company, :, autofix, repair, shop, \r , descr...",643,12,20,0.0187,0.0311,"[skilled, expertise, outstanding, strong, cont...","[reliable, inclusive, work, service, work, col...",0.562177
6,early_childhood_teacher,masculine,Company: Little Learners Preschool\r\nDescript...,company: little learners preschool\r descripti...,"(company, :, little, learners, preschool, \r ,...",365,6,9,0.0164,0.0247,"[strong, best, principles, best, strong, ability]","[care, dedicated, nurturing, inclusive, collab...",0.549834
7,early_childhood_teacher,feminine,Company: Sunshine Kids Preschool\r\nDescriptio...,company: sunshine kids preschool\r description...,"(company, :, sunshine, kids, preschool, \r , d...",351,8,13,0.0228,0.037,"[active, unique, best, principles, best, excel...","[nurturing, dedicated, inclusive, warm, nurtur...",0.559244
8,early_childhood_teacher,neutral,Company: Bright Beginnings Childcare Center\r\...,company: bright beginnings childcare center\r ...,"(company, :, bright, beginnings, childcare, ce...",447,12,18,0.0268,0.0403,"[bright, bright, skilled, bright, unique, acti...","[inclusive, nurturing, dedicated, care, inclus...",0.549834


# Explainable AI

In [21]:
import spacy
import pandas as pd
from pathlib import Path
import os

In [15]:
ner_spacy = spacy.load("ner_model\\training\\model-best")
tokenizer_spacy = spacy.tokenizer.Tokenizer(ner_spacy.vocab)
classes = list(ner_spacy.get_pipe("ner").labels)

In [16]:
classes

['FEM', 'MAS']

In [81]:
# Define a function to predict
def predict(texts):

    # convert texts to bare strings
    texts = [str(text) for text in texts]

    results = []
    for doc in ner_spacy.pipe(texts):
        # results.append([{'label': cat, 'score': doc.cats[cat]} for cat in doc.cats])

        print(doc.ents)

        results.append([doc.ents[ent] for ent in classes])

    print(results)

    return results

In [82]:
# Create a function to create a transformers-like tokenizer to match shap's expectations
def tok_wrapper(text, return_offsets_mapping=False):
    doc = tokenizer_spacy(text)

    out = {"input_ids": [tok.norm for tok in doc]}

    if return_offsets_mapping:
        out["offset_mapping"] = [(tok.idx, tok.idx + len(tok)) for tok in doc]
    return out

In [83]:
import shap
# Create the Shap Explainer
# - predict is the "model" function, adapted to a transformers-like model
# - masker is the masker used by shap, which relies on a transformers-like tokenizer
# - algorithm is set to permutation, which is the one used for transformers models
# - output_names are the classes (although it is not propagated to the permutation explainer currently, which is why plots do not have the labels)
# - max_evals is set to a high number to reduce the probability of cases where the explainer fails because there are too many tokens
explainer = shap.Explainer(
    predict,
    masker=shap.maskers.Text(tok_wrapper),
    algorithm="permutation",
    output_names=classes,
    max_evals=1500,
)

In [84]:
# Get current directory for constructing file paths
current_directory = os.getcwd()

# Load generative AI created job advertisements
ai_descriptions_file_path = Path(os.path.join(current_directory, "data/generative_ai_advertisements.csv"))
ai_descriptions = pd.read_csv(ai_descriptions_file_path)

In [85]:
ai_descriptions[:1]["description"]

0    Company: Elite Homes Realty\r\nDescription: El...
Name: description, dtype: object