In [76]:
# Import necessary libraries

import pandas as pd
from bs4 import BeautifulSoup
import os
import spacy

import csv
import json
import itertools

from pathlib import Path

import csv
import json
import pandas as pd
import spacy
import math


from bs4 import BeautifulSoup
from spacy import displacy
from pathlib import Path
from cairosvg import svg2png

## Load and clean job descriptions

Load and clean the indeed.com job advertisement data.

In [77]:
# Get current directory for constructing file paths
current_directory = os.getcwd()

In [78]:
# Load job data
path = Path(os.path.join(current_directory, "data/jobs_raw.zip"))
df = pd.read_csv(path)

# Get just job descriptions
descriptions = df['Job Description']

In [79]:
# Function to tidy job description data
def pre_process_text(description, html=True):

    text = description

    if(html):
        soup = BeautifulSoup(description, "html.parser")
        text = soup.get_text(separator=' ')

        # Replace newline characters with spaces
        text = text.replace('\n', ' ')


    # set to lower
    text = text.lower()


    return text

In [80]:
# Tidy and save descriptions - taking a random sample of 1000 descriptions
result = descriptions.sample(n=1000, random_state=2)
result = result.apply(pre_process_text)

result.to_csv("data/descriptions_clean.csv")

## Create training data

Create training data to be used for training ner model.

In [81]:
# Define the file paths for the CSV and JSONL files
csv_file_path = Path(os.path.join(current_directory, "data/descriptions_clean.csv"))
jsonl_file_path = Path(os.path.join(current_directory, "data/data.jsonl"))
masculine_file_path = Path(os.path.join(current_directory, "data/masculine.csv"))
feminine__file_path = Path(os.path.join(current_directory, "data/feminine.csv"))


# Load word lists
with open(masculine_file_path, newline='') as f:
    masculine_words = [row[0] for row in csv.reader(f)]
with open(feminine__file_path, newline='') as f:
    feminine_words = [row[0] for row in csv.reader(f)]

# Create dictionary of word lists
wordlists = {
    "MAS" : masculine_words,
    "FEM" : feminine_words
}

In [82]:
# Load spacy
nlp = spacy.load('en_core_web_sm')

In [83]:
# Define helper functions

# Set index of matches in a given text in a spacy document - returns a list of indices that show where word matches appear
def match_words(text):
    # Create an empty list to store the matches
    matches = []
    # Create a spaCy Doc object from the text
    doc = nlp(text)
    # Loop through the word lists and check for matches
    for list_name, word_list in wordlists.items():
        for word in word_list:
            for token in doc:
                if token.text.lower() == word:
                    matches.append(
                        [
                            token.idx,
                            token.idx + len(token.text),
                            list_name
                        ]
                    )
    # Return the matches
    return matches


# Process each row of a CSV file to identify matches - returns a dictionary of the text and the matches
def process_row(row):
    # Get the text description from the row
    text = row["Job Description"]
    # Match the words in the text and get the label attributes
    matches = match_words(text)
    # Return a dictionary with the text and label attributes
    return {"text": text, "label": matches}

In [84]:
# Read in the CSV file and create a list of processed rows
processed_rows = []
with open(csv_file_path) as csvfile:
    reader = csv.DictReader(csvfile)
    for row in reader:
        processed_rows.append(process_row(row))

In [85]:
# Write the processed rows to a JSONL file - JSONL files can be loaded to doccano for review
with open(jsonl_file_path, "w") as outfile:
    for row in processed_rows:
        json.dump(row, outfile)
        outfile.write("\n")

In [86]:
# Define the file paths for where train and test datasets will be saved
train_file_path = Path(os.path.join(current_directory, "ner_model/assets/train.json"))
test_file_path = Path(os.path.join(current_directory, "ner_model/assets/dev.json"))


# Create train and dev versions the job description data
result = []

with open(jsonl_file_path, 'r') as f:
    for line in f.readlines():
        data = json.loads(line)

        labels = data['label']
        labels.sort()
        labels = list(l for l,_ in itertools.groupby(labels))

        entry = [data['text'], {"entities" : labels}]

        result.append(entry)
    
    n = len(result)
    n_train = int(n * 0.7)

    train = result[0:n_train]
    dev = result[n_train+1:n]

    with open(train_file_path, 'w') as g:
        json.dump(train, g)

    with open(test_file_path, 'w') as h:
        json.dump(dev, h)


## Train and evaluate model

Train and evaluate the Named Entity Recognition model.

In [87]:
# Train the NER model - all model settings and parameters are defined in ner_model/project.yml and ner_model/configs/config.cfg - these are used by spacy for model training/testing

!python -m spacy project run all ./ner_model

ℹ Running workflow 'all'
[1m
Running command: 'c:\Users\chris\anaconda3\python.exe' scripts/convert.py en assets/train.json corpus/train.spacy
Running command: 'c:\Users\chris\anaconda3\python.exe' scripts/convert.py en assets/dev.json corpus/dev.spacy
[1m
Running command: 'c:\Users\chris\anaconda3\python.exe' -m spacy train configs/config.cfg --output training/ --paths.train corpus/train.spacy --paths.dev corpus/dev.spacy --training.eval_frequency 10 --training.patience 50 --gpu-id -1
ℹ Saving to output directory: training
ℹ Using CPU
[1m
✔ Initialized pipeline
[1m
ℹ Pipeline: ['tok2vec', 'ner']
ℹ Initial learn rate: 0.001
E    #       LOSS TOK2VEC  LOSS NER  ENTS_F  ENTS_P  ENTS_R  SCORE 
---  ------  ------------  --------  ------  ------  ------  ------
  0       0          0.00    399.20    0.00    0.00    0.00    0.00
  0      10          6.40   3582.42    0.00    0.00    0.00    0.00
  0      20         10.37    499.48    0.00    0.00    0.00    0.00
  0      30          4.4

[2023-04-23 18:23:02,849] [INFO] Set up nlp object from config
[2023-04-23 18:23:02,863] [INFO] Pipeline: ['tok2vec', 'ner']
[2023-04-23 18:23:02,870] [INFO] Created vocabulary
[2023-04-23 18:23:02,885] [INFO] Finished initializing nlp object
[2023-04-23 18:23:12,099] [INFO] Initialized pipeline components: ['tok2vec', 'ner']


In [88]:
# Function to get spacy doc summary stats
def get_doc_stats(doc):

    total_tokens = doc.__len__()

    total_masculine = len([ent.text for ent in doc.ents if ent.label_ == "MAS"])
    total_feminine = len([ent.text for ent in doc.ents if ent.label_ == "FEM"])

    n = total_feminine + total_masculine

    if (n == 0):
        gt_score = 0.5
    else:
        x = (1/n)*(total_feminine - total_masculine)
        gt_score = 1/(1+math.exp(-x))


    result = {
        'total': total_tokens,
        'total_masculine': total_masculine,
        'total_feminine': total_feminine,
        'pct_masculine': round(total_masculine/total_tokens, 4),
        'pct_feminine': round(total_feminine/total_tokens, 4),
        'masc_ents': [ent.text for ent in doc.ents if ent.label_ == "MAS"],
        'fem_ents' : [ent.text for ent in doc.ents if ent.label_ == "FEM"],
        'gt_score' : gt_score
    }

    return result

In [89]:
# Load NER model
model_path = Path(os.path.join(current_directory, "ner_model/training/model-best"))

nlp = spacy.load(model_path)

In [90]:
# Get random sample of job descriptions
description_eval_sample = pd.DataFrame(descriptions.sample(n=1000, random_state=99))

# Create clean description
description_eval_sample["description_clean"] = description_eval_sample['Job Description'].apply(pre_process_text)

# Create spacy Doc object for each job description
description_eval_sample['spacy'] = description_eval_sample['description_clean'].apply(lambda row: nlp(row))

# Get summary statistics for Doc object
description_eval_sample['token_summary'] = description_eval_sample['spacy'].apply(lambda row: get_doc_stats(row))

description_eval_sample = pd.concat([description_eval_sample.drop(['token_summary'], axis=1), description_eval_sample['token_summary'].apply(pd.Series)], axis=1)

### Summary Statistics

Get summary statistics for evaluating solution performance

In [91]:
# Total and percentage masculine and feminine job descriptions across the entire corpus
total_masculine_corpus = len(description_eval_sample[description_eval_sample['total_masculine'] != 0])
pct_masculine_corpus = total_masculine_corpus/len(description_eval_sample)
total_feminine_corpus = len(description_eval_sample[description_eval_sample['total_feminine'] != 0])
pct_feminine_corpus = total_feminine_corpus/len(description_eval_sample)

# Mean percent of masculine and feminine words in a job advertisement across the entire corpus
mean_pct_masculine_corpus = description_eval_sample['pct_masculine'].mean()
mean_pct_feminine_corpus = description_eval_sample['pct_feminine'].mean()

# Mean gender target score across the entire corpus
mean_gt_score_corpus = description_eval_sample['gt_score'].mean()

# Top masculine entities and their percentage
all_masc_ents = description_eval_sample['masc_ents'].to_list()
all_masc_ents = pd.DataFrame([item for all_masc_ents in all_masc_ents for item in all_masc_ents])

top_masc = all_masc_ents.value_counts(normalize=True) * 100

# Top feminine entities and their percentage
all_fem_ents = description_eval_sample['fem_ents'].to_list()
all_fem_ents = pd.DataFrame([item for all_fem_ents in all_fem_ents for item in all_fem_ents])

top_fem = all_fem_ents.value_counts(normalize=True) * 100


print("There are %d job descriptions with masculine language (%2f percent)" % (total_masculine_corpus, pct_masculine_corpus))
print("There are %d job descriptions with feminine language (%2f percent)" % (total_feminine_corpus, pct_feminine_corpus))
print("There is an average of %2f masculine words per job advertisement" % (mean_pct_masculine_corpus))
print("There is an average of %2f feminine words per job advertisement" % (mean_pct_feminine_corpus))
print("The average gender target score across the entire corpus is %2f" % mean_gt_score_corpus)

print(top_masc[0:5])
print(top_fem[0:5])

There are 997 job descriptions with masculine language (0.997000 percent)
There are 996 job descriptions with feminine language (0.996000 percent)
There is an average of 0.020398 masculine words per job advertisement
There is an average of 0.015232 feminine words per job advertisement
The average gender target score across the entire corpus is 0.469476
ability      18.169380
strong        8.174853
data          6.430428
able          4.562868
excellent     4.501300
dtype: float64
work             27.459469
service          16.037214
responsible       6.632277
understanding     4.587325
communicate       3.491157
dtype: float64


### Manual Inspection

Complete a manual inspection of the most masculine and most feminine job advertisements.

In [92]:
# Get top three masculine pct and feminine pct job advertisements
top_fem  = description_eval_sample.sort_values('gt_score', ascending=False).iloc[0:3]
top_masc = description_eval_sample.sort_values('gt_score', ascending=False).iloc[-3:]

In [93]:
# Settings for producing displacy images
colors = {"MAS": "linear-gradient(120deg, #a1c4fd, #a1c4fd)", "FEM": "linear-gradient(120deg, #fdcbf1, #fdcbf1)"}
options = {"ents": ["MAS", "FEM"], "colors": colors}

In [94]:
displacy.render(top_masc['spacy'].iloc[0], style='ent', options=options)

In [95]:
displacy.render(top_masc['spacy'].iloc[1], style='ent', options=options)

In [96]:
displacy.render(top_masc['spacy'].iloc[2], style='ent', options=options)

In [97]:
displacy.render(top_fem['spacy'].iloc[0], style='ent', options=options)

In [98]:
displacy.render(top_fem['spacy'].iloc[1], style='ent', options=options)

In [99]:
displacy.render(top_fem['spacy'].iloc[2], style='ent', options=options)

## Generative AI Assessment

Assess model performance on generative AI examples

In [100]:
# Load generative AI created job advertisements
ai_descriptions_file_path = Path(os.path.join(current_directory, "data/generative_ai_advertisements.csv"))
ai_descriptions = pd.read_csv(ai_descriptions_file_path)

In [101]:
ai_descriptions['description_clean'] = ai_descriptions["description"].apply(pre_process_text)

# Create spacy Doc object for each job description
ai_descriptions['spacy'] = ai_descriptions['description_clean'].apply(lambda row: nlp(row))

# Get summary statistics for Doc object
ai_descriptions['token_summary'] = ai_descriptions['spacy'].apply(lambda row: get_doc_stats(row))

ai_descriptions = pd.concat([ai_descriptions.drop(['token_summary'], axis=1), ai_descriptions['token_summary'].apply(pd.Series)], axis=1)

In [102]:
ai_descriptions

Unnamed: 0,position,type,description,description_clean,spacy,total,total_masculine,total_feminine,pct_masculine,pct_feminine,masc_ents,fem_ents,gt_score
0,real_estate_agent,masculine,Company: Elite Homes Realty\r\nDescription: El...,company: elite homes realty\r description: eli...,"(company, :, elite, homes, realty, \r , descri...",472,13,8,0.0275,0.0169,"[skilled, expertise, strong, unique, research,...","[connect, understand, present, service, collab...",0.440756
1,real_estate_agent,feminine,\r\nCompany: Stellar Properties Group\r\nDescr...,\r company: stellar properties group\r descrip...,"(\r , company, :, stellar, properties, group, ...",639,16,16,0.025,0.025,"[leading, skilled, exceptional, strong, resear...","[inclusive, dedicated, service, responsible, s...",0.5
2,real_estate_agent,neutral,Company: Dream Homes Realty\r\nDescription: Dr...,company: dream homes realty\r description: dre...,"(company, :, dream, homes, realty, \r , descri...",452,16,9,0.0354,0.0199,"[exceptional, finding, skilled, exceptional, r...","[service, service, understand, present, servic...",0.430454
3,mechanic,masculine,Company: TurboFix Automotive\r\nDescription: T...,company: turbofix automotive\r description: tu...,"(company, :, turbofix, automotive, \r , descri...",405,12,10,0.0296,0.0247,"[leading, skilled, confidence, expertise, exce...","[work, responsible, reliability, collaborate, ...",0.477288
4,mechanic,feminine,Company: AutoPro Repair Services\r\nDescriptio...,company: autopro repair services\r description...,"(company, :, autopro, repair, services, \r , d...",657,13,15,0.0198,0.0228,"[skilled, expertise, exceptional, outstanding,...","[inclusive, service, work, collaborate, organi...",0.51785
5,mechanic,neutral,Company: AutoFix Repair Shop\r\nDescription: A...,company: autofix repair shop\r description: au...,"(company, :, autofix, repair, shop, \r , descr...",643,12,20,0.0187,0.0311,"[skilled, expertise, outstanding, strong, cont...","[reliable, inclusive, work, service, work, col...",0.562177
6,early_childhood_teacher,masculine,Company: Little Learners Preschool\r\nDescript...,company: little learners preschool\r descripti...,"(company, :, little, learners, preschool, \r ,...",365,6,9,0.0164,0.0247,"[strong, best, principles, best, strong, ability]","[care, dedicated, nurturing, inclusive, collab...",0.549834
7,early_childhood_teacher,feminine,Company: Sunshine Kids Preschool\r\nDescriptio...,company: sunshine kids preschool\r description...,"(company, :, sunshine, kids, preschool, \r , d...",351,8,13,0.0228,0.037,"[active, unique, best, principles, best, excel...","[nurturing, dedicated, inclusive, warm, nurtur...",0.559244
8,early_childhood_teacher,neutral,Company: Bright Beginnings Childcare Center\r\...,company: bright beginnings childcare center\r ...,"(company, :, bright, beginnings, childcare, ce...",447,12,18,0.0268,0.0403,"[bright, bright, skilled, bright, unique, acti...","[inclusive, nurturing, dedicated, care, inclus...",0.549834
