In [61]:
# Import necessary libraries

import pandas as pd
from bs4 import BeautifulSoup
import os
import spacy

import csv
import json
import itertools

from pathlib import Path

import csv
import json
import pandas as pd
import spacy
import math


from bs4 import BeautifulSoup
from spacy import displacy
from pathlib import Path
from cairosvg import svg2png

## Load and clean job descriptions

Load and clean the indeed.com job advertisement data.

In [62]:
# Get current directory for constructing file paths
current_directory = os.getcwd()

In [63]:
# Load job data
path = Path(os.path.join(current_directory, "data/jobs_raw.zip"))
df = pd.read_csv(path)

# Get just job descriptions
descriptions = df['Job Description']

In [64]:
# Function to tidy job description data
def pre_process_text(description, html=True):

    text = description

    if(html):
        soup = BeautifulSoup(description, "html.parser")
        text = soup.get_text(separator=' ')

        # Replace newline characters with spaces
        text = text.replace('\n', ' ')


    # to lower
    text = text.lower()

    # remove URLS

    return text

In [65]:
# Tidy and save descriptions - taking just the top 1000 results
result = descriptions.sample(n=1000, random_state=2)
result = result.apply(pre_process_text)

result.to_csv("data/descriptions_clean.csv")

## Create training data

Create training data to be used for training ner model.

In [66]:
# Define the file paths for the CSV and JSONL files
csv_file_path = Path(os.path.join(current_directory, "data/descriptions_clean.csv"))
jsonl_file_path = Path(os.path.join(current_directory, "data/data.jsonl"))
masculine_file_path = Path(os.path.join(current_directory, "data/masculine.csv"))
feminine__file_path = Path(os.path.join(current_directory, "data/feminine.csv"))


# Load word lists
with open(masculine_file_path, newline='') as f:
    masculine_words = [row[0] for row in csv.reader(f)]
with open(feminine__file_path, newline='') as f:
    feminine_words = [row[0] for row in csv.reader(f)]


wordlists = {
    "MAS" : masculine_words,
    "FEM" : feminine_words
}

In [67]:
# Load spacy
nlp = spacy.load('en_core_web_sm')

In [68]:
# Define helper functions

# Set index of matches in for a given text in a spacy document
def match_words(text):
    # Create an empty list to store the matches
    matches = []
    # Create a spaCy Doc object from the text
    doc = nlp(text)
    # Loop through the word lists and check for matches
    for list_name, word_list in wordlists.items():
        for word in word_list:
            for token in doc:
                if token.text.lower() == word:
                    matches.append(
                        [
                            token.idx,
                            token.idx + len(token.text),
                            list_name
                        ]
                    )
    # Return the matches
    return matches


# Process each row of a CSV file
def process_row(row):
    # Get the text description from the row
    text = row["Job Description"]
    # Match the words in the text and get the label attributes
    matches = match_words(text)
    # Return a dictionary with the text and label attributes
    return {"text": text, "label": matches}

In [69]:
# Read in the CSV file and create a list of processed rows
processed_rows = []
with open(csv_file_path) as csvfile:
    reader = csv.DictReader(csvfile)
    for row in reader:
        processed_rows.append(process_row(row))

In [70]:
# Write the processed rows to a JSONL file
with open(jsonl_file_path, "w") as outfile:
    for row in processed_rows:
        json.dump(row, outfile)
        outfile.write("\n")

In [71]:
# Define the file paths for train and test
train_file_path = Path(os.path.join(current_directory, "ner_demo/assets/train.json"))
test_file_path = Path(os.path.join(current_directory, "ner_demo/assets/test.json"))


# Create train and dev versions the job description data
result = []

with open(jsonl_file_path, 'r') as f:
    for line in f.readlines():
        data = json.loads(line)

        labels = data['label']
        labels.sort()
        labels = list(l for l,_ in itertools.groupby(labels))

        entry = [data['text'], {"entities" : labels}]

        result.append(entry)
    
    n = len(result)
    n_train = int(n * 0.7)

    train = result[0:n_train]
    dev = result[n_train+1:n]

    with open(train_file_path, 'w') as g:
        json.dump(train, g)

    with open(test_file_path, 'w') as h:
        json.dump(dev, h)


## Train and evaluate model

Train and evaluate the Named Entity Recognition model.

In [72]:
# Train the model

!python -m spacy project run all ./ner_demo

ℹ Running workflow 'all'
[1m
ℹ Skipping 'convert': nothing changed
[1m
ℹ Skipping 'train': nothing changed
[1m
ℹ Skipping 'evaluate': nothing changed


In [99]:
# Function to get spacy doc summary stats
def get_doc_stats(doc):

    total_tokens = doc.__len__()

    total_masculine = len([ent.text for ent in doc.ents if ent.label_ == "MAS"])
    total_feminine = len([ent.text for ent in doc.ents if ent.label_ == "FEM"])

    n = total_feminine + total_masculine

    if (n == 0):
        gt_score = 0.5
    else:
        x = (1/n)*(total_feminine - total_masculine)
        gt_score = 1/(1+math.exp(-x))


    result = {
        'total': total_tokens,
        'total_masculine': total_masculine,
        'total_feminine': total_feminine,
        'pct_masculine': round(total_masculine/total_tokens, 4),
        'pct_feminine': round(total_feminine/total_tokens, 4),
        'masc_ents': [ent.text for ent in doc.ents if ent.label_ == "MAS"],
        'fem_ents' : [ent.text for ent in doc.ents if ent.label_ == "FEM"],
        'gt_score' : gt_score
    }

    return result

In [100]:
# Load NER model
model_path = Path(os.path.join(current_directory, "ner_demo/training/model-best"))

nlp = spacy.load(model_path)

In [120]:
# Get random sample of job descriptions
description_eval_sample = pd.DataFrame(descriptions.sample(n=1000, random_state=99))

# Create clean description
description_eval_sample["description_clean"] = description_eval_sample['Job Description'].apply(pre_process_text)

# Create spacy Doc object for each job description
description_eval_sample['spacy'] = description_eval_sample['description_clean'].apply(lambda row: nlp(row))

# Get summary statistics for Doc object
description_eval_sample['token_summary'] = description_eval_sample['spacy'].apply(lambda row: get_doc_stats(row))

description_eval_sample = pd.concat([description_eval_sample.drop(['token_summary'], axis=1), description_eval_sample['token_summary'].apply(pd.Series)], axis=1)

KeyboardInterrupt: 

### Summary Statistics

Get summary statistics for evaluating solution performance

In [102]:
# Total and percentage masculine and feminine job descriptions across the entire corpus
total_masculine_corpus = len(description_eval_sample[description_eval_sample['total_masculine'] != 0])
pct_masculine_corpus = total_masculine_corpus/len(description_eval_sample)
total_feminine_corpus = len(description_eval_sample[description_eval_sample['total_feminine'] != 0])
pct_feminine_corpus = total_feminine_corpus/len(description_eval_sample)

# Mean percent of masculine and feminine words in a job advertisement across the entire corpus
mean_pct_masculine_corpus = description_eval_sample['pct_masculine'].mean()
mean_pct_feminine_corpus = description_eval_sample['pct_feminine'].mean()

# Mean gender target score across the entire corpus
mean_gt_score_corpus = description_eval_sample['gt_score'].mean()

# Top masculine entities and their percentage
all_masc_ents = description_eval_sample['masc_ents'].to_list()
all_masc_ents = pd.DataFrame([item for all_masc_ents in all_masc_ents for item in all_masc_ents])

top_masc = all_masc_ents.value_counts(normalize=True) * 100

# Top feminine entities and their percentage
all_fem_ents = description_eval_sample['fem_ents'].to_list()
all_fem_ents = pd.DataFrame([item for all_fem_ents in all_fem_ents for item in all_fem_ents])

top_fem = all_fem_ents.value_counts(normalize=True) * 100


print("There are %d job descriptions with masculine language (%2f percent)" % (total_masculine_corpus, pct_masculine_corpus))
print("There are %d job descriptions with feminine language (%2f percent)" % (total_feminine_corpus, pct_feminine_corpus))
print("There is an average of %2f masculine words per job advertisement" % (mean_pct_masculine_corpus))
print("There is an average of %2f feminine words per job advertisement" % (mean_pct_feminine_corpus))
print("The average gender target score across the entire corpus is %2f" % mean_gt_score_corpus)

print(top_masc[0:5])
print(top_fem[0:5])

There are 989 job descriptions with masculine language (0.989000 percent)
There are 989 job descriptions with feminine language (0.989000 percent)
There is an average of 0.014526 masculine words per job advertisement
There is an average of 0.011962 feminine words per job advertisement
The average gender target score across the entire corpus is 0.480419
ability    23.315274
data       10.141622
able        6.548807
project     6.045053
best        5.959510
dtype: float64
work           35.641451
service        20.808632
support        14.548257
communicate     4.801992
understand      4.055015
dtype: float64


In [119]:
description_eval_sample.sort_values("pct_masculine", ascending=False)

Unnamed: 0,Job Description,description_clean,spacy,total,total_masculine,total_feminine,pct_masculine,pct_feminine,masc_ents,fem_ents,gt_score
11483,"<div id=""jobDescriptionText"" class=""jobsearch-...",zappos e-commerce is seeking a senior analys...,"( , zappos, e, -, commerce, is, seeking, a, s...",388,20,4,0.0515,0.0103,"[analyst, challenging, finding, decision, anal...","[work, understand, interpersonal, communicate]",0.339244
8674,"<div id=""jobDescriptionText"" class=""jobsearch-...","comcast spotlight, the advertising sales div...","( , comcast, spotlight, ,, the, advertising, ...",846,39,7,0.0461,0.0083,"[most, best, ability, driven, individual, rese...","[work, connect, she, support, support, support...",0.332777
3446,"<div id=""jobDescriptionText"" class=""jobsearch-...",position purpose: the strategy department...,"( , position, purpose, :, , the, strategy,...",614,28,10,0.0456,0.0163,"[data, data, driven, decision, data, analyst, ...","[support, work, understand, communicate, she, ...",0.383745
23431,"<div id=""jobDescriptionText"" class=""jobsearch-...","are you curious, motivated, and forward-thin...","( , are, you, curious, ,, motivated, ,, and, ...",1559,66,19,0.0423,0.0122,"[most, challenging, project, project, project,...","[work, work, service, communicate, work, work,...",0.365182
1890,"<div id=""jobDescriptionText"" class=""jobsearch-...",job advert want to take your know-how...,"( , job, advert, , want, to, take, you...",980,41,4,0.0418,0.0041,"[data, data, best, lead, natural, driven, best...","[service, service, service, communicate]",0.305292
...,...,...,...,...,...,...,...,...,...,...,...
16076,"<div id=""jobDescriptionText"" class=""jobsearch-...",radco residential is seeking a leasing consu...,"( , radco, residential, is, seeking, a, leasi...",550,0,4,0.0000,0.0073,[],"[service, service, communicate, work]",0.731059
13987,"<div id=""jobDescriptionText"" class=""jobsearch-...",essential functions of job duties: supervi...,"( , essential, functions, of, job, duties, :,...",386,0,5,0.0000,0.0130,[],"[share, work, work, class, class]",0.731059
14826,"<div id=""jobDescriptionText"" class=""jobsearch-...",responsibilities: make cold calls and fo...,"( , responsibilities, :, , make, cold, ca...",101,0,0,0.0000,0.0000,[],[],0.500000
27591,"<div id=""jobDescriptionText"" class=""jobsearch-...",job summary: provides sales and administr...,"( , job, summary, :, , provides, sales, an...",438,0,6,0.0000,0.0137,[],"[support, support, support, work, support, sup...",0.731059


### Manual Inspection

Complete a manual inspection of the most masculine and most feminine job advertisements.

In [103]:
# Get top three masculine pct and feminine pct job advertisements
top_fem  = description_eval_sample.sort_values('gt_score', ascending=False).iloc[0:3]
top_masc = description_eval_sample.sort_values('gt_score', ascending=False).iloc[-3:]

In [104]:
# Settings for producing displacy images
colors = {"MAS": "linear-gradient(120deg, #a1c4fd, #a1c4fd)", "FEM": "linear-gradient(120deg, #fdcbf1, #fdcbf1)"}
options = {"ents": ["MAS", "FEM"], "colors": colors}

In [106]:
displacy.render(top_masc['spacy'].iloc[0], style='ent', options=options)

In [107]:
displacy.render(top_masc['spacy'].iloc[1], style='ent', options=options)

In [108]:
displacy.render(top_masc['spacy'].iloc[2], style='ent', options=options)

In [109]:
displacy.render(top_fem['spacy'].iloc[0], style='ent', options=options)

In [110]:
displacy.render(top_fem['spacy'].iloc[1], style='ent', options=options)

In [111]:
displacy.render(top_fem['spacy'].iloc[2], style='ent', options=options)