In [2]:
import csv
import json
import pandas as pd
import spacy
import math


from bs4 import BeautifulSoup
from spacy import displacy
from pathlib import Path
from cairosvg import svg2png

In [3]:
#train the model

!python -m spacy project run all

ℹ Running workflow 'all'
[1m
ℹ Skipping 'convert': nothing changed
[1m
ℹ Skipping 'train': nothing changed
[1m
ℹ Skipping 'evaluate': nothing changed


## Evaluate model performance

In [4]:
# Load jobs

# Load job data
df = pd.read_csv("C:\\Users\\chris\\DATA417\\data\\jobs_raw.zip")

# Get just job descriptions
descriptions = df['Job Description']


In [5]:
# Function to tidy job description data
def pre_process_text(description, html=True):

    text = description

    if(html):
        soup = BeautifulSoup(description, "html.parser")
        text = soup.get_text(separator=' ')

        # Replace newline characters with spaces
        text = text.replace('\n', ' ')

    return text


# Function to get spacy doc summary stats
def get_doc_stats(doc):

    total_tokens = doc.__len__()

    total_masculine = len([ent.text for ent in doc.ents if ent.label_ == "MAS"])
    total_feminine = len([ent.text for ent in doc.ents if ent.label_ == "FEM"])
    total_exclusive = len([ent.text for ent in doc.ents if ent.label_ == "EXC"])

    n = total_feminine + total_masculine
    x = (1/n)*(total_feminine - total_masculine)

    gt_score = 1/(1+math.exp(-x))

    result = {
        'total': total_tokens,
        'total_masculine': total_masculine,
        'total_feminine': total_feminine,
        'total_exclusive': total_exclusive,
        'pct_masculine': round(total_masculine/total_tokens, 4),
        'pct_feminine': round(total_feminine/total_tokens, 4),
        'pct_exclusive': round(total_exclusive/total_tokens, 4),
        'masc_ents': [ent.text for ent in doc.ents if ent.label_ == "MAS"],
        'fem_ents' : [ent.text for ent in doc.ents if ent.label_ == "FEM"],
        'exc_ents' : [ent.text for ent in doc.ents if ent.label_ == "EXC"],
        'gt_score' : gt_score
    }

    return result

In [6]:
# Get random sample of job descriptions
description_eval_sample = pd.DataFrame(descriptions.sample(n=100, random_state=2))

# Create clean description
description_eval_sample["description_clean"] = description_eval_sample['Job Description'].apply(pre_process_text)

In [7]:
# Load NER model
nlp = spacy.load("C:\\Users\\chris\\DATA417\\ner_demo\\training\\model-best")

In [8]:
doc = nlp("Test job description")

In [9]:
# Create spacy Doc object for each job description
description_eval_sample['spacy'] = description_eval_sample['description_clean'].apply(lambda row: nlp(row))

# Get summary statistics for Doc object
description_eval_sample['token_summary'] = description_eval_sample['spacy'].apply(lambda row: get_doc_stats(row))

description_eval_sample = pd.concat([description_eval_sample.drop(['token_summary'], axis=1), description_eval_sample['token_summary'].apply(pd.Series)], axis=1)

In [10]:
description_eval_sample

Unnamed: 0,Job Description,description_clean,spacy,total,total_masculine,total_feminine,total_exclusive,pct_masculine,pct_feminine,pct_exclusive,masc_ents,fem_ents,exc_ents,gt_score
16220,"<div id=""jobDescriptionText"" class=""jobsearch-...",Summary An experienced creative produc...,"( , Summary, , An, experienced, creativ...",1003,37,12,1,0.0369,0.0120,0.0010,"[creative, talent, creative, creative, able, l...","[work, understand, together, present, work, Wo...",[special],0.375146
26848,"<div id=""jobDescriptionText"" class=""jobsearch-...",Overview If you are a motivated in...,"( , Overview, , If, you, are, a, mo...",625,7,5,1,0.0112,0.0080,0.0016,"[individual, driven, driven, Proficient, abili...","[support, work, support, work, class]",[Disability],0.458430
26836,"<div id=""jobDescriptionText"" class=""jobsearch-...",If you have a background in sales or custo...,"( , If, you, have, a, background, in, sales...",304,4,3,0,0.0132,0.0099,0.0000,"[publication, finding, publication, Finding]","[service, work, service]",[],0.464346
22537,"<div id=""jobDescriptionText"" class=""jobsearch-...",Topflight Corporation has been providing in...,"( , Topflight, Corporation, , has, been, pro...",427,3,9,2,0.0070,0.0211,0.0047,"[conductive, Proficient, Ability]","[understand, work, service, work, understand, ...","[shrink, Special]",0.622459
16244,"<div id=""jobDescriptionText"" class=""jobsearch-...",BJ's Wholesale Club was the first retailer t...,"( , BJ, 's, Wholesale, Club, was, the, first,...",587,5,4,3,0.0085,0.0068,0.0051,"[talent, expert, test, test, data]","[service, communicate, support, support]","[sex, age, disability]",0.472251
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
18569,"<div id=""jobDescriptionText"" class=""jobsearch-...",Eaton’s Lighting Division is currently see...,"( , Eaton, ’s, Lighting, Division, is, curr...",1223,13,13,3,0.0106,0.0106,0.0025,"[Lead, unique, Active, able, analytical, drive...","[work, work, Work, Work, Service, work, Servic...","[sex, age, disability]",0.500000
4114,"<div id=""jobDescriptionText"" class=""jobsearch-...",Accelerate Your Career Drive global techn...,"( , Accelerate, Your, Career, , Drive, glo...",1079,12,16,2,0.0111,0.0148,0.0019,"[most, data, data, best, data, Lead, data, dat...","[service, support, work, Support, class, servi...","[disability, age]",0.535654
13655,"<div id=""jobDescriptionText"" class=""jobsearch-...",Redfin is redefining real estate in the cons...,"( , Redfin, is, redefining, real, estate, in,...",528,4,6,1,0.0076,0.0114,0.0019,"[finding, data, best, data]","[service, support, Work, support, instant, com...",[sick],0.549834
16305,"<div id=""jobDescriptionText"" class=""jobsearch-...","Job Location: Parsippany, N.J. . ...","( , Job, Location, :, Parsippany, ,, N.J....",474,9,7,3,0.0190,0.0148,0.0063,"[Analyst, insight, best, Lead, project, lead, ...","[course, Support, Work, support, work, Support...","[sex, age, disability]",0.468791


### Summary Statistics

Get summary statistics for evaluating solution performance

In [12]:
# Total and percentage masculine and feminine job descriptions across the entire corpus
total_masculine_corpus = len(description_eval_sample[description_eval_sample['total_masculine'] != 0])
pct_masculine_corpus = total_masculine_corpus/len(description_eval_sample)
total_feminine_corpus = len(description_eval_sample[description_eval_sample['total_feminine'] != 0])
pct_feminine_corpus = total_feminine_corpus/len(description_eval_sample)

# Mean percent of masculine and feminine words in a job advertisement across the entire corpus
mean_pct_masculine_corpus = description_eval_sample['pct_masculine'].mean()
mean_pct_feminine_corpus = description_eval_sample['pct_feminine'].mean()

# Mean gender target score across the entire corpus
mean_gt_score_corpus = description_eval_sample['gt_score'].mean()

# Top masculine entities and their percentage
all_masc_ents = description_eval_sample['masc_ents'].to_list()
all_masc_ents = pd.DataFrame([item for all_masc_ents in all_masc_ents for item in all_masc_ents])

top_masc = all_masc_ents.value_counts(normalize=True) * 100

# Top feminine entities and their percentage
all_fem_ents = description_eval_sample['fem_ents'].to_list()
all_fem_ents = pd.DataFrame([item for all_fem_ents in all_fem_ents for item in all_fem_ents])

top_fem = all_fem_ents.value_counts(normalize=True) * 100


print("There are %d job descriptions with masculine language (%2f percent)" % (total_masculine_corpus, pct_masculine_corpus))
print("There are %d job descriptions with feminine language (%2f percent)" % (total_feminine_corpus, pct_feminine_corpus))
print("There is an average of %2f masculine words per job advertisement" % (mean_pct_masculine_corpus))
print("There is an average of %2f feminine words per job advertisement" % (mean_pct_feminine_corpus))
print("The average gender target score across the entire corpus is %2f" % mean_gt_score_corpus)

print(top_masc[0:5])
print(top_fem[0:5])

There are 97 job descriptions with masculine language (0.970000 percent)
There are 100 job descriptions with feminine language (1.000000 percent)
There is an average of 0.013359 masculine words per job advertisement
There is an average of 0.012563 feminine words per job advertisement
The average gender target score across the entire corpus is 0.506421
ability    12.609971
Ability    11.827957
data        8.308895
able        6.451613
best        4.789834
dtype: float64
work       28.325688
service    18.807339
support     9.288991
Work        7.224771
Service     5.045872
dtype: float64


### Manual Inspection

Complete a manual inspection of the most masculine and most feminine job advertisements.

In [13]:
# Get top three masculine pct and feminine pct job advertisements
top_fem  = description_eval_sample.sort_values('gt_score', ascending=False).iloc[0:3]
top_masc = description_eval_sample.sort_values('gt_score', ascending=False).iloc[-3:]

In [14]:
# Settings for producing displacy images
colors = {"MAS": "linear-gradient(120deg, #a1c4fd, #a1c4fd)", "FEM": "linear-gradient(120deg, #fdcbf1, #fdcbf1)"}
options = {"ents": ["MAS", "FEM"], "colors": colors}

## Masculine

In [19]:
svg = displacy.render(top_masc['spacy'].iloc[0], style='ent', jupyter=False, options=options)

output_path = Path("C:\\Users\\chris\\DATA417\\assets\\displacy_images\\masc_1.svg")

output_path.open("w", encoding="utf-8").write(svg)

26024

In [21]:
displacy.render(top_masc['spacy'].iloc[1], style='ent', options=options)

In [75]:
displacy.render(top_masc['spacy'].iloc[2], style='ent', options=options)

## Feminine

In [76]:
displacy.render(top_fem['spacy'].iloc[0], style='ent', options=options)

In [77]:
displacy.render(top_fem['spacy'].iloc[1], style='ent', options=options)

In [78]:
displacy.render(top_fem['spacy'].iloc[2], style='ent', options=options)