# Generate samples from identity terms and job titles/occupations

The goal of this notebook is to create a synthetic dataset that can be used to measure the occupational bias in sentiment analysis systems.

In [1]:
# imports
import pandas as pd

# read data
identities = pd.read_excel("identity_terms.xlsx")
occupations = pd.read_excel("occupations.xlsx")

# display data
print(len(identities), "identities and", len(occupations), "occupations.")
print("This should yield", len(identities)*len(occupations), "sentences.")
display(identities.head())
display(occupations.head())

# merge data 
corpus = pd.merge(identities.assign(key=1), occupations.assign(key=1), on='key').drop('key', axis=1) # cartesian product of the two dfs

# create synthetic sentences

# in Danish
corpus["sentence_DA"] = corpus["identity_term_DA"] + " er " + corpus["job_title_DA"] + "." # create sentence
corpus["sentence_DA"] = corpus["sentence_DA"].apply(lambda x: x.capitalize()) # capitalize first word of sentence

# in English
corpus["sentence_EN"] = corpus["identity_term_EN"] + " is a(n) " + corpus["job_title_EN"] + "." # create sentence
corpus["sentence_EN"] = corpus["sentence_EN"].apply(lambda x: x.capitalize()) # capitalize first word of sentence
print("\nResult:")
print(len(corpus), "sentences")
display(corpus.head())

# save df
corpus.to_excel("gender_corpus.xlsx", index=False)
print("Successfully saved corpus!")