In [42]:
import csv
import json
import pandas as pd
import spacy
import itertools

In [43]:
# Define the file paths for the CSV and JSONL files
csv_file_path = 'C:\\Users\\chris\\DATA417\\data\\descriptions_clean.csv'
jsonl_file_path = 'C:\\Users\\chris\\DATA417\\data\\data.jsonl'
masculine_file_path = 'C:\\Users\\chris\\DATA417\\data\\masculine.csv'
feminine__file_path = 'C:\\Users\\chris\\DATA417\\data\\feminine.csv'
exclusion_file_path = 'C:\\Users\\chris\\DATA417\\data\\exclusion.csv'

In [44]:
nlp = spacy.load('en_core_web_sm')

In [45]:
# Load word lists
with open(masculine_file_path, newline='') as f:
    masculine_words = [row[0] for row in csv.reader(f)]
with open(feminine__file_path, newline='') as f:
    feminine_words = [row[0] for row in csv.reader(f)]
with open(exclusion_file_path, newline='') as f:
    exclusion_words = [row[0] for row in csv.reader(f)]

wordlists = {
    "MAS" : masculine_words,
    "FEM" : feminine_words,
    "EXC" : exclusion_words
}

In [46]:
intersect = set(wordlists["MAS"]) & set(wordlists["EXC"])
intersect

set()

In [47]:
# Define the function to match words in the text
def match_words(text):
    # Create an empty list to store the matches
    matches = []
    # Create a spaCy Doc object from the text
    doc = nlp(text)
    # Loop through the word lists and check for matches
    for list_name, word_list in wordlists.items():
        for word in word_list:
            for token in doc:
                if token.text.lower() == word:
                    matches.append(
                        [
                            token.idx,
                            token.idx + len(token.text),
                            list_name
                        ]
                    )
    # Return the matches
    return matches

In [48]:
# Define the function to process each row of the CSV file
def process_row(row):
    # Get the text description from the row
    text = row["Job Description"]
    # Match the words in the text and get the label attributes
    matches = match_words(text)
    # Return a dictionary with the text and label attributes
    return {"text": text, "label": matches}

In [49]:
# Read in the CSV file and create a list of processed rows
processed_rows = []
with open(csv_file_path) as csvfile:
    reader = csv.DictReader(csvfile)
    for row in reader:
        processed_rows.append(process_row(row))

In [50]:
# Write the processed rows to a JSONL file
with open(jsonl_file_path, "w") as outfile:
    for row in processed_rows:
        json.dump(row, outfile)
        outfile.write("\n")

In [51]:
result = []

with open(jsonl_file_path, 'r') as f:
    for line in f.readlines():
        data = json.loads(line)

        labels = data['label']
        labels.sort()
        labels = list(l for l,_ in itertools.groupby(labels))

        entry = [data['text'], {"entities" : labels}]

        result.append(entry)
    
    train = result[0:700]
    dev = result[701:1000]

    with open("train.json", 'w') as g:
        json.dump(train, g)

    with open("dev.json", 'w') as h:
        json.dump(dev, h)
