In [None]:
# Load libraries
# Python
import spacy
import numpy as np
import pandas as pd
# PySpark
from pyspark.sql.functions import udf
from pyspark.sql.types import StringType, ArrayType
from pyspark.sql.types import *

In [None]:
# Load data
inputPath = getArgument("inputPath", "default")
df = spark.read.format('csv').options(header='true', inferSchema='true', \
quote="\"", escape= "\"").load(inputPath)

In [None]:
# Define schema
schema = ArrayType(StructType([
    StructField("text", StringType(), False),
    StructField("start_char", IntegerType(), False),
    StructField("end_char", IntegerType(), False),
    StructField("label", StringType(), False)
    ]))

In [None]:
# Define function to get entities
def get_entities(text):
    global nlp
    try:
        doc = nlp(text)
    except:
        nlp = spacy.load('en_ner_base_V3')
    doc = nlp(text)
    return [[e.text, e.start_char, e.end_char, e.label_] for e in doc.ents]

get_entities_udf = udf(lambda x: get_entities(x), schema)

In [None]:
# Get Entities
documents_df = df.withColumn('entities', get_entities_udf('description'))
Finally, we will write the PySpark DataFrame as a Parquet file. Note that the output

In [None]:
# Write Parquet
outPath = getArgument("outputPath", "default")
documents_df.write.format("parquet").mode("overwrite").save(outPath)