In [None]:
# Install pyspark
! pip install --ignore-installed pyspark

# Install Spark NLP
! pip install --ignore-installed spark-nlp

In [None]:
import sparknlp
from sparknlp.base import *
from sparknlp.annotator import *
from pyspark.sql import SparkSession

In [None]:
print("Spark NLP version", sparknlp.version())

Apache Spark version: 3.0.1


This notebook uses the default configuration (useStorage=true). This parameter tells the annotator to serialize patterns file data with RocksDB storage when saving the model.

In [None]:
data = spark.createDataFrame([["Lord Eddard Stark was the head of House Stark. John Snow lives in Winterfell."]]).toDF("text")

We are going to use a JSON file with the following format:


```
[
  {
    "label": "PERSON",
    "patterns": ["Jon", "John", "John Snow"]
  },
  {
    "label": "PERSON",
    "patterns": ["Eddard", "Eddard Stark"]
  },
  {
    "label": "LOCATION",
    "patterns": ["Winterfell"]
  }
]
```



In [None]:
document_assembler = DocumentAssembler().setInputCol("text").setOutputCol("document")
sentence_detector = SentenceDetector().setInputCols("document").setOutputCol("sentence")

tokenizer = Tokenizer() \
  .setInputCols("document") \
  .setOutputCol("token") \
  .setExceptions(["John Snow", "Eddard Stark"])

entity_ruler = EntityRulerApproach() \
    .setInputCols(["sentence", "token"]) \
    .setOutputCol("entity") \
    .setPatternsResource("sample_data/patterns.json")

In [None]:
pipeline = Pipeline(stages=[document_assembler, sentence_detector, tokenizer, entity_ruler])
model = pipeline.fit(data)

In [None]:
model.transform(data).select("entity").show(truncate=False)

+--------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+
|entity                                                                                                                                                                                                        |
+--------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+
|[[chunk, 5, 16, Eddard Stark, [entity -> PERSON, sentence -> 0], []], [chunk, 47, 55, John Snow, [entity -> PERSON, sentence -> 1], []], [chunk, 66, 75, Winterfell, [entity -> LOCATION, sentence -> 1], []]]|
+-------------------------------------------------------------------------------------------------------------------------------------------------------------------

We can define an id field to identify entities and it supports JSON Lines format as the example below.

```
{"id": "names-with-j", "label": "PERSON", "patterns": ["Jon", "John", "John Snow"]}
{"id": "names-with-e", "label": "PERSON", "patterns": ["Eddard", "Eddard Stark"]}
{"id": "locations", "label": "LOCATION", "patterns": ["Winterfell"]}
```

In [None]:
entity_ruler = EntityRulerApproach() \
    .setInputCols(["sentence", "token"]) \
    .setOutputCol("entity") \
    .setPatternsResource("sample_data/patterns.jsonl", ReadAs.TEXT, options={"format": "JSONL"})

In [None]:
pipeline = Pipeline(stages=[document_assembler, sentence_detector, tokenizer, entity_ruler])
model = pipeline.fit(data)
model.transform(data).select("entity").show(truncate=False)

+-----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+
|entity                                                                                                                                                                                                                                                                 |
+-----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+
|[[chunk, 5, 16, Eddard Stark, [entity -> PERSON, id -> names-with-e, sentence -> 0], []], [chunk, 47, 55, John Snow, [entity -> PERSON, id -> names-with-j, sentence -> 1], []], [chunk, 66, 75, Winterfe

For the CSV file we use the following format:


```
PERSON|Jon
PERSON|John
PERSON|John Snow
LOCATION|Winterfell
```



In [None]:
entity_ruler_csv = EntityRulerApproach() \
    .setInputCols(["sentence", "token"]) \
    .setOutputCol("entity") \
    .setPatternsResource("sample_data/patterns.csv", options={"format": "csv", "delimiter": "\\|"})

In [None]:
pipeline_csv = Pipeline(stages=[document_assembler, sentence_detector, tokenizer, entity_ruler_csv])
model_csv = pipeline_csv.fit(data)

In [None]:
model_csv.transform(data).select("entity").show(truncate=False)

+-----------------------------------------------------------------------------------------------------------------------------------------+
|entity                                                                                                                                   |
+-----------------------------------------------------------------------------------------------------------------------------------------+
|[[chunk, 47, 55, John Snow, [entity -> PERSON, sentence -> 1], []], [chunk, 66, 75, Winterfell, [entity -> LOCATION, sentence -> 1], []]]|
+-----------------------------------------------------------------------------------------------------------------------------------------+



# Regex Patterns

This annotator can also find matches based on regex patterns defined on pattern field. For example we can have the JSON file below

```
[
  {
    "id": "person-regex",
    "label": "PERSON",
    "patterns": ["\\w+\\s\\w+", "\\w+-\\w+"]
  },
  {
    "id": "locations-words",
    "label": "LOCATION",
    "patterns": ["Winterfell"]
  }
]
```

In [None]:
regex_entity_ruler = EntityRulerApproach() \
    .setInputCols(["sentence", "token"]) \
    .setOutputCol("entity") \
    .setPatternsResource("sample_data/regex_patterns.json") \
    .setEnablePatternRegex(True)

In [None]:
regex_pipeline = Pipeline(stages=[document_assembler, sentence_detector, tokenizer, regex_entity_ruler])
regex_model = regex_pipeline.fit(data)

In [None]:
regex_model.transform(data).select("entity").show(truncate=False)

+-----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+
|entity                                                                                                                                                                                                                                                                       |
+-----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+
|[[chunk, 5, 16, Eddard Stark, [entity -> PERSON, id -> person-regex, sentence -> 0], []], [chunk, 47, 55, John Snow, [entity -> PERSON, id -> person-regex, sentence -> 1], []], [chunk