In [1]:
import sparknlp

from sparknlp.base import *
from sparknlp.annotator import *

from pyspark.ml import Pipeline

In [2]:
spark = sparknlp.start()

In [3]:
document = DocumentAssembler().setInputCol('text').setOutputCol('document')

In [4]:
tokenizer = Tokenizer().setInputCols('document').setOutputCol('token')

In [5]:
pos = PerceptronModel.pretrained().setInputCols('document', 'token').setOutputCol('pos')

pos_anc download started this may take some time.
Approximate size to download 4.3 MB
[OK!]


In [6]:
pipeline = Pipeline().setStages([document, tokenizer, pos])

In [50]:
data = spark.read.text('./sample-sentences-en.txt').toDF('text')

In [51]:
data.show(5)

+--------------------+
|                text|
+--------------------+
|Peter is a very g...|
|My life in Russia...|
|John and Peter ar...|
|Lucas Nogal Dunbe...|
|Europe is very cu...|
+--------------------+



In [52]:
model = pipeline.fit(data)

In [53]:
result = model.transform(data)

In [54]:
result.show(5)

+--------------------+--------------------+--------------------+--------------------+
|                text|            document|               token|                 pos|
+--------------------+--------------------+--------------------+--------------------+
|Peter is a very g...|[[document, 0, 27...|[[token, 0, 4, Pe...|[[pos, 0, 4, NNP,...|
|My life in Russia...|[[document, 0, 37...|[[token, 0, 1, My...|[[pos, 0, 1, PRP$...|
|John and Peter ar...|[[document, 0, 76...|[[token, 0, 3, Jo...|[[pos, 0, 3, NNP,...|
|Lucas Nogal Dunbe...|[[document, 0, 67...|[[token, 0, 4, Lu...|[[pos, 0, 4, NNP,...|
|Europe is very cu...|[[document, 0, 68...|[[token, 0, 5, Eu...|[[pos, 0, 5, NNP,...|
+--------------------+--------------------+--------------------+--------------------+



In [55]:
stored = result\
  .select('text', 'pos.begin', 'pos.end', 'pos.result', 'pos.metadata')\
  .toDF('text', 'pos_begin', 'pos_end', 'pos_result', 'pos_meta')\
  .cache()

In [56]:
stored.printSchema()

root
 |-- text: string (nullable = true)
 |-- pos_begin: array (nullable = true)
 |    |-- element: integer (containsNull = true)
 |-- pos_end: array (nullable = true)
 |    |-- element: integer (containsNull = true)
 |-- pos_result: array (nullable = true)
 |    |-- element: string (containsNull = true)
 |-- pos_meta: array (nullable = true)
 |    |-- element: map (containsNull = true)
 |    |    |-- key: string
 |    |    |-- value: string (valueContainsNull = true)



In [57]:
stored.show(5)

+--------------------+--------------------+--------------------+--------------------+--------------------+
|                text|           pos_begin|             pos_end|          pos_result|            pos_meta|
+--------------------+--------------------+--------------------+--------------------+--------------------+
|Peter is a very g...|[0, 6, 9, 11, 16,...|[4, 7, 9, 14, 19,...|[NNP, VBZ, DT, RB...|[[word -> Peter],...|
|My life in Russia...|[0, 3, 8, 11, 18,...|[1, 6, 9, 16, 19,...|[PRP$, NN, IN, NN...|[[word -> My], [w...|
|John and Peter ar...|[0, 5, 9, 15, 19,...|[3, 7, 13, 17, 26...|[NNP, CC, NNP, VB...|[[word -> John], ...|
|Lucas Nogal Dunbe...|[0, 6, 12, 23, 26...|[4, 10, 21, 24, 2...|[NNP, NNP, NNP, V...|[[word -> Lucas],...|
|Europe is very cu...|[0, 7, 10, 15, 23...|[5, 8, 13, 21, 26...|[NNP, VBZ, RB, RB...|[[word -> Europe]...|
+--------------------+--------------------+--------------------+--------------------+--------------------+



---------
## Spark SQL Functions

In [58]:
from pyspark.sql.functions import *

In [59]:
stored.filter(array_contains('pos_result', 'VBD')).show(5)

+----+---------+-------+----------+--------+
|text|pos_begin|pos_end|pos_result|pos_meta|
+----+---------+-------+----------+--------+
+----+---------+-------+----------+--------+



In [60]:
stored.withColumn('token_count', size('pos_result')).select('pos_result', 'token_count').show(5)

+--------------------+-----------+
|          pos_result|token_count|
+--------------------+-----------+
|[NNP, VBZ, DT, RB...|          7|
|[PRP$, NN, IN, NN...|          8|
|[NNP, CC, NNP, VB...|         15|
|[NNP, NNP, NNP, V...|         15|
|[NNP, VBZ, RB, RB...|         15|
+--------------------+-----------+



In [61]:
stored.select('text', array_max('pos_end')).show(5)

+--------------------+------------------+
|                text|array_max(pos_end)|
+--------------------+------------------+
|Peter is a very g...|                27|
|My life in Russia...|                37|
|John and Peter ar...|                76|
|Lucas Nogal Dunbe...|                67|
|Europe is very cu...|                68|
+--------------------+------------------+



In [62]:
stored.withColumn('unique_pos', array_distinct('pos_result')).select('pos_result', 'unique_pos').show(5)

+--------------------+--------------------+
|          pos_result|          unique_pos|
+--------------------+--------------------+
|[NNP, VBZ, DT, RB...|[NNP, VBZ, DT, RB...|
|[PRP$, NN, IN, NN...|[PRP$, NN, IN, NN...|
|[NNP, CC, NNP, VB...|[NNP, CC, VBP, NN...|
|[NNP, NNP, NNP, V...|[NNP, VBZ, DT, RB...|
|[NNP, VBZ, RB, RB...|[NNP, VBZ, RB, JJ...|
+--------------------+--------------------+



In [63]:
stored.groupBy(array_sort(array_distinct('pos_result'))).count().show(10)

+--------------------------------------+-----+
|array_sort(array_distinct(pos_result))|count|
+--------------------------------------+-----+
|                  [., CC, EX, JJ, N...|    1|
|                  [., IN, JJ, NN, N...|    1|
|                  [., CC, DT, IN, J...|    1|
|                  [., DT, IN, JJ, N...|    1|
|                  [., DT, JJ, NN, N...|    1|
+--------------------------------------+-----+



----------------
### SQL Functions with `col`

In [64]:
from pyspark.sql.functions import col

In [65]:
stored.select(col('pos_meta').getItem(0).getItem('word')).show(5)

+-----------------+
|pos_meta[0][word]|
+-----------------+
|            Peter|
|               My|
|             John|
|            Lucas|
|           Europe|
+-----------------+



-------------
### Spark NLP Annotation UDFs

In [66]:
result.select('pos').show(1, truncate=False)

+-----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+
|pos                                                                                                                                                                                                                                                                    |
+-----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+
|[[pos, 0, 4, NNP, [word -> Peter], []], [pos, 6, 7, VBZ, [word -> is], []], [pos, 9, 9, DT, [word -> a], []], [pos, 11, 14, RB, [word -> very], []], [pos, 16, 19, JJ, [word -> good], []], [pos, 21, 26,

In [67]:
def nn_tokens(annotations):
    nn_annotations = list(
        filter(lambda annotation: annotation.result == 'NN', annotations)
    )
    return list(
        map(lambda nn_annotation: nn_annotation.metadata['word'], nn_annotations)
    )

In [68]:
from sparknlp.functions import *

In [69]:
from pyspark.sql.types import ArrayType, StringType

In [70]:
result.select(map_annotations(nn_tokens, ArrayType(StringType()))('pos').alias('nn_tokens')).show(truncate=False)

+---------+
|nn_tokens|
+---------+
|[person] |
|[life]   |
|[]       |
|[car]    |
|[]       |
+---------+

