In [1]:
import sparknlp

from sparknlp.base import *
from sparknlp.annotator import *

from pyspark.ml import Pipeline

In [2]:
spark = sparknlp.start()

In [3]:
document = DocumentAssembler().setInputCol('text').setOutputCol('document')

In [4]:
tokenizer = Tokenizer().setInputCols('document').setOutputCol('token')

In [5]:
pos = PerceptronModel.pretrained().setInputCols('document', 'token').setOutputCol('pos')

pos_anc download started this may take some time.
Approximate size to download 4.3 MB
[OK!]


In [6]:
pipeline = Pipeline().setStages([document, tokenizer, pos])

In [7]:
data = spark.read.text('/Users/saifa/sample-sentences-en.txt').toDF('text')

In [8]:
data.show(5)

+--------------------+
|                text|
+--------------------+
|         Chapter 1 .|
|Once when I was s...|
|It was a picture ...|
|Here is a copy of...|
|In the book it sa...|
+--------------------+
only showing top 5 rows



In [9]:
model = pipeline.fit(data)

In [10]:
result = model.transform(data)

In [11]:
result.show(5)

+--------------------+--------------------+--------------------+--------------------+
|                text|            document|               token|                 pos|
+--------------------+--------------------+--------------------+--------------------+
|         Chapter 1 .|[[document, 0, 10...|[[token, 0, 6, Ch...|[[pos, 0, 6, NN, ...|
|Once when I was s...|[[document, 0, 13...|[[token, 0, 3, On...|[[pos, 0, 3, RB, ...|
|It was a picture ...|[[document, 0, 73...|[[token, 0, 1, It...|[[pos, 0, 1, PRP,...|
|Here is a copy of...|[[document, 0, 30...|[[token, 0, 3, He...|[[pos, 0, 3, RB, ...|
|In the book it sa...|[[document, 0, 87...|[[token, 0, 1, In...|[[pos, 0, 1, IN, ...|
+--------------------+--------------------+--------------------+--------------------+
only showing top 5 rows



In [28]:
stored = result\
  .select('text', 'pos.begin', 'pos.end', 'pos.result', 'pos.metadata')\
  .toDF('text', 'pos_begin', 'pos_end', 'pos_result', 'pos_meta')\
  .cache()

In [52]:
stored.printSchema()

root
 |-- text: string (nullable = true)
 |-- pos_begin: array (nullable = true)
 |    |-- element: integer (containsNull = true)
 |-- pos_end: array (nullable = true)
 |    |-- element: integer (containsNull = true)
 |-- pos_result: array (nullable = true)
 |    |-- element: string (containsNull = true)
 |-- pos_meta: array (nullable = true)
 |    |-- element: map (containsNull = true)
 |    |    |-- key: string
 |    |    |-- value: string (valueContainsNull = true)



In [29]:
stored.show(5)

+--------------------+--------------------+--------------------+--------------------+--------------------+
|                text|           pos_begin|             pos_end|          pos_result|            pos_meta|
+--------------------+--------------------+--------------------+--------------------+--------------------+
|         Chapter 1 .|          [0, 8, 10]|          [6, 8, 10]|         [NN, CD, .]|[[word -> Chapter...|
|Once when I was s...|[0, 5, 10, 12, 16...|[3, 8, 10, 14, 18...|[RB, WRB, PRP, VB...|[[word -> Once], ...|
|It was a picture ...|[0, 3, 7, 9, 17, ...|[1, 5, 7, 15, 18,...|[PRP, VBD, DT, NN...|[[word -> It], [w...|
|Here is a copy of...|[0, 5, 8, 10, 15,...|[3, 6, 8, 13, 16,...|[RB, VBZ, DT, NN,...|[[word -> Here], ...|
|In the book it sa...|[0, 3, 7, 12, 15,...|[1, 5, 10, 13, 18...|[IN, DT, NN, PRP,...|[[word -> In], [w...|
+--------------------+--------------------+--------------------+--------------------+--------------------+
only showing top 5 rows



---------
## Spark SQL Functions

In [30]:
from pyspark.sql.functions import *

In [31]:
stored.filter(array_contains('pos_result', 'VBD')).show(5)

+--------------------+--------------------+--------------------+--------------------+--------------------+
|                text|           pos_begin|             pos_end|          pos_result|            pos_meta|
+--------------------+--------------------+--------------------+--------------------+--------------------+
|Once when I was s...|[0, 5, 10, 12, 16...|[3, 8, 10, 14, 18...|[RB, WRB, PRP, VB...|[[word -> Once], ...|
|It was a picture ...|[0, 3, 7, 9, 17, ...|[1, 5, 7, 15, 18,...|[PRP, VBD, DT, NN...|[[word -> It], [w...|
|In the book it sa...|[0, 3, 7, 12, 15,...|[1, 5, 10, 13, 18...|[IN, DT, NN, PRP,...|[[word -> In], [w...|
|And after some wo...|[0, 4, 10, 15, 20...|[2, 8, 13, 18, 23...|[CC, IN, DT, NN, ...|[[word -> And], [...|
|It looked like th...|[0, 3, 10, 15, 20...|[1, 8, 13, 18, 20...|[PRP, VBD, IN, DT...|[[word -> It], [w...|
+--------------------+--------------------+--------------------+--------------------+--------------------+
only showing top 5 rows



In [32]:
stored.withColumn('token_count', size('pos_result')).select('pos_result', 'token_count').show(5)

+--------------------+-----------+
|          pos_result|token_count|
+--------------------+-----------+
|         [NN, CD, .]|          3|
|[RB, WRB, PRP, VB...|         27|
|[PRP, VBD, DT, NN...|         16|
|[RB, VBZ, DT, NN,...|          8|
|[IN, DT, NN, PRP,...|         18|
+--------------------+-----------+
only showing top 5 rows



In [35]:
stored.select('text', array_max('pos_end')).show(5)

+--------------------+------------------+
|                text|array_max(pos_end)|
+--------------------+------------------+
|         Chapter 1 .|                10|
|Once when I was s...|               130|
|It was a picture ...|                73|
|Here is a copy of...|                30|
|In the book it sa...|                87|
+--------------------+------------------+
only showing top 5 rows



In [33]:
stored.withColumn('unique_pos', array_distinct('pos_result')).select('pos_result', 'unique_pos').show(5)

+--------------------+--------------------+
|          pos_result|          unique_pos|
+--------------------+--------------------+
|         [NN, CD, .]|         [NN, CD, .]|
|[RB, WRB, PRP, VB...|[RB, WRB, PRP, VB...|
|[PRP, VBD, DT, NN...|[PRP, VBD, DT, NN...|
|[RB, VBZ, DT, NN,...|[RB, VBZ, DT, NN,...|
|[IN, DT, NN, PRP,...|[IN, DT, NN, PRP,...|
+--------------------+--------------------+
only showing top 5 rows



In [50]:
stored.groupBy(array_sort(array_distinct('pos_result'))).count().show(10)

+--------------------------------------+-----+
|array_sort(array_distinct(pos_result))|count|
+--------------------------------------+-----+
|                  [,, ., CC, DT, IN...|    1|
|                  [,, -, ., :, CC, ...|    1|
|                  ['', ., :, DT, PR...|    1|
|                  [,, ., CD, DT, IN...|    1|
|                          ['', ., WRB]|    1|
|                  [(, ), ,, ., CC, ...|    1|
|                  [., NNS, PRP, RB,...|    1|
|                  ['', ,, ., DT, IN...|    1|
|                           [., CD, NN]|    2|
|                  [,, ., DT, IN, JJ...|    1|
+--------------------------------------+-----+
only showing top 10 rows



----------------
### SQL Functions with `col`

In [36]:
from pyspark.sql.functions import col

In [39]:
stored.select(col('pos_meta').getItem(0).getItem('word')).show(5)

+-----------------+
|pos_meta[0][word]|
+-----------------+
|          Chapter|
|             Once|
|               It|
|             Here|
|               In|
+-----------------+
only showing top 5 rows



-------------
### Spark NLP Annotation UDFs

In [40]:
result.select('pos').show(1, truncate=False)

+-------------------------------------------------------------------------------------------------------------+
|pos                                                                                                          |
+-------------------------------------------------------------------------------------------------------------+
|[[pos, 0, 6, NN, [word -> Chapter], []], [pos, 8, 8, CD, [word -> 1], []], [pos, 10, 10, ., [word -> .], []]]|
+-------------------------------------------------------------------------------------------------------------+
only showing top 1 row



In [41]:
def nn_tokens(annotations):
    nn_annotations = list(
        filter(lambda annotation: annotation.result == 'NN', annotations)
    )
    return list(
        map(lambda nn_annotation: nn_annotation.metadata['word'], nn_annotations)
    )

In [42]:
from sparknlp.functions import *

In [43]:
from pyspark.sql.types import ArrayType, StringType

In [44]:
result.select(map_annotations(nn_tokens, ArrayType(StringType()))('pos').alias('nn_tokens')).show(truncate=False)

+---------------------------------------------------------+
|nn_tokens                                                |
+---------------------------------------------------------+
|[Chapter]                                                |
|[picture, book, primeval, forest]                        |
|[picture, boa, constrictor, act, animal]                 |
|[copy]                                                   |
|[book, whole]                                            |
|[digestion]                                              |
|[jungle]                                                 |
|[work, pencil, drawing]                                  |
|[]                                                       |
|[masterpiece, grown]                                     |
|[]                                                       |
|[hat]                                                    |
|[picture, hat]                                           |
|[picture, boa, constrictor, elephant]  