![JohnSnowLabs](https://nlp.johnsnowlabs.com/assets/images/logo.png)

[![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/JohnSnowLabs/spark-nlp-workshop//blob/master/tutorials/Certification_Trainings/Public/6.Playground_DataFrames.ipynb)

# Spark DataFrames Playground

In [None]:
!pip install -q pyspark==3.3.0 spark-nlp==4.2.0

In [2]:
import sparknlp
import pandas as pd

from pyspark.ml import Pipeline
from pyspark.ml import PipelineModel

from sparknlp.annotator import *
from sparknlp.base import *

spark = sparknlp.start() # for GPU training >> sparknlp.start(gpu = True) # for Spark 2.3 =>> sparknlp.start(spark23 = True)

print("Spark NLP version", sparknlp.version())
print("Apache Spark version:", spark.version)

spark

Spark NLP version 4.2.0
Apache Spark version: 3.3.0


In [3]:
document = DocumentAssembler()\
              .setInputCol('text')\
              .setOutputCol('document')

In [4]:
tokenizer = Tokenizer()\
              .setInputCols('document')\
              .setOutputCol('token')

In [5]:
pos = PerceptronModel.pretrained()\
              .setInputCols(['document', 'token'])\
              .setOutputCol('pos')

pos_anc download started this may take some time.
Approximate size to download 3.9 MB
[OK!]


In [6]:
pipeline = Pipeline().setStages([
                                document, 
                                tokenizer, 
                                pos
                                ])

In [None]:
!wget https://raw.githubusercontent.com/JohnSnowLabs/spark-nlp-workshop/master/jupyter/annotation/english/spark-nlp-basics/sample-sentences-en.txt

In [8]:
data = spark.read.text('./sample-sentences-en.txt').toDF('text')

In [9]:
data.show(5, truncate=False)

+-----------------------------------------------------------------------------+
|text                                                                         |
+-----------------------------------------------------------------------------+
|Peter is a very good person.                                                 |
|My life in Russia is very interesting.                                       |
|John and Peter are brothers. However they don't support each other that much.|
|Lucas Nogal Dunbercker is no longer happy. He has a good car though.         |
|Europe is very culture rich. There are huge churches! and big houses!        |
+-----------------------------------------------------------------------------+



In [10]:
model = pipeline.fit(data)

In [11]:
result = model.transform(data)

In [12]:
result.show(5)

+--------------------+--------------------+--------------------+--------------------+
|                text|            document|               token|                 pos|
+--------------------+--------------------+--------------------+--------------------+
|Peter is a very g...|[{document, 0, 27...|[{token, 0, 4, Pe...|[{pos, 0, 4, NNP,...|
|My life in Russia...|[{document, 0, 37...|[{token, 0, 1, My...|[{pos, 0, 1, PRP$...|
|John and Peter ar...|[{document, 0, 76...|[{token, 0, 3, Jo...|[{pos, 0, 3, NNP,...|
|Lucas Nogal Dunbe...|[{document, 0, 67...|[{token, 0, 4, Lu...|[{pos, 0, 4, NNP,...|
|Europe is very cu...|[{document, 0, 68...|[{token, 0, 5, Eu...|[{pos, 0, 5, NNP,...|
+--------------------+--------------------+--------------------+--------------------+



In [13]:
stored = result\
          .select('text', 'pos.begin', 'pos.end', 'pos.result', 'pos.metadata')\
          .toDF('text', 'pos_begin', 'pos_end', 'pos_result', 'pos_meta')\
          .cache()

In [14]:
stored.printSchema()

root
 |-- text: string (nullable = true)
 |-- pos_begin: array (nullable = true)
 |    |-- element: integer (containsNull = true)
 |-- pos_end: array (nullable = true)
 |    |-- element: integer (containsNull = true)
 |-- pos_result: array (nullable = true)
 |    |-- element: string (containsNull = true)
 |-- pos_meta: array (nullable = true)
 |    |-- element: map (containsNull = true)
 |    |    |-- key: string
 |    |    |-- value: string (valueContainsNull = true)



In [15]:
stored.show(5)

+--------------------+--------------------+--------------------+--------------------+--------------------+
|                text|           pos_begin|             pos_end|          pos_result|            pos_meta|
+--------------------+--------------------+--------------------+--------------------+--------------------+
|Peter is a very g...|[0, 6, 9, 11, 16,...|[4, 7, 9, 14, 19,...|[NNP, VBZ, DT, RB...|[{word -> Peter, ...|
|My life in Russia...|[0, 3, 8, 11, 18,...|[1, 6, 9, 16, 19,...|[PRP$, NN, IN, NN...|[{word -> My, sen...|
|John and Peter ar...|[0, 5, 9, 15, 19,...|[3, 7, 13, 17, 26...|[NNP, CC, NNP, VB...|[{word -> John, s...|
|Lucas Nogal Dunbe...|[0, 6, 12, 23, 26...|[4, 10, 21, 24, 2...|[NNP, NNP, NNP, V...|[{word -> Lucas, ...|
|Europe is very cu...|[0, 7, 10, 15, 23...|[5, 8, 13, 21, 26...|[NNP, VBZ, RB, RB...|[{word -> Europe,...|
+--------------------+--------------------+--------------------+--------------------+--------------------+



---------
## Spark SQL Functions

In [16]:
from pyspark.sql.functions import *

In [17]:
stored.filter(array_contains('pos_result', 'VBZ')).show(5)

+--------------------+--------------------+--------------------+--------------------+--------------------+
|                text|           pos_begin|             pos_end|          pos_result|            pos_meta|
+--------------------+--------------------+--------------------+--------------------+--------------------+
|Peter is a very g...|[0, 6, 9, 11, 16,...|[4, 7, 9, 14, 19,...|[NNP, VBZ, DT, RB...|[{word -> Peter, ...|
|My life in Russia...|[0, 3, 8, 11, 18,...|[1, 6, 9, 16, 19,...|[PRP$, NN, IN, NN...|[{word -> My, sen...|
|Lucas Nogal Dunbe...|[0, 6, 12, 23, 26...|[4, 10, 21, 24, 2...|[NNP, NNP, NNP, V...|[{word -> Lucas, ...|
|Europe is very cu...|[0, 7, 10, 15, 23...|[5, 8, 13, 21, 26...|[NNP, VBZ, RB, RB...|[{word -> Europe,...|
+--------------------+--------------------+--------------------+--------------------+--------------------+



In [18]:
stored.withColumn('token_count', size('pos_result')).select('pos_result', 'token_count').show(5)

+--------------------+-----------+
|          pos_result|token_count|
+--------------------+-----------+
|[NNP, VBZ, DT, RB...|          7|
|[PRP$, NN, IN, NN...|          8|
|[NNP, CC, NNP, VB...|         15|
|[NNP, NNP, NNP, V...|         15|
|[NNP, VBZ, RB, RB...|         15|
+--------------------+-----------+



In [19]:
stored.select('text', array_max('pos_end')).show(5)

+--------------------+------------------+
|                text|array_max(pos_end)|
+--------------------+------------------+
|Peter is a very g...|                27|
|My life in Russia...|                37|
|John and Peter ar...|                76|
|Lucas Nogal Dunbe...|                67|
|Europe is very cu...|                68|
+--------------------+------------------+



In [20]:
stored.withColumn('unique_pos', array_distinct('pos_result')).select('pos_result', 'unique_pos').show(5)

+--------------------+--------------------+
|          pos_result|          unique_pos|
+--------------------+--------------------+
|[NNP, VBZ, DT, RB...|[NNP, VBZ, DT, RB...|
|[PRP$, NN, IN, NN...|[PRP$, NN, IN, NN...|
|[NNP, CC, NNP, VB...|[NNP, CC, VBP, NN...|
|[NNP, NNP, NNP, V...|[NNP, VBZ, DT, RB...|
|[NNP, VBZ, RB, RB...|[NNP, VBZ, RB, JJ...|
+--------------------+--------------------+



In [21]:
stored.groupBy(array_distinct('pos_result')).count().show(10)

+--------------------------+-----+
|array_distinct(pos_result)|count|
+--------------------------+-----+
|      [NNP, CC, VBP, NN...|    1|
|      [NNP, VBZ, DT, RB...|    1|
|      [NNP, VBZ, RB, JJ...|    1|
|      [PRP$, NN, IN, NN...|    1|
|      [NNP, VBZ, DT, RB...|    1|
+--------------------------+-----+



## SQL Functions with `col`

In [22]:
from pyspark.sql.functions import col

In [23]:
stored.select(col('pos_meta').getItem(0).getItem('word')).show(5)

+-----------------+
|pos_meta[0][word]|
+-----------------+
|            Peter|
|               My|
|             John|
|            Lucas|
|           Europe|
+-----------------+



-------------
## Spark NLP Annotation UDFs

In [24]:
import pandas as pd
from sparknlp.functions import *
from pyspark.sql.types import ArrayType, StringType

In [25]:
result.select('pos').show(1, truncate=False)

+--------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+
|pos                                                                                                                                                                                                                                                                                                                                                                             |
+-----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------

In [26]:
@udf( StringType())
def nn_annotation(res,meta):
    nn = []
    for i,j in zip(res,meta):
      if i == "NN":
        nn.append(j["word"])
    return nn    

In [27]:
result.withColumn("nn_tokens", nn_annotation(col("pos.result"), col("pos.metadata")))\
      .select("nn_tokens")\
      .show(truncate=False)

+---------+
|nn_tokens|
+---------+
|[person] |
|[life]   |
|[]       |
|[car]    |
|[]       |
+---------+

