![JohnSnowLabs](https://nlp.johnsnowlabs.com/assets/images/logo.png)

[![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/JohnSnowLabs/spark-nlp-workshop//blob/master/tutorials/Certification_Trainings/Public/6.Playground_DataFrames.ipynb)

# Spark DataFrames Playground

In [1]:
%%capture

# This is only to setup PySpark and Spark NLP on Colab
!wget https://raw.githubusercontent.com/JohnSnowLabs/spark-nlp-workshop/master/colab_setup.sh -O - | bash


# for Spark 2.4.x and Spark NLP 2.x.x, do the following
# !wget https://raw.githubusercontent.com/JohnSnowLabs/spark-nlp-workshop/master/colab_setup.sh
# !bash colab_setup.sh -p 2.4.x -s 2.x.x


<b>  if you want to work with Spark 2.3 </b>
```
import os

# Install java
! apt-get update -qq
! apt-get install -y openjdk-8-jdk-headless -qq > /dev/null

!wget -q https://archive.apache.org/dist/spark/spark-2.3.0/spark-2.3.0-bin-hadoop2.7.tgz

!tar xf spark-2.3.0-bin-hadoop2.7.tgz
!pip install -q findspark

os.environ["JAVA_HOME"] = "/usr/lib/jvm/java-8-openjdk-amd64"
os.environ["PATH"] = os.environ["JAVA_HOME"] + "/bin:" + os.environ["PATH"]
os.environ["SPARK_HOME"] = "/content/spark-2.3.0-bin-hadoop2.7"
! java -version

import findspark
findspark.init()
from pyspark.sql import SparkSession

! pip install --ignore-installed -q spark-nlp==2.7.5

import sparknlp

spark = sparknlp.start(spark23=True)
```

In [None]:
import sparknlp

spark = sparknlp.start()

from sparknlp.base import *
from sparknlp.annotator import *

from pyspark.ml import Pipeline

print("Spark NLP version", sparknlp.version())

print("Apache Spark version:", spark.version)

spark

In [3]:
document = DocumentAssembler().setInputCol('text').setOutputCol('document')

In [None]:
tokenizer = Tokenizer().setInputCols('document').setOutputCol('token')

In [None]:
pos = PerceptronModel.pretrained().setInputCols('document', 'token').setOutputCol('pos')

pos_anc download started this may take some time.
Approximate size to download 3.9 MB
[OK!]


In [None]:
pipeline = Pipeline().setStages([document, tokenizer, pos])

In [None]:
!wget https://raw.githubusercontent.com/JohnSnowLabs/spark-nlp-workshop/master/jupyter/annotation/english/spark-nlp-basics/sample-sentences-en.txt

In [None]:
data = spark.read.text('./sample-sentences-en.txt').toDF('text')

In [None]:
data.show(5)

+--------------------+
|                text|
+--------------------+
|Peter is a very g...|
|My life in Russia...|
|John and Peter ar...|
|Lucas Nogal Dunbe...|
|Europe is very cu...|
+--------------------+



In [None]:
model = pipeline.fit(data)

In [None]:
result = model.transform(data)

In [None]:
result.show(5)

+--------------------+--------------------+--------------------+--------------------+
|                text|            document|               token|                 pos|
+--------------------+--------------------+--------------------+--------------------+
|Peter is a very g...|[[document, 0, 27...|[[token, 0, 4, Pe...|[[pos, 0, 4, NNP,...|
|My life in Russia...|[[document, 0, 37...|[[token, 0, 1, My...|[[pos, 0, 1, PRP$...|
|John and Peter ar...|[[document, 0, 76...|[[token, 0, 3, Jo...|[[pos, 0, 3, NNP,...|
|Lucas Nogal Dunbe...|[[document, 0, 67...|[[token, 0, 4, Lu...|[[pos, 0, 4, NNP,...|
|Europe is very cu...|[[document, 0, 68...|[[token, 0, 5, Eu...|[[pos, 0, 5, NNP,...|
+--------------------+--------------------+--------------------+--------------------+



In [None]:
stored = result\
  .select('text', 'pos.begin', 'pos.end', 'pos.result', 'pos.metadata')\
  .toDF('text', 'pos_begin', 'pos_end', 'pos_result', 'pos_meta')\
  .cache()

In [None]:
stored.printSchema()

root
 |-- text: string (nullable = true)
 |-- pos_begin: array (nullable = true)
 |    |-- element: integer (containsNull = true)
 |-- pos_end: array (nullable = true)
 |    |-- element: integer (containsNull = true)
 |-- pos_result: array (nullable = true)
 |    |-- element: string (containsNull = true)
 |-- pos_meta: array (nullable = true)
 |    |-- element: map (containsNull = true)
 |    |    |-- key: string
 |    |    |-- value: string (valueContainsNull = true)



In [None]:
stored.show(5)

+--------------------+--------------------+--------------------+--------------------+--------------------+
|                text|           pos_begin|             pos_end|          pos_result|            pos_meta|
+--------------------+--------------------+--------------------+--------------------+--------------------+
|Peter is a very g...|[0, 6, 9, 11, 16,...|[4, 7, 9, 14, 19,...|[NNP, VBZ, DT, RB...|[[word -> Peter],...|
|My life in Russia...|[0, 3, 8, 11, 18,...|[1, 6, 9, 16, 19,...|[PRP$, NN, IN, NN...|[[word -> My], [w...|
|John and Peter ar...|[0, 5, 9, 15, 19,...|[3, 7, 13, 17, 26...|[NNP, CC, NNP, VB...|[[word -> John], ...|
|Lucas Nogal Dunbe...|[0, 6, 12, 23, 26...|[4, 10, 21, 24, 2...|[NNP, NNP, NNP, V...|[[word -> Lucas],...|
|Europe is very cu...|[0, 7, 10, 15, 23...|[5, 8, 13, 21, 26...|[NNP, VBZ, RB, RB...|[[word -> Europe]...|
+--------------------+--------------------+--------------------+--------------------+--------------------+



---------
## Spark SQL Functions

In [None]:
from pyspark.sql.functions import *

In [None]:
stored.filter(array_contains('pos_result', 'VBD')).show(5)

+----+---------+-------+----------+--------+
|text|pos_begin|pos_end|pos_result|pos_meta|
+----+---------+-------+----------+--------+
+----+---------+-------+----------+--------+



In [None]:
stored.withColumn('token_count', size('pos_result')).select('pos_result', 'token_count').show(5)

+--------------------+-----------+
|          pos_result|token_count|
+--------------------+-----------+
|[NNP, VBZ, DT, RB...|          7|
|[PRP$, NN, IN, NN...|          8|
|[NNP, CC, NNP, VB...|         15|
|[NNP, NNP, NNP, V...|         15|
|[NNP, VBZ, RB, RB...|         15|
+--------------------+-----------+



In [None]:
stored.select('text', array_max('pos_end')).show(5)

+--------------------+------------------+
|                text|array_max(pos_end)|
+--------------------+------------------+
|Peter is a very g...|                27|
|My life in Russia...|                37|
|John and Peter ar...|                76|
|Lucas Nogal Dunbe...|                67|
|Europe is very cu...|                68|
+--------------------+------------------+



In [None]:
stored.withColumn('unique_pos', array_distinct('pos_result')).select('pos_result', 'unique_pos').show(5)

+--------------------+--------------------+
|          pos_result|          unique_pos|
+--------------------+--------------------+
|[NNP, VBZ, DT, RB...|[NNP, VBZ, DT, RB...|
|[PRP$, NN, IN, NN...|[PRP$, NN, IN, NN...|
|[NNP, CC, NNP, VB...|[NNP, CC, VBP, NN...|
|[NNP, NNP, NNP, V...|[NNP, VBZ, DT, RB...|
|[NNP, VBZ, RB, RB...|[NNP, VBZ, RB, JJ...|
+--------------------+--------------------+



In [None]:
stored.groupBy(array_distinct('pos_result')).count().show(10)

+--------------------------+-----+
|array_distinct(pos_result)|count|
+--------------------------+-----+
|      [NNP, CC, VBP, NN...|    1|
|      [NNP, VBZ, DT, RB...|    1|
|      [NNP, VBZ, RB, JJ...|    1|
|      [PRP$, NN, IN, NN...|    1|
|      [NNP, VBZ, DT, RB...|    1|
+--------------------------+-----+



----------------
### SQL Functions with `col`

In [None]:
from pyspark.sql.functions import col

In [None]:
stored.select(col('pos_meta').getItem(0).getItem('word')).show(5)

+-----------------+
|pos_meta[0][word]|
+-----------------+
|            Peter|
|               My|
|             John|
|            Lucas|
|           Europe|
+-----------------+



-------------
### Spark NLP Annotation UDFs

In [None]:
result.select('pos').show(1, truncate=False)

+-----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+
|pos                                                                                                                                                                                                                                                                    |
+-----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+
|[[pos, 0, 4, NNP, [word -> Peter], []], [pos, 6, 7, VBZ, [word -> is], []], [pos, 9, 9, DT, [word -> a], []], [pos, 11, 14, RB, [word -> very], []], [pos, 16, 19, JJ, [word -> good], []], [pos, 21, 26,

In [None]:
def nn_tokens(annotations):
    nn_annotations = list(
        filter(lambda annotation: annotation.result == 'NN', annotations)
    )
    return list(
        map(lambda nn_annotation: nn_annotation.metadata['word'], nn_annotations)
    )

In [None]:
from sparknlp.functions import *

In [None]:
from pyspark.sql.types import ArrayType, StringType

In [None]:
result.select(map_annotations(nn_tokens, ArrayType(StringType()))('pos').alias('nn_tokens')).show(truncate=False)

+---------+
|nn_tokens|
+---------+
|[person] |
|[life]   |
|[]       |
|[car]    |
|[]       |
+---------+

