![JohnSnowLabs](https://nlp.johnsnowlabs.com/assets/images/logo.png)

[![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/JohnSnowLabs/spark-nlp-workshop/blob/master/jupyter/annotation/english/spark-nlp-basics/playground-dataFrames.ipynb)

## 0. Colab Setup

In [None]:
# This is only to setup PySpark and Spark NLP on Colab
!wget http://setup.johnsnowlabs.com/colab.sh -O - | bash

openjdk version "1.8.0_252"
OpenJDK Runtime Environment (build 1.8.0_252-8u252-b09-1~18.04-b09)
OpenJDK 64-Bit Server VM (build 25.252-b09, mixed mode)
[K     |████████████████████████████████| 215.7MB 57kB/s 
[K     |████████████████████████████████| 204kB 31.7MB/s 
[?25h  Building wheel for pyspark (setup.py) ... [?25l[?25hdone
[K     |████████████████████████████████| 122kB 9.5MB/s 
[?25h

In [None]:
import sparknlp

from sparknlp.base import *
from sparknlp.annotator import *

from pyspark.ml import Pipeline

In [None]:
spark = sparknlp.start()

In [None]:
document = DocumentAssembler().setInputCol('text').setOutputCol('document')

In [None]:
tokenizer = Tokenizer().setInputCols('document').setOutputCol('token')

In [None]:
pos = PerceptronModel.pretrained().setInputCols('document', 'token').setOutputCol('pos')

pos_anc download started this may take some time.
Approximate size to download 4.3 MB
[OK!]


In [None]:
pipeline = Pipeline().setStages([document, tokenizer, pos])

In [None]:
data = spark.read.text('./sample-sentences-en.txt').toDF('text')

AnalysisException: ignored

In [None]:
data.show(5)

In [None]:
model = pipeline.fit(data)

In [None]:
result = model.transform(data)

In [None]:
result.show(5)

In [None]:
stored = result\
  .select('text', 'pos.begin', 'pos.end', 'pos.result', 'pos.metadata')\
  .toDF('text', 'pos_begin', 'pos_end', 'pos_result', 'pos_meta')\
  .cache()

In [None]:
stored.printSchema()

In [None]:
stored.show(5)

---------
## Spark SQL Functions

In [None]:
from pyspark.sql.functions import *

In [None]:
stored.filter(array_contains('pos_result', 'VBD')).show(5)

In [None]:
stored.withColumn('token_count', size('pos_result')).select('pos_result', 'token_count').show(5)

In [None]:
stored.select('text', array_max('pos_end')).show(5)

In [None]:
stored.withColumn('unique_pos', array_distinct('pos_result')).select('pos_result', 'unique_pos').show(5)

In [None]:
stored.groupBy(array_sort(array_distinct('pos_result'))).count().show(10)

----------------
### SQL Functions with `col`

In [None]:
from pyspark.sql.functions import col

In [None]:
stored.select(col('pos_meta').getItem(0).getItem('word')).show(5)

-------------
### Spark NLP Annotation UDFs

In [None]:
result.select('pos').show(1, truncate=False)

In [None]:
def nn_tokens(annotations):
    nn_annotations = list(
        filter(lambda annotation: annotation.result == 'NN', annotations)
    )
    return list(
        map(lambda nn_annotation: nn_annotation.metadata['word'], nn_annotations)
    )

In [None]:
from sparknlp.functions import *

In [None]:
from pyspark.sql.types import ArrayType, StringType

In [None]:
result.select(map_annotations(nn_tokens, ArrayType(StringType()))('pos').alias('nn_tokens')).show(truncate=False)