![JohnSnowLabs](https://nlp.johnsnowlabs.com/assets/images/logo.png)

# 6. Spark DataFrames Playground v.2.6.3

In [0]:
import sparknlp

from sparknlp.base import *
from sparknlp.annotator import *

from pyspark.ml import Pipeline

print("Spark NLP version", sparknlp.version())

spark = sparknlp.start()

print("Apache Spark version:", spark.version)

spark

In [0]:
document = DocumentAssembler().setInputCol('text').setOutputCol('document')

In [0]:
tokenizer = Tokenizer().setInputCols('document').setOutputCol('token')

In [0]:
pos = PerceptronModel.pretrained().setInputCols('document', 'token').setOutputCol('pos')

In [0]:
pipeline = Pipeline().setStages([document, tokenizer, pos])

In [0]:
!wget https://raw.githubusercontent.com/JohnSnowLabs/spark-nlp-workshop/master/jupyter/annotation/english/spark-nlp-basics/sample-sentences-en.txt
  
dbutils.fs.cp("file:/databricks/driver/sample-sentences-en.txt", "dbfs:/")

In [0]:
%fs ls "file:/databricks/driver"

path,name,size
file:/databricks/driver/conf/,conf/,4096
file:/databricks/driver/logs/,logs/,4096
file:/databricks/driver/__pycache__/,__pycache__/,4096
file:/databricks/driver/aclimdb_test.csv,aclimdb_test.csv,32715164
file:/databricks/driver/news_category_train.csv,news_category_train.csv,24032125
file:/databricks/driver/spam_ham_dataset.csv,spam_ham_dataset.csv,5502589
file:/databricks/driver/dataset_encoder.py,dataset_encoder.py,2543
file:/databricks/driver/sentence_grouper.py,sentence_grouper.py,953
file:/databricks/driver/aclimdb_train.csv,aclimdb_train.csv,33497180
file:/databricks/driver/toxic_train.snappy.parquet,toxic_train.snappy.parquet,2767307


In [0]:
data = spark.read.text('./sample-sentences-en.txt').toDF('text')

In [0]:
data.show(5)

In [0]:
model = pipeline.fit(data)

In [0]:
result = model.transform(data)

In [0]:
result.show(5)

In [0]:
stored = result\
  .select('text', 'pos.begin', 'pos.end', 'pos.result', 'pos.metadata')\
  .toDF('text', 'pos_begin', 'pos_end', 'pos_result', 'pos_meta')\
  .cache()

In [0]:
stored.printSchema()

In [0]:
stored.show(5)

---------
## Spark SQL Functions

In [0]:
from pyspark.sql.functions import *

In [0]:
stored.filter(array_contains('pos_result', 'VBD')).show(5)

In [0]:
stored.withColumn('token_count', size('pos_result')).select('pos_result', 'token_count').show(5)

In [0]:
stored.select('text', array_max('pos_end')).show(5)

In [0]:
stored.withColumn('unique_pos', array_distinct('pos_result')).select('pos_result', 'unique_pos').show(5)

In [0]:
stored.groupBy(array_sort(array_distinct('pos_result'))).count().show(10)

----------------
### SQL Functions with `col`

In [0]:
from pyspark.sql.functions import col

In [0]:
stored.select(col('pos_meta').getItem(0).getItem('word')).show(5)

-------------
### Spark NLP Annotation UDFs

In [0]:
result.select('pos').show(1, truncate=False)

In [0]:
def nn_tokens(annotations):
    nn_annotations = list(
        filter(lambda annotation: annotation.result == 'NN', annotations)
    )
    return list(
        map(lambda nn_annotation: nn_annotation.metadata['word'], nn_annotations)
    )

In [0]:
from sparknlp.functions import *

In [0]:
from pyspark.sql.types import ArrayType, StringType

In [0]:
result.select(map_annotations(nn_tokens, ArrayType(StringType()))('pos').alias('nn_tokens')).show(truncate=False)

End of Notebook # 6