![JohnSnowLabs](https://nlp.johnsnowlabs.com/assets/images/logo.png)

# Spark DataFrames Playground

In [3]:
import sparknlp

from sparknlp.base import *
from sparknlp.annotator import *

from pyspark.ml import Pipeline

print("Spark NLP version", sparknlp.version())

print("Apache Spark version:", spark.version)

spark

In [4]:
spark = sparknlp.start()

In [5]:
document = DocumentAssembler().setInputCol('text').setOutputCol('document')

In [6]:
tokenizer = Tokenizer().setInputCols('document').setOutputCol('token')

In [7]:
pos = PerceptronModel.pretrained().setInputCols('document', 'token').setOutputCol('pos')

In [8]:
pipeline = Pipeline().setStages([document, tokenizer, pos])

In [9]:
!wget https://raw.githubusercontent.com/JohnSnowLabs/spark-nlp-workshop/master/jupyter/annotation/english/spark-nlp-basics/sample-sentences-en.txt

In [10]:
%fs ls "file:/databricks/driver"

path,name,size
file:/databricks/driver/conf/,conf/,4096
file:/databricks/driver/eng.train,eng.train,3283420
file:/databricks/driver/logs/,logs/,4096
file:/databricks/driver/eventlogs/,eventlogs/,4096
file:/databricks/driver/sample-sentences-en.txt,sample-sentences-en.txt,284
file:/databricks/driver/ganglia/,ganglia/,4096
file:/databricks/driver/news_category_test.csv,news_category_test.csv,1504408
file:/databricks/driver/eng.testa,eng.testa,827443
file:/databricks/driver/derby.log,derby.log,726


In [11]:
dbutils.fs.cp("file:/databricks/driver/sample-sentences-en.txt", "dbfs:/")

In [12]:
data = spark.read.text('sample-sentences-en.txt').toDF('text')

In [13]:
data.show(5)

In [14]:
model = pipeline.fit(data)

In [15]:
result = model.transform(data)

In [16]:
result.show(5)

In [17]:
stored = result\
  .select('text', 'pos.begin', 'pos.end', 'pos.result', 'pos.metadata')\
  .toDF('text', 'pos_begin', 'pos_end', 'pos_result', 'pos_meta')\
  .cache()

In [18]:
stored.printSchema()

In [19]:
stored.show(5)

---------
## Spark SQL Functions

In [21]:
from pyspark.sql.functions import *

In [22]:
stored.filter(array_contains('pos_result', 'VBD')).show(5)

In [23]:
stored.withColumn('token_count', size('pos_result')).select('pos_result', 'token_count').show(5)

In [24]:
stored.select('text', array_max('pos_end')).show(5)

In [25]:
stored.withColumn('unique_pos', array_distinct('pos_result')).select('pos_result', 'unique_pos').show(5)

In [26]:
stored.groupBy(array_sort(array_distinct('pos_result'))).count().show(10)

----------------
### SQL Functions with `col`

In [28]:
from pyspark.sql.functions import col

In [29]:
stored.select(col('pos_meta').getItem(0).getItem('word')).show(5)

-------------
### Spark NLP Annotation UDFs

In [31]:
result.select('pos').show(1, truncate=False)

In [32]:
def nn_tokens(annotations):
    nn_annotations = list(
        filter(lambda annotation: annotation.result == 'NN', annotations)
    )
    return list(
        map(lambda nn_annotation: nn_annotation.metadata['word'], nn_annotations)
    )

In [33]:
from sparknlp.functions import *

In [34]:
from pyspark.sql.types import ArrayType, StringType

In [35]:
result.select(map_annotations(nn_tokens, ArrayType(StringType()))('pos').alias('nn_tokens')).show(truncate=False)