![JohnSnowLabs](https://nlp.johnsnowlabs.com/assets/images/logo.png)

[![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/JohnSnowLabs/spark-nlp/blob/master/examples/python/annotation/text/english/spark-nlp-basics/spark_nlp_basics_functions.ipynb)

# Spark NLP Basic Funtions

In [None]:
# Only run this cell when you are using Spark NLP on Google Colab
!wget http://setup.johnsnowlabs.com/colab.sh -O - | bash

In [None]:
import sparknlp
from sparknlp.annotator import *
from sparknlp.pretrained import *

spark = sparknlp.start()

print("Spark NLP version: ", sparknlp.version())
print("Apache Spark version: ", spark.version)

Spark NLP version:  4.3.0
Apache Spark version:  3.3.0


In [None]:
data = spark.createDataFrame([['Peter is a goud person.']]).toDF('text')

In [None]:
pipeline = PretrainedPipeline('explain_document_ml')

explain_document_ml download started this may take some time.
Approx size to download 9.2 MB
[OK!]


In [None]:
result = pipeline.transform(data)

In [None]:
result.show()

+--------------------+--------------------+--------------------+--------------------+--------------------+--------------------+--------------------+--------------------+
|                text|            document|            sentence|               token|               spell|              lemmas|               stems|                 pos|
+--------------------+--------------------+--------------------+--------------------+--------------------+--------------------+--------------------+--------------------+
|Peter is a goud p...|[{document, 0, 22...|[{document, 0, 22...|[{token, 0, 4, Pe...|[{token, 0, 4, Pe...|[{token, 0, 4, Pe...|[{token, 0, 4, pe...|[{pos, 0, 4, NNP,...|
+--------------------+--------------------+--------------------+--------------------+--------------------+--------------------+--------------------+--------------------+



In [None]:
from sparknlp.functions import *

In [None]:
from sparknlp.annotation import Annotation

def my_annoation_map_function(annotations):
    return list(map(lambda a: Annotation(
        'my_own_type',
        a.begin,
        a.end,
        a.result,
        {'my_key': 'custom_annotation_data'},
        []), annotations))

In [None]:
# The array type must be provided in order to tell Spark the expected output type of our column.
# We are using an Annotation array here

result.select(
    map_annotations(my_annoation_map_function, Annotation.arrayType())('token')
).toDF("my output").show(truncate=False)

+-------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+
|my output                                                                                                                                                                                                                                                                                                                                                                                                          |
+-------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------

In [None]:
# we can also explode annotations like this

explode_annotations_col(result, 'lemmas.result', 'exploded').select('exploded').show()

+--------+
|exploded|
+--------+
|   Peter|
|      be|
|       a|
|   gourd|
|  person|
|       .|
+--------+

