This notebook shows how to export spaCy tokens and sentences to Spark NLP using SpacyToAnnotation component

### Exporting Spacy Tokens/Sentences

In [None]:
import spacy
from spacy.lang.en import English



In [None]:
nlp = spacy.load("en_core_web_sm")
text = "Hello world! How are you today? I'm fine thanks."
doc = nlp(text)
tokens = [str(token) for token in doc]
token_spaces = [bool(token.whitespace_) for token in doc]
sentence_ends = [sent[-1].i for sent in doc.sents]

Create a dictionary with the data and export to JSON file

In [None]:
import json

spaces = [int(space) for space in token_spaces]

data = {
    "tokens": tokens,
    "token_spaces": token_spaces,
    "sentence_ends": sentence_ends
}

json_data = json.dumps([data])

with open("./multi_doc_tokens.json", "w") as outfile:
    outfile.write(json_data)

In [None]:
! cat ./multi_doc_tokens.json

[{"tokens": ["Hello", "world", "!", "How", "are", "you", "today", "?", "I", "'m", "fine", "thanks", "."], "token_spaces": [true, false, true, true, true, true, false, true, false, true, true, false, false], "sentence_ends": [2, 7, 12]}]

### Importing Spacy Tokens/Sentences to Spark NLP

To import this json file of tokens and sentences to Spark NLP annotations we follow the procedure below:

In [None]:
import sparknlp
from sparknlp.base import *
from sparknlp.annotator import *
from sparknlp.training import SpacyToAnnotation

print("Spark NLP version", sparknlp.version())

In [None]:
spark = sparknlp.start()
spark

In [None]:
spark.version

'3.2.1'

In [None]:
from sparknlp.training import SpacyToAnnotation

nlp_reader = SpacyToAnnotation()

In [None]:
result = nlp_reader.readJsonFile(spark, "./multi_doc_tokens.json")

In [None]:
result

DataFrame[document: array<struct<annotatorType:string,begin:int,end:int,result:string,metadata:map<string,string>,embeddings:array<float>>>, sentence: array<struct<annotatorType:string,begin:int,end:int,result:string,metadata:map<string,string>,embeddings:array<float>>>, token: array<struct<annotatorType:string,begin:int,end:int,result:string,metadata:map<string,string>,embeddings:array<float>>>]

In [None]:
result.printSchema()

root
 |-- document: array (nullable = true)
 |    |-- element: struct (containsNull = true)
 |    |    |-- annotatorType: string (nullable = true)
 |    |    |-- begin: integer (nullable = false)
 |    |    |-- end: integer (nullable = false)
 |    |    |-- result: string (nullable = true)
 |    |    |-- metadata: map (nullable = true)
 |    |    |    |-- key: string
 |    |    |    |-- value: string (valueContainsNull = true)
 |    |    |-- embeddings: array (nullable = true)
 |    |    |    |-- element: float (containsNull = false)
 |-- sentence: array (nullable = true)
 |    |-- element: struct (containsNull = true)
 |    |    |-- annotatorType: string (nullable = true)
 |    |    |-- begin: integer (nullable = false)
 |    |    |-- end: integer (nullable = false)
 |    |    |-- result: string (nullable = true)
 |    |    |-- metadata: map (nullable = true)
 |    |    |    |-- key: string
 |    |    |    |-- value: string (valueContainsNull = true)
 |    |    |-- embeddings: array (

In [None]:
result.show(truncate=False)

+-----------------------------------------------------------------------------+------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+-----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+
|document                                                                     |sentence                                                                                                    