<img src="https://nlp.johnsnowlabs.com/assets/images/logo.png" width="180" height="50" style="float: left;">

In [8]:
import sys

#Spark ML and SQL
from pyspark.ml import Pipeline, PipelineModel
from pyspark.sql.functions import array_contains
from pyspark.sql import SparkSession
from pyspark.sql.types import StructType, StructField, IntegerType, StringType
#Spark NLP
import sparknlp
from sparknlp.pretrained import PretrainedPipeline
from sparknlp.annotator import *
from sparknlp.common import RegexRule
from sparknlp.base import DocumentAssembler, Finisher

In [2]:
spark = sparknlp.start()

In [9]:
data = spark.createDataFrame(["Yesterday I lost my blue unikorn .", "Through a note of introduction from Bettina."], StringType()).toDF("text")

In [11]:
  documentAssembler = DocumentAssembler()\
      .setInputCol("text")\
      .setOutputCol("document")

  tokenizer = Tokenizer()\
    .setInputCols(["document"])\
    .setOutputCol("token")

  ocrspellModel = ContextSpellCheckerModel()\
    .pretrained()\
    .setInputCols(["token"])\
    .setOutputCol("spell_checked")\
    .setTradeoff(10.0)
    
    
  finisher = Finisher()\
    .setInputCols(["spell_checked"])\
    .setValueSplitSymbol("@")


In [12]:
pipeline = Pipeline(stages=[
    documentAssembler,
    tokenizer,
    ocrspellModel,
    finisher
])

empty = spark.sparkContext.parallelize([['empty']]).toDF().toDF("region").cache()

pipeline_model = pipeline.fit(empty)
pipeline_model.transform(data).select("finished_spell_checked").show(truncate=False)


+------------------------------------------------------+
|finished_spell_checked                                |
+------------------------------------------------------+
|[Yesterday, ,, lost, my, blue, uniform, .]            |
|[Through, a, note, of, introduction, from, Bettina, .]|
+------------------------------------------------------+

