In [5]:
import sys
sys.path.append('../../')

from pyspark.sql import SparkSession
from pyspark.ml import Pipeline

from sparknlp.annotator import *
from sparknlp.common import *
from sparknlp.base import *


In [6]:
data = spark.sparkContext.parallelize([
    ["Yesterday I lost my blue unikorn ."], 
    ["he is gane ."]]).toDF().toDF("region").cache()

In [7]:
  documentAssembler = DocumentAssembler()\
      .setInputCol("region")\
      .setOutputCol("text")

  tokenizer = Tokenizer()\
    .setInputCols(["text"])\
    .setOutputCol("token")

  ocrspellModel = ContextSpellCheckerModel()\
    .pretrained()\
    .setInputCols(["token"])\
    .setOutputCol("spell_checked")\
    .setTradeoff(10.0)
    
    
  finisher = Finisher()\
    .setInputCols(["spell_checked"])\
    .setValueSplitSymbol(" ")


In [8]:
pipeline = Pipeline(stages=[
    documentAssembler,
    tokenizer,
    ocrspellModel,
    finisher
])

checked_data = pipeline.fit(data).transform(data)
checked_data.select("finished_spell_checked").show(truncate=False)


+----------------------------------+
|finished_spell_checked            |
+----------------------------------+
|Yesterday@,@lost@my@blue@uniform@.|
|he@is@gone@.                      |
+----------------------------------+

