<img src="https://nlp.johnsnowlabs.com/assets/images/logo.png" width="180" height="50" style="float: left;">

In [1]:
import sys
sys.path.append('../../')

#Spark ML and SQL
from pyspark.ml import Pipeline, PipelineModel
from pyspark.sql.functions import array_contains
from pyspark.sql import SparkSession
from pyspark.sql.types import StructType, StructField, IntegerType, StringType
#Spark NLP
from sparknlp.annotator import *
from sparknlp.common import RegexRule
from sparknlp.base import DocumentAssembler, Finisher

In [None]:
spark = SparkSession.builder \
    .appName("Context_Spell_Checking")\
    .master("local[*]")\
    .config("spark.driver.memory","8G")\
    .config("spark.driver.maxResultSize", "2G")\
    .config("spark.jars", "/tmp/sparknlp.jar")\
    .config("spark.driver.extraClassPath", "/tmp/sparknlp.jar")\
    .config("spark.executor.extraClassPath", "/tmp/sparknlp.jar")\
    .config("spark.kryoserializer.buffer.max", "500m")\
    .getOrCreate()

In [None]:
data = ["Yesterday I lost my blue unikorn .",
        "he is gane .",
        "Through a note of introduction from Bettina he had meet her mother and fether ."]

In [None]:
  documentAssembler = DocumentAssembler()\
      .setInputCol("text")\
      .setOutputCol("document")

  tokenizer = Tokenizer()\
    .setInputCols(["document"])\
    .setOutputCol("token")

  ocrspellModel = ContextSpellCheckerModel()\
    .pretrained()\
    .setInputCols(["token"])\
    .setOutputCol("spell_checked")
    
  finisher = Finisher()\
    .setInputCols(["spell_checked"])\
    .setValueSplitSymbol("@")


In [None]:
pipeline = Pipeline(stages=[
    documentAssembler,
    tokenizer,
    ocrspellModel,
    finisher
])

pipeline_model = pipeline.fit(data)
#checked_data.select("finished_spell_checked").show(truncate=False)


In [None]:
from sparknlp.base import LightPipeline
lp = LightPipeline(pipeline_model)
lp.annotate(data)