

![JohnSnowLabs](https://nlp.johnsnowlabs.com/assets/images/logo.png)

[![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://githubtocolab.com/JohnSnowLabs/spark-nlp-workshop/blob/master/tutorials/streamlit_notebooks/SPELL_CHECKER_EN.ipynb)




# **Spell check your text documents**

## 1. Colab Setup

Install dependencies

In [1]:
# Install PySpark and Spark NLP
! pip install -q pyspark==3.1.2 spark-nlp

Import dependencies

In [2]:
import pandas as pd
import numpy as np
import json
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report, accuracy_score
from pyspark.ml import Pipeline
from pyspark.sql import SparkSession
import pyspark.sql.functions as F
from sparknlp.annotator import *
from sparknlp.base import *
import sparknlp
from sparknlp.pretrained import PretrainedPipeline

Start Spark Session

In [3]:
spark = sparknlp.start()

## 2. Select the NER model and construct the pipeline

In [4]:
document_assembler = DocumentAssembler()\
  .setInputCol("text")\
  .setOutputCol("document")

tokenizer = RecursiveTokenizer()\
  .setInputCols(["document"])\
  .setOutputCol("token")\
  .setPrefixes(["\"", "(", "[", "\n"])\
  .setSuffixes([".", ",", "?", ")","!", "‘s"])

spell_model = ContextSpellCheckerModel\
    .pretrained('spellcheck_dl')\
    .setInputCols("token")\
    .setOutputCol("corrected")

finisher = Finisher().setInputCols("corrected")

light_pipeline = Pipeline(stages = [
                                    document_assembler,
                                    tokenizer,
                                    spell_model,
                                    finisher
                                    ])
## For comparison
full_pipeline = Pipeline(
    stages = [
              document_assembler,
              tokenizer,
              spell_model
  ])

empty_ds = spark.createDataFrame([[""]]).toDF("text")
pipeline_model = full_pipeline.fit(empty_ds)
l_pipeline_model = LightPipeline(light_pipeline.fit(empty_ds))

spellcheck_dl download started this may take some time.
Approximate size to download 111.4 MB
[OK!]


## 3. Create example inputs

In [5]:
# Enter examples as strings in this array
input_list = ["Plaese alliow me tao introdduce myhelf, I am a man of waelth und tiaste"]

## 4. Use the pipeline to create outputs

Full Pipeline

In [6]:
df = spark.createDataFrame(pd.DataFrame({"text": input_list}))
result = pipeline_model.transform(df)

Light Pipeline

In [7]:
# Light pipelines expect a single example.
light_result = l_pipeline_model.annotate(input_list[0])

## 5. Visualize results

Visualize comparison as dataframe

In [8]:
exploded = F.explode(F.arrays_zip('token.result', 'corrected.result'))
select_expression_0 = F.expr("cols['0']").alias("original")
select_expression_1 = F.expr("cols['1']").alias("corrected")
result.select(exploded.alias("cols")) \
    .select(select_expression_0, select_expression_1).show(truncate=False)

+----------+---------+
|original  |corrected|
+----------+---------+
|Plaese    |Please   |
|alliow    |allow    |
|me        |me       |
|tao       |to       |
|introdduce|introduce|
|myhelf    |myself   |
|,         |,        |
|I         |I        |
|am        |am       |
|a         |a        |
|man       |man      |
|of        |of       |
|waelth    |wealth   |
|und       |und      |
|tiaste    |taste    |
+----------+---------+



Vizualise light pipeline and finished result

In [9]:
# this finished result does not need parsing and can directly be used an any other task.
light_result['corrected']

['Please',
 'allow',
 'me',
 'to',
 'introduce',
 'myself',
 ',',
 'I',
 'am',
 'a',
 'man',
 'of',
 'wealth',
 'und',
 'taste']