![JohnSnowLabs](https://nlp.johnsnowlabs.com/assets/images/logo.png)

<H1>Context Based Clinical Spell Checker</H1>

In [0]:
import os
import json
import string
import numpy as np
import pandas as pd

import sparknlp
import sparknlp_jsl
from sparknlp.util import *
from sparknlp.base import *
from sparknlp.annotator import *
from sparknlp_jsl.annotator import *
from sparknlp.pretrained import ResourceDownloader

from pyspark.sql import functions as F
from pyspark.ml import Pipeline, PipelineModel


pd.set_option('max_colwidth', 100)
pd.set_option('display.max_columns', None)  
pd.set_option('display.expand_frame_repr', False)

print("Spark NLP Version :", sparknlp.version())
print("Spark NLP_JSL Version :", sparknlp_jsl.version())

spark

In [0]:
documentAssembler = DocumentAssembler()\
    .setInputCol("text")\
    .setOutputCol("document")

tokenizer = RecursiveTokenizer()\
    .setInputCols(["document"])\
    .setOutputCol("token")\
    .setPrefixes(["\"", "(", "[", "\n"])\
    .setSuffixes([".", ",", "?", ")","!", "'s"])

spellModel = ContextSpellCheckerModel.pretrained('spellcheck_clinical', 'en', 'clinical/models')\
    .setInputCols("token")\
    .setOutputCol("checked")

In [0]:
pipeline = Pipeline(
    stages = [
    documentAssembler,
    tokenizer,
    spellModel
  ])

empty_ds = spark.createDataFrame([[""]]).toDF("text")

lp = LightPipeline(pipeline.fit(empty_ds))

Ok!, at this point we have our spell checking pipeline as expected. Let's see what we can do with it, see these errors,



_She was **treathed** with a five day course of **amoxicilin** for a **resperatory** **truct** infection._

_With pain well controlled on **orall** **meditation**, she was discharged to **reihabilitation** **facilitay**._


_Her **adominal** examination is soft, nontender, and **nonintended**_

_The patient was seen by the **entocrinology** service and she was discharged on 40 units of **unsilin** glargine at night_
      
_No __cute__ distress_


Check that some of the errors are valid English words, only by considering the context the right choice can be made.

In [0]:
example = ["She was treathed with a five day course of amoxicilin for a resperatory truct infection . ",
           "With pain well controlled on orall meditation, she was discharged to reihabilitation facilitay.",
           "Her adominal examination is soft, nontender, and nonintended.",
           "The patient was seen by the entocrinology service and she was discharged on 40 units of unsilin glargine at night",
           "No cute distress",
          ]

for pairs in lp.annotate(example):

  print (list(zip(pairs['token'],pairs['checked'])))

In [0]:
print("Corrected tokens:\n")

pair_list = [list(zip(pairs['token'],pairs['checked'])) for pairs in lp.annotate(example)]
corrected_list = [i for pair in pair_list for i in pair if i[0] != i[1]]
corrected_list

## spellcheck_drug_norvig

In [0]:
documentAssembler = DocumentAssembler()\
    .setInputCol("text")\
    .setOutputCol("document")

tokenizer = Tokenizer()\
    .setInputCols("document")\
    .setOutputCol("token")

spell = NorvigSweetingModel.pretrained("spellcheck_drug_norvig", "en", "clinical/models")\
    .setInputCols("token")\
    .setOutputCol("corrected_token")\

pipeline = Pipeline(
    stages = [
        documentAssembler,
        tokenizer,
        spell
        ])

empty_ds = spark.createDataFrame([[""]]).toDF("text")

lp = LightPipeline(pipeline.fit(empty_ds))

In [0]:
example = ["You have to take Amrosia artemisiifoli , Oactra and a bit of Grastk and lastacaf ",
          ]

for pairs in lp.annotate(example):
    print(list(zip(pairs['token'],pairs['corrected_token'])))

In [0]:
print("Corrected tokens:\n")

pair_list = [list(zip(pairs['token'],pairs['corrected_token'])) for pairs in lp.annotate(example)]
corrected_list = [i for pair in pair_list for i in pair if i[0] != i[1]]
corrected_list

End of Notebook #