In [None]:

pip install pyspark==3.2.1

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting pyspark==3.2.1
  Downloading pyspark-3.2.1.tar.gz (281.4 MB)
[K     |████████████████████████████████| 281.4 MB 38 kB/s 
[?25hCollecting py4j==0.10.9.3
  Downloading py4j-0.10.9.3-py2.py3-none-any.whl (198 kB)
[K     |████████████████████████████████| 198 kB 58.6 MB/s 
[?25hBuilding wheels for collected packages: pyspark
  Building wheel for pyspark (setup.py) ... [?25l[?25hdone
  Created wheel for pyspark: filename=pyspark-3.2.1-py2.py3-none-any.whl size=281853643 sha256=84cb1fe201501a847e5f854cd94ce277e73c860376a102a35ad29f3a92a00562
  Stored in directory: /root/.cache/pip/wheels/58/94/83/915c9059e4b038e2d43a6058f307fe1c3e8536e5745f3b23b7
Successfully built pyspark
Installing collected packages: py4j, pyspark
Successfully installed py4j-0.10.9.3 pyspark-3.2.1


In [None]:
import sparknlp
from sparknlp.base import *
from sparknlp.annotator import *

In [None]:
spark = sparknlp.start()

In [None]:
data = spark.createDataFrame([['Peter is a good person living in Italy.Paula is also good person.She lives in London ']]).toDF('text')

In [None]:
data.show(truncate=False)

+-------------------------------------------------------------------------------------+
|text                                                                                 |
+-------------------------------------------------------------------------------------+
|Peter is a good person living in Italy.Paula is also good person.She lives in London |
+-------------------------------------------------------------------------------------+



**DocumentAssembler : Read String Column and create Annotation Columns**

In [None]:
#Document Assember controls cleaning up of Source text and handling of special chatacters 
# and new lines througn Cleanup Mode

In [None]:
document=DocumentAssembler().setInputCol('text').setOutputCol('document').setCleanupMode('shrink')

In [None]:
# SentenceDetector: Splits Sentences in meaningful way
#Sentence Detector explode sentences improves parallelism in large text concentration

sentence= SentenceDetector().setInputCols('document').setOutputCol('sentence')

In [None]:
sentence.setExplodeSentences(True)

SentenceDetector_01fcf4c256b6

**Tokenizer**

In [None]:
tokenizer=Tokenizer().setInputCols('sentence').setOutputCol('token')

In [None]:
tokenizer.setExceptions(['e-mail'])

Tokenizer_422c2c3cafbe

**Spell Checker**

In [None]:
checker = NorvigSweetingModel.pretrained().setInputCols(['token']).setOutputCol('checked')

spellcheck_norvig download started this may take some time.
Approximate size to download 4.2 MB
[OK!]


**Embedding**

In [None]:
embeddings = WordEmbeddingsModel.pretrained().setInputCols(['sentence','token']).setOutputCol('embeddings')

glove_100d download started this may take some time.
Approximate size to download 145.3 MB
[OK!]


**Named Entity Recognition**

In [None]:
ner = NerDLModel.pretrained().setInputCols(['sentence','checked','embeddings']).setOutputCol('ner')

ner_dl download started this may take some time.
Approximate size to download 13.6 MB
[OK!]


In [None]:
converter = NerConverter().setInputCols(['sentence','checked','ner']).setOutputCol('chunk')

**PipeLine**

In [None]:
from pyspark.ml import Pipeline

In [None]:
pipeline=Pipeline().setStages([document,sentence,tokenizer,embeddings,checker,ner,converter])

In [None]:
model = pipeline.fit(data)

In [None]:
result = model.transform(data)

In [None]:
result.show()

+--------------------+--------------------+--------------------+--------------------+--------------------+--------------------+--------------------+--------------------+
|                text|            document|            sentence|               token|          embeddings|             checked|                 ner|               chunk|
+--------------------+--------------------+--------------------+--------------------+--------------------+--------------------+--------------------+--------------------+
|Peter is a good p...|[{document, 0, 83...|[{document, 0, 83...|[{token, 0, 4, Pe...|[{word_embeddings...|[{token, 0, 4, Pe...|[{named_entity, 0...|[{chunk, 0, 4, Pe...|
+--------------------+--------------------+--------------------+--------------------+--------------------+--------------------+--------------------+--------------------+



In [None]:
# Analyze Pipline

result.select('sentence.result').show(truncate=False)

+--------------------------------------------------------------------------------------+
|result                                                                                |
+--------------------------------------------------------------------------------------+
|[Peter is a good person living in Italy.Paula is also good person.She lives in London]|
+--------------------------------------------------------------------------------------+



In [None]:
result.select('checked.result').show(truncate=False)

+----------------------------------------------------------------------------------------------------+
|result                                                                                              |
+----------------------------------------------------------------------------------------------------+
|[Peter, is, a, good, person, living, in, Italy.Paula, is, also, good, person.She, lives, in, London]|
+----------------------------------------------------------------------------------------------------+



In [None]:
result.select('ner.result').show(truncate=False)

+---------------------------------------------------------+
|result                                                   |
+---------------------------------------------------------+
|[B-PER, O, O, O, O, O, O, B-LOC, O, O, O, O, O, O, B-LOC]|
+---------------------------------------------------------+



In [None]:
result.select('chunk.result','chunk.begin','chunk.end').show(truncate=False)

+----------------------------+-----------+-----------+
|result                      |begin      |end        |
+----------------------------+-----------+-----------+
|[Peter, Italy.Paula, London]|[0, 33, 78]|[4, 43, 83]|
+----------------------------+-----------+-----------+



In [None]:
light=LightPipeline(model)

In [None]:
light.annotate('Bruno is living in Italy, and he is doing well.')

{'chunk': ['Bruno', 'Italy'],
 'checked': ['Bruno',
  'is',
  'living',
  'in',
  'Italy',
  ',',
  'and',
  'he',
  'is',
  'doing',
  'well',
  '.'],
 'document': ['Bruno is living in Italy, and he is doing well.'],
 'token': ['Bruno',
  'is',
  'living',
  'in',
  'Italy',
  ',',
  'and',
  'he',
  'is',
  'doing',
  'well',
  '.'],
 'ner': ['B-PER', 'O', 'O', 'O', 'B-LOC', 'O', 'O', 'O', 'O', 'O', 'O', 'O'],
 'embeddings': ['Bruno',
  'is',
  'living',
  'in',
  'Italy',
  ',',
  'and',
  'he',
  'is',
  'doing',
  'well',
  '.'],
 'sentence': ['Bruno is living in Italy, and he is doing well.']}