In [None]:
import os
import sys
sys.path.append('../../')

print(sys.version)

from sparknlp.pretrained import ResourceDownloader
from sparknlp.base import DocumentAssembler
from sparknlp.annotator import SentenceDetector

from pyspark.sql import SparkSession

spark = SparkSession.builder \
    .appName("downloader-example")\
    .master("local[1]")\
    .getOrCreate()

# instantiate the downloader
downloader = ResourceDownloader()


In [None]:
# create some mock data to play with
l = [
  (1,'To be or not to be'),
  (2,'This is it!')
]

data = spark.createDataFrame(l, ['docID','text'])

In [None]:
# download directly - models

# document assembler
assembler = downloader.downloadModel(DocumentAssembler.reader, "document_std", "en")
output = assembler.transform(data)
output.show()


# sentence detector
detector = downloader.downloadModel(SentenceDetector.reader, "sentence_std", "en")
detector.transform(output).show()


In [None]:
# download directly - pipeline models

# simple pipeline with document assembler and tokenizer
pipeline = downloader.downloadPipeline("pipeline_std", "en")
pipeline.transform(data).show()

In [None]:
# download predefined - models
from pretrained.en.models import DocumentAssembler, SentenceDetector, Tokenizer, Pos, Ner
assembler_std = DocumentAssembler.std()
assembler_std.transform(data).show()

SentenceDetector.std()
Tokenizer.std()
Pos.fast()
Ner.fast()

In [None]:
# download predefined - pipelines models

from pretrained.en.pipelines import SentenceDetector, Ner, Pos

# sentence detector
sent_detector_std = SentenceDetector.std()
sent_detector_std.transform(data).show()

# part of speech
pos_std = Pos.fast()
pos_tagged = pos_std.transform(data)
pos_tagged.show()

# named entity recognition
ner_std = Ner.fast()
ner_std.transform(data).show()
