In [None]:
import os
import sys
sys.path.append('../../')

print(sys.version)

from sparknlp.pretrained import ResourceDownloader
from sparknlp.base import DocumentAssembler
from sparknlp.annotator import *

from pyspark.sql import SparkSession
from pyspark.ml import Pipeline

spark = SparkSession.builder \
    .appName("downloader-example")\
    .master("local[1]")\
    .getOrCreate()

# instantiate the downloader
downloader = ResourceDownloader()


In [None]:
# create some mock data to play with
l = [
  (1,'To be or not to be'),
  (2,'This is it!')
]

data = spark.createDataFrame(l, ['docID','text'])

In [None]:
# download directly - models
document_assembler = DocumentAssembler() \
    .setInputCol("text")

sentence_detector = SentenceDetector() \
    .setInputCols(["document"]) \
    .setOutputCol("sentence")

tokenizer = Tokenizer() \
    .setInputCols(["sentence"]) \
    .setOutputCol("token")
    
# pos tagger
pos = downloader.downloadModel(PerceptronModel, "pos_fast", "en")    
    
pipeline = Pipeline(stages=[document_assembler, sentence_detector, tokenizer, pos])

output = pipeline.fit(data).transform(data)
output.show()


In [None]:
# download directly - pipeline models

# simple pipeline with document assembler and tokenizer
pipeline = downloader.downloadPipeline("pipeline_basic_fin", "en")
pipeline.transform(data).show()

In [None]:
# Test clearCache
downloader.clearCache("pipeline_basic_fin", "en")

In [None]:
# download predefined - pipelines
from sparknlp.pretrained.pipeline.en import BasicPipeline

basic_data = BasicPipeline.annotate(data, "text")
basic_data.show()

In [None]:
# annotat quickly from string
BasicPipeline().annotate("This world is made up of good and bad things")

In [None]:
# download predefined - models
from sparknlp.pretrained.model.en import *

pos = CloudPerceptronModel.retrieve()
pos.setInputCols(["document", "normal"]).setOutputCol("pos")

ner = CloudNerCrfModel.retrieve()
ner.setInputCols(["pos", "normal", "document"]).setOutputCol("ner")

annotation_pipeline = BasicPipeline.retrieve()
annotation_data = annotation_pipeline.transform(data)
annotation_data.show()

pos_tagged = pos.transform(annotation_data)
ner_tagged = ner.transform(pos_tagged)
ner_tagged.show()