In [1]:
import os
import sys
sys.path.append('../../')

print(sys.version)

from sparknlp.downloader import ResourceDownloader
from sparknlp.base import DocumentAssembler
from sparknlp.annotator import *

from pyspark.sql import SparkSession
from pyspark.ml import Pipeline

spark = SparkSession.builder \
    .appName("downloader-example")\
    .master("local[1]")\
    .getOrCreate()

# instantiate the downloader
downloader = ResourceDownloader()


3.6.4 (v3.6.4:d48eceb, Dec 19 2017, 06:04:45) [MSC v.1900 32 bit (Intel)]


In [2]:
# create some mock data to play with
l = [
  (1,'To be or not to be'),
  (2,'This is it!')
]

data = spark.createDataFrame(l, ['docID','text'])

In [3]:
# download directly - models
document_assembler = DocumentAssembler() \
    .setInputCol("text")

sentence_detector = SentenceDetector() \
    .setInputCols(["document"]) \
    .setOutputCol("sentence")

tokenizer = Tokenizer() \
    .setInputCols(["sentence"]) \
    .setOutputCol("token")
    
# pos tagger
pos = downloader.downloadModel(PerceptronModel, "pos_fast", "en")    
    
pipeline = Pipeline(stages=[document_assembler, sentence_detector, tokenizer, pos])

output = pipeline.fit(data).transform(data)
output.show()


+-----+------------------+--------------------+--------------------+--------------------+--------------------+
|docID|              text|            document|            sentence|               token|                 pos|
+-----+------------------+--------------------+--------------------+--------------------+--------------------+
|    1|To be or not to be|[[document, 0, 17...|[[document, 0, 17...|[[token, 0, 1, To...|[[pos, 0, 1, TO, ...|
|    2|       This is it!|[[document, 0, 10...|[[document, 0, 10...|[[token, 0, 3, Th...|[[pos, 0, 3, DT, ...|
+-----+------------------+--------------------+--------------------+--------------------+--------------------+



In [4]:
# download directly - pipeline models

# simple pipeline with document assembler and tokenizer
pipeline = downloader.downloadPipeline("pipeline_basic", "en")
pipeline.transform(data).show()

+-----+------------------+--------------------+--------------------+--------------------+--------------------+--------------------+
|docID|              text|   finished_document|      finished_token|     finished_normal|      finished_lemma|        finished_pos|
+-----+------------------+--------------------+--------------------+--------------------+--------------------+--------------------+
|    1|To be or not to be|[To be or not to be]|[To, be, or, not,...|[to, be, or, not,...|[To, be, or, not,...|[TO, VB, CC, RB, ...|
|    2|       This is it!|       [This is it!]|   [This, is, it, !]|      [this, is, it]|   [This, be, it, !]|      [DT, VBZ, PRP]|
+-----+------------------+--------------------+--------------------+--------------------+--------------------+--------------------+



In [5]:
# Test clearCache
downloader.clearCache("pipeline_basic", "en")

In [6]:
# downlload predefined - pipelines
from sparknlp.downloader.pipeline.en import BasicPipeline

basic_data = BasicPipeline().annotate(data, "text")
basic_data.show()

+-----+------------------+--------------------+--------------------+--------------------+--------------------+--------------------+
|docID|              text|   finished_document|      finished_token|     finished_normal|      finished_lemma|        finished_pos|
+-----+------------------+--------------------+--------------------+--------------------+--------------------+--------------------+
|    1|To be or not to be|[To be or not to be]|[To, be, or, not,...|[to, be, or, not,...|[To, be, or, not,...|[TO, VB, CC, RB, ...|
|    2|       This is it!|       [This is it!]|   [This, is, it, !]|      [this, is, it]|   [This, be, it, !]|      [DT, VBZ, PRP]|
+-----+------------------+--------------------+--------------------+--------------------+--------------------+--------------------+



In [8]:
# download predefined - models
from sparknlp.downloader.model.en import *
ner = CloudNerCrfModel.retrieve()
ner.setInputCols("finished_pos", "finished_token", "finished_document")

Py4JJavaError: An error occurred while calling z:com.johnsnowlabs.downloader.PythonResourceDownloader.downloadModel.
: java.net.URISyntaxException: Illegal character in path at index 16: cache_pretrained\ner_fast_en_1.5_2_1522103155443
	at java.net.URI$Parser.fail(URI.java:2848)
	at java.net.URI$Parser.checkChars(URI.java:3021)
	at java.net.URI$Parser.parseHierarchical(URI.java:3105)
	at java.net.URI$Parser.parse(URI.java:3063)
	at java.net.URI.<init>(URI.java:588)
	at com.johnsnowlabs.nlp.HasWordEmbeddings$class.deserializeEmbeddings(HasWordEmbeddings.scala:52)
	at com.johnsnowlabs.nlp.annotators.ner.crf.NerCrfModel.deserializeEmbeddings(NerCrfModel.scala:18)
	at com.johnsnowlabs.nlp.embeddings.EmbeddingsReadable$class.onRead(EmbeddingsReadable.scala:8)
	at com.johnsnowlabs.nlp.annotators.ner.crf.NerCrfModel$.onRead(NerCrfModel.scala:78)
	at com.johnsnowlabs.nlp.annotators.ner.crf.NerCrfModel$.onRead(NerCrfModel.scala:78)
	at com.johnsnowlabs.nlp.ParamsAndFeaturesReadable$$anonfun$read$1.apply(ParamsAndFeaturesReadable.scala:29)
	at com.johnsnowlabs.nlp.ParamsAndFeaturesReadable$$anonfun$read$1.apply(ParamsAndFeaturesReadable.scala:29)
	at com.johnsnowlabs.nlp.FeaturesReader.load(ParamsAndFeaturesReadable.scala:17)
	at com.johnsnowlabs.nlp.FeaturesReader.load(ParamsAndFeaturesReadable.scala:6)
	at com.johnsnowlabs.downloader.ResourceDownloader$.downloadModel(ResourceDownloader.scala:79)
	at com.johnsnowlabs.downloader.PythonResourceDownloader$.downloadModel(ResourceDownloader.scala:133)
	at com.johnsnowlabs.downloader.PythonResourceDownloader.downloadModel(ResourceDownloader.scala)
	at sun.reflect.NativeMethodAccessorImpl.invoke0(Native Method)
	at sun.reflect.NativeMethodAccessorImpl.invoke(NativeMethodAccessorImpl.java:62)
	at sun.reflect.DelegatingMethodAccessorImpl.invoke(DelegatingMethodAccessorImpl.java:43)
	at java.lang.reflect.Method.invoke(Method.java:498)
	at py4j.reflection.MethodInvoker.invoke(MethodInvoker.java:244)
	at py4j.reflection.ReflectionEngine.invoke(ReflectionEngine.java:357)
	at py4j.Gateway.invoke(Gateway.java:282)
	at py4j.commands.AbstractCommand.invokeMethod(AbstractCommand.java:132)
	at py4j.commands.CallCommand.execute(CallCommand.java:79)
	at py4j.GatewayConnection.run(GatewayConnection.java:214)
	at java.lang.Thread.run(Thread.java:748)


In [None]:
# download predefined - pipelines models

from sparknlp.pretrained.en.pipelines import *

# part of speech
pos_std = CloudPOSPipeline.retrieveSmall()
pos_tagged = pos_std.transform(data)
pos_tagged.show()

# named entity recognition
ner_std = CloudNerCrfPipeline.retrieveSmall()
ner_std.transform(data).show()
