In [1]:
import os
import sys
sys.path.append('../../')

print(sys.version)

from sparknlp.pretrained import ResourceDownloader
from sparknlp.base import DocumentAssembler
from sparknlp.annotator import SentenceDetector

from pyspark.sql import SparkSession

spark = SparkSession.builder \
    .appName("downloader-example")\
    .master("local[1]")\
    .getOrCreate()

# instantiate the downloader
downloader = ResourceDownloader()


3.6.1 |Continuum Analytics, Inc.| (default, May 11 2017, 13:09:58) 
[GCC 4.4.7 20120313 (Red Hat 4.4.7-1)]


In [2]:
# create some mock data to play with
l = [
  (1,'To be or not to be'),
  (2,'This is it!')
]

data = spark.createDataFrame(l, ['docID','text'])

In [4]:
# download directly - models

# document assembler
assembler = downloader.downloadModel(DocumentAssembler.reader, "document_std", "en")
output = assembler.transform(data)
output.show()


# sentence detector
detector = downloader.downloadModel(SentenceDetector.reader, "sentence_std", "en")
detector.transform(output).show()


+-----+------------------+--------------------+
|docID|              text|            document|
+-----+------------------+--------------------+
|    1|To be or not to be|[[document,0,17,T...|
|    2|       This is it!|[[document,0,10,T...|
+-----+------------------+--------------------+

+-----+------------------+--------------------+--------------------+
|docID|              text|            document|            sentence|
+-----+------------------+--------------------+--------------------+
|    1|To be or not to be|[[document,0,17,T...|[[document,0,17,T...|
|    2|       This is it!|[[document,0,10,T...|[[document,0,10,T...|
+-----+------------------+--------------------+--------------------+



In [3]:
# download directly - pipeline models

# simple pipeline with document assembler and tokenizer
pipeline = downloader.downloadPipeline("pipeline_std", "en")
pipeline.transform(data).show()

+-----+------------------+--------------------+--------------------+
|docID|              text|            document|               token|
+-----+------------------+--------------------+--------------------+
|    1|To be or not to be|[[document,0,17,T...|[[token,0,1,To,Ma...|
|    2|       This is it!|[[document,0,10,T...|[[token,0,3,This,...|
+-----+------------------+--------------------+--------------------+



In [3]:
# download predefined - models
from pretrained.en.models import DocumentAssembler
assembler_std = DocumentAssembler.std()
assembler_std.transform(data).show()

+-----+------------------+--------------------+
|docID|              text|            document|
+-----+------------------+--------------------+
|    1|To be or not to be|[[document,0,17,T...|
|    2|       This is it!|[[document,0,10,T...|
+-----+------------------+--------------------+



In [5]:
# download predefined - pipelines models


from pretrained.en.pipelines import SentenceDetector, Ner, Pos

# sentence detector
sent_detector_std = SentenceDetector.std()
sent_detector_std.transform(data).show()

# part of speech
pos_std = Pos.fast()
pos_tagged = pos_std.transform(data)
pos_tagged.show()

# named entity recognition
ner_std = Ner.fast()
ner_std.transform(data).show()


+-----+------------------+--------------------+--------------------+
|docID|              text|            document|            sentence|
+-----+------------------+--------------------+--------------------+
|    1|To be or not to be|[[document,0,17,T...|[[document,0,17,T...|
|    2|       This is it!|[[document,0,10,T...|[[document,0,10,T...|
+-----+------------------+--------------------+--------------------+



IllegalArgumentException: 'requirement failed: Was not able to download: pos_fast for language: Some(en), with libVersion: 1.4.0 and spark version: 2.1.2 with downlader: com.johnsnowlabs.pretrained.S3ResourceDownloader@71259452'