In [1]:
import pyspark
import sparknlp

from pyspark.sql import SparkSession
from pyspark.ml import Pipeline

In [2]:
spark = SparkSession.builder \
    .appName("Spark NLP Licensed") \
    .master("local[*]") \
    .config("spark.driver.memory", "16G") \
    .config("spark.serializer", "org.apache.spark.serializer.KryoSerializer") \
    .config("spark.kryoserializer.buffer.max", "2000M") \
    .config("spark.jars", "/home/i/projects/jsl/spark-nlp-internal/python/lib/sparknlp-jsl.jar,/home/i/IdeaProjects/spark-nlp/python/lib/sparknlp.jar")\
    .getOrCreate()

In [3]:
documenter = sparknlp.DocumentAssembler()\
    .setInputCol("text")\
    .setOutputCol("document")

sentencerDL = sparknlp.annotator.SentenceDetectorDLModel\
    .load("/home/i/cache_pretrained/sentence_detector_dl_healthcare_en_2.6.0_2.4_1600001082565")\
    .setInputCols(["document"])\
    .setOutputCol("sentences")    

sentEmbedder = sparknlp.annotator.BertSentenceEmbeddings\
      .load("/models/sbiobert")\
      .setInputCols(["sentences"])\
      .setOutputCol("sentence_embeddings")

    
pipeline = Pipeline(stages=[documenter, sentencerDL, sentEmbedder])

In [6]:
data = spark.createDataFrame(
    [
        (1, 'John loves apples.'),
        (2, 'Mary loves oranges. John loves Mary.'),
        (3, 'John loves apples. John loves apples_.'),
    ],
    ['id', 'text']
)

In [7]:
results = pipeline.fit(data).transform(data)

In [18]:
for row in results.select("sentence_embeddings").collect():
    for embd in row.sentence_embeddings:
        print("{:<20}\t{}".format(embd.result, "".join(map(lambda x: "{:>10.3f}".format(x), embd.embeddings[0:5]))))
    print("\n")

John loves apples.  	    -0.112    -0.682     0.478     0.838     0.161


Mary loves oranges. 	    -0.643    -0.654    -0.006     1.731     0.138
John loves Mary.    	    -0.832    -0.404     0.148     0.496     0.850


John loves apples.  	    -0.112    -0.682     0.478     0.838     0.161
John loves apples_. 	    -0.025    -0.592     0.523     0.672     0.077


