In [1]:
from sparknlp.annotator import *
from sparknlp_jsl.annotator import *
from sparknlp.base import *

In [7]:
import sparknlp_jsl

spark = sparknlp_jsl.start("LBTfY2dAbA") # Secret code provided as part of the license

In [8]:
document = DocumentAssembler()\
.setInputCol("text")\
.setOutputCol("document")

token = Tokenizer()\
.setInputCols("document")\
.setOutputCol("token")

normalizer = Normalizer()\
.setInputCols("token")\
.setOutputCol("normal")

pos = PerceptronModel.pretrained("pos_anc", lang = "en")\
.setInputCols("document", "normal")\
.setOutputCol("pos")

wordEmbeddings = WordEmbeddingsModel.pretrained()\
.setInputCols("document", "normal")\
.setOutputCol("word_embeddings")

ner = NerDLModel.pretrained()\
.setInputCols("document", "normal", "word_embeddings")\
.setOutputCol("ner")

nerConverter = NerConverter()\
.setInputCols("document", "normal", "ner")\
.setOutputCol("ner_converter")\
.setWhiteList(["PER"])

disambiguator = DisambiguatorModel.pretrained('people_disambiguator', 'en', 'clinical/models')\
.setInputCols("ner_converter", "word_embeddings")\
.setOutputCol("disambiguation")

pipeline = Pipeline().setStages([
    document,
    token,
    normalizer,
    wordEmbeddings,
    pos,
    ner,
    nerConverter,
    disambiguator
])

pos_anc download started this may take some time.
Approximate size to download 4.3 MB
[OK!]
glove_100d download started this may take some time.
Approximate size to download 145.3 MB
[OK!]
ner_dl download started this may take some time.
Approximate size to download 13.5 MB
[OK!]
people_disambiguator download started this may take some time.
Approximate size to download 54.1 MB
[OK!]


In [None]:
data = spark.createDataFrame([
    [1, "The 2015 CPAC featured Jamila Bey who became the first atheist activist to address CPAC's annual meeting."],
    [2, "The city is the heart of the state's 7th legislative district, represented in the Minnesota Senate by  Erik Simonson" +
      " and in the Minnesota House of Representatives by Jennifer Schultz (District 7A) and Liz Olson (District 7B), all " +
      "members of the Democratic-Farmer-Labor Party, which has long dominated the city's politics."],
    [3, "His father Dimitri (Zurab), a prince of the House of Orbeliani, served at the court of the last Georgian kings," +
      " while mother Khoreshan née Andronikashvili was a granddaughter, on her mother, Princess Elene’s, side, of Erekle" +
      " II, the penultimate and popular king of Georgia, whose cult would later be introduced into Georgian literature by Grigol Orbeliani himself."],
    [4, "The disarmament of Krepost Sveaborg, denying the Reds from using it, was used in White propaganda by how Allan " +
      "Staffans had duped the Russians and removed the locks without authorization"],
    [5, "The show also had a contestant named Josephine Roberto (aka Banig)," +
      " who later defeated Christina Aguilera on the way to become Female Vocalist Champion in the 1989 edition of Star Search in the United States."]
]).toDF('id', 'text')

In [42]:
data = spark.createDataFrame([
    [1, "Bill Gates is the founder of Microsoft."]
]).toDF('id', 'text')

In [43]:
data.show()

+---+--------------------+
| id|                text|
+---+--------------------+
|  1|Bill Gates is the...|
+---+--------------------+



In [44]:
model = pipeline.fit(data)

In [45]:
model.transform(data).select('ner_converter.result', 'disambiguation.result', 'disambiguation.metadata').show(truncate=False)

+------------+---------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+-----------------------------------------------------------------------------------------------------------+
|result      |result                                                                                                                                                                                                                           |metadata                                                                                                   |
+------------+---------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+--------------------------------------------------------------