![JohnSnowLabs](https://nlp.johnsnowlabs.com/assets/images/logo.png)

# Spark NLP and Spark ML Pipelines

In [None]:
import os

# Install java
! apt-get install -y openjdk-8-jdk-headless -qq > /dev/null
os.environ["JAVA_HOME"] = "/usr/lib/jvm/java-8-openjdk-amd64"
os.environ["PATH"] = os.environ["JAVA_HOME"] + "/bin:" + os.environ["PATH"]
! java -version

# Install pyspark
! pip install --ignore-installed pyspark==2.4.4

# Install Spark NLP
! pip install --ignore-installed spark-nlp== 2.4.2

## Simple Topic Modeling

`Spark-NLP`
* DocumentAssembler
* SentenceDetector
* Tokenizer
* Normalizer
* POS tagger
* Chunker
* Finisher

`Spark ML`
* Hashing
* TF-IDF
* LDA

In [1]:
import sys
import time

from pyspark.sql.functions import col
from pyspark.ml.feature import CountVectorizer, HashingTF, IDF, Tokenizer
from pyspark.ml.clustering import LDA, LDAModel

#Spark NLP
import sparknlp
from sparknlp.pretrained import PretrainedPipeline
from sparknlp.annotator import *
from sparknlp.common import RegexRule
from sparknlp.base import *

### Let's create a Spark Session for our app

In [2]:
spark = sparknlp.start()

print("Spark NLP version: ", sparknlp.version())
print("Apache Spark version: ", spark.version)

Spark NLP version:   2.4.2
Apache Spark version:  2.4.4


Let's download some scientific sample from PubMed dataset:
```
wget -N 	https://s3.amazonaws.com/auxdata.johnsnowlabs.com/public/resources/en/pubmed/pubmed-sample.csv -P /tmp
```

In [3]:
! wget -N 	https://s3.amazonaws.com/auxdata.johnsnowlabs.com/public/resources/en/pubmed/pubmed-sample.csv -P /tmp

--2020-02-11 19:15:13--  https://s3.amazonaws.com/auxdata.johnsnowlabs.com/public/resources/en/pubmed/pubmed-sample.csv
Loaded CA certificate '/etc/ssl/certs/ca-certificates.crt'
Resolving s3.amazonaws.com (s3.amazonaws.com)... 52.216.95.141
Connecting to s3.amazonaws.com (s3.amazonaws.com)|52.216.95.141|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 10484510 (10,0M) [text/csv]
Saving to: ‘/tmp/pubmed-sample.csv’


2020-02-11 19:15:15 (6,89 MB/s) - ‘/tmp/pubmed-sample.csv’ saved [10484510/10484510]



In [4]:
pubMedDF = spark.read\
                .option("header", "true")\
                .csv("/tmp/pubmed-sample.csv")\
                .filter("AB IS NOT null")\
                .withColumn("text", col("AB"))\
                .drop("TI", "AB")

In [5]:
pubMedDF.printSchema()
pubMedDF.show()

root
 |-- text: string (nullable = true)

+--------------------+
|                text|
+--------------------+
|The human KCNJ9 (...|
|BACKGROUND: At pr...|
|OBJECTIVE: To inv...|
|Combined EEG/fMRI...|
|Kohlschutter synd...|
|Statistical analy...|
|The synthetic DOX...|
|Our objective was...|
|We conducted a ph...|
|"Monomeric sarcos...|
|We presented the ...|
|The literature de...|
|A novel approach ...|
|An HPLC-ESI-MS-MS...|
|The localizing an...|
|OBJECTIVE: To eva...|
|For the construct...|
|We report the res...|
|Intraparenchymal ...|
|It is known that ...|
+--------------------+
only showing top 20 rows



In [6]:
pubMedDF.count()
pubMedDF = pubMedDF.limit(2000) 

### Let's create Spark-NLP Pipeline

In [7]:
# Spark NLP Pipeline

document_assembler = DocumentAssembler() \
    .setInputCol("text")

sentence_detector = SentenceDetector() \
    .setInputCols(["document"]) \
    .setOutputCol("sentence")

tokenizer = Tokenizer() \
    .setInputCols(["sentence"]) \
    .setOutputCol("token")

posTagger = PerceptronModel.pretrained() \
  .setInputCols(["sentence", "token"])

chunker = Chunker() \
    .setInputCols(["sentence", "pos"]) \
    .setOutputCol("chunk") \
    .setRegexParsers(["<NNP>+", "<DT>?<JJ>*<NN>"])

finisher = Finisher() \
  .setInputCols(["chunk"]) \
  .setIncludeMetadata(False)

nlpPipeline = Pipeline(stages=[
    document_assembler, 
    sentence_detector, 
    tokenizer,
    posTagger,
    chunker,
    finisher
])

pos_anc download started this may take some time.
Approximate size to download 4.3 MB
[OK!]


In [8]:
nlpPipelineDF = nlpPipeline.fit(pubMedDF).transform(pubMedDF)

### Let's create Spark ML Pipeline

In [9]:
# SPark ML Pipeline

cv = CountVectorizer(inputCol="finished_chunk", outputCol="features", vocabSize=1000, minDF=10.0, minTF=10.0)
idf = IDF(inputCol="features", outputCol="idf")
lda = LDA(k=10, maxIter=5)
### Let's create Spark-NLP Pipeline
mlPipeline = Pipeline(stages=[
    cv,
    idf,
    lda
])

### We are going to train Spark ML Pipeline by using Spark-NLP Pipeline

In [None]:
# Let's create Spark-NLP Pipeline
mlModel = mlPipeline.fit(nlpPipelineDF)

In [None]:
mlPipelineDF = mlModel.transform(nlpPipelineDF)

In [None]:
mlPipelineDF.show()

In [None]:
ldaModel = mlModel.stages[2]

In [None]:
ll = ldaModel.logLikelihood(mlPipelineDF)
lp = ldaModel.logPerplexity(mlPipelineDF)
print("The lower bound on the log likelihood of the entire corpus: " + str(ll))
print("The upper bound on perplexity: " + str(lp))


In [None]:
# Describe topics.
print("The topics described by their top-weighted terms:")
ldaModel.describeTopics(3).show(truncate=False)

### Let's look at out topics
NOTE: More cleaning, filtering, playing around with `CountVectorizer`, and more iterations in `LDA` will result in better Topic Modelling results.

In [None]:
# Output topics. Each is a distribution over words (matching word count vectors)
print("Learned topics (as distributions over vocab of " + str(ldaModel.vocabSize())
      + " words):")

topics = ldaModel.describeTopics(50)
topics_rdd = topics.rdd

vocab = mlModel.stages[0].vocabulary

topics_words = topics_rdd\
       .map(lambda row: row['termIndices'])\
       .map(lambda idx_list: [vocab[idx] for idx in idx_list])\
       .collect()

for idx, topic in enumerate(topics_words):
    print("topic: ", idx)
    print("----------")
    for word in topic:
       print(word)
    print("----------")