In [None]:
# This is only to setup PySpark and Spark NLP on Colab
!wget http://setup.johnsnowlabs.com/colab.sh -O - | bash

In [None]:
import sparknlp
# let's start Spark with Spark NLP
spark = sparknlp.start()

In [3]:
!wget -O aclimdb_train.csv https://s3.amazonaws.com/auxdata.johnsnowlabs.com/public/resources/en/sentiment-corpus/aclimdb/aclimdb_train.csv
!wget -O aclimdb_test.csv https://s3.amazonaws.com/auxdata.johnsnowlabs.com/public/resources/en/sentiment-corpus/aclimdb/aclimdb_test.csv

--2021-11-21 09:52:29--  https://s3.amazonaws.com/auxdata.johnsnowlabs.com/public/resources/en/sentiment-corpus/aclimdb/aclimdb_train.csv
Resolving s3.amazonaws.com (s3.amazonaws.com)... 52.217.92.54
Connecting to s3.amazonaws.com (s3.amazonaws.com)|52.217.92.54|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 33497180 (32M) [text/csv]
Saving to: ‘aclimdb_train.csv’


2021-11-21 09:52:29 (81.6 MB/s) - ‘aclimdb_train.csv’ saved [33497180/33497180]

--2021-11-21 09:52:30--  https://s3.amazonaws.com/auxdata.johnsnowlabs.com/public/resources/en/sentiment-corpus/aclimdb/aclimdb_test.csv
Resolving s3.amazonaws.com (s3.amazonaws.com)... 52.217.92.54
Connecting to s3.amazonaws.com (s3.amazonaws.com)|52.217.92.54|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 32715164 (31M) [text/csv]
Saving to: ‘aclimdb_test.csv’


2021-11-21 09:52:30 (46.9 MB/s) - ‘aclimdb_test.csv’ saved [32715164/32715164]



In [4]:
trainDataset = spark.read \
      .option("header", True) \
      .csv("aclimdb_train.csv")

testDataset = spark.read \
      .option("header", True) \
      .csv("aclimdb_test.csv")

trainDataset.show()

+--------------------+--------+
|                text|   label|
+--------------------+--------+
|This is an Excell...|positive|
|The Sarah Silverm...|positive|
|"Prom Night" is a...|negative|
|So often a band w...|positive|
|"Pet Sematary" is...|positive|
|I watched the fil...|negative|
|Boy this movie ha...|negative|
|Checking the spoi...|negative|
|Despite its rathe...|positive|
|Absolute masterpi...|positive|
|The tweedy profes...|positive|
|A movie best summ...|negative|
|Take young, prett...|negative|
|For months I've b...|negative|
|"Batman: The Myst...|positive|
|Well, it was funn...|negative|
|I have seen the s...|positive|
|Brainless film ab...|negative|
|Leave it to geniu...|negative|
|Seven Pounds star...|positive|
+--------------------+--------+
only showing top 20 rows



In [5]:
from pyspark.ml import Pipeline

from sparknlp.annotator import *
from sparknlp.common import *
from sparknlp.base import *

In [6]:
document = DocumentAssembler()\
  .setInputCol("text")\
  .setOutputCol("document")

token = Tokenizer()\
  .setInputCols("document")\
  .setOutputCol("token")

norm = Normalizer()\
  .setInputCols(["token"])\
  .setOutputCol("normalized")\
  .setLowercase(True)

stops = StopWordsCleaner.pretrained()\
  .setInputCols("normalized")\
  .setOutputCol("cleanedToken")
  
doc2Vec = Doc2VecApproach()\
  .setInputCols("cleanedToken")\
  .setOutputCol("sentence_embeddings")\
  .setMaxSentenceLength(1000)\
  .setStepSize(0.025)\
  .setMinCount(5)\
  .setVectorSize(100)\
  .setNumPartitions(1)\
  .setMaxIter(1)\
  .setSeed(42)\
  .setStorageRef("doc2vec_aclImdb")\

sentimentdl = ClassifierDLApproach()\
  .setInputCols(["sentence_embeddings"])\
  .setOutputCol("class")\
  .setLabelColumn("label")\
  .setMaxEpochs(5)\
  .setEnableOutputLogs(True)

pipeline = Pipeline(
    stages = [
        document,
        token,
        norm,
        stops,
        doc2Vec,
        sentimentdl
    ])

stopwords_en download started this may take some time.
Approximate size to download 2.9 KB
[OK!]


In [7]:
pipelineModel = pipeline.fit(trainDataset)

In [8]:
!cd ~/annotator_logs && ls -l

total 4
-rw-r--r-- 1 root root 452 Nov 21 09:58 ClassifierDLApproach_b126569e5e91.log


In [9]:
!cat ~/annotator_logs/{sentimentdl.uid}.log

Training started - epochs: 5 - learning_rate: 0.005 - batch_size: 64 - training_examples: 25000 - classes: 2
Epoch 0/5 - 6.51s - loss: 184.16612 - acc: 0.8153926 - batches: 391
Epoch 1/5 - 5.91s - loss: 178.30418 - acc: 0.8358334 - batches: 391
Epoch 2/5 - 5.65s - loss: 179.25107 - acc: 0.84036857 - batches: 391
Epoch 3/5 - 6.31s - loss: 178.86932 - acc: 0.84237176 - batches: 391
Epoch 4/5 - 5.80s - loss: 178.13194 - acc: 0.84489584 - batches: 391


In [10]:
prediction = pipelineModel.transform(testDataset)

In [11]:
from sklearn.metrics import classification_report

predsPd = prediction.select('label','text',"class.result").toPandas()
predsPd['result'] = predsPd['result'].apply(lambda x : x[0])
print (classification_report(predsPd['result'], predsPd['label']))

              precision    recall  f1-score   support

    negative       0.87      0.80      0.84     13575
    positive       0.79      0.86      0.82     11425

    accuracy                           0.83     25000
   macro avg       0.83      0.83      0.83     25000
weighted avg       0.83      0.83      0.83     25000



## Save and Restore
### Pipeline Model

It's pretty simple to save and restore an already trained Pipeline which is called `PipelineModel`:

In [14]:
# this is our PipelineModel after it was trained via .fit()
# as you can see we have all the stages inside this PipelineModel
pipelineModel.stages
# so once you save it on disk, it will include everything next time you load it!

[DocumentAssembler_2f9c0247af19,
 REGEX_TOKENIZER_1f492672ab16,
 NORMALIZER_5f6019207ea3,
 STOPWORDS_CLEANER_3e62acb2648b,
 Doc2VecModel_7921b49ae1a0,
 ClassifierDLModel_4fb2630de611]

In [17]:
pipelineModel.write().overwrite().save("./imdb_classifier_doc2vec_pipeline")

In [19]:
# let's load it back and try
loadedPipelineModel = PipelineModel.load("./imdb_classifier_doc2vec_pipeline")
loadedPipelineModel.stages
# we have all of our stages inside the loaded pipeline!

[DocumentAssembler_2f9c0247af19,
 REGEX_TOKENIZER_1f492672ab16,
 NORMALIZER_5f6019207ea3,
 STOPWORDS_CLEANER_3e62acb2648b,
 Doc2VecModel_7921b49ae1a0,
 ClassifierDLModel_4fb2630de611]

In [22]:
# you can use it with Spark NLP LightPipeline 
lp_loadedPipeline = LightPipeline(loadedPipelineModel)

lp_loadedPipeline.annotate("This movie was really good!")

{'class': ['positive'],
 'cleanedToken': ['movie', 'good'],
 'document': ['This movie was really good!'],
 'normalized': ['this', 'movie', 'was', 'really', 'good'],
 'sentence_embeddings': ['movie good'],
 'token': ['This', 'movie', 'was', 'really', 'good', '!']}

In [24]:
# or you can use it via DataFrame
from pyspark.sql.types import StringType

dfTest = spark.createDataFrame([
    "This movie is a delight for those of all ages. I have seen it several times and each time I am enchanted by the characters and magic. The cast is outstanding, the special effects delightful, everything most believable.",
    "This film was to put it simply rubbish. The child actors couldn't act, as can be seen by Harry's supposed surprise on learning he's a wizard. I'm a wizard! is said with such indifference you'd think he's not surprised at all."
], StringType()).toDF("text")

loadedPipelineModel\
  .transform(dfTest)\
  .select("class.result")\
  .show(2, False)

+----------+
|result    |
+----------+
|[positive]|
|[negative]|
+----------+



### Annotator Models
Now let's say you would like to only save the trained annotators inside your pipeline so you can load them inside another custom Pipeline

In [29]:
# all we need is to access that stage and save it on disk
pipelineModel.stages

[DocumentAssembler_2f9c0247af19,
 REGEX_TOKENIZER_1f492672ab16,
 NORMALIZER_5f6019207ea3,
 STOPWORDS_CLEANER_3e62acb2648b,
 Doc2VecModel_7921b49ae1a0,
 ClassifierDLModel_4fb2630de611]

In [30]:
print(pipelineModel.stages[-1])
print(pipelineModel.stages[-2])

ClassifierDLModel_4fb2630de611
Doc2VecModel_7921b49ae1a0


In [31]:
# let's save our ClassifierDL - let's mention it was trained by doc2vec_aclImdb as well
pipelineModel.stages[-1].write().overwrite().save("./classifierdl_doc2vec_aclImdb_model")

In [32]:
# and here is our trained Doc2VecModel
pipelineModel.stages[-2].write().overwrite().save("./doc2vec_aclImdb_model")