![JohnSnowLabs](https://nlp.johnsnowlabs.com/assets/images/logo.png)

[![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/JohnSnowLabs/spark-nlp/blob/master/examples/python/training/english/doc2vec/Train_Doc2Vec_and_Text_Classification.ipynb)

# Document Embeddings with Doc2Vec

In [None]:
# Only run this cell when you are using Spark NLP on Google Colab
!wget http://setup.johnsnowlabs.com/colab.sh -O - | bash

In [None]:
import sparknlp
# let's start Spark with Spark NLP
spark = sparknlp.start()

In [None]:
!wget -O aclimdb_train.csv https://s3.amazonaws.com/auxdata.johnsnowlabs.com/public/resources/en/sentiment-corpus/aclimdb/aclimdb_train.csv
!wget -O aclimdb_test.csv https://s3.amazonaws.com/auxdata.johnsnowlabs.com/public/resources/en/sentiment-corpus/aclimdb/aclimdb_test.csv

--2023-02-20 15:54:06--  https://s3.amazonaws.com/auxdata.johnsnowlabs.com/public/resources/en/sentiment-corpus/aclimdb/aclimdb_train.csv
Loaded CA certificate '/etc/ssl/certs/ca-certificates.crt'
Resolving s3.amazonaws.com (s3.amazonaws.com)... 52.217.231.104, 52.217.198.144, 52.216.212.120, ...
Connecting to s3.amazonaws.com (s3.amazonaws.com)|52.217.231.104|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 33497180 (32M) [text/csv]
Saving to: ‘aclimdb_train.csv’


2023-02-20 15:54:09 (14,6 MB/s) - ‘aclimdb_train.csv’ saved [33497180/33497180]

--2023-02-20 15:54:09--  https://s3.amazonaws.com/auxdata.johnsnowlabs.com/public/resources/en/sentiment-corpus/aclimdb/aclimdb_test.csv
Loaded CA certificate '/etc/ssl/certs/ca-certificates.crt'
Resolving s3.amazonaws.com (s3.amazonaws.com)... 52.217.67.14, 3.5.19.152, 52.217.93.246, ...
Connecting to s3.amazonaws.com (s3.amazonaws.com)|52.217.67.14|:443... connected.
HTTP request sent, awaiting response... 200 OK
Leng

In [None]:
trainDataset = spark.read \
      .option("header", True) \
      .csv("aclimdb_train.csv")

testDataset = spark.read \
      .option("header", True) \
      .csv("aclimdb_test.csv")

trainDataset.show()

+--------------------+--------+
|                text|   label|
+--------------------+--------+
|This is an Excell...|positive|
|The Sarah Silverm...|positive|
|"Prom Night" is a...|negative|
|So often a band w...|positive|
|"Pet Sematary" is...|positive|
|I watched the fil...|negative|
|Boy this movie ha...|negative|
|Checking the spoi...|negative|
|Despite its rathe...|positive|
|Absolute masterpi...|positive|
|The tweedy profes...|positive|
|A movie best summ...|negative|
|Take young, prett...|negative|
|For months I've b...|negative|
|"Batman: The Myst...|positive|
|Well, it was funn...|negative|
|I have seen the s...|positive|
|Brainless film ab...|negative|
|Leave it to geniu...|negative|
|Seven Pounds star...|positive|
+--------------------+--------+
only showing top 20 rows



In [None]:
from pyspark.ml import Pipeline

from sparknlp.annotator import *
from sparknlp.common import *
from sparknlp.base import *

In [None]:
document = DocumentAssembler()\
  .setInputCol("text")\
  .setOutputCol("document")

token = Tokenizer()\
  .setInputCols("document")\
  .setOutputCol("token")

norm = Normalizer()\
  .setInputCols(["token"])\
  .setOutputCol("normalized")\
  .setLowercase(True)

stops = StopWordsCleaner.pretrained()\
  .setInputCols("normalized")\
  .setOutputCol("cleanedToken")

doc2Vec = Doc2VecApproach()\
  .setInputCols("cleanedToken")\
  .setOutputCol("sentence_embeddings")\
  .setMaxSentenceLength(1000)\
  .setStepSize(0.025)\
  .setMinCount(5)\
  .setVectorSize(100)\
  .setNumPartitions(1)\
  .setMaxIter(1)\
  .setSeed(42)\
  .setStorageRef("doc2vec_aclImdb")\

sentimentdl = ClassifierDLApproach()\
  .setInputCols(["sentence_embeddings"])\
  .setOutputCol("class")\
  .setLabelColumn("label")\
  .setMaxEpochs(5)\
  .setEnableOutputLogs(True)

pipeline = Pipeline(
    stages = [
        document,
        token,
        norm,
        stops,
        doc2Vec,
        sentimentdl
    ])

stopwords_en download started this may take some time.
Approximate size to download 2.9 KB
[OK!]


In [None]:
pipelineModel = pipeline.fit(trainDataset)

In [None]:
!cd ~/annotator_logs && ls -l

total 100
-rw-r--r-- 1 root root 446 20. Feb 15:55 ClassifierDLApproach_97ff5c76d735.log
-rw-r--r-- 1 root root 317 10. Feb 16:54 ClassifierMetrics_17606bbb7d1f.log
-rw-r--r-- 1 root root 313 10. Feb 16:54 ClassifierMetrics_1a6c515483ae.log
-rw-r--r-- 1 root root 323 10. Feb 16:54 ClassifierMetrics_2530315112a8.log
-rw-r--r-- 1 root root 314 10. Feb 16:54 ClassifierMetrics_3ccf43933a23.log
-rw-r--r-- 1 root root 325 10. Feb 16:54 ClassifierMetrics_55c7e364bf2b.log
-rw-r--r-- 1 root root 325 10. Feb 16:54 ClassifierMetrics_9290b613e8d7.log
-rw-r--r-- 1 root root 317 10. Feb 16:54 ClassifierMetrics_aa0e2812a3b9.log
-rw-r--r-- 1 root root 318 10. Feb 16:54 ClassifierMetrics_ad4cb4a650fa.log
-rw-r--r-- 1 root root 312 10. Feb 16:54 ClassifierMetrics_efc7f6345e79.log
-rw-r--r-- 1 root root 319 10. Feb 16:54 ClassifierMetrics_f571876aaa09.log
-rw-r--r-- 1 root root 320 26. Okt 09:23 NerDL_0f47f69f09e6.log
-rw-r--r-- 1 root root 320  2. Aug 2022  NerDL_10e337c8a3ef.log
-rw-r--r-- 1 root root 

In [None]:
!cat ~/annotator_logs/{sentimentdl.uid}.log

Training started - epochs: 5 - learning_rate: 0.005 - batch_size: 64 - training_examples: 25000 - classes: 2
Epoch 0/5 - 2.27s - loss: 194.4157 - acc: 0.814335 - batches: 391
Epoch 1/5 - 1.74s - loss: 186.7701 - acc: 0.8377324 - batches: 391
Epoch 2/5 - 1.75s - loss: 184.50777 - acc: 0.8419792 - batches: 391
Epoch 3/5 - 1.79s - loss: 182.49121 - acc: 0.8430609 - batches: 391
Epoch 4/5 - 1.69s - loss: 180.77087 - acc: 0.8451843 - batches: 391


In [None]:
prediction = pipelineModel.transform(testDataset)

In [None]:
from sklearn.metrics import classification_report

predsPd = prediction.select('label','text',"class.result").toPandas()
predsPd['result'] = predsPd['result'].apply(lambda x : x[0])
print (classification_report(predsPd['result'], predsPd['label']))

              precision    recall  f1-score   support

    negative       0.86      0.82      0.84     13143
    positive       0.81      0.85      0.83     11857

    accuracy                           0.83     25000
   macro avg       0.83      0.83      0.83     25000
weighted avg       0.83      0.83      0.83     25000



## Save and Restore
### Pipeline Model

It's pretty simple to save and restore an already trained Pipeline which is called `PipelineModel`:

In [None]:
# this is our PipelineModel after it was trained via .fit()
# as you can see we have all the stages inside this PipelineModel
pipelineModel.stages
# so once you save it on disk, it will include everything next time you load it!

[DocumentAssembler_eb3006c82ed9,
 REGEX_TOKENIZER_62be3e2cd631,
 NORMALIZER_8c22ec321476,
 STOPWORDS_CLEANER_3e62acb2648b,
 Doc2VecModel_8e5707c8288a,
 ClassifierDLModel_9e82f8b9ca8b]

In [None]:
pipelineModel.write().overwrite().save("./imdb_classifier_doc2vec_pipeline")

In [None]:
# let's load it back and try
loadedPipelineModel = PipelineModel.load("./imdb_classifier_doc2vec_pipeline")
loadedPipelineModel.stages
# we have all of our stages inside the loaded pipeline!

[DocumentAssembler_eb3006c82ed9,
 REGEX_TOKENIZER_62be3e2cd631,
 NORMALIZER_8c22ec321476,
 STOPWORDS_CLEANER_3e62acb2648b,
 Doc2VecModel_8e5707c8288a,
 ClassifierDLModel_9e82f8b9ca8b]

In [None]:
# you can use it with Spark NLP LightPipeline
lp_loadedPipeline = LightPipeline(loadedPipelineModel)

lp_loadedPipeline.annotate("This movie was really good!")

{'document': ['This movie was really good!'],
 'cleanedToken': ['movie', 'good'],
 'normalized': ['this', 'movie', 'was', 'really', 'good'],
 'sentence_embeddings': ['movie good'],
 'token': ['This', 'movie', 'was', 'really', 'good', '!'],
 'class': ['positive']}

In [None]:
# or you can use it via DataFrame
from pyspark.sql.types import StringType

dfTest = spark.createDataFrame([
    "This movie is a delight for those of all ages. I have seen it several times and each time I am enchanted by the characters and magic. The cast is outstanding, the special effects delightful, everything most believable.",
    "This film was to put it simply rubbish. The child actors couldn't act, as can be seen by Harry's supposed surprise on learning he's a wizard. I'm a wizard! is said with such indifference you'd think he's not surprised at all."
], StringType()).toDF("text")

loadedPipelineModel\
  .transform(dfTest)\
  .select("class.result")\
  .show(2, False)

+----------+
|result    |
+----------+
|[positive]|
|[negative]|
+----------+



### Annotator Models
Now let's say you would like to only save the trained annotators inside your pipeline so you can load them inside another custom Pipeline

In [None]:
# all we need is to access that stage and save it on disk
pipelineModel.stages

[DocumentAssembler_eb3006c82ed9,
 REGEX_TOKENIZER_62be3e2cd631,
 NORMALIZER_8c22ec321476,
 STOPWORDS_CLEANER_3e62acb2648b,
 Doc2VecModel_8e5707c8288a,
 ClassifierDLModel_9e82f8b9ca8b]

In [None]:
print(pipelineModel.stages[-1])
print(pipelineModel.stages[-2])

ClassifierDLModel_9e82f8b9ca8b
Doc2VecModel_8e5707c8288a


In [None]:
# let's save our ClassifierDL - let's mention it was trained by doc2vec_aclImdb as well
pipelineModel.stages[-1].write().overwrite().save("./classifierdl_doc2vec_aclImdb_model")

In [None]:
# and here is our trained Doc2VecModel
pipelineModel.stages[-2].write().overwrite().save("./doc2vec_aclImdb_model")