In [None]:
#Imports
import sys
sys.path.append('../../')

from pyspark.ml import Pipeline
from sparknlp.annotator import *
from sparknlp.base import DocumentAssembler, Finisher


In [None]:
from pyspark.sql import SparkSession

spark = SparkSession.builder \
    .master("local[2]") \
    .config("spark.jar", "lib/sparknlp.jar") \
    .config("spark.driver.memory", "5g")\
    .config("spark.dirver.maxResultSize", "2g")\
    .getOrCreate()

In [None]:
#Load the input data to be annotated
data = spark. \
        read. \
        parquet("../../../src/test/resources/sentiment.parquet"). \
        limit(1000)
data.cache()
data.count()
data.show()

In [None]:
### Define the dataframe
document_assembler = DocumentAssembler() \
            .setInputCol("text")
        
### Transform input to appropriate schema
#assembled = document_assembler.transform(data)

In [None]:
### Sentence detector
sentence_detector = SentenceDetectorModel() \
    .setInputCols(["document"]) \
    .setOutputCol("sentence")
#sentence_data = sentence_detector.transform(checked)

In [None]:
### Tokenizer
tokenizer = RegexTokenizer() \
            .setInputCols(["sentence"]) \
            .setOutputCol("token")
#tokenized = tokenizer.transform(assembled)

In [None]:
normalizer = Normalizer() \
            .setInputCols(["token"]) \
            .setOutputCol("normal")

In [None]:
### Spell Checker
spell_checker = NorvigSweetingApproach() \
            .setInputCols(["normal"]) \
            .setOutputCol("spell")

#checked = spell_checker.fit(tokenized).transform(tokenized)

In [None]:
sentiment_detector = ViveknSentimentApproach() \
    .setInputCols(["spell", "sentence"]) \
    .setOutputCol("sentiment") \
    .setPositiveSource("../../../src/test/resources/vivekn/positive") \
    .setNegativeSource("../../../src/test/resources/vivekn/negative") \
    .setPruneCorpus(False)


In [None]:
finisher = Finisher() \
    .setInputCols(["sentiment"]) \
    .setIncludeKeys(True)

In [None]:
pipeline = Pipeline(stages=[
    document_assembler,
    sentence_detector,
    tokenizer,
    normalizer,
    spell_checker,
    sentiment_detector,
    finisher
])

sentiment_data = pipeline.fit(data).transform(data)
sentiment_data.show()

In [None]:
for r in sentiment_data.take(5):
    print(r)

In [None]:
pipeline.write().overwrite().save("./ps")
pipeline.fit(data).write().overwrite().save("./ms")

In [None]:
from pyspark.ml import Pipeline,PipelineModel

In [None]:
Pipeline.read().load("./ps")
PipelineModel.read().load("./ms")