Vivekn Sentiment analysis computes Vivek Nayamanan algorithm from a set of positive and negative corpus

We call necessary imports

In [None]:
#Imports
import time
import sys
import os

from pyspark.ml import Pipeline, PipelineModel
from pyspark.sql import SparkSession

from sparknlp.annotator import *
from sparknlp.base import DocumentAssembler, Finisher


We load SparkSession if not already there

In [None]:
spark = SparkSession.builder \
    .appName("ner") \
    .master("local[*]") \
    .config("spark.driver.memory","2G") \
    .config("spark.driver.extraClassPath", "../../lib/sparknlp.jar") \
    .config("spark.kryoserializer.buffer.max", "500m") \
    .getOrCreate()

We load a spark dataset and put it in memory

In [None]:
#Load the input data to be annotated
data = spark.createDataFrame([
    ["Harry Potter was a great movie!"],
    ["I totally recommend it"],
    ["Awesome, just awesome!!"],
    ["Very bad movie, don't even watch it"]
]).toDF("text")

We creat the document assemblerr, which will put target text column into Annotation form

In [None]:
### Define the dataframe
document_assembler = DocumentAssembler() \
            .setInputCol("text")
        
### Transform input to appropriate schema
#assembled = document_assembler.transform(data)

The sentence detector will parse sub sentences in every line

In [None]:
### Sentence detector
sentence_detector = SentenceDetector() \
    .setInputCols(["document"]) \
    .setOutputCol("sentence")
#sentence_data = sentence_detector.transform(checked)

The tokenizer will match standard tokens

In [None]:
### Tokenizer
tokenizer = Tokenizer() \
            .setInputCols(["sentence"]) \
            .setOutputCol("token")
#tokenized = tokenizer.transform(assembled)

Normalizer will clean out the tokens

In [None]:
normalizer = Normalizer() \
            .setInputCols(["token"]) \
            .setOutputCol("normal")

The spell checker will correct normalized tokens, this trains with a dictionary of english words

In [None]:
### Spell Checker
spell_checker = NorvigSweetingApproach() \
            .setInputCols(["normal"]) \
            .setOutputCol("spell") \
            .setDictionary("../../data/spell/words.txt") \
            .setCorpus("../../data/spell/coca2017.txt")

#checked = spell_checker.fit(tokenized).transform(tokenized)

We creat the ViveknSentimentApproach and set resources to train it

In [None]:
sentiment_detector = ViveknSentimentApproach() \
    .setInputCols(["spell", "sentence"]) \
    .setOutputCol("sentiment") \
    .setPruneCorpus(0) \
    .setPositiveSource("../../data/vivekn/training_positive") \
    .setNegativeSource("../../data/vivekn/training_negative") \


The finisher will utilize sentiment analysis output

In [None]:
finisher = Finisher() \
    .setInputCols(["sentiment"]) \
    .setIncludeKeys(True)

We fit and predict over data

In [None]:
pipeline = Pipeline(stages=[
    document_assembler,
    sentence_detector,
    tokenizer,
    normalizer,
    spell_checker,
    sentiment_detector,
    finisher
])

start = time.time()
sentiment_model = pipeline.fit(data)
end = time.time()
print("Time elapsed pipeline process: " + str(end - start))

In [None]:
sentiment_data = sentiment_model.transform(data)
sentiment_data.show(truncate=False)

We can take a sample back into the driver

In [None]:
for r in sentiment_data.take(5):
    print(r)

Faster way to compute tests, streaming or small amounts of data

In [None]:
from sparknlp.base import LightPipeline

light = LightPipeline(sentiment_model)
light.annotate("So boring, I didn't like it at all")