In [7]:
from __future__ import print_function
from builtins import range
from timeit import default_timer
import sys
import math
import json

# Out of Box PySpark Imports
from pyspark.sql import SparkSession
from pyspark.ml import Pipeline
from pyspark.sql import functions as F
from pyspark.sql.types import ArrayType, StringType

# KPSOI Imports
#from sentimentanalysis.settings import KP_CSV_FILE, KP_DATA_COL, KP_TEST_COL, \
#                                       SPARK_NLP_PATH, DATA_DIR
#sys.path.append(SPARK_NLP_PATH)
sys.path.append("../../")

# John Snow Labs Imports
from sparknlp.base import DocumentAssembler
from sparknlp.annotator import *
from sparknlp.clinical.annotators import *
from sparknlp.common import RegexRule
from sparknlp.models import *

In [2]:
# data = spark.read.csv(KP_CSV_FILE, header=True) # extra commas breaking schema
# reference: https://github.com/databricks/spark-csv
#data = spark.read.format('com.databricks.spark.csv') \
#            .options(header='true', inferschema='true', escape='"') \
#            .load(KP_CSV_FILE)

total_rows = data.count()
print('Loaded %i rows of data from file: %s' % (total_rows, KP_CSV_FILE))

data = data.na.drop(subset=[KP_DATA_COL])
print('Dropped %s rows with null data.' % (total_rows - data.count()))

data = data.withColumn('pyspark_id', F.monotonically_increasing_id())

train = data.where(F.col(KP_TEST_COL).isNull())
test = data.where(F.col(KP_TEST_COL).isNotNull())

ntrain = train.count()
ntest = test.count()

print('Loaded %s training and %s testing rows of data.' % (ntrain, ntest))

Loaded 5478 rows of data from file: /Users/Will/kpsoi/sentiment-analysis/data/Kaiser-Permanente-Review-Export-2017.csv
Dropped 941 rows with null data.
Loaded 3638 training and 899 testing rows of data.


In [3]:
ones = train.select(KP_DATA_COL, 'rating') \
            .rdd.filter(lambda (c, r): float(r) == 1) \
            .map(lambda (c, r): c) \
            .collect()

with open(os.path.join(DATA_DIR, 'bad_comments/onestars.txt'), 'w') as badfile:
    for comment in ones:
        badfile.write('{}\n'.format(comment.encode('utf-8')))

In [4]:
fives = train.select(KP_DATA_COL, 'rating') \
            .rdd.filter(lambda (c, r): float(r) == 5) \
            .map(lambda (c, r): c) \
            .collect()

with open(os.path.join(DATA_DIR, 'good_comments/fivestars.txt'), 'w') as goodfile:
    for comment in fives:
        goodfile.write('{}\n'.format(comment.encode('utf-8')))

In [5]:
document_assembler = DocumentAssembler() \
    .setInputCol(KP_DATA_COL) \
    .setOutputCol('document')

tokenizer = RegexTokenizer() \
    .setOutputCol('tokens')

normalizer = Normalizer() \
    .setInputCols(["tokens"]) \
    .setOutputCol("normalized")

spell_checker = SpellChecker() \
    .setInputCols(['normalized']) \
    .setOutputCol('spelled_tokens')

sentence_detector = SentenceDetector() \
    .setInputCols(['spelled_tokens']) \
    .setOutputCol('sentence')

pos_tagger = POSTagger() \
    .setInputCols(['spelled_tokens', 'sentence']) \
    .setOutputCol('pos') \
    .setCorpusPath(os.path.join(SPARK_NLP_PATH, '../src/test/resources/anc-pos-corpus'))
    
sentiment_detector = ViveknSentimentDetector() \
    .setInputCols(['spelled_tokens', 'sentence']) \
    .setOutputCol('sentiment_score') \
    .setPositiveSource(os.path.join(DATA_DIR, 'good_comments/fivestars.txt')) \
    .setNegativeSource(os.path.join(DATA_DIR, 'bad_comments/onestars.txt'))

In [6]:
stages = [
    document_assembler,
    tokenizer,
    normalizer,
    spell_checker,
    sentence_detector,
    pos_tagger,
    sentiment_detector
]

pipeline = Pipeline(stages=stages)
model = pipeline.fit(train)
train = model.transform(train)
test = model.transform(test)

In [7]:
# Get the accuracy of our PySpark Model

def get_score(sentiment_score):
    return sentiment_score[0].asDict()['metadata']['sda']

_ = default_timer()

print('Calculating accuracy for initial word lists model on %s labeled rows.\n' % ntest)

results = test.select(KP_TEST_COL, 'sentiment_score')

pos_rdd = results.rdd.filter(lambda (sa, sp): sa == '+')
num_pos = float(pos_rdd.count())

neg_rdd = results.rdd.filter(lambda (sa, sp): sa == '-')
num_neg = float(neg_rdd.count())

pos_correct = pos_rdd \
        .filter(lambda (sa, sp): get_score(sp).lower() == 'positive') \
        .count()

neg_correct = neg_rdd \
        .filter(lambda (sa, sp): get_score(sp).lower() == 'negative') \
        .count()

pos_accuracy = pos_correct / num_pos
neg_accuracy = neg_correct / num_neg

accuracy = (pos_correct + neg_correct) / float(ntest)

print('Viveken Sentiment Model Results\n'
      'Positive Accuracy: %s%%\nNegative Accuracy %s%%\n'
      'Overall Accuracy %s%%\nTime: %s s\n' % \
      (round(pos_accuracy * 100, 2), round(neg_accuracy * 100, 2),
       round(accuracy * 100, 2), default_timer() - _))

Calculating accuracy for initial word lists model on 899 labeled rows.

Viveken Sentiment Model Results
Positive Accuracy: 74.0%
Negative Accuracy 99.0%
Overall Accuracy 82.31%
Time: 1784.92652798 s

