# Amazon sentiment demo

In [1]:
import bz2
import re
import numpy as np

## 1. Read data sample

You can skip this step if you do not want to test on real amazon data.

In [2]:
train_file = bz2.BZ2File('../data/train.s.ft.txt.bz2')
test_file = bz2.BZ2File('../data/test.s.ft.txt.bz2')
train_file_lines = train_file.readlines()
test_file_lines = test_file.readlines()

In [3]:
train_file_lines = [x.decode('utf-8') for x in train_file_lines]
test_file_lines = [x.decode('utf-8') for x in test_file_lines]

In [4]:
train_labels = [0 if x.split(' ')[0] == '__label__1' else 1 for x in train_file_lines]
train_sentences = [x.split(' ', 1)[1][:-1].lower() for x in train_file_lines]
for i in range(len(train_sentences)):
    train_sentences[i] = re.sub('\d','0',train_sentences[i])

In [5]:
test_labels = [0 if x.split(' ')[0] == '__label__1' else 1 for x in test_file_lines]
test_sentences = [x.split(' ', 1)[1][:-1].lower() for x in test_file_lines]
for i in range(len(test_sentences)):
    test_sentences[i] = re.sub('\d','0', test_sentences[i])

In [6]:
for i in range(len(train_sentences)):
    if 'www.' in train_sentences[i] or 'http:' in train_sentences[i] or 'https:' in train_sentences[i] or '.com' in train_sentences[i]:
        train_sentences[i] = re.sub(r"([^ ]+(?<=\.[a-z]{3}))", "<url>", train_sentences[i])
        
for i in range(len(test_sentences)):
    if 'www.' in test_sentences[i] or 'http:' in test_sentences[i] or 'https:' in test_sentences[i] or '.com' in test_sentences[i]:
        test_sentences[i] = re.sub(r"([^ ]+(?<=\.[a-z]{3}))", "<url>", test_sentences[i])

In [7]:
from textwrap import TextWrapper
wrapper = TextWrapper(subsequent_indent='\t  ')

In [8]:
review = test_sentences[0]
review_label = test_labels[0]
print("Sentence:", "\n".join(wrapper.wrap(review)))
print("Label:   ", review_label)

Sentence: great cd: my lovely pat has one of the great voices of her generation.
	  i have listened to this cd for years and i still love it. when i'm
	  in a good mood it makes me feel better. a bad mood just evaporates
	  like sugar in the rain. this cd just oozes life. vocals are jusat
	  stuunning and lyrics just kill. one of life's hidden gems. this is
	  a desert isle cd in my book. why she never made it big is just
	  beyond me. everytime i play this, no matter black, white, young,
	  old, male, female everybody says one thing "who was that singing ?"
Label:    1


## 2. Perform an inference

In [9]:
from hydrosdk import Cluster, Application
import grpc

In [10]:
cluster = Cluster(
    http_address="<hydrosphere-http-address>",
    grpc_address="<hydrosphere-grpc-address>",
    ssl=True,                                         # turn off, if your Hydrosphere instance doesn't have
    grpc_credentials=grpc.ssl_channel_credentials()   # TLS certificates installed
)

### Tokenizer

For the tokenization we use the tokenizer model for which we've created an application.

In [11]:
app1 = Application.find(cluster, "<application-name>")
app1.lock_while_starting()
predictor1 = app1.predictor()

In [12]:
result1 = predictor1.predict({"sentence": review})

In [13]:
result1

{'tokenized': array([   95,    21,  1531,  4475,    44,    24,     7,     1,    30,
         1917,     7,    79,  1957,     3,    20,  1030,     5,     8,
           95,    11,   138,     2,     3,   127,    81,     6,    51,
          110,    10,     4,    32,  2062,     6,   209,    43,   229,
           91,     4,   128,  2062,    36,    33,  2304,    10,     1,
         2274,     8,    95,    36,   154,  1031,    23,     2,   646,
           36,  1528,    24,     7,  6335,  2461,  4235,     8,     9,
            4,  3888, 12033,    95,    10,    21,    19,   182,    97,
          122,   129,     6,   220,     9,    36,   861,    43,  2693,
            3,   235,     8,    54,   615,   429,   576,   459,   134,
         1776,  1351,  2198,   498,    24,   162,    72,    13,    12,
          910])}

### Estimator

For the sentiment prediction we use the estimator model for which we've created an application.

In [14]:
app2 = Application.find(cluster, "<application-name>")
app2.lock_while_starting()
predictor2 = app2.predictor()

In [15]:
result2 = predictor2.predict({"tokenized": result1["tokenized"]})

In [16]:
result2

{'confidence': 0.9660149216651917, 'label': 1}

In [17]:
print("Prediction confidence:", result2["confidence"])
print("Prediction label:", result2["label"])
print("Actual label:", review_label)

Prediction confidence: 0.9660149216651917
Prediction label: 1
Actual label: 1


### Pipeline

For this test we've created a pipeline application that consists of two stages:

- tokenization
- sentiment estimation

In this app we pass the whole review text and receive it's sentiment prediction.

In [18]:
app3 = Application.find(cluster, "<application-name>")
app3.lock_while_starting()
predictor3 = app3.predictor()

In [19]:
result3 = predictor3.predict({"sentence": review})

In [20]:
print("Prediction confidence:", result3["confidence"])
print("Prediction label:", result3["label"])
print("Actual label:", review_label)

Prediction confidence: 0.9660149216651917
Prediction label: 1
Actual label: 1
