# Predicting sentiment from product reviews

# Fire up GraphLab Create
(See [Getting Started with SFrames](/notebooks/Week%201/Getting%20Started%20with%20SFrames.ipynb) for setup instructions)

In [1]:
import graphlab

# Read some product review data

Loading reviews for a set of baby products. 

In [2]:
products = graphlab.SFrame('amazon_baby.gl/')

2016-06-12 16:04:40,733 [INFO] graphlab.cython.cy_server, 176: GraphLab Create v1.9 started. Logging: /tmp/graphlab_server_1465747480.log


This non-commercial license of GraphLab Create is assigned to jzhao59@illinois.edu and will expire on June 11, 2017. For commercial licensing options, visit https://dato.com/buy/.


# Build the word count vector for each review

In [3]:
products['word_count'] = graphlab.text_analytics.count_words(products['review'])

In [4]:
products.head()

name,review,rating,word_count
Planetwise Flannel Wipes,"These flannel wipes are OK, but in my opinion ...",3.0,"{'and': 5, 'stink': 1, 'because': 1, 'ordered': ..."
Planetwise Wipe Pouch,it came early and was not disappointed. i love ...,5.0,"{'and': 3, 'love': 1, 'it': 2, 'highly': 1, ..."
Annas Dream Full Quilt with 2 Shams ...,Very soft and comfortable and warmer than it ...,5.0,"{'and': 2, 'quilt': 1, 'it': 1, 'comfortable': ..."
Stop Pacifier Sucking without tears with ...,This is a product well worth the purchase. I ...,5.0,"{'ingenious': 1, 'and': 3, 'love': 2, ..."
Stop Pacifier Sucking without tears with ...,All of my kids have cried non-stop when I tried to ...,5.0,"{'and': 2, 'parents!!': 1, 'all': 2, 'puppet.': ..."
Stop Pacifier Sucking without tears with ...,"When the Binky Fairy came to our house, we didn't ...",5.0,"{'and': 2, 'cute': 1, 'help': 2, 'doll': 1, ..."
A Tale of Baby's Days with Peter Rabbit ...,"Lovely book, it's bound tightly so you may no ...",4.0,"{'shop': 1, 'be': 1, 'is': 1, 'it': 1, 'as': ..."
"Baby Tracker&reg; - Daily Childcare Journal, ...",Perfect for new parents. We were able to keep ...,5.0,"{'feeding,': 1, 'and': 2, 'all': 1, 'right': 1, ..."
"Baby Tracker&reg; - Daily Childcare Journal, ...",A friend of mine pinned this product on Pinte ...,5.0,"{'and': 1, 'help': 1, 'give': 1, 'is': 1, ..."
"Baby Tracker&reg; - Daily Childcare Journal, ...",This has been an easy way for my nanny to record ...,4.0,"{'journal.': 1, 'all': 1, 'standarad': 1, ..."


In [4]:
graphlab.canvas.set_target('ipynb')

## Define what's a positive and a negative sentiment

We will ignore all reviews with rating = 3, since they tend to have a neutral sentiment.  Reviews with a rating of 4 or higher will be considered positive, while the ones with rating of 2 or lower will have a negative sentiment.   

In [5]:
# ignore all 3* reviews
products = products[products['rating'] != 3]

In [6]:
# positive sentiment = 4* or 5* reviews
products['sentiment'] = products['rating'] >=4

# Assignment

## orginaze the original data

In [7]:
selected_words = ['awesome', 'great', 'fantastic', 'amazing', 'love', 'horrible', 
                  'bad', 'terrible', 'awful', 'wow', 'hate']

In [9]:
def selected_count(data, words):
    if words in data:
        return data[words]
    else:
        return 0

for words in selected_words:
    products[words] = products['word_count'].apply(lambda x: selected_count(x, words))

In [10]:
products[:2]

name,review,rating,word_count,sentiment,awesome,great
Planetwise Wipe Pouch,it came early and was not disappointed. i love ...,5.0,"{'and': 3, 'love': 1, 'it': 2, 'highly': 1, ...",1,0,0
Annas Dream Full Quilt with 2 Shams ...,Very soft and comfortable and warmer than it ...,5.0,"{'and': 2, 'quilt': 1, 'it': 1, 'comfortable': ...",1,0,0

fantastic,amazing,love,horrible,bad,terrible,awful,wow,hate
0,0,1,0,0,0,0,0,0
0,0,0,0,0,0,0,0,0


## Question 1 to 2

In [22]:
for item in selected_words:
    print(item + ': \t' + str(products[item].sum()))

awesome: 	2002
great: 	42420
fantastic: 	873
amazing: 	1305
love: 	40277
horrible: 	659
bad: 	3197
terrible: 	673
awful: 	345
wow: 	131
hate: 	1057


## Split the dataset and create the model

In [11]:
train_data,test_data = products.random_split(.8, seed=0)

In [12]:
selected_words_model = graphlab.logistic_classifier.create(train_data,
                                                           target='sentiment',
                                                           features=selected_words,
                                                           validation_set=test_data)

## Question 3, 4

In [13]:
selected_words_model['coefficients'].sort('value', ascending = False).print_rows(12, 5)

+-------------+-------+-------+------------------+------------------+
|     name    | index | class |      value       |      stderr      |
+-------------+-------+-------+------------------+------------------+
|     love    |  None |   1   |  1.39989834302   | 0.0287147460124  |
| (intercept) |  None |   1   |  1.36728315229   | 0.00861805467824 |
|   awesome   |  None |   1   |  1.05800888878   |  0.110865296265  |
|   amazing   |  None |   1   |  0.892802422508  |  0.127989503231  |
|  fantastic  |  None |   1   |  0.891303090304  |  0.154532343591  |
|    great    |  None |   1   |  0.883937894898  | 0.0217379527921  |
|     wow     |  None |   1   | -0.0541450123333 |  0.275616449416  |
|     bad     |  None |   1   | -0.985827369929  | 0.0433603009142  |
|     hate    |  None |   1   |  -1.40916406276  | 0.0771983993506  |
|    awful    |  None |   1   |  -1.76469955631  |  0.134679803365  |
|   horrible  |  None |   1   |  -1.99651800559  | 0.0973584169028  |
|   terrible  |  Non

## Question 5

In [22]:
selected_words_model.evaluate(test_data)

{'accuracy': 0.8431419649291376,
 'auc': 0.6648096413721418,
 'confusion_matrix': Columns:
 	target_label	int
 	predicted_label	int
 	count	int
 
 Rows: 4
 
 Data:
 +--------------+-----------------+-------+
 | target_label | predicted_label | count |
 +--------------+-----------------+-------+
 |      0       |        0        |  234  |
 |      1       |        0        |  130  |
 |      0       |        1        |  5094 |
 |      1       |        1        | 27846 |
 +--------------+-----------------+-------+
 [4 rows x 3 columns],
 'f1_score': 0.914242563530107,
 'log_loss': 0.40547471103659266,
 'precision': 0.8453551912568306,
 'recall': 0.9953531598513011,
 'roc_curve': Columns:
 	threshold	float
 	fpr	float
 	tpr	float
 	p	int
 	n	int
 
 Rows: 100001
 
 Data:
 +-----------+-----+-----+-------+------+
 | threshold | fpr | tpr |   p   |  n   |
 +-----------+-----+-----+-------+------+
 |    0.0    | 1.0 | 1.0 | 27976 | 5328 |
 |   1e-05   | 1.0 | 1.0 | 27976 | 5328 |
 |   2e-05   |

In [23]:
selected_words_model.show(view='Evaluation')