In [21]:
import graphlab
products = graphlab.SFrame('amazon_baby.gl/')

In [22]:
def remove_punctuation(text):
    import string
    return text.translate(None, string.punctuation) 

products['review_clean'] = products['review'].apply(remove_punctuation)

In [23]:
products.fillna(column='review', value='')

name,review,rating,review_clean
Planetwise Flannel Wipes,"These flannel wipes are OK, but in my opinion ...",3.0,These flannel wipes are OK but in my opinion not ...
Planetwise Wipe Pouch,it came early and was not disappointed. i love ...,5.0,it came early and was not disappointed i love ...
Annas Dream Full Quilt with 2 Shams ...,Very soft and comfortable and warmer than it ...,5.0,Very soft and comfortable and warmer than it ...
Stop Pacifier Sucking without tears with ...,This is a product well worth the purchase. I ...,5.0,This is a product well worth the purchase I ...
Stop Pacifier Sucking without tears with ...,All of my kids have cried non-stop when I tried to ...,5.0,All of my kids have cried nonstop when I tried to ...
Stop Pacifier Sucking without tears with ...,"When the Binky Fairy came to our house, we didn't ...",5.0,When the Binky Fairy came to our house we didnt ...
A Tale of Baby's Days with Peter Rabbit ...,"Lovely book, it's bound tightly so you may no ...",4.0,Lovely book its bound tightly so you may no ...
"Baby Tracker&reg; - Daily Childcare Journal, ...",Perfect for new parents. We were able to keep ...,5.0,Perfect for new parents We were able to keep ...
"Baby Tracker&reg; - Daily Childcare Journal, ...",A friend of mine pinned this product on Pinte ...,5.0,A friend of mine pinned this product on Pinte ...
"Baby Tracker&reg; - Daily Childcare Journal, ...",This has been an easy way for my nanny to record ...,4.0,This has been an easy way for my nanny to record ...


In [24]:
products = products[products['rating'] != 3]

In [26]:
products['sentiment'] = products['rating'].apply(lambda rating: +1 if rating > 3 else -1)

# Split into training and test sets

In [28]:
train_data, test_data = products.random_split(.8, seed=1)

# Build the word count vector for each review

In [29]:
from sklearn.feature_extraction.text import CountVectorizer

vectorizer = CountVectorizer(token_pattern=r'\b\w+\b')
     # Use this token pattern to keep single-letter words
# First, learn vocabulary from the training data and assign columns to words
# Then convert the training data into a sparse matrix
train_matrix = vectorizer.fit_transform(train_data['review_clean'])
# Second, convert the test data into a sparse matrix, using the same word-column mapping
test_matrix = vectorizer.transform(test_data['review_clean'])

# Train a sentiment classifier with logistic regression

In [31]:
from sklearn.linear_model import LogisticRegression

In [34]:
model = LogisticRegression()
model.fit(train_matrix, train_data['sentiment'])

LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
          intercept_scaling=1, max_iter=100, multi_class='ovr', n_jobs=1,
          penalty='l2', random_state=None, solver='liblinear', tol=0.0001,
          verbose=0, warm_start=False)

## Q1

In [40]:
print len(model.coef_[model.coef_ >= 0])

85823


In [41]:
print type(model.coef_)
print len(model.coef_[0])

<type 'numpy.ndarray'>
121712


## Q2

In [42]:
sample_test_data = test_data[10:13]
print sample_test_data

+-------------------------------+-------------------------------+--------+
|              name             |             review            | rating |
+-------------------------------+-------------------------------+--------+
|   Our Baby Girl Memory Book   | Absolutely love it and all... |  5.0   |
| Wall Decor Removable Decal... | Would not purchase again o... |  2.0   |
| New Style Trailing Cherry ... | Was so excited to get this... |  1.0   |
+-------------------------------+-------------------------------+--------+
+-------------------------------+-----------+
|          review_clean         | sentiment |
+-------------------------------+-----------+
| Absolutely love it and all... |     1     |
| Would not purchase again o... |     -1    |
| Was so excited to get this... |     -1    |
+-------------------------------+-----------+
[3 rows x 5 columns]



In [43]:
sample_test_data[0]['review']

'Absolutely love it and all of the Scripture in it.  I purchased the Baby Boy version for my grandson when he was born and my daughter-in-law was thrilled to receive the same book again.'

In [46]:
sample_test_data[1]['review']

'Would not purchase again or recommend. The decals were thick almost plastic like and were coming off the wall as I was applying them! The would NOT stick! Literally stayed stuck for about 5 minutes then started peeling off.'

In [50]:
sample_test_matrix = vectorizer.transform(sample_test_data['review_clean'])
scores = model.decision_function(sample_test_matrix)
print scores

[  5.61045832  -3.14770511 -10.4220766 ]


In [52]:
def predict_sentiment(scores):
    sentiment = []
    for s in scores:
        if s > 0:
            sentiment.append(1)
        else:
            sentiment.append(-1)
    return sentiment

In [53]:
print predict_sentiment(scores)

[1, -1, -1]


In [54]:
def predict_prob(scores):
    import numpy as np
    prob = []
    for s in scores:
        prob.append(1./(1.+np.exp(-s)))
    return prob

In [55]:
print predict_prob(scores)

[0.99635395051971587, 0.041181798490862391, 2.9767112857786577e-05]


## Q3

In [91]:
test_data['prob'] = predict_prob(model.decision_function(test_matrix))
print test_data.sort(sort_columns='prob', ascending=False).print_rows(num_rows=20)

+-------------------------------+-------------------------------+--------+
|              name             |             review            | rating |
+-------------------------------+-------------------------------+--------+
| P'Kolino Silly Soft Seatin... | I've purchased both the P'... |  4.0   |
| Roan Rocco Classic Pram St... | Great Pram Rocco!!!!!!I bo... |  5.0   |
| Freemie Hands-Free Conceal... | I absolutely love this pro... |  5.0   |
| Graco Pack 'n Play Element... | My husband and I assembled... |  4.0   |
| Evenflo X Sport Plus Conve... | After seeing this in Paren... |  5.0   |
| Baby Jogger City Mini GT S... | Amazing, Love, Love, Love ... |  5.0   |
| Graco FastAction Fold Jogg... | Graco's FastAction Jogging... |  5.0   |
| Buttons Cloth Diaper Cover... | We are big Best Bottoms fa... |  4.0   |
| Britax 2012 B-Agile Stroll... | [I got this stroller for m... |  4.0   |
| Mamas &amp; Papas 2014 Urb... | After much research I purc... |  4.0   |
| Evenflo 6 Pack Classic 

## Q4

In [92]:
print test_data.sort(sort_columns='prob', ascending=True).print_rows(num_rows=20)

+-------------------------------+-------------------------------+--------+
|              name             |             review            | rating |
+-------------------------------+-------------------------------+--------+
| Fisher-Price Ocean Wonders... | We have not had ANY luck w... |  2.0   |
| Levana Safe N'See Digital ... | This is the first review I... |  1.0   |
| Safety 1st Exchangeable Ti... | I thought it sounded great... |  1.0   |
| Adiri BPA Free Natural Nur... | I will try to write an obj... |  2.0   |
| VTech Communications Safe ... | This is my second video mo... |  1.0   |
| The First Years True Choic... | Note: we never installed b... |  1.0   |
| Safety 1st High-Def Digita... | We bought this baby monito... |  1.0   |
| Cloth Diaper Sprayer--styl... | I bought this sprayer out ... |  1.0   |
| Motorola Digital Video Bab... | DO NOT BUY THIS BABY MONIT... |  1.0   |
| Philips AVENT Newborn Star... | It's 3am in the morning an... |  1.0   |
| Cosco Alpha Omega Elite

# Compute accuracy of the classifier

## Q5

In [93]:
test_data

name,review,rating,review_clean,sentiment
"Baby Tracker&reg; - Daily Childcare Journal, ...",This has been an easy way for my nanny to record ...,4.0,This has been an easy way for my nanny to record ...,1
"Baby Tracker&reg; - Daily Childcare Journal, ...",I love this journal and our nanny uses it ...,4.0,I love this journal and our nanny uses it ...,1
Nature's Lullabies First Year Sticker Calendar ...,"I love this little calender, you can keep ...",5.0,I love this little calender you can keep ...,1
Nature's Lullabies Second Year Sticker Calendar ...,"I had a hard time finding a second year calendar, ...",5.0,I had a hard time finding a second year calendar ...,1
"Lamaze Peekaboo, I Love You ...","One of baby's first and favorite books, and i ...",4.0,One of babys first and favorite books and it is ...,1
"Lamaze Peekaboo, I Love You ...",My son loved this book as an infant. It was ...,5.0,My son loved this book as an infant It was per ...,1
"Lamaze Peekaboo, I Love You ...",Our baby loves this book & has loved it for a ...,5.0,Our baby loves this book has loved it for a while ...,1
"SoftPlay Giggle Jiggle Funbook, Happy Bear ...",This bear is absolutely adorable and I would ...,2.0,This bear is absolutely adorable and I would ...,-1
SoftPlay Peek-A-Boo Where's Elmo A Childr ...,I bought two for recent baby showers! The book ...,5.0,I bought two for recent baby showers The boo ...,1
Baby's First Year Undated Wall Calendar with ...,I searched high and low for a first year cale ...,5.0,I searched high and low for a first year cale ...,1

prob
0.778357867964
0.999999258526
0.934550814871
0.999977964941
0.98022388167
0.999952869283
0.998739797898
0.808889363637
0.998481718924
0.997288349874


In [95]:
test_data['predicted_sentiment'] = model.predict(test_matrix)

In [97]:
test_data['correct'] = (test_data['sentiment'] == test_data['predicted_sentiment'])

In [100]:
print 'accuracy = ' + str(sum(test_data['correct']/len(test_data['correct'])*1.0))

accuracy = 0.932235421166


# Learn another classifier with fewer words

In [101]:
significant_words = ['love', 'great', 'easy', 'old', 'little', 'perfect', 'loves', 
      'well', 'able', 'car', 'broke', 'less', 'even', 'waste', 'disappointed', 
      'work', 'product', 'money', 'would', 'return']

In [102]:
vectorizer_word_subset = CountVectorizer(vocabulary=significant_words) # limit to 20 words
train_matrix_word_subset = vectorizer_word_subset.fit_transform(train_data['review_clean'])
test_matrix_word_subset = vectorizer_word_subset.transform(test_data['review_clean'])

# Train a logistic regression model on a subset of data

In [103]:
simple_model = LogisticRegression()
simple_model.fit(train_matrix_word_subset, train_data['sentiment'])

LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
          intercept_scaling=1, max_iter=100, multi_class='ovr', n_jobs=1,
          penalty='l2', random_state=None, solver='liblinear', tol=0.0001,
          verbose=0, warm_start=False)

In [137]:
simple_model_coef_table = graphlab.SFrame({'word':significant_words,
                                          'coefficients':simple_model.coef_.flatten()})
simple_model_positive_word = simple_model_coef_table[simple_model_coef_table['coefficients'] > 0]['word']

In [138]:
simple_model_coef_table.sort(sort_columns='coefficients', ascending=False).print_rows(num_rows=20)

+-----------------+--------------+
|   coefficients  |     word     |
+-----------------+--------------+
|  1.67307389259  |    loves     |
|  1.50981247669  |   perfect    |
|  1.36368975931  |     love     |
|  1.19253827349  |     easy     |
|  0.943999590572 |    great     |
|  0.520185762718 |    little    |
|  0.503760457768 |     well     |
|  0.190908572065 |     able     |
| 0.0855127794631 |     old      |
| 0.0588546711528 |     car      |
| -0.209562864535 |     less     |
| -0.320556236734 |   product    |
| -0.362166742274 |    would     |
| -0.511379631799 |     even     |
| -0.621168773642 |     work     |
| -0.898030737715 |    money     |
|  -1.65157634497 |    broke     |
|  -2.03369861394 |    waste     |
|  -2.10933109032 |    return    |
|  -2.3482982195  | disappointed |
+-----------------+--------------+
[20 rows x 2 columns]



In [110]:
model_coef_table = graphlab.SFrame({'word':vectorizer.vocabulary_.keys(),
                                         'coefficient':model.coef_.flatten()})
model_positive_word = model_coef_table[model_coef_table['coefficient'] > 0]['word']

## Q6

In [139]:
for word in simple_model_positive_word:
    if sum(model_positive_word.contains(word)) > 0:
        print 'yes'
    else:
        print 'no'

yes
yes
yes
yes
yes
yes
yes
yes
yes
yes


# Comparing Models

In [120]:
train_data['predicted_sentiment'] = model.predict(train_matrix)
train_data['predicted_sentiment_simple'] = simple_model.predict(train_matrix_word_subset)
train_data['correct'] = (train_data['predicted_sentiment'] == train_data['sentiment'])
train_data['correct_simple'] = (train_data['predicted_sentiment_simple'] == train_data['sentiment'])

## Q7

In [129]:
print 'training accuracy = ' + str(1.0*sum(train_data['correct'])/len(train_data['correct']))                             
print 'training accuracy of simple model = ' + str(1.0*sum(train_data['correct_simple'])/len(train_data['correct_simple']))

training accuracy = 0.967769982611
training accuracy of simple model = 0.866822570007


## Q8

In [130]:
test_data['predicted_sentiment'] = model.predict(test_matrix)
test_data['predicted_sentiment_simple'] = simple_model.predict(test_matrix_word_subset)
test_data['correct'] = (test_data['predicted_sentiment'] == test_data['sentiment'])
test_data['correct_simple'] = (test_data['predicted_sentiment_simple'] == test_data['sentiment'])

In [131]:
print 'test accuracy = ' + str(1.0*sum(test_data['correct'])/len(test_data['correct']))
print 'test accuracy of simple model = ' + str(1.0*sum(test_data['correct_simple'])/len(test_data['correct_simple']))

test accuracy = 0.932235421166
test accuracy of simple model = 0.869360451164


# Baseline: Majority class prediction

## Q9

In [132]:
print len(train_data[train_data['sentiment'] == 1])
print len(train_data[train_data['sentiment'] == -1])

112164
21252


In [134]:
print len(test_data[test_data['sentiment'] == 1])
print len(test_data[test_data['sentiment'] == -1])

28095
5241


In [135]:
print 'majority class accuracy on test set = ' + str(1.0*len(test_data[test_data['sentiment'] == 1])/len(test_data))

majority class accuracy on test set = 0.842782577394
