In [1]:
import pandas as pd
import matplotlib.pyplot as plt

%matplotlib inline

In [3]:
products = pd.read_csv("amazon_baby.csv")

# fill missing values with empty string
products.review.fillna("", inplace=True)

In [4]:
products.head()

Unnamed: 0,name,review,rating
0,Planetwise Flannel Wipes,"These flannel wipes are OK, but in my opinion ...",3
1,Planetwise Wipe Pouch,it came early and was not disappointed. i love...,5
2,Annas Dream Full Quilt with 2 Shams,Very soft and comfortable and warmer than it l...,5
3,Stop Pacifier Sucking without tears with Thumb...,This is a product well worth the purchase. I ...,5
4,Stop Pacifier Sucking without tears with Thumb...,All of my kids have cried non-stop when I trie...,5


# 1. Use .apply() to build a new feature with the counts for each of the selected_words

In [33]:
from collections import defaultdict

import nltk


def count_words(row):
    word_counter = defaultdict(int)
    for word in nltk.word_tokenize(row):
        word_counter[word.lower()] += 1
    return word_counter

In [25]:
products['word_counts'] = products.review.apply(count_words)

In [26]:
products.head()

Unnamed: 0,name,review,rating,word_counts
0,Planetwise Flannel Wipes,"These flannel wipes are OK, but in my opinion ...",3,"{'these': 1, 'flannel': 1, 'wipes': 3, 'are': ..."
1,Planetwise Wipe Pouch,it came early and was not disappointed. i love...,5,"{'it': 3, 'came': 1, 'early': 1, 'and': 3, 'wa..."
2,Annas Dream Full Quilt with 2 Shams,Very soft and comfortable and warmer than it l...,5,"{'very': 1, 'soft': 1, 'and': 2, 'comfortable'..."
3,Stop Pacifier Sucking without tears with Thumb...,This is a product well worth the purchase. I ...,5,"{'this': 4, 'is': 4, 'a': 2, 'product': 2, 'we..."
4,Stop Pacifier Sucking without tears with Thumb...,All of my kids have cried non-stop when I trie...,5,"{'all': 2, 'of': 1, 'my': 1, 'kids': 2, 'have'..."


In [32]:
selected_words = [
    'awesome', 'great', 'fantastic', 'amazing', 'love',
    'horrible','bad', 'terrible', 'awful', 'wow', 'hate'
]

In [45]:
def count_rows_with_selected_words(row):
    counter = {}
    for word in selected_words:
        counter[word] = row.get(word, 0)
    return counter

rows = [
    products.word_counts.apply(count_rows_with_selected_words)
]

AttributeError: 'DataFrame' object has no attribute 'word_counts'

In [43]:
products = pd.concat([products.drop('word_counts', axis='columns'), pd.DataFrame(rows)], axis='columns')

In [44]:
products.head()

Unnamed: 0,name,review,rating,awesome,great,fantastic,amazing,love,horrible,bad,terrible,awful,wow,hate
0,Planetwise Flannel Wipes,"These flannel wipes are OK, but in my opinion ...",3,0,0,0,0,0,0,0,0,0,0,0
1,Planetwise Wipe Pouch,it came early and was not disappointed. i love...,5,0,0,0,0,1,0,0,0,0,0,0
2,Annas Dream Full Quilt with 2 Shams,Very soft and comfortable and warmer than it l...,5,0,0,0,0,0,0,0,0,0,0,0
3,Stop Pacifier Sucking without tears with Thumb...,This is a product well worth the purchase. I ...,5,0,0,0,0,2,0,0,0,0,0,0
4,Stop Pacifier Sucking without tears with Thumb...,All of my kids have cried non-stop when I trie...,5,0,1,0,0,1,0,0,0,0,0,0


In [132]:
# count the frequency of each selected word 
products[selected_words].sum().sort_values()

wow            445
awful          737
horrible      1223
terrible      1250
hate          1265
fantastic     1733
amazing       2677
awesome       3991
bad           4837
love         43614
great        58588
dtype: int64

**Question 1**: Out of the 11 words in selected_words, which one is most used in the reviews in the dataset?
* great

**Question 2**: Out of the 11 words in selected_words, which one is least used in the reviews in the dataset?
* wow

# 2. Create a new sentiment analysis model using only the selected_words as features:

## Define what's a positive and a negative sentiment

In [50]:
products['sentiment'] = products.rating >= 3
products.sentiment = products.sentiment.astype('int')

In [52]:
products.sentiment.value_counts()

1    157038
0     26493
Name: sentiment, dtype: int64

In [53]:
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split

# store the feature matrix in "X"
X = products[selected_words]
y = products.sentiment

In [54]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=0)

In [55]:
selected_words_model = LogisticRegression(solver='lbfgs', max_iter=300)

In [57]:
selected_words_model.fit(X_train, y_train)

LogisticRegression(max_iter=300)

In [58]:
y_pred_class = selected_wordspredicted_sentiment_model.predict(X_test)

In [110]:
y_pred_proba = selected_words_model.predict_proba(X_test)[:, 1]

In [62]:
import numpy as np

In [82]:
pd.DataFrame(
    selected_words_model.coef_.flatten(),
    index=selected_words, columns=['coefficient']
).sort_values(by='coefficient', ascending=False)

Unnamed: 0,coefficient
love,1.293147
awesome,1.03657
amazing,1.021343
fantastic,0.888352
great,0.812487
wow,-0.090775
bad,-0.796074
hate,-1.229229
awful,-1.909762
terrible,-2.001889


**Question 3**: Out of the 11 words in selected_words, which one got the most positive weight in the selected_words_model? 
* love

**Question 4**: Out of the 11 words in selected_words, which one got the most negative weight in the selected_words_model?
* horrible

In [117]:
# add predict_sentiment column to the products dataframe
products = pd.concat((products, pd.Series(y_pred_proba, name='predict_sentiment')), axis=1)

# 3. Comparing the accuracy of different sentiment analysis model

In [118]:
from sklearn import metrics

In [119]:
metrics.accuracy_score(y_test, y_pred_class)

0.8571117225597298

**Question 5**: Which of the following ranges contains the accuracy of the selected_words_model on the test_data?
* 0.841 to 0.871

**Question 6**: Which of the following ranges contains the accuracy of the sentiment_model in the IPython Notebook from lecture on the test_data?
* 0.901 to 0.931

**Null accuracy**: accuracy that could be achieved by always predicting the most frequent class

In [120]:
products.sentiment.value_counts()[1] / len(products)

0.8556483645814603

**Question 7**: Which of the following ranges contains the accuracy of the majority class classifier, which simply predicts the majority class on the test_data?
* .811 to .843

* Previously, we achieved **0.9319960420976883** accuracy.

**Question 8**: How do you compare the different learned models with the baseline approach where we are just predicting the majority class?
* classifier with all words was better than the other two, but the others were almost the same.

# 4. Interpreting the difference in performance between the models

In [124]:
diaper_champ_reviews = products.loc[products.name == "Baby Trend Diaper Champ", ['name', 'rating', 'predict_sentiment']]

In [126]:
diaper_champ_reviews

Unnamed: 0,name,rating,predict_sentiment
312,Baby Trend Diaper Champ,4,0.815670
313,Baby Trend Diaper Champ,3,0.815670
314,Baby Trend Diaper Champ,1,0.957395
315,Baby Trend Diaper Champ,5,0.395923
316,Baby Trend Diaper Champ,5,0.815670
...,...,...,...
640,Baby Trend Diaper Champ,4,0.941609
641,Baby Trend Diaper Champ,5,0.908858
642,Baby Trend Diaper Champ,5,0.908858
643,Baby Trend Diaper Champ,2,0.815670


In [129]:
diaper_champ_reviews.sort_values(by='predict_sentiment', ascending=False, inplace=True)

In [133]:
diaper_champ_reviews.head()

Unnamed: 0,name,rating,predict_sentiment
604,Baby Trend Diaper Champ,5,0.997932
587,Baby Trend Diaper Champ,3,0.995352
526,Baby Trend Diaper Champ,5,0.994001
380,Baby Trend Diaper Champ,4,0.992505
390,Baby Trend Diaper Champ,5,0.992505


**Question 9**: Which of the following ranges contains the ‘predicted_sentiment’ for the most positive review for ‘Baby Trend Diaper Champ’, according to the sentiment_model from the IPython Notebook from lecture?

* Without examin anything 0.9 to 1.0

**Question 10**: Consider the most positive review for ‘Baby Trend Diaper Champ’ according to the sentiment_model from the IPython Notebook from lecture. Which of the following ranges contains the predicted_sentiment for this review, if we use the selected_words_model to analyze it?

* 0.7 to 0.8

**Question 11**: Why is the value of the predicted_sentiment for the most positive review found using the sentiment_model much more positive than the value predicted using the selected_words_model?
* None

In [131]:
diaper_champ_reviews.tail()

Unnamed: 0,name,rating,predict_sentiment
611,Baby Trend Diaper Champ,5,0.564157
473,Baby Trend Diaper Champ,5,0.522284
415,Baby Trend Diaper Champ,5,0.395923
315,Baby Trend Diaper Champ,5,0.395923
418,Baby Trend Diaper Champ,5,0.370369
