# Import useful libraries

In [1]:
import numpy as np
import pandas as pd 
import matplotlib.pyplot as plt
%matplotlib inline

# Read data

In [2]:
products = pd.read_csv('amazon_baby.csv')

# Explore data

In [3]:
products.head(10)

Unnamed: 0,name,review,rating
0,Planetwise Flannel Wipes,"These flannel wipes are OK, but in my opinion ...",3
1,Planetwise Wipe Pouch,it came early and was not disappointed. i love...,5
2,Annas Dream Full Quilt with 2 Shams,Very soft and comfortable and warmer than it l...,5
3,Stop Pacifier Sucking without tears with Thumb...,This is a product well worth the purchase. I ...,5
4,Stop Pacifier Sucking without tears with Thumb...,All of my kids have cried non-stop when I trie...,5
5,Stop Pacifier Sucking without tears with Thumb...,"When the Binky Fairy came to our house, we did...",5
6,A Tale of Baby's Days with Peter Rabbit,"Lovely book, it's bound tightly so you may not...",4
7,"Baby Tracker&reg; - Daily Childcare Journal, S...",Perfect for new parents. We were able to keep ...,5
8,"Baby Tracker&reg; - Daily Childcare Journal, S...",A friend of mine pinned this product on Pinter...,5
9,"Baby Tracker&reg; - Daily Childcare Journal, S...",This has been an easy way for my nanny to reco...,4


# Text Process

In [4]:
products = products.fillna({'review':''})  # fill in N/A's in the review column

In [5]:
def remove_punctuation(text):
    import string
    # The first and second params are characters needed to be replaced
    # The third one is to be deleted
    translator = str.maketrans('', '', string.punctuation) 
    return text.translate(translator) 

products['review_clean'] = products['review'].apply(remove_punctuation)

In [6]:
products.head(10)

Unnamed: 0,name,review,rating,review_clean
0,Planetwise Flannel Wipes,"These flannel wipes are OK, but in my opinion ...",3,These flannel wipes are OK but in my opinion n...
1,Planetwise Wipe Pouch,it came early and was not disappointed. i love...,5,it came early and was not disappointed i love ...
2,Annas Dream Full Quilt with 2 Shams,Very soft and comfortable and warmer than it l...,5,Very soft and comfortable and warmer than it l...
3,Stop Pacifier Sucking without tears with Thumb...,This is a product well worth the purchase. I ...,5,This is a product well worth the purchase I h...
4,Stop Pacifier Sucking without tears with Thumb...,All of my kids have cried non-stop when I trie...,5,All of my kids have cried nonstop when I tried...
5,Stop Pacifier Sucking without tears with Thumb...,"When the Binky Fairy came to our house, we did...",5,When the Binky Fairy came to our house we didn...
6,A Tale of Baby's Days with Peter Rabbit,"Lovely book, it's bound tightly so you may not...",4,Lovely book its bound tightly so you may not b...
7,"Baby Tracker&reg; - Daily Childcare Journal, S...",Perfect for new parents. We were able to keep ...,5,Perfect for new parents We were able to keep t...
8,"Baby Tracker&reg; - Daily Childcare Journal, S...",A friend of mine pinned this product on Pinter...,5,A friend of mine pinned this product on Pinter...
9,"Baby Tracker&reg; - Daily Childcare Journal, S...",This has been an easy way for my nanny to reco...,4,This has been an easy way for my nanny to reco...


In [7]:
# Transform rating into categorical data
products = products[products['rating'] != 3]

def create_sentiment(rating):
    if rating > 3:
        return 1
    else:
        return 0

products['sentiment'] = products['rating'].apply(create_sentiment)
products.head(10)

Unnamed: 0,name,review,rating,review_clean,sentiment
1,Planetwise Wipe Pouch,it came early and was not disappointed. i love...,5,it came early and was not disappointed i love ...,1
2,Annas Dream Full Quilt with 2 Shams,Very soft and comfortable and warmer than it l...,5,Very soft and comfortable and warmer than it l...,1
3,Stop Pacifier Sucking without tears with Thumb...,This is a product well worth the purchase. I ...,5,This is a product well worth the purchase I h...,1
4,Stop Pacifier Sucking without tears with Thumb...,All of my kids have cried non-stop when I trie...,5,All of my kids have cried nonstop when I tried...,1
5,Stop Pacifier Sucking without tears with Thumb...,"When the Binky Fairy came to our house, we did...",5,When the Binky Fairy came to our house we didn...,1
6,A Tale of Baby's Days with Peter Rabbit,"Lovely book, it's bound tightly so you may not...",4,Lovely book its bound tightly so you may not b...,1
7,"Baby Tracker&reg; - Daily Childcare Journal, S...",Perfect for new parents. We were able to keep ...,5,Perfect for new parents We were able to keep t...,1
8,"Baby Tracker&reg; - Daily Childcare Journal, S...",A friend of mine pinned this product on Pinter...,5,A friend of mine pinned this product on Pinter...,1
9,"Baby Tracker&reg; - Daily Childcare Journal, S...",This has been an easy way for my nanny to reco...,4,This has been an easy way for my nanny to reco...,1
10,"Baby Tracker&reg; - Daily Childcare Journal, S...",I love this journal and our nanny uses it ever...,4,I love this journal and our nanny uses it ever...,1


In [8]:
# Accuracy of majority class classifier
products['sentiment'].sum() / len(products)

0.8411233448474381

# All Words

## Vectorizer

In [9]:
from sklearn.feature_extraction.text import CountVectorizer
cv = CountVectorizer(stop_words = 'english', min_df = 10, max_features = 5000)
f = cv.fit_transform(products['review_clean'])

## Get Features' name and amount

In [10]:
# Construct DataFrame for each Feature
feature_names = cv.get_feature_names()
feature_count = f.sum(axis = 0).tolist()[0]

df = pd.DataFrame(list(zip(feature_names, feature_count)), 
                  columns = ['Feature', 'Count']).sort_values(by = ['Count'], ascending = False)

In [11]:
df

Unnamed: 0,Feature,Count
327,baby,65155
1896,great,54398
2285,just,49383
4703,use,46722
2438,like,44420
...,...,...
3116,picnic,83
673,calmed,83
3086,performs,83
4847,weighing,83


## Train Model & Calculate Weight

In [12]:
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression

X_train, X_test, y_train, y_test, indice_train, indice_test = train_test_split(f, products['sentiment'], products.index, test_size=0.2, random_state=0)
lreg = LogisticRegression()
model = lreg.fit(X_train, y_train)

y_pred = model.predict(X_test)
model.score(X_test, y_test)

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression


0.9202122874876315

## Predict Sentiment

In [14]:
# indice are used to match columns
# .predict_proba() returns probability for each class
products.loc[indice_train, 'predicted_sentiment'] = model.predict_proba(X_train)[:, 1]
products.loc[indice_test, 'predicted_sentiment'] = model.predict_proba(X_test)[:, 1]
products.loc[(products.name=='Baby Trend Diaper Champ') & (products.index.isin(indice_train))].sort_values(by=['predicted_sentiment'], ascending=False)

Unnamed: 0,name,review,rating,review_clean,sentiment,predicted_sentiment
376,Baby Trend Diaper Champ,"This is absolutely, by far, the best diaper pa...",5,This is absolutely by far the best diaper pail...,1,1.000000
414,Baby Trend Diaper Champ,We have been using our Diaper Champ for almost...,5,We have been using our Diaper Champ for almost...,1,0.999999
604,Baby Trend Diaper Champ,I have been using this diaper pail for 41/2 mo...,5,I have been using this diaper pail for 412 mon...,1,0.999998
451,Baby Trend Diaper Champ,"As a first time mother, I wanted to get the be...",5,As a first time mother I wanted to get the bes...,1,0.999997
512,Baby Trend Diaper Champ,Bottom line: Using regular bags saves big time...,5,Bottom line Using regular bags saves big time ...,1,0.999994
...,...,...,...,...,...,...
504,Baby Trend Diaper Champ,This thing freakin' stinks. Literally. I chang...,1,This thing freakin stinks Literally I changed ...,0,0.009999
484,Baby Trend Diaper Champ,Worst diaper pale ever!! I've had mine for 2 ...,1,Worst diaper pale ever Ive had mine for 2 yea...,0,0.003564
550,Baby Trend Diaper Champ,"Ok, so the idea of the Diaper Champ is awesome...",1,Ok so the idea of the Diaper Champ is awesomei...,0,0.000516
357,Baby Trend Diaper Champ,This is the worst diaper pail ever! It was gr...,1,This is the worst diaper pail ever It was gre...,0,0.000463


# Selected Words

## Vectorizer

In [15]:
selected_words = ['awesome', 'great', 'fantastic', 'amazing', 'love', 'horrible', 'bad', 'terrible', 'awful', 'wow', 'hate']
cv2 = CountVectorizer(stop_words='english', min_df=2, vocabulary=selected_words)
f2 = cv2.fit_transform(products['review_clean'])

## Get Features' name and amount

In [16]:
feature_count = f2.sum(axis=0).tolist()[0]
feature_names = cv2.get_feature_names()

df2 = pd.DataFrame(list(zip(feature_names,feature_count)), columns = ['Feature', 'Count']).sort_values(by=['Count'], ascending=False)
df2

Unnamed: 0,Feature,Count
1,great,54398
4,love,41515
6,bad,4044
0,awesome,3735
3,amazing,2539
2,fantastic,1604
7,terrible,1101
10,hate,1090
5,horrible,1085
8,awful,657


## Train Model & Calculate Weight

In [17]:
X2_train, X2_test, y2_train, y2_test, indice2_train, indice2_test = train_test_split(f2,
                                                                                    products['sentiment'],
                                                                                    products.index,
                                                                                    test_size = 0.2,
                                                                                    random_state = 0)

In [18]:
model2 = lreg.fit(X2_train, y2_train)

In [19]:
pd.DataFrame(list(zip(cv2.get_feature_names(), model2.coef_[0])), columns=['Feature','Weight']).sort_values(by='Weight', ascending=False)

Unnamed: 0,Feature,Weight
4,love,1.387334
0,awesome,1.178728
3,amazing,1.033787
2,fantastic,0.899419
1,great,0.865369
9,wow,-0.08073
6,bad,-0.987045
10,hate,-1.433868
8,awful,-2.074177
7,terrible,-2.197423


In [20]:
model2.score(X2_test, y2_test)

0.8451320799976013

## Predict Sentiment

In [21]:
products.loc[indice2_train, 'Predict_Sentiment2'] = model2.predict_proba(X2_train)[:,1]
products.loc[indice2_test, 'Predict_Sentiment2'] = model2.predict_proba(X2_test)[:,1]
products.loc[(products['name']=='Baby Trend Diaper Champ') & (products.index.isin(indice2_train))].sort_values(by='Predict_Sentiment2', ascending=False)

Unnamed: 0,name,review,rating,review_clean,sentiment,predicted_sentiment,Predict_Sentiment2
329,Baby Trend Diaper Champ,I LOVE LOVE LOVE this product! It is SO much e...,4,I LOVE LOVE LOVE this product It is SO much ea...,1,0.999783,0.998293
446,Baby Trend Diaper Champ,I received my Diaper Champ at my baby shower f...,5,I received my Diaper Champ at my baby shower f...,1,0.999187,0.995954
459,Baby Trend Diaper Champ,"Love it, love it, love it! This lives up to t...",5,Love it love it love it This lives up to the ...,1,0.999730,0.995954
367,Baby Trend Diaper Champ,"Let me just say, I LOVE THIS PRODUCT!! I used...",5,Let me just say I LOVE THIS PRODUCT I used th...,1,0.999979,0.988971
426,Baby Trend Diaper Champ,I love this diaper pale and wouldn't dream of ...,5,I love this diaper pale and wouldnt dream of t...,1,0.997101,0.988593
...,...,...,...,...,...,...,...
580,Baby Trend Diaper Champ,This is the best thing ever. Easy to use and n...,5,This is the best thing ever Easy to use and no...,1,0.566145,0.588304
317,Baby Trend Diaper Champ,Two girlfriends and two family members put me ...,5,Two girlfriends and two family members put me ...,1,0.891497,0.524031
445,Baby Trend Diaper Champ,For my first born I purchased the Diaper Genie...,5,For my first born I purchased the Diaper Genie...,1,0.485617,0.281387
489,Baby Trend Diaper Champ,I registered for this product after reading th...,2,I registered for this product after reading th...,0,0.000417,0.281387
