# Import Libraries

In [1]:
import pandas as pd

# Read Data

In [2]:
products = pd.read_csv('amazon_baby.csv')
products.head()

Unnamed: 0,name,review,rating
0,Planetwise Flannel Wipes,"These flannel wipes are OK, but in my opinion ...",3
1,Planetwise Wipe Pouch,it came early and was not disappointed. i love...,5
2,Annas Dream Full Quilt with 2 Shams,Very soft and comfortable and warmer than it l...,5
3,Stop Pacifier Sucking without tears with Thumb...,This is a product well worth the purchase. I ...,5
4,Stop Pacifier Sucking without tears with Thumb...,All of my kids have cried non-stop when I trie...,5


# Data Processing

## Creating Sentiment

In [3]:
# transform datatype
products['review'] = products['review'].astype('str')
products['rating'] = products['rating'].astype('int')

# Remove neutral review
products = products[products['rating'] != 3]

def create_sentiment(review):
    if review > 3:
        return 1
    else:
        return 0
    
# Use .apply() to create column "sentiment"
products['sentiment'] = products['rating'].apply(create_sentiment)

In [4]:
products.head()

Unnamed: 0,name,review,rating,sentiment
1,Planetwise Wipe Pouch,it came early and was not disappointed. i love...,5,1
2,Annas Dream Full Quilt with 2 Shams,Very soft and comfortable and warmer than it l...,5,1
3,Stop Pacifier Sucking without tears with Thumb...,This is a product well worth the purchase. I ...,5,1
4,Stop Pacifier Sucking without tears with Thumb...,All of my kids have cried non-stop when I trie...,5,1
5,Stop Pacifier Sucking without tears with Thumb...,"When the Binky Fairy came to our house, we did...",5,1


## Remove Punctuation

In [5]:
def remove_punctuation(review):
    import string
    translator = str.maketrans('', '', string.punctuation)
    return review.translate(translator)

In [6]:
products['review_clean'] = products['review'].apply(remove_punctuation)

In [7]:
products.head()

Unnamed: 0,name,review,rating,sentiment,review_clean
1,Planetwise Wipe Pouch,it came early and was not disappointed. i love...,5,1,it came early and was not disappointed i love ...
2,Annas Dream Full Quilt with 2 Shams,Very soft and comfortable and warmer than it l...,5,1,Very soft and comfortable and warmer than it l...
3,Stop Pacifier Sucking without tears with Thumb...,This is a product well worth the purchase. I ...,5,1,This is a product well worth the purchase I h...
4,Stop Pacifier Sucking without tears with Thumb...,All of my kids have cried non-stop when I trie...,5,1,All of my kids have cried nonstop when I tried...
5,Stop Pacifier Sucking without tears with Thumb...,"When the Binky Fairy came to our house, we did...",5,1,When the Binky Fairy came to our house we didn...


In [8]:
# Calculate Portion of Positive Review
products['sentiment'].sum() / len(products)

0.8411233448474381

# Sentiment Model

In [9]:
# Copy a df1 from df
df1 = products.copy()

from sklearn.feature_extraction.text import CountVectorizer
#cv = CountVectorizer(stop_words='english')
cv = CountVectorizer()
f = cv.fit_transform(df1['review_clean'])

## Construct DataFrame for features

In [10]:
features_name = cv.get_feature_names()
features_count = f.sum(axis=0).tolist()[0]

pd.DataFrame(list(zip(features_name, features_count)), columns=['Feature', 'Count']).sort_values(by=['Count'], ascending=False)

Unnamed: 0,Feature,Count
123235,the,666231
12183,and,414160
66632,it,376053
126381,to,372951
66290,is,229242
...,...,...
56890,greenodor,1
56889,greenmountaindiaperscom,1
56888,greenmonkeys,1
56887,greenland,1


## Train Model

In [11]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test, indice_train, indice_test = train_test_split(f, df1['sentiment'], df1.index, 
                                                                              test_size = 0.2, random_state=0)

In [12]:
from sklearn.linear_model import LogisticRegression
lreg = LogisticRegression()
sentiment_model = lreg.fit(X_train, y_train)

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression


In [13]:
# Accuracy
sentiment_model.score(X_test, y_test)

0.9294473928817727

### Predict Sentiment

In [14]:
df1.loc[indice_train, 'predict_sentiment'] = sentiment_model.predict_proba(X_train)[:, 1] # Only need to proba for "1"
df1.loc[indice_test, 'predict_sentiment'] = sentiment_model.predict_proba(X_test)[:, 1]

df1.head()

Unnamed: 0,name,review,rating,sentiment,review_clean,predict_sentiment
1,Planetwise Wipe Pouch,it came early and was not disappointed. i love...,5,1,it came early and was not disappointed i love ...,0.808291
2,Annas Dream Full Quilt with 2 Shams,Very soft and comfortable and warmer than it l...,5,1,Very soft and comfortable and warmer than it l...,0.97845
3,Stop Pacifier Sucking without tears with Thumb...,This is a product well worth the purchase. I ...,5,1,This is a product well worth the purchase I h...,0.999859
4,Stop Pacifier Sucking without tears with Thumb...,All of my kids have cried non-stop when I trie...,5,1,All of my kids have cried nonstop when I tried...,0.997157
5,Stop Pacifier Sucking without tears with Thumb...,"When the Binky Fairy came to our house, we did...",5,1,When the Binky Fairy came to our house we didn...,0.999986


In [15]:
# For Baby Trend Diaper Champ’

# Train Set
df1_ = df1[(df1['name'] == 'Baby Trend Diaper Champ') & (df1.index.isin(indice_train))].sort_values(by=['predict_sentiment'], ascending=False)
df1_.head()

Unnamed: 0,name,review,rating,sentiment,review_clean,predict_sentiment
376,Baby Trend Diaper Champ,"This is absolutely, by far, the best diaper pa...",5,1,This is absolutely by far the best diaper pail...,1.0
414,Baby Trend Diaper Champ,We have been using our Diaper Champ for almost...,5,1,We have been using our Diaper Champ for almost...,1.0
320,Baby Trend Diaper Champ,I originally put this item on my baby registry...,5,1,I originally put this item on my baby registry...,1.0
328,Baby Trend Diaper Champ,Diaper Champ or Diaper Genie? That was my dile...,5,1,Diaper Champ or Diaper Genie That was my dilem...,1.0
604,Baby Trend Diaper Champ,I have been using this diaper pail for 41/2 mo...,5,1,I have been using this diaper pail for 412 mon...,0.999999


In [16]:
# Sort by Rating
df1_r = df1[(df1['name'] == 'Baby Trend Diaper Champ') & (df1.index.isin(indice_train))].sort_values(by=['rating'], ascending=False)
df1_r.head()

Unnamed: 0,name,review,rating,sentiment,review_clean,predict_sentiment
474,Baby Trend Diaper Champ,I registered for this after reading all the re...,5,1,I registered for this after reading all the re...,0.924605
516,Baby Trend Diaper Champ,Hands down....get this diaper pail... The only...,5,1,Hands downget this diaper pail The only reason...,0.73973
498,Baby Trend Diaper Champ,I bought this after being gievn a diaper genie...,5,1,I bought this after being gievn a diaper genie...,0.997927
499,Baby Trend Diaper Champ,This is a great product and a good value for t...,5,1,This is a great product and a good value for t...,0.999778
501,Baby Trend Diaper Champ,It's so much better then the diaper genie. No ...,5,1,Its so much better then the diaper genie No sp...,0.96056


In [17]:
# Test Set
df1__ = df1[ (df1['name'] == 'Baby Trend Diaper Champ') & (df1.index.isin(indice_test)) ].sort_values(by=['predict_sentiment'], ascending=False)
df1__.head()

Unnamed: 0,name,review,rating,sentiment,review_clean,predict_sentiment
377,Baby Trend Diaper Champ,"At only 3 weeks old, my son goes through about...",5,1,At only 3 weeks old my son goes through about ...,0.999999
571,Baby Trend Diaper Champ,We did alot of research on diaper pails before...,2,0,We did alot of research on diaper pails before...,0.999999
402,Baby Trend Diaper Champ,I love my Diaper Champ. My son is 3 months old...,4,1,I love my Diaper Champ My son is 3 months old ...,0.999994
518,Baby Trend Diaper Champ,I LOOOVE this diaper pail! Its the easiest to...,5,1,I LOOOVE this diaper pail Its the easiest to ...,0.999887
486,Baby Trend Diaper Champ,"This is my second child. With my first, I went...",5,1,This is my second child With my first I went t...,0.999652


In [18]:
# Sort by Rating
df1__r = df1[(df1['name'] == 'Baby Trend Diaper Champ') & (df1.index.isin(indice_train))].sort_values(by=['rating'], ascending=False)
df1__r.head()

Unnamed: 0,name,review,rating,sentiment,review_clean,predict_sentiment
474,Baby Trend Diaper Champ,I registered for this after reading all the re...,5,1,I registered for this after reading all the re...,0.924605
516,Baby Trend Diaper Champ,Hands down....get this diaper pail... The only...,5,1,Hands downget this diaper pail The only reason...,0.73973
498,Baby Trend Diaper Champ,I bought this after being gievn a diaper genie...,5,1,I bought this after being gievn a diaper genie...,0.997927
499,Baby Trend Diaper Champ,This is a great product and a good value for t...,5,1,This is a great product and a good value for t...,0.999778
501,Baby Trend Diaper Champ,It's so much better then the diaper genie. No ...,5,1,Its so much better then the diaper genie No sp...,0.96056


# Selected Words Model

In [19]:
# Copy a df2 from df
df2 = products.copy()

# Vectorizer review
selected_words = ['awesome', 'great', 'fantastic', 'amazing', 'love', 'horrible', 'bad', 'terrible', 'awful', 'wow', 'hate']
#cv2 = CountVectorizer(stop_words='english', vocabulary=selected_words)
cv2 = CountVectorizer(vocabulary=selected_words)
f2 = cv2.fit_transform(df2['review_clean'])

## Visualize the frequency of selected words

In [20]:
features_count2 = f2.sum(axis=0).tolist()[0] # .tolist() is a list of list ( e.g.[[]] ), [0] extracts the only list from it
features_name2 = cv2.get_feature_names()

pd.DataFrame(list(zip(features_name2, features_count2)), columns = ['Feature', 'Count']).sort_values(by=['Count'], ascending=False)

Unnamed: 0,Feature,Count
1,great,54398
4,love,41515
6,bad,4044
0,awesome,3735
3,amazing,2539
2,fantastic,1604
7,terrible,1101
10,hate,1090
5,horrible,1085
8,awful,657


## Train Model

In [21]:
X_train2, X_test2, y_train2, y_test2, indice_train2, indice_test2 = train_test_split(f2, df2['sentiment'], df2.index,
                                                                                    test_size=0.2, random_state=0)

In [22]:
lreg = LogisticRegression()
selected_words_model = lreg.fit(X_train2, y_train2)

In [23]:
# Accuracy
selected_words_model.score(X_test2, y_test2)

0.8451320799976013

In [24]:
# Weight (coefficient)
# .coef_ is a list of list in numpy.array -> ( [ [] ] ); [0] extracts the first and only element from the array
pd.DataFrame(list(zip(cv2.get_feature_names(), selected_words_model.coef_[0])), columns=['Feature', 'Weight']).sort_values(by=['Weight'],
                                                                                                                        ascending=False)

Unnamed: 0,Feature,Weight
4,love,1.387334
0,awesome,1.178728
3,amazing,1.033787
2,fantastic,0.899419
1,great,0.865369
9,wow,-0.08073
6,bad,-0.987045
10,hate,-1.433868
8,awful,-2.074177
7,terrible,-2.197423


### Predict Sentiment

In [25]:
df2.loc[indice_train2, 'predict_sentiment'] = selected_words_model.predict_proba(X_train2)[:, 1]
df2.loc[indice_test2, 'predict_sentiment'] = selected_words_model.predict_proba(X_test2)[:, 1]

In [26]:
# Train Set
df2_ = df2[ (df2['name'] == 'Baby Trend Diaper Champ') & (df2.index.isin(indice_train2)) ].sort_values(by=['predict_sentiment'],
                                                                                                      ascending=False)
df2_.head()

Unnamed: 0,name,review,rating,sentiment,review_clean,predict_sentiment
329,Baby Trend Diaper Champ,I LOVE LOVE LOVE this product! It is SO much e...,4,1,I LOVE LOVE LOVE this product It is SO much ea...,0.998293
446,Baby Trend Diaper Champ,I received my Diaper Champ at my baby shower f...,5,1,I received my Diaper Champ at my baby shower f...,0.995954
459,Baby Trend Diaper Champ,"Love it, love it, love it! This lives up to t...",5,1,Love it love it love it This lives up to the ...,0.995954
367,Baby Trend Diaper Champ,"Let me just say, I LOVE THIS PRODUCT!! I used...",5,1,Let me just say I LOVE THIS PRODUCT I used th...,0.988971
426,Baby Trend Diaper Champ,I love this diaper pale and wouldn't dream of ...,5,1,I love this diaper pale and wouldnt dream of t...,0.988593


In [27]:
# Sort by Rating
df2_r = df2[(df2['name'] == 'Baby Trend Diaper Champ') & (df2.index.isin(indice_train2))].sort_values(by=['rating'], ascending=False)
df2_r.head()

Unnamed: 0,name,review,rating,sentiment,review_clean,predict_sentiment
474,Baby Trend Diaper Champ,I registered for this after reading all the re...,5,1,I registered for this after reading all the re...,0.793147
516,Baby Trend Diaper Champ,Hands down....get this diaper pail... The only...,5,1,Hands downget this diaper pail The only reason...,0.793147
498,Baby Trend Diaper Champ,I bought this after being gievn a diaper genie...,5,1,I bought this after being gievn a diaper genie...,0.793147
499,Baby Trend Diaper Champ,This is a great product and a good value for t...,5,1,This is a great product and a good value for t...,0.901088
501,Baby Trend Diaper Champ,It's so much better then the diaper genie. No ...,5,1,Its so much better then the diaper genie No sp...,0.588304


In [28]:
# Test Set
df2__ = df2[ (df2['name'] == 'Baby Trend Diaper Champ') & (df2.index.isin(indice_test2)) ].sort_values(by=['predict_sentiment'],
                                                                                                      ascending=False)
df2__.head()

Unnamed: 0,name,review,rating,sentiment,review_clean,predict_sentiment
322,Baby Trend Diaper Champ,We had 2 diaper Genie's both given to us as a ...,4,1,We had 2 diaper Genies both given to us as a g...,0.983994
456,Baby Trend Diaper Champ,I love this diaper pail. It keeps the diapers ...,4,1,I love this diaper pail It keeps the diapers f...,0.983994
402,Baby Trend Diaper Champ,I love my Diaper Champ. My son is 3 months old...,4,1,I love my Diaper Champ My son is 3 months old ...,0.973318
377,Baby Trend Diaper Champ,"At only 3 weeks old, my son goes through about...",5,1,At only 3 weeks old my son goes through about ...,0.973318
601,Baby Trend Diaper Champ,This is a great for dirty Diapers. I have had ...,5,1,This is a great for dirty Diapers I have had n...,0.973318


In [29]:
# Sort by Rating
df2__r = df2[(df2['name'] == 'Baby Trend Diaper Champ') & (df2.index.isin(indice_test2))].sort_values(by=['rating'], ascending=False)
df2__r.head()

Unnamed: 0,name,review,rating,sentiment,review_clean,predict_sentiment
644,Baby Trend Diaper Champ,I see that there are complaints of stinkiness ...,5,1,I see that there are complaints of stinkiness ...,0.793147
403,Baby Trend Diaper Champ,I agree it's better than the genie. I own the...,5,1,I agree its better than the genie I own the g...,0.588304
493,Baby Trend Diaper Champ,My husband and I registered for the Diaper Cha...,5,1,My husband and I registered for the Diaper Cha...,0.793147
486,Baby Trend Diaper Champ,"This is my second child. With my first, I went...",5,1,This is my second child With my first I went t...,0.938851
481,Baby Trend Diaper Champ,I love this pail. You just drop the diaper in...,5,1,I love this pail You just drop the diaper in ...,0.938851
