In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

In [2]:
products=pd.read_csv('amazon_baby.csv')
products.head()

Unnamed: 0,name,review,rating
0,Planetwise Flannel Wipes,"These flannel wipes are OK, but in my opinion ...",3
1,Planetwise Wipe Pouch,it came early and was not disappointed. i love...,5
2,Annas Dream Full Quilt with 2 Shams,Very soft and comfortable and warmer than it l...,5
3,Stop Pacifier Sucking without tears with Thumb...,This is a product well worth the purchase. I ...,5
4,Stop Pacifier Sucking without tears with Thumb...,All of my kids have cried non-stop when I trie...,5


In [3]:
products.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 183531 entries, 0 to 183530
Data columns (total 3 columns):
 #   Column  Non-Null Count   Dtype 
---  ------  --------------   ----- 
 0   name    183213 non-null  object
 1   review  182702 non-null  object
 2   rating  183531 non-null  int64 
dtypes: int64(1), object(2)
memory usage: 4.2+ MB


## Perform text cleaning

In [4]:
products = products.fillna({'review':' '})


In [5]:
def remove_punctuation(text):
    import string
    return text.translate( string.punctuation) 

products['review_clean'] = products['review'].apply(remove_punctuation)

## Extract Sentiments

In [6]:
products=products[products['rating']!=3]

In [7]:

products['sentiment'] = products['rating'].apply(lambda rating : +1 if rating > 3 else -1)
print(len(products))
products.head()


166752


Unnamed: 0,name,review,rating,review_clean,sentiment
1,Planetwise Wipe Pouch,it came early and was not disappointed. i love...,5,it came early and was not disappointed. i love...,1
2,Annas Dream Full Quilt with 2 Shams,Very soft and comfortable and warmer than it l...,5,Very soft and comfortable and warmer than it l...,1
3,Stop Pacifier Sucking without tears with Thumb...,This is a product well worth the purchase. I ...,5,This is a product well worth the purchase. I ...,1
4,Stop Pacifier Sucking without tears with Thumb...,All of my kids have cried non-stop when I trie...,5,All of my kids have cried non-stop when I trie...,1
5,Stop Pacifier Sucking without tears with Thumb...,"When the Binky Fairy came to our house, we did...",5,"When the Binky Fairy came to our house, we did...",1


In [8]:
train_data=products.sample(frac=0.8,random_state=200) 
test_data=products.drop(train_data.index)
print(len(train_data))
print(len(test_data))

133402
33350


In [33]:
from sklearn.feature_extraction.text import CountVectorizer

vectorizer = CountVectorizer(token_pattern=r'\b\w+\b')
     # Use this token pattern to keep single-letter words
# First, learn vocabulary from the training data and assign columns to words
# Then convert the training data into a sparse matrix
train_matrix = vectorizer.fit_transform(train_data['review_clean'])
# Second, convert the test data into a sparse matrix, using the same word-column mapping
test_matrix = vectorizer.transform(test_data['review_clean'])

In [10]:
from sklearn.linear_model import LogisticRegression
sentiment_model=LogisticRegression()
sentiment_model.fit(train_matrix,train_data['sentiment'])

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression


LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
                   intercept_scaling=1, l1_ratio=None, max_iter=100,
                   multi_class='auto', n_jobs=None, penalty='l2',
                   random_state=None, solver='lbfgs', tol=0.0001, verbose=0,
                   warm_start=False)

In [11]:
array=np.sum(np.array(sentiment_model.coef_) >= 0, axis=0)

In [12]:
array.shape

(57326,)

In [13]:
count=0
for i in range (array.shape[0]):
    if array[i]==1:
        count=count+1
print(count)

39975


In [14]:
sample_test_data = test_data[10:13]
sample_test_matrix=test_matrix[10:13]
sample_test_data

Unnamed: 0,name,review,rating,review_clean,sentiment
66,My Kindergarten Year - A Keepsake Book,I'm so glad I stumbled upon these gem. My son ...,5,I'm so glad I stumbled upon these gem. My son ...,1
75,Reusable Flannel (15) Baby Wipes 100% Cotton B...,I love these wipes. They are so simple and so...,5,I love these wipes. They are so simple and so...,1
77,Cloth Diaper Pins Stainless Steel Traditional ...,We bought the pins as my 6 year old Autistic s...,4,We bought the pins as my 6 year old Autistic s...,1


In [15]:
sample_test_data.iloc[0,1]

"I'm so glad I stumbled upon these gem. My son has already started answering questions and I think it will be a great memory to look back years later to see how he answered the questions."

In [16]:
sample_test_data.iloc[1,1]

"I love these wipes.  They are so simple and soft and they are a great price for reusable wipes.  They actually get my baby clean and dry during diaper changes.  And I know that I'll find a million other uses for them after our diaper days are done.  I'm ordering more of them right now!"

In [17]:
sample_test_data.iloc[2,1]

"We bought the pins as my 6 year old Autistic son was able to open or break open normal safety pins we needed to use for his night time clothes.  These new pins are significantly more sturdy and he hasn't learned how to open them and only once has he broken one open.  I seem to recall that their shipping price was more than a bit on the high side of reasonable though given the size of the package..."

In [18]:
scores=sentiment_model.predict(sample_test_matrix)
scores

array([1, 1, 1], dtype=int64)

In [19]:
import math
def calculate_proba(scores):
    """ Calculate the probability predictions from the scores.
    """
    proba_preds = []
    for score in scores:
        proba_pred =  1 / (1 + math.exp(-score))
        proba_preds.append(proba_pred)
    return proba_preds

calculate_proba(scores)

[0.7310585786300049, 0.7310585786300049, 0.7310585786300049]

In [20]:
probas = sentiment_model.predict_proba(sample_test_matrix)
probas

array([[6.82651078e-03, 9.93173489e-01],
       [6.14553475e-04, 9.99385447e-01],
       [2.50576329e-03, 9.97494237e-01]])

In [21]:
probabilities=sentiment_model.predict_proba(test_matrix)
test_data['probability']=probabilities[:,1]

In [22]:
print(probabilities[:,1].max())
print(probabilities[:,1].min())

1.0
1.2676921814169826e-20


In [23]:
test_data.head()

Unnamed: 0,name,review,rating,review_clean,sentiment,probability
1,Planetwise Wipe Pouch,it came early and was not disappointed. i love...,5,it came early and was not disappointed. i love...,1,0.876889
5,Stop Pacifier Sucking without tears with Thumb...,"When the Binky Fairy came to our house, we did...",5,"When the Binky Fairy came to our house, we did...",1,0.999968
10,"Baby Tracker&reg; - Daily Childcare Journal, S...",I love this journal and our nanny uses it ever...,4,I love this journal and our nanny uses it ever...,1,0.999999
15,Nature's Lullabies First Year Sticker Calendar,I bought this calender for myself for my secon...,4,I bought this calender for myself for my secon...,1,0.999548
26,Baby's First Journal - Green,"Extremely useful! As a new mom, tired and inex...",5,"Extremely useful! As a new mom, tired and inex...",1,0.981855


In [25]:
test_data1=test_data.copy()
test_data1.sort_values(by=['probability'], inplace=True,ascending=False)
test_data1.head(20)

Unnamed: 0,name,review,rating,review_clean,sentiment,probability
168086,Buttons Cloth Diaper Cover - One Size - 8 Colo...,Buttons vs. Best Bottoms reviewFirst thing I w...,5,Buttons vs. Best Bottoms reviewFirst thing I w...,1,1.0
88659,"ERGObaby Original Baby Carrier, Galaxy Grey",We purchased this carrier after a recommendati...,5,We purchased this carrier after a recommendati...,1,1.0
109197,Britax Boulevard 70 Convertible Car Seat (Prev...,A little background on me: I have three kids (...,5,A little background on me: I have three kids (...,1,1.0
147975,"Baby Jogger City Mini GT Single Stroller, Shad...",Let me start by saying that I have gone throug...,5,Let me start by saying that I have gone throug...,1,1.0
94921,"Baby Jogger Summit XC Single Stroller, Red/Black",EDIT 10/1//2011: Just wanted to add that I ha...,5,EDIT 10/1//2011: Just wanted to add that I ha...,1,1.0
140780,"Diono RadianR100 Convertible Car Seat, Dune",i bought this when the seat was owned by Sunsh...,5,i bought this when the seat was owned by Sunsh...,1,1.0
69511,"Joovy Ergo Caboose Tandem Stroller, Black",I've had this stroller for a little more than ...,5,I've had this stroller for a little more than ...,1,1.0
116083,Joovy Ergo Caboose Tandem Stroller Black,I've had this stroller for a little more than ...,5,I've had this stroller for a little more than ...,1,1.0
109122,Britax Marathon 70 Convertible Car Seat (Previ...,I have been using this version of the Britax M...,5,I have been using this version of the Britax M...,1,1.0
106455,"Quinny Senzz 2011 Fashion Stroller, Star",I am very pleased overall with the Quinny Senz...,4,I am very pleased overall with the Quinny Senz...,1,1.0


In [26]:
test_data1.sort_values(by=['probability'], inplace=True)
test_data1.head(20)

Unnamed: 0,name,review,rating,review_clean,sentiment,probability
147902,Graco Pack 'n Play Playard - Dempsey,My disappointment with this product prompted m...,1,My disappointment with this product prompted m...,-1,1.2676919999999999e-20
175191,"Zooper Twist Escape Stroller, Summer Day",I had to return this stroller for three reason...,1,I had to return this stroller for three reason...,-1,5.759905e-19
27310,Evenflo Expressions Plus High Chair - 3's Company,PLEASE HEED THE OTHER REVIEWERS WARNINGS ON TH...,1,PLEASE HEED THE OTHER REVIEWERS WARNINGS ON TH...,-1,1.34225e-13
68194,Evenflo Crosstown Soft Portable Travel Gate,I'm sure that this product passed all the form...,1,I'm sure that this product passed all the form...,-1,3.060661e-12
111536,ERGO Baby Carrier - Performance Grey,I got this because my 10 month was pushing the...,1,I got this because my 10 month was pushing the...,-1,8.20926e-12
41581,"Newborn Baby Pea in The Pod Halloween Costume,...","Looks really cute, however, the cloth smells f...",1,"Looks really cute, however, the cloth smells f...",-1,1.014314e-11
178360,iPad Travel Case,I have rated and left reviews for many items o...,1,I have rated and left reviews for many items o...,-1,1.649664e-11
75995,"Peg-Perego Tatamia High Chair, White Latte",Edited to Add 6/4/2010: Just wanted to add th...,1,Edited to Add 6/4/2010: Just wanted to add th...,-1,4.988625e-11
59062,"Peg Perego Primo Viaggio Infant Car Seat, Kiwi",WARNING to all owners newer model Peg Perego S...,1,WARNING to all owners newer model Peg Perego S...,-1,2.577857e-10
127472,"Argington Organic Bam Bam Crib Complete, Ebony",Please do not buy this crib. I so wanted to lo...,1,Please do not buy this crib. I so wanted to lo...,-1,5.730416e-10


In [39]:
print(test_data['sentiment'].head())
print(sentiment_model.predict(test_matrix))

1     1
5     1
10    1
15    1
26    1
Name: sentiment, dtype: int64
[1 1 1 ... 1 1 1]


In [57]:
pred=sentiment_model.predict(test_matrix)
pred.shape

(33350,)

In [55]:
actual=test_data['sentiment'].values
actual.shape

(33350,)

In [62]:
num_correct = sum(pred == actual)
accuracy = num_correct/test_matrix.shape[0]


In [63]:
accuracy

0.9295652173913044