In [1]:
import pandas as pd
data = pd.read_csv('amazon_baby.csv')

In [2]:
data.head()

Unnamed: 0,name,review,rating
0,Planetwise Flannel Wipes,"These flannel wipes are OK, but in my opinion ...",3
1,Planetwise Wipe Pouch,it came early and was not disappointed. i love...,5
2,Annas Dream Full Quilt with 2 Shams,Very soft and comfortable and warmer than it l...,5
3,Stop Pacifier Sucking without tears with Thumb...,This is a product well worth the purchase. I ...,5
4,Stop Pacifier Sucking without tears with Thumb...,All of my kids have cried non-stop when I trie...,5


# remove punctuations

In [3]:
def remove_punctuation(text):
    import string
    text = text.str.replace('[^\w\s]', '')
    return text

In [4]:
data = data.fillna({'review':''})
data['review_clean'] = remove_punctuation(data['review'])

In [5]:
data.head()

Unnamed: 0,name,review,rating,review_clean
0,Planetwise Flannel Wipes,"These flannel wipes are OK, but in my opinion ...",3,These flannel wipes are OK but in my opinion n...
1,Planetwise Wipe Pouch,it came early and was not disappointed. i love...,5,it came early and was not disappointed i love ...
2,Annas Dream Full Quilt with 2 Shams,Very soft and comfortable and warmer than it l...,5,Very soft and comfortable and warmer than it l...
3,Stop Pacifier Sucking without tears with Thumb...,This is a product well worth the purchase. I ...,5,This is a product well worth the purchase I h...
4,Stop Pacifier Sucking without tears with Thumb...,All of my kids have cried non-stop when I trie...,5,All of my kids have cried nonstop when I tried...


# create sentiments

In [6]:
data = data[data['rating'] != 3]

In [7]:
data['sentiment'] = data['rating'].apply(lambda rating: +1 if rating > 3 else -1)

In [8]:
data.head()

Unnamed: 0,name,review,rating,review_clean,sentiment
1,Planetwise Wipe Pouch,it came early and was not disappointed. i love...,5,it came early and was not disappointed i love ...,1
2,Annas Dream Full Quilt with 2 Shams,Very soft and comfortable and warmer than it l...,5,Very soft and comfortable and warmer than it l...,1
3,Stop Pacifier Sucking without tears with Thumb...,This is a product well worth the purchase. I ...,5,This is a product well worth the purchase I h...,1
4,Stop Pacifier Sucking without tears with Thumb...,All of my kids have cried non-stop when I trie...,5,All of my kids have cried nonstop when I tried...,1
5,Stop Pacifier Sucking without tears with Thumb...,"When the Binky Fairy came to our house, we did...",5,When the Binky Fairy came to our house we didn...,1


In [9]:
import json
with open('module-2-assignment-train-idx.json', 'r') as f:
    train_idx = json.loads(f.read())
with open('module-2-assignment-test-idx.json', 'r') as f:
    test_idx = json.loads(f.read())

In [10]:
train_data = data.iloc[train_idx]
test_data = data.iloc[test_idx]
print(len(train_data) , len(test_data), len(data))

133416 33336 166752


In [11]:
from sklearn.feature_extraction.text import CountVectorizer

vectorizer = CountVectorizer(token_pattern=r'\b\w+\b')
train_matrix = vectorizer.fit_transform(train_data['review_clean'])
test_matrix = vectorizer.transform(test_data['review_clean'])
print(train_matrix.shape, test_matrix.shape)

(133416, 121741) (33336, 121741)


# logitstic regression to classify sentiment

In [12]:
from sklearn.linear_model import LogisticRegression

In [None]:
sentiment_model = LogisticRegression()
sentiment_model.fit(train_matrix, train_data['sentiment'])

In [15]:
print('# of coefs > 0: ',sentiment_model.coef_[sentiment_model.coef_ >= 0].shape[0])

# of coefs > 0:  90357


In [16]:
sample_test_data = test_data[10:13]
print(sample_test_data)
print(sample_test_data.iloc[0]['review'])
print(sample_test_data.iloc[1]['review'])

                                                 name  \
59                          Our Baby Girl Memory Book   
71  Wall Decor Removable Decal Sticker - Colorful ...   
91  New Style Trailing Cherry Blossom Tree Decal R...   

                                               review  rating  \
59  Absolutely love it and all of the Scripture in...       5   
71  Would not purchase again or recommend. The dec...       2   
91  Was so excited to get this product for my baby...       1   

                                         review_clean  sentiment  
59  Absolutely love it and all of the Scripture in...          1  
71  Would not purchase again or recommend The deca...         -1  
91  Was so excited to get this product for my baby...         -1  
Absolutely love it and all of the Scripture in it.  I purchased the Baby Boy version for my grandson when he was born and my daughter-in-law was thrilled to receive the same book again.
Would not purchase again or recommend. The decals were t

In [17]:
sample_test_matrix = vectorizer.transform(sample_test_data['review_clean'])
scores = sentiment_model.decision_function(sample_test_matrix)
print(scores)

[ 4.67796559 -2.90300633 -9.63054837]


# predict

In [18]:
def predict_from_score(scores):
    y = []
    for score in scores:
        pred = 1 if score > 0 else -1
        y.append(pred)
    return y

In [19]:
pred2 = predict_from_score(scores)
pred1 = sentiment_model.predict(sample_test_matrix)
print(pred1, pred2)

[ 1 -1 -1] [1, -1, -1]


In [20]:
import numpy as np
def predict_probability(scores):
    y = []
    return 1/(1+np.exp(-scores))

In [21]:
predict_probability(scores)

array([9.90787744e-01, 5.20051492e-02, 6.56867022e-05])

In [22]:
scores = sentiment_model.decision_function(test_matrix)
idx = sorted(range(len(scores)), key= lambda i:scores[i], reverse=False)
for i in idx[0:20]:
    print(scores[i], test_data.iloc[i]['name'])

-30.82382766428859 The First Years True Choice P400 Premium Digital Monitor, 2 Parent Unit
-30.219675938614003 Fisher-Price Ocean Wonders Aquarium Bouncer
-26.368313275166827 VTech Communications Safe &amp; Sounds Full Color Video and Audio Monitor
-25.848844308591676 Levana Safe N'See Digital Video Baby Monitor with Talk-to-Baby Intercom and Lullaby Control (LV-TW501)
-21.498950817011824 Adiri BPA Free Natural Nurser Ultimate Bottle Stage 1 White, Slow Flow (0-3 months)
-21.476286487788897 Safety 1st High-Def Digital Monitor
-21.203876181473714 One Step Ahead Hide-Away Extra Long Bed Rail
-20.52187158240507 Peg-Perego Tatamia High Chair, White Latte
-20.290819741023718 Baby Trend Inertia Infant Car Seat - Horizon
-20.009888988956764 Samsung SEW-3037W Wireless Pan Tilt Video Baby Monitor Infrared Night Vision and Zoom, 3.5 inch
-19.735471624318922 Cloth Diaper Sprayer--styles may vary
-19.499210630961176 Regalo My Cot Portable Bed, Royal Blue
-19.452995780851516 Snuza Portable Baby Mov

# calculate accuracy

In [23]:
pred = sentiment_model.predict(test_matrix)
print('accuracy of sentiment model on test data ',1-np.sum(abs(pred - test_data['sentiment']))/2/test_data['sentiment'].shape)
num = 0
for p,t in zip(pred,test_data['sentiment']):
    if p == t:
        num += 1
print(num/pred.shape[0])

accuracy of sentiment model on test data  [0.93202544]
0.9320254379649628


# train another simpler model

In [24]:
significant_words = ['love', 'great', 'easy', 'old', 'little', 'perfect', 'loves', 
      'well', 'able', 'car', 'broke', 'less', 'even', 'waste', 'disappointed', 
      'work', 'product', 'money', 'would', 'return']
vectorizer_word_subset = CountVectorizer(vocabulary=significant_words)
train_matrix_word_subset = vectorizer_word_subset.fit_transform(train_data['review_clean'])
test_matrix_word_subset = vectorizer_word_subset.transform(test_data['review_clean'])

In [25]:
simple_model = LogisticRegression()
simple_model.fit(train_matrix_word_subset, train_data['sentiment'])

LogisticRegression()

In [26]:
simple_model_coef_table = pd.DataFrame({'word': significant_words, 'coefficient':simple_model.coef_.flatten()})

In [27]:
simple_model_coef_table.sort_values('coefficient')[::-1]

Unnamed: 0,word,coefficient
6,loves,1.673269
5,perfect,1.510263
0,love,1.363697
2,easy,1.192219
1,great,0.94395
4,little,0.520174
7,well,0.50376
8,able,0.190937
3,old,0.085424
9,car,0.058813


In [28]:
verify_set = vectorizer.transform(significant_words)
res = sentiment_model.predict(verify_set)
for word, result in zip(significant_words, res):
    print(word, simple_model_coef_table[simple_model_coef_table['word']==word]['coefficient'] , result)

love 0    1.363697
Name: coefficient, dtype: float64 1
great 1    0.94395
Name: coefficient, dtype: float64 1
easy 2    1.192219
Name: coefficient, dtype: float64 1
old 3    0.085424
Name: coefficient, dtype: float64 1
little 4    0.520174
Name: coefficient, dtype: float64 1
perfect 5    1.510263
Name: coefficient, dtype: float64 1
loves 6    1.673269
Name: coefficient, dtype: float64 1
well 7    0.50376
Name: coefficient, dtype: float64 1
able 8    0.190937
Name: coefficient, dtype: float64 1
car 9    0.058813
Name: coefficient, dtype: float64 1
broke 10   -1.652144
Name: coefficient, dtype: float64 1
less 11   -0.209348
Name: coefficient, dtype: float64 1
even 12   -0.511456
Name: coefficient, dtype: float64 1
waste 13   -2.034489
Name: coefficient, dtype: float64 -1
disappointed 14   -2.348478
Name: coefficient, dtype: float64 -1
work 15   -0.621307
Name: coefficient, dtype: float64 1
product 16   -0.320491
Name: coefficient, dtype: float64 1
money 17   -0.898062
Name: coefficient, 

# comparing two models

In [29]:
def compute_accuracy(model, train_matrix, output):
    pred = model.predict(train_matrix)
    num = 0
    for p,t in zip(pred,output):
        if p == t:
            num += 1
    return num/pred.shape[0]

In [30]:
pred_train_senti = sentiment_model.predict(train_matrix)
print('accuracy of sentiment model on training data ', 1-np.sum(abs(pred_train_senti - train_data['sentiment'])/2)/pred_train_senti.shape[0], 
      compute_accuracy(sentiment_model, train_matrix, train_data['sentiment']))
pred_train_simple = simple_model.predict(train_matrix_word_subset)
print('accuracy of simpler model on training data ', 1- np.sum(abs(pred_train_simple - train_data['sentiment'])/2)/pred_train_simple.shape[0] , 
      compute_accuracy(simple_model, train_matrix_word_subset, train_data['sentiment']))

accuracy of sentiment model on training data  0.9473826227738802 0.9473826227738802
accuracy of simpler model on training data  0.8668225700065959 0.8668225700065959


In [31]:
print('accuracy of sentiment model on test data ', compute_accuracy(sentiment_model, test_matrix, test_data['sentiment']))
print('accuracy of simpler model on test data', compute_accuracy(simple_model, test_matrix_word_subset, test_data['sentiment']))

accuracy of sentiment model on test data  0.9320254379649628
accuracy of simpler model on test data 0.8693604511639069


In [33]:
from sklearn.dummy import DummyClassifier
dummy_model = DummyClassifier(strategy='most_frequent')
dummy_model.fit(train_matrix, train_data['sentiment'])
print('accuracy of majority classifier on training data ', compute_accuracy(dummy_model, train_matrix, train_data['sentiment']))
print('accuracy of majority classifier on test data ',compute_accuracy(dummy_model, test_matrix, test_data['sentiment']))

accuracy of majority classifier on training data  0.8407087605684476
accuracy of majority classifier on test data  0.8427825773938085
