![image.png](attachment:image.png)

In [14]:
import numpy as np
import pandas as pd

import matplotlib.pyplot as plt
%matplotlib inline
import seaborn as sns

import json
import string

import warnings
warnings.filterwarnings('ignore')

from sklearn.model_selection import train_test_split
from sklearn.model_selection import GridSearchCV
from sklearn.linear_model import LogisticRegression

from sklearn.metrics import roc_auc_score,roc_curve
from sklearn.metrics import confusion_matrix , accuracy_score , classification_report
from sklearn.metrics import plot_roc_curve
from sklearn.metrics import auc

![image-2.png](attachment:image-2.png)

In [15]:
products = pd.read_csv('../data_sets/amazon_baby_subset.csv')
products.head()

Unnamed: 0,name,review,rating,sentiment
0,Stop Pacifier Sucking without tears with Thumb...,All of my kids have cried non-stop when I trie...,5,1
1,Nature's Lullabies Second Year Sticker Calendar,We wanted to get something to keep track of ou...,5,1
2,Nature's Lullabies Second Year Sticker Calendar,My daughter had her 1st baby over a year ago. ...,5,1
3,"Lamaze Peekaboo, I Love You","One of baby's first and favorite books, and it...",4,1
4,SoftPlay Peek-A-Boo Where's Elmo A Children's ...,Very cute interactive book! My son loves this ...,5,1


In [16]:
products.isnull().sum()

name          90
review       241
rating         0
sentiment      0
dtype: int64

![image.png](attachment:image.png)

In [17]:
# Name of first 10 products in the data-set
products['name'].head(10)

0    Stop Pacifier Sucking without tears with Thumb...
1      Nature's Lullabies Second Year Sticker Calendar
2      Nature's Lullabies Second Year Sticker Calendar
3                          Lamaze Peekaboo, I Love You
4    SoftPlay Peek-A-Boo Where's Elmo A Children's ...
5                            Our Baby Girl Memory Book
6    Hunnt&reg; Falling Flowers and Birds Kids Nurs...
7    Blessed By Pope Benedict XVI Divine Mercy Full...
8    Cloth Diaper Pins Stainless Steel Traditional ...
9    Cloth Diaper Pins Stainless Steel Traditional ...
Name: name, dtype: object

In [18]:
# Try counting the number of +ve and -ve reviews
positive_count = 0
negative_count = 0
for i in products['sentiment']:
    if i==1:  positive_count+=1
    else:     negative_count+=1
print('Count of Positive reviews : ',positive_count)
print('Count of Negative reviews : ',negative_count)

Count of Positive reviews :  26579
Count of Negative reviews :  26493


In [19]:
print('Count of Positive reviews : ',len(products[products['sentiment']==1]))
print('Count of Negative reviews : ',len(products[products['sentiment']==-1]))

Count of Positive reviews :  26579
Count of Negative reviews :  26493


![image.png](attachment:image.png)

In [20]:
with open('../data_sets/important_words.json','r') as f:
    important_words = json.load(f)

print(important_words)

['baby', 'one', 'great', 'love', 'use', 'would', 'like', 'easy', 'little', 'seat', 'old', 'well', 'get', 'also', 'really', 'son', 'time', 'bought', 'product', 'good', 'daughter', 'much', 'loves', 'stroller', 'put', 'months', 'car', 'still', 'back', 'used', 'recommend', 'first', 'even', 'perfect', 'nice', 'bag', 'two', 'using', 'got', 'fit', 'around', 'diaper', 'enough', 'month', 'price', 'go', 'could', 'soft', 'since', 'buy', 'room', 'works', 'made', 'child', 'keep', 'size', 'small', 'need', 'year', 'big', 'make', 'take', 'easily', 'think', 'crib', 'clean', 'way', 'quality', 'thing', 'better', 'without', 'set', 'new', 'every', 'cute', 'best', 'bottles', 'work', 'purchased', 'right', 'lot', 'side', 'happy', 'comfortable', 'toy', 'able', 'kids', 'bit', 'night', 'long', 'fits', 'see', 'us', 'another', 'play', 'day', 'money', 'monitor', 'tried', 'thought', 'never', 'item', 'hard', 'plastic', 'however', 'disappointed', 'reviews', 'something', 'going', 'pump', 'bottle', 'cup', 'waste', 'retu

In [21]:
print("Number of important words : ",len(important_words))

Number of important words :  193


![image.png](attachment:image.png)

![image.png](attachment:image.png)

In [22]:
print('Before')
print(products.isnull().sum())

# Replacing the NaN values with None --> For removing the punctuations
products = products.fillna({'review':''})

print('\nAfter')
print(products.isnull().sum())

Before
name          90
review       241
rating         0
sentiment      0
dtype: int64

After
name         90
review        0
rating        0
sentiment     0
dtype: int64


In [28]:
def remove_punctuation(text):
    ## if the above punctuations are present it will be removed and replaced with NaN values
    mapping = str.maketrans(' ' , ' ' , string.punctuation)
    return text.translate(mapping)

In [30]:
products['review_clean'] = products['review'].apply(remove_punctuation)
products.head()

Unnamed: 0,name,review,rating,sentiment,review_clean
0,Stop Pacifier Sucking without tears with Thumb...,All of my kids have cried non-stop when I trie...,5,1,All of my kids have cried nonstop when I tried...
1,Nature's Lullabies Second Year Sticker Calendar,We wanted to get something to keep track of ou...,5,1,We wanted to get something to keep track of ou...
2,Nature's Lullabies Second Year Sticker Calendar,My daughter had her 1st baby over a year ago. ...,5,1,My daughter had her 1st baby over a year ago S...
3,"Lamaze Peekaboo, I Love You","One of baby's first and favorite books, and it...",4,1,One of babys first and favorite books and it i...
4,SoftPlay Peek-A-Boo Where's Elmo A Children's ...,Very cute interactive book! My son loves this ...,5,1,Very cute interactive book My son loves this b...


![image.png](attachment:image.png)

In [35]:
for word in important_words:
    products[word] = products['review_clean'].apply(lambda s : s.split().count(word))

In [40]:
products.head()

Unnamed: 0,name,review,rating,sentiment,review_clean,baby,one,great,love,use,...,seems,picture,completely,wish,buying,babies,won,tub,almost,either
0,Stop Pacifier Sucking without tears with Thumb...,All of my kids have cried non-stop when I trie...,5,1,All of my kids have cried nonstop when I tried...,0,0,1,0,0,...,0,0,0,0,0,0,0,0,0,0
1,Nature's Lullabies Second Year Sticker Calendar,We wanted to get something to keep track of ou...,5,1,We wanted to get something to keep track of ou...,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,Nature's Lullabies Second Year Sticker Calendar,My daughter had her 1st baby over a year ago. ...,5,1,My daughter had her 1st baby over a year ago S...,1,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,"Lamaze Peekaboo, I Love You","One of baby's first and favorite books, and it...",4,1,One of babys first and favorite books and it i...,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,SoftPlay Peek-A-Boo Where's Elmo A Children's ...,Very cute interactive book! My son loves this ...,5,1,Very cute interactive book My son loves this b...,0,0,1,0,0,...,0,0,0,0,0,1,0,0,0,0


In [39]:
print(important_words)

['baby', 'one', 'great', 'love', 'use', 'would', 'like', 'easy', 'little', 'seat', 'old', 'well', 'get', 'also', 'really', 'son', 'time', 'bought', 'product', 'good', 'daughter', 'much', 'loves', 'stroller', 'put', 'months', 'car', 'still', 'back', 'used', 'recommend', 'first', 'even', 'perfect', 'nice', 'bag', 'two', 'using', 'got', 'fit', 'around', 'diaper', 'enough', 'month', 'price', 'go', 'could', 'soft', 'since', 'buy', 'room', 'works', 'made', 'child', 'keep', 'size', 'small', 'need', 'year', 'big', 'make', 'take', 'easily', 'think', 'crib', 'clean', 'way', 'quality', 'thing', 'better', 'without', 'set', 'new', 'every', 'cute', 'best', 'bottles', 'work', 'purchased', 'right', 'lot', 'side', 'happy', 'comfortable', 'toy', 'able', 'kids', 'bit', 'night', 'long', 'fits', 'see', 'us', 'another', 'play', 'day', 'money', 'monitor', 'tried', 'thought', 'never', 'item', 'hard', 'plastic', 'however', 'disappointed', 'reviews', 'something', 'going', 'pump', 'bottle', 'cup', 'waste', 'retu

Now we proceed with *Step 2*. For each word in **important_words**, we compute a count for the number of times the word occurs in the review. We will store this count in a separate column (one for each word). The result of this feature processing is a single column for each word in **important_words** which keeps a count of the number of times the respective word occurs in the review text.


**Note:** There are several ways of doing this. In this assignment, we use the built-in *count* function for Python lists. Each review string is first split into individual words and the number of occurances of a given word is counted.

![image.png](attachment:image.png)

In [41]:
print(products['perfect'])

0        0
1        0
2        0
3        1
4        0
        ..
53067    0
53068    0
53069    0
53070    0
53071    0
Name: perfect, Length: 53072, dtype: int64


In [None]:
## From the above it is clear that
## 1st review --> No perfect
## 2nd review --> No perfect
## 3rd review --> No perfect
## 4th review --> 1 perfect word

## 53071th review --> No perfect

In [64]:
print("Number of reviews with")
print("No perfect word : ", len(products[products['perfect'] == 0]))
print("1 perfect word  : ",len(products[products['perfect'] == 1]))
print("2 perfect words : ",len(products[products['perfect'] == 2]))
print("3 perfect words : ",len(products[products['perfect'] == 3]))
print("4 perfect words : ",len(products[products['perfect'] == 4]))
print("5 perfect words : ",len(products[products['perfect'] == 5]))
print("6 perfect words : ",len(products[products['perfect'] == 6]))

Number of reviews with
No perfect word :  50117
1 perfect word  :  2731
2 perfect words :  202
3 perfect words :  16
4 perfect words :  6
5 perfect words :  0
6 perfect words :  0


In [63]:
## Each review will have exactly 4 'perfect' words
for i in products[products['perfect'] == 4]['review']:
    print("\n\n",i)



 I don't think there is a "perfect" double stroller out there for $300.  There are single strollers that cost way more than this one and those are not perfect either.  So I'm giving it 4 stars because it can be improved by the maker.  The child trays are small.  The sun shades are small.  But I have one of those single canopy sunshade and it works great with this stroller since it's not too wide.  The stickers for the word "Aria" were coming off but I put it back and it seemed to stay.  Other than that, the size is perfect.  My 2 year old and 6 months old daughters are very happy riding in this stroller.  Some peoople said the weight differences could throw it off balance and might cause it hard to turn or move around. But I don't find that is a problem.  It's light and very easy to move around.  Both trays does swing open individually from the middle out.  It has separate seat reclining which is perfect.  The closing and opening are very easy to use. I have a Prius and this stroller

![image.png](attachment:image.png).

In [72]:
products['contains_perfect'] = products['perfect'].apply(lambda s : 1 if s>=1 else 0)
products['contains_perfect'].head()

0    0
1    0
2    0
3    1
4    0
Name: contains_perfect, dtype: int64

In [73]:
sum(products['contains_perfect'])

2955

![image.png](attachment:image.png)

In [75]:
print('The number of reviews containing the word "perfect" ',
      sum(products['contains_perfect']))

The number of reviews containing the word "perfect"  2955


![image.png](attachment:image.png)

![image.png](attachment:image.png)

![image.png](attachment:image.png)

In [96]:
def get_numpy_data(data_sframe,features,label):
    products['constant']=1
    features = ['constant']+features
    feature_frame = data_sframe[features]
    feature_matrix = feature_frame.as_matrix()
    label_array    = data_sframe[labels]
    label_matrix   = label_array.as_matrix()
    return (feature_matrix,label_array)

In [98]:
feature_matrix,sentiment = get_numpy_data(products,important_words,'sentiment')

KeyError: "Passing list-likes to .loc or [] with any missing labels is no longer supported. The following labels were missing: Index(['baby', 'one', 'great', 'love', 'use',\n       ...\n       'babies', 'won', 'tub', 'almost', 'either'],\n      dtype='object', length=193). See https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#deprecate-loc-reindex-listlike"