## Loading Dataset

In [51]:
import pandas as pd
import numpy as np

In [52]:
# Reading the csv file and displaying the first 5 rows of the dataframe.
products = pd.read_csv(r'Dataset/amazon_baby.csv')
products.head()

Unnamed: 0,name,review,rating
0,Planetwise Flannel Wipes,"These flannel wipes are OK, but in my opinion ...",3
1,Planetwise Wipe Pouch,it came early and was not disappointed. i love...,5
2,Annas Dream Full Quilt with 2 Shams,Very soft and comfortable and warmer than it l...,5
3,Stop Pacifier Sucking without tears with Thumb...,This is a product well worth the purchase. I ...,5
4,Stop Pacifier Sucking without tears with Thumb...,All of my kids have cried non-stop when I trie...,5


## Cleaning the Data

In [53]:
def remove_punctuation(text):
    """
    The function takes in a string, and returns a new string with all of the punctuation removed. 
    
    The function uses Python's built-in string.punctuation variable, which is a string of all the punctuation characters. 
    
    The function str.translate() takes in a translation table, which you can generate using the maketrans() helper function
    in the string library. 
    
    :param text: The text to be processed
    :return: A string with all punctuation removed.
    """
    import string
    return str(text).translate(str.maketrans('', '', string.punctuation)) 

# Applying the remove_punctuation function to the review column of the products dataframe.
products['review_clean'] = products['review'].apply(remove_punctuation)

# Displaying the first 5 rows of the dataframe.
products.head()

Unnamed: 0,name,review,rating,review_clean
0,Planetwise Flannel Wipes,"These flannel wipes are OK, but in my opinion ...",3,These flannel wipes are OK but in my opinion n...
1,Planetwise Wipe Pouch,it came early and was not disappointed. i love...,5,it came early and was not disappointed i love ...
2,Annas Dream Full Quilt with 2 Shams,Very soft and comfortable and warmer than it l...,5,Very soft and comfortable and warmer than it l...
3,Stop Pacifier Sucking without tears with Thumb...,This is a product well worth the purchase. I ...,5,This is a product well worth the purchase I h...
4,Stop Pacifier Sucking without tears with Thumb...,All of my kids have cried non-stop when I trie...,5,All of my kids have cried nonstop when I tried...


In [54]:
# Counting the number of missing values in each column.
products.isna().sum()

name            318
review          829
rating            0
review_clean      0
dtype: int64

In [55]:
# Replacing all the empty name, reviews with an empty string.
products = products.fillna({'name':'', 'review':''})

In [56]:
# Filtering out all the products with rating 3.
products = products[products['rating'] != 3]
# Creating a new column called 'sentiment' and assigning it a value of +1 if the rating is greater than 3 and -1 if the rating is less than 3.
products['sentiment'] = products['rating'].apply(lambda rating : +1 if rating > 3 else -1)

## Train Test Split

In [57]:
# The below code is loading the train and test index files into the train_index and test_index variables.Index files are used to follow the same implementation done in assignment as it's not done on pandas.
import json

temp_file = open('Dataset/module-2-assignment-train-idx.json')
train_index = json.load(temp_file)
temp_file.close()
temp_file = open('Dataset/module-2-assignment-test-idx.json')
test_index = json.load(temp_file)
temp_file.close()

In [84]:
# The above code is splitting the data into training and test data.
train_data = products.iloc[train_index]
test_data = products.iloc[test_index]
# Reseting the index
train_data.reset_index(drop=True, inplace=True)
test_data.reset_index(drop=True, inplace=True)
# Assigning the sentiment column to the train_target and test_target variables.
train_target = train_data['sentiment']
test_target = test_data['sentiment']

## Encoding the data

In [61]:
from sklearn.feature_extraction.text import CountVectorizer

vectorizer = CountVectorizer(token_pattern=r'\b\w+\b')
# Use this token pattern to keep single-letter words
# First, learn vocabulary from the training data and assign columns to words
# Then convert the training data into a sparse matrix
train_matrix = vectorizer.fit_transform(train_data['review_clean'])
# Second, convert the test data into a sparse matrix, using the same word-column mapping
test_matrix = vectorizer.transform(test_data['review_clean'])

## Logistic Regression model

In [62]:
from sklearn.linear_model import LogisticRegression

In [171]:
# Fitting a logistic regression model to the training data.
lr_model = LogisticRegression()
lr_model.fit(train_matrix, train_target)

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


## Minor EDA as per assigment

In [98]:
# Selecting rows 10 to 12 from the test_data dataframe.
sample_data = test_data.iloc[10:13]
sample_data

Unnamed: 0,name,review,rating,review_clean,sentiment
10,Our Baby Girl Memory Book,Absolutely love it and all of the Scripture in...,5,Absolutely love it and all of the Scripture in...,1
11,Wall Decor Removable Decal Sticker - Colorful ...,Would not purchase again or recommend. The dec...,2,Would not purchase again or recommend The deca...,-1
12,New Style Trailing Cherry Blossom Tree Decal R...,Was so excited to get this product for my baby...,1,Was so excited to get this product for my baby...,-1


In [103]:
sample_data.loc[10]['review']

'Absolutely love it and all of the Scripture in it.  I purchased the Baby Boy version for my grandson when he was born and my daughter-in-law was thrilled to receive the same book again.'

In [104]:
sample_data.loc[11]['review']

'Would not purchase again or recommend. The decals were thick almost plastic like and were coming off the wall as I was applying them! The would NOT stick! Literally stayed stuck for about 5 minutes then started peeling off.'

In [106]:
# Transforming the sample data into a matrix and then using the decision function to calculate the scores.
sample_test_matrix = vectorizer.transform(sample_data['review_clean'])
scores = lr_model.decision_function(sample_test_matrix)
scores

array([  5.14554318,  -3.20447669, -11.07916963])

In [108]:
# Applying the sigmoid function to each element in the list `scores`.
[1/(1+np.exp(elem)) for elem in scores]

[0.005791571597551336, 0.9610023956909771, 0.9999845698174172]

## Simpler Model

In [140]:
significant_words = ['love', 'great', 'easy', 'old', 'little', 'perfect', 'loves', 
      'well', 'able', 'car', 'broke', 'less', 'even', 'waste', 'disappointed', 
      'work', 'product', 'money', 'would', 'return']

In [141]:
# The below code is creating a vectorizer that only uses the 20 significant words for encoding.
vectorizer_word_subset = CountVectorizer(vocabulary=significant_words) # limit to 20 words
train_matrix_word_subset = vectorizer_word_subset.fit_transform(train_data['review_clean'])
test_matrix_word_subset = vectorizer_word_subset.transform(test_data['review_clean'])

In [142]:
# Fitting a logistic regression model to the training data.
simple_lr_model = LogisticRegression()
simple_lr_model.fit(train_matrix_word_subset,train_target)

## Questions

> Question 1
>
> How many weights are greater than or equal to 0?

In [69]:
# Counting the number of coefficients that are greater than 0.
(lr_model.coef_>0).sum()

90262

> Question 2
> 
> Of the three data points in sample_test_data, which one has the lowest probability of being classified as a positive review?


In [159]:
# Predicting the probability of the first class (class 1) for the first three rows of the test matrix.
lr_model.predict_proba(test_matrix[10:13])[:,1]

array([9.94208428e-01, 3.89976043e-02, 1.54301826e-05])

> Question 3
> 
> Which of the following products are represented in the 20 most positive reviews?

In [139]:
# Zipping the index of the test data with the probability of the first class (1)
index_prob = list(zip(test_data.index, lr_model.predict_proba(test_matrix)[:,1]))
# Sorting the list of tuples by the second element in the tuple which is probability in descending order.
index_prob.sort(key=lambda x:x[1],reverse=True)
# Getting the first 20 positive predictions index by unziping the tuples and making it to a list
top_20_positive_index = list(list(zip(*index_prob[:20]))[0])
test_data.loc[top_20_positive_index]

Unnamed: 0,name,review,rating,review_clean,sentiment
2570,"Stork Craft Beatrice Combo Tower Chest, White",I bought the tower despite the bad reviews and...,5,I bought the tower despite the bad reviews and...,1
9555,Evenflo X Sport Plus Convenience Stroller - Ch...,After seeing this in Parent's Magazine and rea...,5,After seeing this in Parents Magazine and read...,1
11923,"Evenflo 6 Pack Classic Glass Bottle, 4-Ounce",It's always fun to write a review on those pro...,5,Its always fun to write a review on those prod...,1
15732,Baby Einstein Around The World Discovery Center,I am so HAPPY I brought this item for my 7 mon...,5,I am so HAPPY I brought this item for my 7 mon...,1
18112,"Infantino Wrap and Tie Baby Carrier, Black Blu...",I bought this carrier when my daughter was abo...,5,I bought this carrier when my daughter was abo...,1
20743,"Fisher-Price Cradle 'N Swing, My Little Snuga...",My husband and I cannot state enough how much ...,5,My husband and I cannot state enough how much ...,1
21531,Roan Rocco Classic Pram Stroller 2-in-1 with B...,Great Pram Rocco!!!!!!I bought this pram from ...,5,Great Pram RoccoI bought this pram from Europe...,1
24286,"Britax 2012 B-Agile Stroller, Red",[I got this stroller for my daughter prior to ...,4,I got this stroller for my daughter prior to t...,1
24899,Graco Pack 'n Play Element Playard - Flint,My husband and I assembled this Pack n' Play l...,4,My husband and I assembled this Pack n Play la...,1
25554,"Diono RadianRXT Convertible Car Seat, Plum",I bought this seat for my tall (38in) and thin...,5,I bought this seat for my tall 38in and thin 2...,1



> Question 4
> 
> Which of the following products are represented in the 20 most negative reviews?
>

In [138]:
# Zipping the index of the test data with the probability of the second class (-1)
index_prob = list(zip(test_data.index, lr_model.predict_proba(test_matrix)[:,0]))
# Sorting the list of tuples by the second element in the tuple which is probability in descending order.
index_prob.sort(key=lambda x:x[1], reverse=True)
# Getting the first 20 negative predictions index by unziping the tuples and making it to a list
top_20_negative_index = list(list(zip(*index_prob[:20]))[0])
test_data.loc[top_20_negative_index]

Unnamed: 0,name,review,rating,review_clean,sentiment
17069,The First Years True Choice P400 Premium Digit...,Note: we never installed batteries in these un...,1,Note we never installed batteries in these uni...,-1
2931,Fisher-Price Ocean Wonders Aquarium Bouncer,We have not had ANY luck with Fisher-Price pro...,2,We have not had ANY luck with FisherPrice prod...,-1
21700,Levana Safe N'See Digital Video Baby Monitor w...,This is the first review I have ever written o...,1,This is the first review I have ever written o...,-1
28184,VTech Communications Safe &amp; Sounds Full Co...,"This is my second video monitoring system, the...",1,This is my second video monitoring system the ...,-1
9655,Safety 1st High-Def Digital Monitor,We bought this baby monitor to replace a diffe...,1,We bought this baby monitor to replace a diffe...,-1
30373,Samsung SEW-3037W Wireless Pan Tilt Video Baby...,Reviewers. You failed me!This thing worked for...,1,Reviewers You failed meThis thing worked for 2...,-1
14711,Cloth Diaper Sprayer--styles may vary,I bought this sprayer out of desperation durin...,1,I bought this sprayer out of desperation durin...,-1
8818,Adiri BPA Free Natural Nurser Ultimate Bottle ...,I will try to write an objective review of the...,2,I will try to write an objective review of the...,-1
31928,Baby Trend Inertia Infant Car Seat - Horizon,"I really wanted to love this seat; however, I ...",1,I really wanted to love this seat however I wo...,-1
10814,Ellaroo Mei Tai Baby Carrier - Hershey,This is basically an overpriced piece of fabri...,1,This is basically an overpriced piece of fabri...,-1


> Question 5
> 
> What is the accuracy of the sentiment_model on the test_data? Round your answer to 2 decimal places (e.g. 0.76).
>

In [137]:
# Calculating the accuracy of the model.
from sklearn import metrics
metrics.accuracy_score(test_target, lr_model.predict(test_matrix))

0.9320554355651548

> Question 7
> 
> Consider the coefficients of simple_model. There should be 21 of them, an intercept term + one for each word in significant_words.
> 
> How many of the 20 coefficients (corresponding to the 20 significant_words and excluding the intercept term) are positive for the simple_model?
>


In [143]:
# Counting the number of coefficients that are greater than 0.
(simple_lr_model.coef_>0).sum()

10

> Question 8
> 
> Are the positive words in the simple_model also positive words in the sentiment_model?
> 

In [151]:
# Checking if the positive words in the simple_model are the positive words in the sentiment_model.
temp_df = pd.DataFrame(zip(test_data.index,lr_model.predict(test_matrix)), columns=['Indexval','Rank'])
for x in zip(test_data.index,simple_lr_model.predict(test_matrix_word_subset)):
    if x[0]==1:
        if x[1] in temp_df['Indexval'].values.tolist():
            continue
        else:
            print("The positive words in the simple_model are not the positive words in the sentiment_model")
            break
print("The positive words in the simple_model are the positive words in the sentiment_model")

The positive words in the simple_model are the positive words in the sentiment_model


> Question 9
> 
> Which model (sentiment_model or simple_model) has higher accuracy on the TRAINING set?
>

In [157]:
# Calculating the accuracy of the model.
print("Train Sentiment Model :",metrics.accuracy_score(train_target, lr_model.predict(train_matrix)))
print("Train Simpler Model :",metrics.accuracy_score(train_target, simple_lr_model.predict(train_matrix_word_subset)))

Train Sentiment Model : 0.9479222881813276
Train Simpler Model : 0.8668225700065959


> Question 10
> 
> Which model (sentiment_model or simple_model) has higher accuracy on the TEST set?
>

In [158]:
# Calculating the accuracy of the model.
print("Test Sentiment Model :",metrics.accuracy_score(test_target, lr_model.predict(test_matrix)))
print("Test Simpler Model :",metrics.accuracy_score(test_target, simple_lr_model.predict(test_matrix_word_subset)))

Test Sentiment Model : 0.9320554355651548
Test Simpler Model : 0.8693604511639069


> Question 11
> 
> Enter the accuracy of the majority class classifier model on the test_data. Round your answer to two decimal places (e.g. 0.76).
>

In [169]:
#Majority classifier predicts all the data with the major class found in training
majority_value = train_target.value_counts().head(1).index[0]
print("Test Majority Model :",metrics.accuracy_score(test_target, [majority_value for i in range(0,len(test_target))]))

Test Majority Model : 0.8427825773938085


> Question 12
> 
> Is the sentiment_model definitely better than the majority class classifier (the baseline)?
>


In [170]:
print("Test Sentiment Model :",metrics.accuracy_score(test_target, lr_model.predict(test_matrix)))
print("Test Majority Model :",metrics.accuracy_score(test_target, [majority_value for i in range(0,len(test_target))]))

Test Sentiment Model : 0.9320554355651548
Test Majority Model : 0.8427825773938085
