# Use Case 2: Amazon Products reviews Sentiment Analysis

In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from bs4 import BeautifulSoup  
import re
import nltk
from nltk.corpus import stopwords 
from nltk.stem.porter import PorterStemmer
from nltk.stem import SnowballStemmer, WordNetLemmatizer
from nltk import sent_tokenize, word_tokenize, pos_tag
import warnings

warnings.filterwarnings('ignore')
%matplotlib inline

In [2]:
# importing the data set into pandas data frame and viewing the first five rows
reviews = pd.read_csv('1429_1.csv')
reviews.head()

Unnamed: 0,id,name,asins,brand,categories,keys,manufacturer,reviews.date,reviews.dateAdded,reviews.dateSeen,...,reviews.doRecommend,reviews.id,reviews.numHelpful,reviews.rating,reviews.sourceURLs,reviews.text,reviews.title,reviews.userCity,reviews.userProvince,reviews.username
0,AVqkIhwDv8e3D1O-lebb,"All-New Fire HD 8 Tablet, 8 HD Display, Wi-Fi,...",B01AHB9CN2,Amazon,"Electronics,iPad & Tablets,All Tablets,Fire Ta...","841667104676,amazon/53004484,amazon/b01ahb9cn2...",Amazon,2017-01-13T00:00:00.000Z,2017-07-03T23:33:15Z,"2017-06-07T09:04:00.000Z,2017-04-30T00:45:00.000Z",...,True,,0.0,5.0,http://reviews.bestbuy.com/3545/5620406/review...,This product so far has not disappointed. My c...,Kindle,,,Adapter
1,AVqkIhwDv8e3D1O-lebb,"All-New Fire HD 8 Tablet, 8 HD Display, Wi-Fi,...",B01AHB9CN2,Amazon,"Electronics,iPad & Tablets,All Tablets,Fire Ta...","841667104676,amazon/53004484,amazon/b01ahb9cn2...",Amazon,2017-01-13T00:00:00.000Z,2017-07-03T23:33:15Z,"2017-06-07T09:04:00.000Z,2017-04-30T00:45:00.000Z",...,True,,0.0,5.0,http://reviews.bestbuy.com/3545/5620406/review...,great for beginner or experienced person. Boug...,very fast,,,truman
2,AVqkIhwDv8e3D1O-lebb,"All-New Fire HD 8 Tablet, 8 HD Display, Wi-Fi,...",B01AHB9CN2,Amazon,"Electronics,iPad & Tablets,All Tablets,Fire Ta...","841667104676,amazon/53004484,amazon/b01ahb9cn2...",Amazon,2017-01-13T00:00:00.000Z,2017-07-03T23:33:15Z,"2017-06-07T09:04:00.000Z,2017-04-30T00:45:00.000Z",...,True,,0.0,5.0,http://reviews.bestbuy.com/3545/5620406/review...,Inexpensive tablet for him to use and learn on...,Beginner tablet for our 9 year old son.,,,DaveZ
3,AVqkIhwDv8e3D1O-lebb,"All-New Fire HD 8 Tablet, 8 HD Display, Wi-Fi,...",B01AHB9CN2,Amazon,"Electronics,iPad & Tablets,All Tablets,Fire Ta...","841667104676,amazon/53004484,amazon/b01ahb9cn2...",Amazon,2017-01-13T00:00:00.000Z,2017-07-03T23:33:15Z,"2017-06-07T09:04:00.000Z,2017-04-30T00:45:00.000Z",...,True,,0.0,4.0,http://reviews.bestbuy.com/3545/5620406/review...,I've had my Fire HD 8 two weeks now and I love...,Good!!!,,,Shacks
4,AVqkIhwDv8e3D1O-lebb,"All-New Fire HD 8 Tablet, 8 HD Display, Wi-Fi,...",B01AHB9CN2,Amazon,"Electronics,iPad & Tablets,All Tablets,Fire Ta...","841667104676,amazon/53004484,amazon/b01ahb9cn2...",Amazon,2017-01-12T00:00:00.000Z,2017-07-03T23:33:15Z,"2017-06-07T09:04:00.000Z,2017-04-30T00:45:00.000Z",...,True,,0.0,5.0,http://reviews.bestbuy.com/3545/5620406/review...,I bought this for my grand daughter when she c...,Fantastic Tablet for kids,,,explore42


## Exploration

> As We will be using only the review text and the review rating so I'll drop all the other columns

In [3]:
# choosing the columns of interest and changing their name
reviews = reviews[['reviews.text', 'reviews.rating']]
reviews.rename(columns={'reviews.text': 'text', 
                       'reviews.rating': 'rating'}, inplace=True)
reviews.head()

Unnamed: 0,text,rating
0,This product so far has not disappointed. My c...,5.0
1,great for beginner or experienced person. Boug...,5.0
2,Inexpensive tablet for him to use and learn on...,5.0
3,I've had my Fire HD 8 two weeks now and I love...,4.0
4,I bought this for my grand daughter when she c...,5.0


In [4]:
# viewing the general information about the variables
reviews.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 34660 entries, 0 to 34659
Data columns (total 2 columns):
text      34659 non-null object
rating    34627 non-null float64
dtypes: float64(1), object(1)
memory usage: 541.7+ KB


In [5]:
reviews.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 34660 entries, 0 to 34659
Data columns (total 2 columns):
text      34659 non-null object
rating    34627 non-null float64
dtypes: float64(1), object(1)
memory usage: 541.7+ KB


In [6]:
# checking for missing values
reviews.isna().sum()

text       1
rating    33
dtype: int64

> The number of the missing values is too small incomparison with the size of the data set , So I will choose to drop the missing values

In [7]:
#droping the missing values
reviews.dropna(axis=0, inplace=True)

In [8]:
#checking the success of removing the missing values
#this should output zero
reviews.isna().sum().sum()

0

In [9]:
# convert the rating column to `int` dtype
reviews.rating = reviews.rating.astype(int)

In [10]:
#checking for duplicates
reviews.duplicated().sum()

0

In [11]:
# viewing summary statistics
reviews.describe()

Unnamed: 0,rating
count,34626.0
mean,4.584561
std,0.73566
min,1.0
25%,4.0
50%,5.0
75%,5.0
max,5.0


In [12]:
# checking the distribution of the rating column
reviews.rating.value_counts()

5    23774
4     8541
3     1499
1      410
2      402
Name: rating, dtype: int64

> It seems that the data set is skewed with much fewer negative reviews than positive reviews. This suggets that `stratified sampling` can be the right approach in this case

## Preprocessing

### Spliting the data set into trian and test sets using stratified sampling and using upsampling to balance the dataset

In [13]:
from sklearn.model_selection import StratifiedShuffleSplit
spliter = StratifiedShuffleSplit(n_splits=5, test_size=0.2)
for train_index, test_index in spliter.split(reviews, reviews.rating): 
    strat_train = reviews.reindex(train_index)
    strat_test = reviews.reindex(test_index)

In [14]:
print('train shape is {} and test shape is {}'.format(strat_train.shape, strat_test.shape))

train shape is (27700, 2) and test shape is (6926, 2)


In [15]:
print('Train porportions \n')
display(strat_train["rating"].value_counts()/len(strat_train))
print('Test_proportions \n--------------------------------------------------------------------------------')
display(strat_test["rating"].value_counts()/len(strat_test))

Train porportions 



5.0    0.686570
4.0    0.247004
3.0    0.042383
1.0    0.011986
2.0    0.011119
Name: rating, dtype: float64

Test_proportions 
--------------------------------------------------------------------------------


5.0    0.684089
4.0    0.244874
3.0    0.046636
2.0    0.013428
1.0    0.009818
Name: rating, dtype: float64

### Using Upsampling to try to balance the training set

In [16]:
from sklearn.utils import resample
train_1 = strat_train.query('rating == 1')
train_2 = strat_train.query('rating == 2')
train_3 = strat_train.query('rating == 3')
train_4 = strat_train.query('rating == 4')
train_5 = strat_train.query('rating == 5')
train_1_up = resample(train_1, replace=True, n_samples=6000, random_state=123)
train_2_up = resample(train_2, replace=True, n_samples=6000, random_state=123)
train_3_up = resample(train_3, replace=True, n_samples=7000, random_state=123)
strat_train_up = pd.concat([train_1_up, train_2_up, train_3_up, train_4, train_5])

Segregate ratings from 1-5 into positive, neutral, and negative.

In [17]:
def to_sentiment(rating):
    if (rating == 5) or (rating == 4):
        return "Positive"
    elif rating == 3:
        return "Neutral"
    elif (rating == 2) or (rating == 1):
        return "Negative"
strat_train_up["sentiment"] = strat_train_up["rating"].apply(to_sentiment)
strat_test["sentiment"] = strat_test["rating"].apply(to_sentiment)
strat_train.sample(5)

Unnamed: 0,text,rating
32654,Easy to find different apps on the Amazon fire...,4.0
25440,My review is not a review of the device as muc...,5.0
25087,"I just love this toy, with time she will get b...",5.0
10680,Checking e-mail is a breeze. All the apps you ...,5.0
20096,It's a surprise to getting this kindle in this...,5.0


In [18]:
X_train = strat_train_up["text"]
y_train = strat_train_up["sentiment"]
X_test = strat_test["text"]
y_test = strat_test["sentiment"]

The following text preprocessing are implemented to convert raw reviews to cleaned review, so that it will be easier for us to do feature extraction in the next step.
* remove html tags using BeautifulSoup
* remove non-character such as digits and symbols
* convert to lower case
* remove stop words such as "the" and "and" if needed
* convert to root words by stemming if needed

In [19]:
def clean_text(raw_text, remove_stopwords=False, stemming=False, split_text=False):
    '''
    Convert a raw review to a cleaned review
    '''
    text = BeautifulSoup(raw_text, 'lxml').get_text()  #remove html
    letters_only = re.sub("[^a-zA-Z]", " ", text)  # remove non-character
    words = letters_only.lower().split() # convert to lower case 
    
    if remove_stopwords: # remove stopword
        stops = set(stopwords.words("english"))
        words = [w for w in words if not w in stops]
        
    if stemming==True: # stemming
        stemmer = SnowballStemmer('english') 
        words = [stemmer.stem(w) for w in words]
        
    if split_text==True:  # split text
        return (words)
    
    return( " ".join(words))

In [20]:
X_train = X_train.dropna()
X_test = X_test.dropna()
y_train = y_train.dropna()
y_test = y_test.dropna()

In [21]:
# preparing text data in training and test sets
X_train_clean = []
X_test_clean = []

for row in X_train:
    X_train_clean.append(clean_text(row))    
for d in X_test:
    X_test_clean.append(clean_text(row))
print('Show a cleaned review in the training set : \n',  X_train_clean[10])

Show a cleaned review in the training set : 
 can not print from this device using the touch screen is very difficult


## Feature Extraction and Modeling

In [22]:
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.naive_bayes import BernoulliNB, MultinomialNB
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn import metrics
from sklearn.metrics import roc_auc_score, accuracy_score
from sklearn.pipeline import Pipeline

### CountVectorizer with Mulinomial Naive Bayes

In [23]:
# Fit and transform the training data to a document-term matrix using CountVectorizer
countVect = CountVectorizer() 
X_train_countVect = countVect.fit_transform(X_train_clean)
print("Number of features : %d \n" %len(countVect.get_feature_names())) #6378 
print("Show some feature names : \n", countVect.get_feature_names()[::1000])


# Train MultinomialNB classifier
mnb = MultinomialNB()
mnb.fit(X_train_countVect, y_train)

Number of features : 12096 

Show some feature names : 
 ['aa', 'bec', 'combines', 'display', 'fingerprint', 'hills', 'lastlonger', 'neiece', 'powerwhite', 'rights', 'sprinkler', 'transportations', 'xbox']


MultinomialNB(alpha=1.0, class_prior=None, fit_prior=True)

In [24]:
def modelEvaluation(predictions):
    '''
    Print model evaluation to predicted result 
    '''
    print ("\nAccuracy on test set: {:.4f}".format(accuracy_score(y_test, predictions)))
    print("\nClassification report : \n", metrics.classification_report(y_test, predictions))
    print("\nConfusion Matrix : \n", metrics.confusion_matrix(y_test, predictions))

In [25]:
# Evaluate the model on validaton set
predictions = mnb.predict(countVect.transform(X_test_clean))
modelEvaluation(predictions)


Accuracy on test set: 0.9300

Classification report : 
               precision    recall  f1-score   support

    Negative       0.00      0.00      0.00       161
     Neutral       0.00      0.00      0.00       323
    Positive       0.93      1.00      0.96      6434

    accuracy                           0.93      6918
   macro avg       0.31      0.33      0.32      6918
weighted avg       0.86      0.93      0.90      6918


Confusion Matrix : 
 [[   0    0  161]
 [   0    0  323]
 [   0    0 6434]]


### TfidfVectorizer with Logistic Regression

In [26]:
# Fit and transform the training data to a document-term matrix using TfidfVectorizer 
tfidf = TfidfVectorizer(min_df=5) #minimum document frequency of 5
X_train_tfidf = tfidf.fit_transform(X_train)
print("Number of features : %d \n" %len(tfidf.get_feature_names())) #1722
print("Show some feature names : \n", tfidf.get_feature_names()[::1000])

# Logistic Regression
lr = LogisticRegression()
lr.fit(X_train_tfidf, y_train)

Number of features : 5504 

Show some feature names : 
 ['00', 'coffee', 'fort', 'me', 'repairs', 'towards']


LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
                   intercept_scaling=1, l1_ratio=None, max_iter=100,
                   multi_class='auto', n_jobs=None, penalty='l2',
                   random_state=None, solver='lbfgs', tol=0.0001, verbose=0,
                   warm_start=False)

In [27]:
# Evaluate on the validaton set
predictions = lr.predict(tfidf.transform(X_test_clean))
modelEvaluation(predictions)


Accuracy on test set: 0.9300

Classification report : 
               precision    recall  f1-score   support

    Negative       0.00      0.00      0.00       161
     Neutral       0.00      0.00      0.00       323
    Positive       0.93      1.00      0.96      6434

    accuracy                           0.93      6918
   macro avg       0.31      0.33      0.32      6918
weighted avg       0.86      0.93      0.90      6918


Confusion Matrix : 
 [[   0    0  161]
 [   0    0  323]
 [   0    0 6434]]
