# Bag of Words Model on Amazon Dataset

Load packages and datasets

In [10]:
import os
from sklearn.feature_extraction.text import CountVectorizer
from Word2VecUtility3 import Word2VecUtility3
import pandas as pd
import numpy as np
from sklearn import svm

In [2]:
data = pd.read_csv('AFF_Evenly_Sampled.csv', sep=',', index_col=False, encoding='utf-8')

In [3]:
print('A quick look at the reviews:')
print(data.head())
data['Score'].value_counts() 

A quick look at the reviews:
   Score                                               Text
0      1  I like this brand. I didn't realize I was orde...
1      1  Being my wife is a licensed cosmetologist and ...
2      1  If you are looking for an upgrade from the sta...
3      1  I am so allergic to too many artificial sweete...
4      1  I have not been able to find this locally and ...


1    82000
0    82000
Name: Score, dtype: int64

In [4]:
#split dataset into train/test sets
train_data = data.sample(frac=0.7,random_state=200)
test_data = data.drop(train_data.index)

train_data.to_csv('train_Even82k.csv', index=False, sep=',', encoding='utf-8')
test_data.to_csv('test_Even82k.csv', index=False, sep=',', encoding='utf-8')

In [5]:
#load train/test sets
train = pd.read_csv('train_Even82k.csv', index_col=False)
test = pd.read_csv('test_Even82k.csv', index_col=False)

# print ("The number of training samples are: %r") % (len(train))
# print ("The number of testing samples are: %r \n") % (len(test))

#make sure the train/test tests are formatted correctly.
print(train.iloc[:2])
print(test.iloc[:2])

   Score                                               Text
0      1  I had been frustrated trying to figure out how...
1      0  I tried this product in hopes that it would in...
   Score                                               Text
0      1  I like this brand. I didn't realize I was orde...
1      1  If you are looking for an upgrade from the sta...


# Text Processing & Bag of Words Model

In [6]:
# Initialize an empty list to hold the clean reviews
clean_train_reviews = []

# Loop over each review; create an index i that goes from 0 to the length
# of the movie review list. Word2VecUtility is a text processing function imported from another file. 

print("Cleaning and parsing the Amazon reviews...\n")
for i in range( 0, len(train["Text"])):
    clean_train_reviews.append(" ".join(Word2VecUtility3.review_to_wordlist(train["Text"][i], True)))

clean_test_reviews = []

print("Cleaning and parsing the test set reviews...\n")
for i in range(0,len(test["Text"])):
    clean_test_reviews.append(" ".join(Word2VecUtility3.review_to_wordlist(test["Text"][i], True)))


Cleaning and parsing the Amazon reviews...



Create Bag of Words

In [36]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.feature_extraction.text import TfidfTransformer
vectorizer = TfidfVectorizer(analyzer="word",tokenizer=None,preprocessor=None,stop_words=None,max_features=5000)

train_data_features = vectorizer.fit_transform(clean_train_reviews)
train_data_features = train_data_features.toarray()

test_data_features = vectorizer.transform(clean_test_reviews)
test_data_features = test_data_features.toarray()

# Train SVM

In [11]:
from __future__ import print_function
from sklearn.model_selection import train_test_split
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import classification_report
from sklearn.svm import SVC
from sklearn import svm
from sklearn.linear_model import SGDClassifier

Parameter Tuning with GridsearchCV

In [29]:
parameters = {
    'loss': ('log', 'hinge'),
    'penalty': ['l1', 'l2', 'elasticnet'],
    'alpha': [0.001, 0.0001, 0.00001, 0.000001]
}

print()
print("GRID SEARCH:")
grid_search = GridSearchCV(SGDClassifier(), parameters, cv=3)
grid_search.fit(train_data_features, train['Score'])


GRID SEARCH:




GridSearchCV(cv=3, error_score='raise',
       estimator=SGDClassifier(alpha=0.0001, average=False, class_weight=None, epsilon=0.1,
       eta0=0.0, fit_intercept=True, l1_ratio=0.15,
       learning_rate='optimal', loss='hinge', max_iter=5, n_iter=None,
       n_jobs=1, penalty='l2', power_t=0.5, random_state=None,
       shuffle=True, tol=None, verbose=0, warm_start=False),
       fit_params={}, iid=True, n_jobs=1,
       param_grid={'penalty': ['l1', 'l2', 'elasticnet'], 'alpha': [0.001, 0.0001, 1e-05, 1e-06], 'loss': ('log', 'hinge')},
       pre_dispatch='2*n_jobs', refit=True, scoring=None, verbose=0)

In [31]:
# View the accuracy score
print('Best score for data:', grid_search.best_score_)
# View the best parameters for the model found using grid search
print('Best loss:', grid_search.best_estimator_.loss)
print('Best penalty:', grid_search.best_estimator_.penalty)
print('Best alpha:', grid_search.best_estimator_.alpha)

Best score for data: 0.8903135888501742
Best loss: hinge
Best penalty: elasticnet
Best alpha: 1e-05


In [32]:
SVM = SGDClassifier(loss='hinge', penalty='elasticnet', alpha=1e-05, max_iter=5, random_state=42)
SVM = SVM.fit( train_data_features, train["Score"] )

# Testing Stage

In [33]:
# Apply the classifier trained using training data to test set, and view the accuracy score
SVM.score(test_data_features, test['Score'])

Cleaning and parsing the test set reviews...

Predicting test labels...



0.89361788617886184

# Naive Bayes 

In [35]:
vectorizer.vocabulary_.get(u'algorithm')

In [34]:
from sklearn.naive_bayes import MultinomialNB
clf = MultinomialNB().fit(train_data_features, train["Score"])

In [35]:
clf.score(test_data_features, test['Score'])

0.86402439024390243

# Try SVM BoW with Radomly Distributed Dataset

In [12]:
df = pd.read_csv('AFF_Randomly_Distributed_164k.csv', sep=',', index_col=False)

In [13]:
#split dataset into train/test sets
#changed names so that we don't contaminate data 
train_data = df.sample(frac=0.7,random_state=200)
test_data = df.drop(train_data.index)

train_data.to_csv('train_Random.csv', index=False, sep=',', encoding='utf-8')
test_data.to_csv('test_Random.csv', index=False, sep=',', encoding='utf-8')

In [14]:
#load train/test sets
train1 = pd.read_csv('train_Random.csv', index_col=False)
test1 = pd.read_csv('test_Random.csv', index_col=False)

print(("The number of training samples are: %r") % (len(train1)))
print(("The number of testing samples are: %r \n") % (len(test1)))

#make sure the train/test tests are formatted correctly.
print(train1.iloc[:2])
print(test1.iloc[:2])
#print(train1['Text'][0])

The number of training samples are: 114800
The number of testing samples are: 49200 

   Score                                               Text
0      0  I was a bit disappointed with these figs.  The...
1      1  I think the first thing that got me looking at...
   Score                                               Text
0      1  White Gold is the BEST honey ever!  It's smoot...
1      0  The product itself is fine, but it is a tiny t...


In [15]:
# Initialize an empty list to hold the clean reviews
clean_train_reviews1 = []

print("Cleaning and parsing the Amazon reviews...\n")
for i in range( 0, len(train1["Text"])):
    clean_train_reviews1.append(" ".join(Word2VecUtility3.review_to_wordlist(train1["Text"][i], True)))

clean_test_reviews1 = []

print("Cleaning and parsing the test set reviews...\n")
for i in range(0,len(test1["Text"])):
    clean_test_reviews1.append(" ".join(Word2VecUtility3.review_to_wordlist(test1["Text"][i], True)))

Cleaning and parsing the Amazon reviews...

Cleaning and parsing the test set reviews...



In [16]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.feature_extraction.text import TfidfTransformer
vectorizer = TfidfVectorizer(analyzer="word",tokenizer=None,preprocessor=None,stop_words=None,max_features=5000)

train_data_features1 = vectorizer.fit_transform(clean_train_reviews1)
train_data_features1 = train_data_features1.toarray()

test_data_features1 = vectorizer.transform(clean_test_reviews1)
test_data_features1 = test_data_features1.toarray()

In [18]:
parameters = {
    'loss': ('log', 'hinge'),
    'penalty': ['l1', 'l2', 'elasticnet'],
    'alpha': [0.001, 0.0001, 0.00001, 0.000001]
}

print()
print("GRID SEARCH:")
grid_search1 = GridSearchCV(SGDClassifier(), parameters, cv=3)
grid_search1.fit(train_data_features1, train1['Score'])


GRID SEARCH:




GridSearchCV(cv=3, error_score='raise',
       estimator=SGDClassifier(alpha=0.0001, average=False, class_weight=None, epsilon=0.1,
       eta0=0.0, fit_intercept=True, l1_ratio=0.15,
       learning_rate='optimal', loss='hinge', max_iter=5, n_iter=None,
       n_jobs=1, penalty='l2', power_t=0.5, random_state=None,
       shuffle=True, tol=None, verbose=0, warm_start=False),
       fit_params=None, iid=True, n_jobs=1,
       param_grid={'loss': ('log', 'hinge'), 'penalty': ['l1', 'l2', 'elasticnet'], 'alpha': [0.001, 0.0001, 1e-05, 1e-06]},
       pre_dispatch='2*n_jobs', refit=True, return_train_score=True,
       scoring=None, verbose=0)

In [19]:
# View the accuracy score
print('Best score for data:', grid_search1.best_score_)
# View the best parameters for the model found using grid search
#print('Best loss:', grid_search1.best_estimator_.loss)
print('Best penalty:', grid_search1.best_estimator_.penalty)
print('Best alpha:', grid_search1.best_estimator_.alpha)

Best score for data: 0.922456445993
Best penalty: l2
Best alpha: 1e-05


In [20]:
SVM1 = SGDClassifier(loss='hinge', penalty='l2', alpha=1e-05, max_iter=5, random_state=42)
SVM1 = SVM1.fit( train_data_features1, train1["Score"] )

In [21]:
# Apply the classifier trained using training data to test set, and view the accuracy score
SVM1.score(test_data_features1, test1['Score'])

0.92863821138211378

In [56]:
SVM2 = SGDClassifier(loss='hinge', penalty='l2', alpha=1e-3, max_iter=5, random_state=42)
SVM2 = SVM2.fit( train_data_features1, train1["Score"] )

In [57]:
SVM2.score(test_data_features1, test1['Score'])

0.84626016260162606

# Naive Bayes 

In [48]:
clf = MultinomialNB().fit(train_data_features1, train1["Score"])

In [50]:
clf.score(test_data_features1, test1['Score'])

0.84626016260162606