# Bag of Words Model on Amazon Dataset

Load packages and datasets

In [1]:
import os
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.ensemble import RandomForestClassifier
from Word2VecUtility3 import Word2VecUtility3
import pandas as pd
import numpy as np
from sklearn import svm

In [2]:
data = pd.read_csv('Yelp_Evenly_Sampled164k.csv', sep=',', index_col=False, encoding='utf-8')

In [3]:
print('A quick look at the reviews:')
print(data.head())
data['stars'].value_counts() 

A quick look at the reviews:
   stars                                               text
0      1  I just got my reading glasses back and have no...
1      1  Absolutely the "Best of Phoenix". Caring compa...
2      1  I've been coming here for about 4 years now. I...
3      1  The food was hot and tasty. The garlic knots a...
4      1  This place is delicious!! I love the salmon an...


1    82000
0    82000
Name: stars, dtype: int64

In [4]:
#split dataset into train/test sets
train_data = data.sample(frac=0.7,random_state=200)
test_data = data.drop(train_data.index)

train_data.to_csv('train_Even82k.csv', index=False, sep=',', encoding='utf-8')
test_data.to_csv('test_Even82k.csv', index=False, sep=',', encoding='utf-8')

In [5]:
#load train/test sets
train = pd.read_csv('train_Even82k.csv', index_col=False)
test = pd.read_csv('test_Even82k.csv', index_col=False)

# print ("The number of training samples are: %r") % (len(train))
# print ("The number of testing samples are: %r \n") % (len(test))

#make sure the train/test tests are formatted correctly.
print(train.iloc[:2])
print(test.iloc[:2])

   stars                                               text
0      1  This store has the nIcest most knowledgeable p...
1      0  I am a barbeque EXPERT and I have to say...thi...
   stars                                               text
0      1  I just got my reading glasses back and have no...
1      1  I've been coming here for about 4 years now. I...


# Text Processing & Bag of Words Model

In [6]:
# Initialize an empty list to hold the clean reviews
clean_train_reviews = []


for i in range( 0, len(train["text"])):
    clean_train_reviews.append(" ".join(Word2VecUtility3.review_to_wordlist(train["text"][i], True)))

clean_test_reviews = []

print("Cleaning and parsing the test set reviews...\n")
for i in range(0,len(test["text"])):
    clean_test_reviews.append(" ".join(Word2VecUtility3.review_to_wordlist(test["text"][i], True)))


Cleaning and parsing the test set reviews...



  ' that document to Beautiful Soup.' % decoded_markup


Create Bag of Words

In [7]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.feature_extraction.text import TfidfTransformer
vectorizer = TfidfVectorizer(analyzer="word",tokenizer=None,preprocessor=None,stop_words=None,max_features=5000)

train_data_features = vectorizer.fit_transform(clean_train_reviews)
train_data_features = train_data_features.toarray()

test_data_features = vectorizer.transform(clean_test_reviews)
test_data_features = test_data_features.toarray()

# Train SVM

In [8]:
from __future__ import print_function
from sklearn.model_selection import train_test_split
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import classification_report
from sklearn.svm import SVC
from sklearn import svm
from sklearn.linear_model import SGDClassifier

Parameter Tuning with GridsearchCV

In [13]:
parameters = {
    'loss': ('log', 'hinge'),
    'penalty': ['l1', 'l2', 'elasticnet'],
    'alpha': [0.001, 0.0001, 0.00001, 0.000001]
}

print()
print("GRID SEARCH:")
grid_search = GridSearchCV(SGDClassifier(), parameters, cv=3)
grid_search.fit(train_data_features, train['stars'])


GRID SEARCH:




GridSearchCV(cv=3, error_score='raise',
       estimator=SGDClassifier(alpha=0.0001, average=False, class_weight=None, epsilon=0.1,
       eta0=0.0, fit_intercept=True, l1_ratio=0.15,
       learning_rate='optimal', loss='hinge', max_iter=5, n_iter=None,
       n_jobs=1, penalty='l2', power_t=0.5, random_state=None,
       shuffle=True, tol=None, verbose=0, warm_start=False),
       fit_params=None, iid=True, n_jobs=1,
       param_grid={'penalty': ['l1', 'l2', 'elasticnet'], 'loss': ('log', 'hinge'), 'alpha': [0.001, 0.0001, 1e-05, 1e-06]},
       pre_dispatch='2*n_jobs', refit=True, return_train_score=True,
       scoring=None, verbose=0)

In [14]:
# View the accuracy score
print('Best score for data:', grid_search.best_score_)
# View the best parameters for the model found using grid search
print('Best loss:', grid_search.best_estimator_.loss)
print('Best penalty:', grid_search.best_estimator_.penalty)
print('Best alpha:', grid_search.best_estimator_.alpha)

Best score for data: 0.928527874564
Best loss: log
Best penalty: elasticnet
Best alpha: 1e-05


In [9]:
SVM = SGDClassifier(loss='hinge', penalty='elasticnet', alpha=1e-05, max_iter=5, random_state=42)
SVM = SVM.fit( train_data_features, train["stars"] )

# Testing Stage

In [10]:
# Apply the classifier trained using training data to test set, and view the accuracy score
SVM.score(test_data_features, test['stars'])

0.93158536585365859

# Naive Bayes 

In [21]:
vectorizer.vocabulary_.get(u'algorithm')

In [23]:
from sklearn.naive_bayes import MultinomialNB
clf = MultinomialNB().fit(train_data_features, train["stars"])

In [24]:
clf.score(test_data_features, test['stars'])

0.88491869918699184

# Try SVM BoW with Radomly Distributed Dataset

In [11]:
df = pd.read_csv('Yelp_Randomly_Distributed_164k.csv', sep=',', index_col=False)

In [12]:
#split dataset into train/test sets
#changed names so that we don't contaminate data 
train_data = df.sample(frac=0.7,random_state=200)
test_data = df.drop(train_data.index)

train_data.to_csv('train_Random.csv', index=False, sep=',', encoding='utf-8')
test_data.to_csv('test_Random.csv', index=False, sep=',', encoding='utf-8')

In [13]:
#load train/test sets
train1 = pd.read_csv('train_Random.csv', index_col=False)
test1 = pd.read_csv('test_Random.csv', index_col=False)

print(("The number of training samples are: %r") % (len(train1)))
print(("The number of testing samples are: %r \n") % (len(test1)))

#make sure the train/test tests are formatted correctly.
print(train1.iloc[:2])
print(test1.iloc[:2])
#print(train1['Text'][0])

The number of training samples are: 114800
The number of testing samples are: 49200 

   stars                                               text
0      1  As the owner of a small business, I know just ...
1      1  They trying to kill me here - downtown is alre...
   stars                                               text
0      1  When it comes to Indian food this place is alw...
1      1  I think this shop is probably more of a 3.5, b...


In [14]:
# Initialize an empty list to hold the clean reviews
clean_train_reviews1 = []

print("Cleaning and parsing the Amazon reviews...\n")
for i in range( 0, len(train1["text"])):
    clean_train_reviews1.append(" ".join(Word2VecUtility3.review_to_wordlist(train1["text"][i], True)))

clean_test_reviews1 = []

print("Cleaning and parsing the test set reviews...\n")
for i in range(0,len(test1["text"])):
    clean_test_reviews1.append(" ".join(Word2VecUtility3.review_to_wordlist(test1["text"][i], True)))

Cleaning and parsing the Amazon reviews...

Cleaning and parsing the test set reviews...



In [15]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.feature_extraction.text import TfidfTransformer
vectorizer = TfidfVectorizer(analyzer="word",tokenizer=None,preprocessor=None,stop_words=None,max_features=5000)

train_data_features1 = vectorizer.fit_transform(clean_train_reviews1)
train_data_features1 = train_data_features1.toarray()

test_data_features1 = vectorizer.transform(clean_test_reviews1)
test_data_features1 = test_data_features1.toarray()

In [16]:
parameters = {
    'loss': ('log', 'hinge'),
    'penalty': ['l1', 'l2', 'elasticnet'],
    'alpha': [0.001, 0.0001, 0.00001, 0.000001]
}

print()
print("GRID SEARCH:")
grid_search1 = GridSearchCV(SGDClassifier(), parameters, cv=3)
grid_search1.fit(train_data_features1, train1['stars'])


GRID SEARCH:




GridSearchCV(cv=3, error_score='raise',
       estimator=SGDClassifier(alpha=0.0001, average=False, class_weight=None, epsilon=0.1,
       eta0=0.0, fit_intercept=True, l1_ratio=0.15,
       learning_rate='optimal', loss='hinge', max_iter=5, n_iter=None,
       n_jobs=1, penalty='l2', power_t=0.5, random_state=None,
       shuffle=True, tol=None, verbose=0, warm_start=False),
       fit_params=None, iid=True, n_jobs=1,
       param_grid={'penalty': ['l1', 'l2', 'elasticnet'], 'loss': ('log', 'hinge'), 'alpha': [0.001, 0.0001, 1e-05, 1e-06]},
       pre_dispatch='2*n_jobs', refit=True, return_train_score=True,
       scoring=None, verbose=0)

In [24]:
# View the accuracy score
print('Best score for data:', grid_search1.best_score_)
# View the best parameters for the model found using grid search
print('Best loss:', grid_search1.best_estimator_.loss)
print('Best penalty:', grid_search1.best_estimator_.penalty)
print('Best alpha:', grid_search1.best_estimator_.alpha)

Best score for data: 0.940165505226
Best loss: log
Best penalty: elasticnet
Best alpha: 1e-05


In [25]:
SVM1 = SGDClassifier(loss='hinge', penalty='elasticnet', alpha=1e-05, max_iter=5, random_state=42)
SVM1 = SVM1.fit( train_data_features1, train1["stars"] )

In [19]:
# Apply the classifier trained using training data to test set, and view the accuracy score
SVM1.score(test_data_features1, test1['stars'])

0.92871951219512194

# Naive Bayes 

In [22]:
from sklearn.naive_bayes import MultinomialNB
clf = MultinomialNB().fit(train_data_features1, train1["stars"])

In [23]:
clf.score(test_data_features1, test1['stars'])

0.89762195121951216