# Bag of Words Model on Amazon Dataset

In [1]:
import os
from sklearn.feature_extraction.text import CountVectorizer
from Word2VecUtility import Word2VecUtility
import pandas as pd
import numpy as np

In [3]:
data = pd.read_csv('Yelp_reviews.csv', sep=',', index_col=False)

print 'A quick look at the reviews:'
data.head()
data.info

A quick look at the reviews:


Unnamed: 0,stars,text
0,5,If you enjoy service by someone who is as comp...
1,5,After being on the phone with Verizon Wireless...
2,5,Great service! Corey is very service oriented....
3,5,Highly recommended. Went in yesterday looking ...
4,4,I walked in here looking for a specific piece ...


In [4]:
print data.stars[0]
print data.text[0]

5
If you enjoy service by someone who is as competent as he is personable, I would recommend Corey Kaplan highly. The time he has spent here has been very productive and working with him educational and enjoyable. I hope not to need him again (though this is highly unlikely) but knowing he is there if I do is very nice. By the way, I'm not from El Centro, CA. but Scottsdale, AZ.


Sample data

In [5]:
#Create a sample dataset to speed up training times for the moment. 
size = 100000 
subdata = data.sample(n = size, random_state=520)
subdata = subdata[pd.notnull(subdata['text'])] # to get rid of null values
print subdata.index
subdata.to_csv('yelp_review_sub100k.csv', index=False, sep=',', encoding='utf-8')

Int64Index([2807975, 1755175, 1813426, 2712021,  235453, 2221447, 2599015,
            1501646, 1623687, 3236951,
            ...
            2213442, 2187233, 3756712, 3637900,  343447,  556202, 3295730,
            2160073, 1753741, 2807019],
           dtype='int64', length=99999)


In [6]:
del(data)
data = subdata
del(subdata)

In [7]:
#Load in the sample data
data = pd.read_csv('yelp_review_sub100k.csv', index_col=False)
print data.iloc[:5]

   stars                                               text
0      3  I don't visit dollar stores often and when I d...
1      2  The food was perfectly adequate, nothing speci...
2      2  Our server, Cookie, has been fabulous both tim...
3      1  DO NOT go here, I made reservations for a holi...
4      5  My husband and I stopped in for a foot rub and...


In [26]:
#remove rows which contain ratings of 3 (Neutral and not included in our analysis.)
data = data[data.stars != 3]
print(data.head())
data['stars'].value_counts() 

   stars                                               text
1      2  The food was perfectly adequate, nothing speci...
2      2  Our server, Cookie, has been fabulous both tim...
3      1  DO NOT go here, I made reservations for a holi...
4      5  My husband and I stopped in for a foot rub and...
6      5  Always great, fresh food. Quick service. \n\nT...


5    40839
4    24767
1    13162
2     8744
Name: stars, dtype: int64

In [27]:
data.loc[data.stars <=2, 'stars'] = 0
data.loc[data.stars >=4, 'stars'] = 1
        
data['stars'].value_counts()

1    65606
0    21906
Name: stars, dtype: int64

In [29]:
#make sure the reviews were labelled correctly (can compare to the previous header)
print data.iloc[:5]

   stars                                               text
1      0  The food was perfectly adequate, nothing speci...
2      0  Our server, Cookie, has been fabulous both tim...
3      0  DO NOT go here, I made reservations for a holi...
4      1  My husband and I stopped in for a foot rub and...
6      1  Always great, fresh food. Quick service. \n\nT...


In [30]:
#split dataset into train/test sets
train_data = data.sample(frac=0.7,random_state=200)
test_data = data.drop(train_data.index)

train_data.to_csv('train.csv', index=False, sep=',', encoding='utf-8')
test_data.to_csv('test.csv', index=False, sep=',', encoding='utf-8')

In [31]:
#load train/test sets
train = pd.read_csv('train.csv', index_col=False)
test = pd.read_csv('test.csv', index_col=False)

print ("The number of training samples are: %r") % (len(train))
print ("The number of testing samples are: %r \n") % (len(test))

#make sure the train/test tests are formatted correctly.
print train.iloc[:2]
print test.iloc[:2]

The number of training samples are: 61258
The number of testing samples are: 26254 

   stars                                               text
0      1  Best place ever!  I was so scared and they mad...
1      1  I've had the last week off before starting my ...
   stars                                               text
0      1  My husband and I stopped in for a foot rub and...
1      1  Always great, fresh food. Quick service. \n\nT...


# Text Processing & Bag of Words Model

In [33]:
# Initialize an empty list to hold the clean reviews
clean_train_reviews = []

# Loop over each review; create an index i that goes from 0 to the length
# of the movie review list. Word2VecUtility is a text processing function imported from another file. 

print("Cleaning and parsing the Amazon reviews...\n")
for i in range( 0, len(train["text"])):
    clean_train_reviews.append(" ".join(Word2VecUtility.review_to_wordlist(train["text"][i], True)))

Cleaning and parsing the Amazon reviews...



Create Bag of Words

In [34]:
# ****** Create a bag of words from the training set
#
print("Creating the bag of words...\n")


# Initialize the "CountVectorizer" object, which is scikit-learn's bag of words tool.
vectorizer = CountVectorizer(analyzer = "word",   \
                         tokenizer = None,    \
                         preprocessor = None, \
                         stop_words = None,   \
                         max_features = 5000)

Creating the bag of words...



# Train SVM

In [35]:
# fit_transform() does two functions: First, it fits the model
# and learns the vocabulary; second, it transforms our training data
# into feature vectors. The input to fit_transform should be a list of
# strings.
train_data_features = vectorizer.fit_transform(clean_train_reviews)

# Numpy arrays are easy to work with, so convert the result to an
# array
np.asarray(train_data_features)

# ******* Train an SVM using the bag of words
#
print("Training the SVM (this may take a while)...")

# Fit the SVM to the training set, using the bag of words as
# features and the sentiment labels as the response variable
#
# Initialize an SVM classifier with chosen parameters.

from sklearn.linear_model import SGDClassifier

SVM = SGDClassifier(loss='hinge', penalty='l2', alpha=1e-3, n_iter=5, random_state=42)
SVM = SVM.fit( train_data_features, train["stars"] )

Training the SVM (this may take a while)...


# Testing Stage

In [36]:
# Create an empty list and append the clean reviews one by one
clean_test_reviews = []

print("Cleaning and parsing the test set movie reviews...\n")
for i in range(0,len(test["text"])):
    clean_test_reviews.append(" ".join(Word2VecUtility.review_to_wordlist(test["text"][i], True)))

# Get a bag of words for the test set, and convert to a numpy array
test_data_features = vectorizer.transform(clean_test_reviews)
np.asarray(test_data_features)

# Use the random forest to make sentiment label predictions
print("Predicting test labels...\n")
result = SVM.predict(test_data_features)

# Copy the results to a pandas dataframe with an "id" column and
# a "sentiment" column
output = pd.DataFrame( data={"id":test["stars"], "Sentiment":result} )

# Use pandas to write the comma-separated output file
output.to_csv(os.path.join('Bag_of_Words_modelSVM100k.csv'), index=False, quoting=3)
print("Wrote results to Bag_of_Words_model.csv")

Cleaning and parsing the test set movie reviews...

Predicting test labels...

Wrote results to Bag_of_Words_model.csv


In [37]:
accuraccy = np.mean(result == test['stars'])  

print(accuraccy)

0.933305401082


# Balanced Dataset

In [10]:
#Load in the new data
df = pd.read_csv('Yelp_Evenly_Sampled.csv', index_col=False)
print data.iloc[:5]
data['stars'].value_counts()

   stars                                               text
0      1  I just got my reading glasses back and have no...
1      1  Absolutely the "Best of Phoenix". Caring compa...
2      1  I've been coming here for about 4 years now. I...
3      1  The food was hot and tasty. The garlic knots a...
4      1  This place is delicious!! I love the salmon an...


1    100000
0    100000
Name: stars, dtype: int64

In [11]:
#split dataset into train/test sets
#changed names so that we don't contaminate data 
train_balanced_data = df.sample(frac=0.7,random_state=200)
test_balanced_data = df.drop(train_balanced_data.index)

train_balanced_data.to_csv('train_balanced.csv', index=False, sep=',', encoding='utf-8')
test_balanced_data.to_csv('test_balanced.csv', index=False, sep=',', encoding='utf-8')

In [12]:
#load train/test sets
train1 = pd.read_csv('train_balanced.csv', index_col=False)
test1 = pd.read_csv('test_balanced.csv', index_col=False)

print ("The number of training samples are: %r") % (len(train1))
print ("The number of testing samples are: %r \n") % (len(test1))

#make sure the train/test tests are formatted correctly.
print train1.iloc[:2]
print test1.iloc[:2]
#print(train1['text'][0])

The number of training samples are: 140000
The number of testing samples are: 60000 

   stars                                               text
0      0  I agree with most of the reviews on the site. ...
1      0  The thing I hate about them is that I can neve...
   stars                                               text
0      1  I just got my reading glasses back and have no...
1      1  I've been coming here for about 4 years now. I...


In [15]:
# Initialize an empty list to hold the clean reviews
clean_train_reviews1 = []

# Loop over each review; create an index i that goes from 0 to the length
# of the movie review list. Word2VecUtility is a text processing function imported from another file. 

print("Cleaning and parsing the Amazon reviews...\n")
for i in range( 0, len(train1["text"])):
    clean_train_reviews1.append(" ".join(Word2VecUtility.review_to_wordlist(train1["text"][i], True)))

Cleaning and parsing the Amazon reviews...





 BeautifulSoup(YOUR_MARKUP})

to this:

 BeautifulSoup(YOUR_MARKUP, "lxml")

  markup_type=markup_type))
  ' that document to Beautiful Soup.' % decoded_markup


In [16]:
# ****** Create a bag of words from the training set
#
print("Creating the bag of words...\n")


# Initialize the "CountVectorizer" object, which is scikit-learn's bag of words tool.
vectorizer1 = CountVectorizer(analyzer = "word",   \
                         tokenizer = None,    \
                         preprocessor = None, \
                         stop_words = None,   \
                         max_features = 5000)

Creating the bag of words...



In [17]:
# ****** Create a bag of words from the training set
#
print("Creating the bag of words...\n")


# Initialize the "CountVectorizer" object, which is scikit-learn's bag of words tool.
vectorizer1 = CountVectorizer(analyzer = "word",   \
                         tokenizer = None,    \
                         preprocessor = None, \
                         stop_words = None,   \
                         max_features = 5000)

Creating the bag of words...



In [19]:
# fit_transform() does two functions: First, it fits the model
# and learns the vocabulary; second, it transforms our training data
# into feature vectors. The input to fit_transform should be a list of
# strings.
train_data_features1 = vectorizer1.fit_transform(clean_train_reviews1)

# Numpy arrays are easy to work with, so convert the result to an
# array
np.asarray(train_data_features1)

# ******* Train an SVM using the bag of words
#
print("Training the SVM (this may take a while)...")

# Fit the SVM to the training set, using the bag of words as
# features and the sentiment labels as the response variable
#
# Initialize an SVM classifier with chosen parameters.

from sklearn.linear_model import SGDClassifier

SVM1 = SGDClassifier(loss='hinge', penalty='l2', alpha=1e-3, n_iter=5, random_state=42)
SVM1 = SVM1.fit( train_data_features1, train1["stars"] )

Training the SVM (this may take a while)...


In [20]:
# Create an empty list and append the clean reviews one by one
clean_test_reviews1 = []

print("Cleaning and parsing the test set movie reviews...\n")
for i in range(0,len(test1["text"])):
    clean_test_reviews1.append(" ".join(Word2VecUtility.review_to_wordlist(test1["text"][i], True)))

# Get a bag of words for the test set, and convert to a numpy array
test_data_features1 = vectorizer1.transform(clean_test_reviews1)
np.asarray(test_data_features1)

# Use the random forest to make sentiment label predictions
print("Predicting test labels...\n")
result1 = SVM1.predict(test_data_features1)

# Copy the results to a pandas dataframe with an "id" column and
# a "sentiment" column
output1 = pd.DataFrame( data={"id":test1["stars"], "Sentiment":result1} )

# Use pandas to write the comma-separated output file
output1.to_csv(os.path.join('Bag_of_Words_Model_BalancedSVM82k.csv'), index=False, quoting=3)
print("Wrote results to Bag_of_Words_model.csv")

Cleaning and parsing the test set movie reviews...

Predicting test labels...

Wrote results to Bag_of_Words_model.csv


In [22]:
accuraccy1 = np.mean(result1 == test1['stars'])  

print(accuraccy1)

0.927716666667
