In [1]:
import numpy as np
import urllib
import scipy.optimize
import random
from collections import defaultdict
import nltk
import string
from nltk.stem.porter import *
from sklearn import linear_model
import pickle
from sklearn.metrics import mean_squared_error

In [2]:
def parseData(fname):
    for l in open(fname):
        yield eval(l)

In [3]:
print("Reading data...")
data = list(parseData("yelp_training_set_review.json"))
print("done")

train_set = data[:200000]
test_set = data[200000:]

Reading data...
done


In [4]:
### Ignore capitalization and remove punctuation
wordCount = defaultdict(int)
punctuation = set(string.punctuation)
bigramCounter = defaultdict(int)
unigramCounter = defaultdict(int)
reviews = []

for d in train_set:
    r = ''.join([c for c in d['text'].lower() if not c in punctuation])
    reviews.append(r)
    allWords = r.split()
    for w in r.split():
        unigramCounter[w] += 1
    for i in range(1, len(allWords)):
        word1 = allWords[i - 1]
        word2 = allWords[i]
        bigramCounter[word1 + " " + word2] += 1

print('the number of unique bigrams = ', len(bigramCounter))
print('the number of unique unigrams = ', len(unigramCounter))

the number of unique bigrams =  3120774
the number of unique unigrams =  205981


In [5]:
unigramCount = [(unigramCounter[w], w) for w in unigramCounter]
unigramCount.sort()
unigramCount.reverse()

print('20 most frequently occurring unigrams =', unigramCount[:20])


bigramCount = [(bigramCounter[w], w) for w in bigramCounter]
bigramCount.sort()
bigramCount.reverse()

print('20 most frequently occurring bigrams =', bigramCount[:20])

20 most frequently occurring unigrams = [(1333785, 'the'), (869970, 'and'), (723860, 'i'), (709764, 'a'), (602827, 'to'), (413064, 'of'), (403915, 'was'), (343766, 'it'), (343663, 'is'), (304557, 'for'), (294277, 'in'), (238356, 'that'), (224749, 'my'), (216843, 'with'), (212645, 'but'), (206895, 'this'), (203836, 'you'), (187123, 'they'), (184673, 'on'), (176520, 'have')]
20 most frequently occurring bigrams = [(88972, 'of the'), (79341, 'it was'), (76992, 'in the'), (74682, 'and the'), (61262, 'this place'), (57373, 'on the'), (53955, 'i was'), (53588, 'and i'), (45569, 'to the'), (44849, 'the food'), (44473, 'for the'), (43837, 'i have'), (43485, 'for a'), (38949, 'is a'), (38663, 'i had'), (36925, 'if you'), (35793, 'to be'), (32973, 'with the'), (32930, 'with a'), (32725, 'at the')]


In [9]:
##################################################
# most 1000 bigrams vector                       #
##################################################
bigramWords_sample = [x[1] for x in bigramCount[:1000]]
bigramWordId_sample = dict(zip(bigramWords_sample, range(len(bigramWords_sample))))

In [10]:
def feature(datum):
    feat = [0] * len(bigramWords_sample)
    r = ''.join([c for c in datum['text'].lower() if not c in punctuation])
    allWords = r.split()
    bigramWordsData = []
    for i in range(1, len(allWords)):
        word1 = allWords[i - 1]
        word2 = allWords[i]
        bigramWordsData.append(word1 + " " + word2)
    for w in bigramWordsData:
        if w in bigramWords_sample:
            feat[bigramWordId_sample[w]] += 1
    feat.append(1)  # offset
    return feat

In [11]:
X_train = [feature(d) for d in train_set]
y_train = [d['stars'] for d in train_set]
X_test = [feature(d) for d in test_set]
y_test = [d['stars'] for d in test_set]

In [12]:
# With regularization
clf = linear_model.Ridge(1.0, fit_intercept=False)
clf.fit(X_train, y_train)
theta = clf.coef_
predictions = clf.predict(X_test)
print('new predictor mean_squared_error using 1000 most common bigrams =', mean_squared_error(predictions, y_test))

new predictor mean_squared_error using 1000 most common bigrams = 1.017259375190573


In [13]:
unigramCount = [(unigramCounter[w], w) for w in unigramCounter]
bigramCount = [(bigramCounter[w], w) for w in bigramCounter]
countBothWord = unigramCount + bigramCount
countBothWord.sort()
countBothWord.reverse()
bothWords = [x[1] for x in countBothWord[:2000]]
both_key_freq = dict(zip(bothWords, range(len(bothWords))))
both_freq_key = dict(zip(range(len(bothWords)), bothWords))

In [14]:
def feature(datum):
    feat = [0] * len(bothWords)
    r = ''.join([c for c in datum['text'].lower() if not c in punctuation])
    words_temp = r.split()
    bigramWordsData = []
    for i in range(1, len(words_temp)):
        word1 = words_temp[i - 1]
        word2 = words_temp[i]
        bigramWordsData.append(word1 + " " + word2)
    for w in words_temp:
        if w in bothWords:
            feat[both_key_freq[w]] += 1
    for w in bigramWordsData:
        if w in bothWords:
            feat[both_key_freq[w]] += 1
    feat.append(1)  # offset
    return feat

In [15]:
X_train = [feature(d) for d in train_set]
y_train = [d['stars'] for d in train_set]
X_test = [feature(d) for d in test_set]
y_test = [d['stars'] for d in test_set]

In [16]:
# With regularization
clf = linear_model.Ridge(1.0, fit_intercept=False)
clf.fit(X_train, y_train)
theta = clf.coef_
predictions = clf.predict(X_test)
print('new predictor mean_squared_error using both 2000 most common unigram and bigrams =', mean_squared_error(predictions, y_test))

new predictor mean_squared_error using both 2000 most common unigram and bigrams = 0.8210778176158229


In [17]:
##################################################
#  the idea of visulization of ratings           #
##################################################

weightCounter = []
for i in range(len(theta) - 1):
    weightCounter.append((theta[i], i))
weightCounter.sort()
weightCounter.reverse()

In [18]:
print('most positive 20 statement(unigrams and bigrams) in sense of stars rating:')
for i in range(20):
    print(both_freq_key[weightCounter[i][1]])

weightCounter.reverse()

print('most negative 20 statement(unigrams and bigrams) in sense of stars rating:')
for i in range(20):
    print(both_freq_key[weightCounter[i][1]])

most positive 20 statement(unigrams and bigrams) in sense of stars rating:
outstanding
5 stars
love this
incredible
excellent
awesome
die
amazing
cant wait
fantastic
helpful
never had
better than
i never
great food
not only
wonderful
highly recommend
will definitely
best
most negative 20 statement(unigrams and bigrams) in sense of stars rating:
worst
horrible
rude
terrible
mediocre
overpriced
poor
bland
dirty
will not
sorry
wont be
money
unfortunately
slow
would not
dry
average
greasy
guess


In [19]:
##################################################
#  with 2000-dimensional tf-idf representations  #
##################################################

temp = defaultdict(list)
freq_temp = defaultdict(int)

words = []
for r in reviews:
    words += r.split()
uniqueWords = set(words)
for i in range(len(reviews)):
    r = reviews[i]
    for w in r.split():
        if len(temp[w]) == 0 or i != temp[w][-1]:
            temp[w].append(i)

for w in uniqueWords:
    freq_temp[w] = len(temp[w])

In [20]:
def calTfIdf(word, r):
    tf = 0
    r_temp = r.split()
    for w in r_temp:
        if word == w:
            tf += 1
    N = len(reviews)

    idf = np.log10(N * 1.0 / freq_temp[word])
    tfidf = tf * idf
    return tfidf

In [21]:
counts = [(unigramCounter[w], w) for w in unigramCounter]
counts = countBothWord
counts.sort()
counts.reverse()

In [22]:
words = [x[1] for x in counts[:2000]]
wordId = dict(zip(words, range(len(words))))
wordSet = set(words)

In [23]:
def feature(datum):
    feat = [0] * len(words)
    r = ''.join([c for c in datum['text'].lower() if not c in punctuation])
    for w in r.split():
        if w in words:


            feat[wordId[w]] = calTfIdf(w, r)

    feat.append(1)  # offset
    return feat

In [24]:
X_train = [feature(d) for d in train_set]
y_train = [d['stars'] for d in train_set]
X_test = [feature(d) for d in test_set]
y_test = [d['stars'] for d in test_set]

In [25]:
# With regularization
clf = linear_model.Ridge(1.0, fit_intercept=False)
clf.fit(X_train, y_train)
theta = clf.coef_
predictions = clf.predict(X_test)

In [27]:
print('the new model with 1000-dimensional unigrams and 1000-dimensional bigrams tf-idf representations MSE =', mean_squared_error(predictions, y_test))

the new model with 1000-dimensional unigrams and 1000-dimensional bigrams tf-idf representations MSE = 0.8210518201753847
