In [1]:
import nltk
import numpy as np
import sklearn
from collections import defaultdict
import string
import gzip
from nltk.corpus import stopwords
import random
import math
import tensorflow as tf

In [2]:
def readGz(f):
    for l in gzip.open(f):
        yield eval(l)

In [3]:
data = [l for l in readGz('train.json.gz')if 'categoryID' in l]

[{'rating': 4.0, 'reviewHash': 'R567271252', 'businessID': 'B423621081', 'unixReviewTime': 1378865039, 'reviewText': u"I'm a vegetarian, but every so often I want a hotdog with lots of toppings.  And a tall can of beer.  Frank has got that covered.  And they have a cool warehouse space with some pinball machines.  Prices are a little high for hotdogs...fancy hotdogs, but hotdogs nonetheless.  Good location and service, but gets crowded and loud.", 'userID': 'U985379327', 'reviewTime': u'Sep 10, 2013', 'categories': [u'American Restaurant', u'Cafe', u'Hot Dog Restaurant'], 'categoryID': 0}, {'rating': 4.0, 'reviewHash': 'R985248711', 'businessID': 'B734024511', 'unixReviewTime': 1372977506, 'reviewText': u"Asia Cafe is hands down the best Chinese food in Austin. Their menu has about 100 different options and it's really authentic. The spicy fish and the garlic pork are two of my favorites. It's a bit far from downtown Austin, right on the outskirts of Round Rock, but it's worth the driv

In [4]:
print len(data)

70195


In [5]:
punctuation = set(string.punctuation)
stemmer = nltk.stem.porter.PorterStemmer()
#first try use stemmer
stopword = stopwords.words('english')

In [6]:
wordCount = defaultdict(int)
for d in data:
    review = ''.join(c for c in d['reviewText'].lower() if c not in punctuation)
    wordList = review.split()
    for w in wordList:
        #w = stemmer.stem(w)
        if w not in stopword:
            wordCount[w] += 1
#first try not using stemmer

In [7]:
print len(wordCount)

51283


In [8]:
count = [(wordCount[w], w ) for w in wordCount]
count.sort()
count.reverse()

commonWords = [count[i][1] for i in range(1000)]
wordDict = defaultdict(int)
for i in range(1000):
    wordDict[commonWords[i]] = i

#The 1000 most common words

In [9]:
random.shuffle(data)
train_label = [d['categoryID'] for d in data[:58000]]
train_data = data[:58000]
validation_label = [d['categoryID'] for d in data[58000:70195]]
validation_data = data[58000:70195]

In [11]:
#calculate tf-idf
#tf can be calculated when extracting feature
#idf calculated here
idf = [0 for i in range(1000)]
for d in train_data:
    review = ''.join(c for c in d['reviewText'].lower() if c not in punctuation)
    wordList = review.split()
    for w in commonWords:
        if w in wordList:
            idf[wordDict[w]] += 1.0
            
idf = np.array([math.log(70195.0/f) for f in idf])

[1.2523499189734044, 1.4176520594136381, 1.4451327026559233, 1.5799600253779253, 2.0678134213377324, 2.316861313794336, 2.385492978847886, 2.2712415982107252, 2.326152902128436, 2.419335824563028, 2.4826156654418363, 2.628330820965022, 2.626359600141428, 2.7631032584828534, 2.6725045853006995, 2.7476442298867907, 2.7708095522867744, 2.7689909566502977, 2.7843248192865695, 2.793127285203596, 2.8204874824174726, 2.9451087667833087, 2.952994599627237, 2.9510854213574356, 2.9226118351406605, 2.9645268526404105, 2.9829220201687106, 3.0357710432843055, 3.2317080020962576, 3.028384394245468, 2.98517490763243, 2.913647894285305, 3.094395886631829, 3.048304779431562, 3.1163328655084146, 3.0432126611947248, 3.054026824858805, 3.2823939014305887, 3.1800359915519367, 3.1513323495220265, 3.185187986461365, 3.204660089874185, 3.147677253244766, 3.209235146244199, 3.2767174434257837, 3.2669539411579325, 3.2624796607630113, 3.2706978616673976, 3.2850539828015504, 3.293460604721261, 3.3699919607485747,

In [13]:
def feature(datum):
    #count tf-idf
    review = ''.join(c for c in d['reviewText'].lower() if c not in punctuation)
    wordList = review.split()
    tf = [0 for i in range(1000)]
    for w in wordList:
        if w in commonWords:
            tf[wordDict[w]] += 1.0
    tf = np.array(tf)
    tfidf = np.multiply(tf,idf)
    return tfidf

In [14]:
train_feature = np.array([feature(d) for d in train_data])
validation_feature = np.array([feature(d) for d in validation_data])

In [15]:
fc_size = 300
input_size = 1000
output_size = 10
regularization_rate = 0.0001
learning_rate = 0.00001
batch_size = 200
max_iter = 60000
#tensorflow learning hyperpatameters

In [None]:
def calc(X, regularizer):
    with tf.variable_scope('fc1'):
        w1 = tf.get_variable(name = 'weight', shape = [input_size, fc_size], initializer = tf.truncated_normal_initializer(stddev = 0.1))
        b1 = tf.get_variable(name = 'bias', shape = [fc_size], initializer = tf.constant_initializer(0.1))
        fc1 = tf.nn.relu(tf.matmul(X, w1)+b1)
        tf.add_to_collection('losses', regularizer(w1))
    
    with tf.variable_scope('fc2'):
        w2 = tf.get_variable(name = 'weight', shape = [fc_size, output_size], initializer = tf.truncated_normal_initializer(stddev = 0.1))
        b2 = tf.get_variable(name = 'bias', shape = [output_size], initializer = tf.constant_initializer(0.1))
        fc2 = tf.matmul(fc1, w2) + b2
        tf.add_to_collection('losses', regularizer(w2))
    
    return fc2
#A neural network with one hidden layer

In [16]:
def train():
    X = tf.placeholder(tf.float64, [None, input_size], name = 'input-X')
    y = tf.placeholder(tf.int64, [None], name = 'input-Y')
    
    regularizer = tf.contrib.layers.l2_regularizer(regularization_rate)
    
    y_ = calc(X, regularizer)
    y_predict = tf.argmax(y_,1)
    correct_prediction = tf.cast(tf.equal(y_predict, y),tf.float32)
    accuracy = tf.reduce_mean(correct_prediction)
    cross_entropy = tf.nn.sparse_softmax_cross_entropy_with_logits(logits = y_, labels = y)
    loss = cross_entropy + tf.add_n(tf.get_collection('losses'))
    train_step = tf.train.AdamOptimizer(learning_rate).minimize(loss)
    
    with tf.Session() as sess:
        tf.initialize_all_variables().run()
        for i in range(max_iter):
            sample = np.random.randint(0, 58000, batch_size)
            x_batch = train_data[sample]
            y_batch = train_label[sample]
            
            _, loss_value = sess.run([train_step, loss], feed_dict = {X:x_batch,y:y_batch})
            if i % 1000 == 0:
                print("After %d iters, loss on training is %f."%(i, loss_value))
                acc = sess.run(accuracy, feed_dict = {X:validation_feature, y:validation_label})
                print("After %d iters, accuracy on validation is %f"%(i, acc))

In [None]:
train()