In [1]:
import pandas as pd
import numpy as np
import datetime
import time
from matplotlib import pyplot as plt

In [2]:
review=pd.read_json('/Users/Paul/Desktop/chinese_restaurants_review/review_processed.json', lines=True)

In [4]:
review[0:3]

Unnamed: 0,stars,token,useful
0,5,"[place, gem, friendly, attentive, service, foo...",0
1,5,"[perhaps, closest, pho, restaurant, port, cred...",0
2,4,"[happened, stumble, upon, little, quaint, rest...",1


In [5]:
from gensim.models import word2vec



In [3]:
num_features = 300   # Word vector dimensionality                      
min_word_count = 40   # Minimum word count                        
num_workers = 4       # Number of threads to run in parallel
context = 10         # Context window size                                                                                    
downsampling = 1e-3   # Downsample setting for frequent words

In [13]:
begin = time.time()
w2v_model = word2vec.Word2Vec(review.token, workers=num_workers, \
            size=num_features, min_count = min_word_count, \
            window = context, sample = downsampling)
print ('Total time spent:', time.time() - begin)

Total time spent: 193.356201171875


In [4]:
w2v_model=word2vec.Word2Vec.load('/Users/Paul/Desktop/chinese_restaurants_review/w2v_model')

In [44]:
w2v_model.similarity('good', 'awesome')

0.51868450256167309

In [47]:
def Average(token):
    
    # Here 300 is the number of neurals in the hidden layer
    feature_vec = np.zeros((num_features,), dtype="float32")
    
    n_words = 0
    # we have trained a word2vec model named 'w2v_model'
    
    # make a set of words we have learned in model
    word_set = set(w2v_model.wv.index2word)
    
    # only arverage the words that has certain correlation/similarity with the judgement words "good" or "bad". The others are ignored.
    for x in token:
        if x in word_set and (w2v_model.similarity('good', x)>=0 or w2v_model.similarity('bad', x)>=0):
            n_words+= 1
            feature_vec += w2v_model[x]
    
    return np.divide(feature_vec,float(n_words))

In [50]:
import time
begin = time.time()
review['w2v_feature'] =review['token'].apply(lambda x: Average(x))
review['w2v_feature']=review['w2v_feature']*(review['useful']+1)
print ('Total time spent:', time.time() - begin)

Total time spent: 1610.818256855011


In [52]:
review[0:3]

Unnamed: 0,stars,token,useful,w2v_feature
0,5,"[place, gem, friendly, attentive, service, foo...",0,"[0.198498, 0.120218, -0.0277819, 0.185768, 0.6..."
1,5,"[perhaps, closest, pho, restaurant, port, cred...",0,"[0.0406628, 0.392262, -0.204646, 0.13495, 0.44..."
2,4,"[happened, stumble, upon, little, quaint, rest...",1,"[-0.225141, 0.415271, -0.686797, 1.06412, 0.66..."


In [53]:
ix = []
for i in range(review.shape[0]):
    if np.any(np.isnan(review.w2v_feature.iloc[i])): ix.append(i)
review = review.drop(ix, axis= 0)

In [54]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split
feature = np.array([x for x in review.w2v_feature])
X_train,  X_test, y_train, y_test = train_test_split(feature, review['stars'], test_size = 0.2, random_state = 1024)

In [55]:
forest = RandomForestClassifier(n_estimators=100, max_depth=15)
model_w2v= forest.fit(X_train, y_train)

In [56]:
predict=model_w2v.predict(X_test)

In [57]:
#test accuracy
np.sum(predict==y_test)/(len(y_test))

0.53094264912504785

In [58]:
np.sum([(predict>=3) & (y_test>=3)])/(np.sum(y_test>=3))

0.951664601051327

In [24]:
review.to_json('/Users/Paul/Desktop/chinese_restaurants_review/review_w2v.json', orient='records', lines=True)

In [None]:
review=pd.read_json('/Users/Paul/Desktop/chinese_restaurants_review/review_w2v.json', lines=True)

In [27]:
import tensorflow as tf

In [28]:
feature_size = 300
n_node_hl1 = 100
n_node_hl2 = 100
n_class = 5
batch_size = 200
n_epoch = 10

In [29]:
# placeholers
x = tf.placeholder('float', [None, feature_size])

# should assign shape?
y = tf.placeholder('float', [None, 5])

In [30]:
def NN_model(data):
    # Weights and bias of each layer
    
    hl1 = {'weights': tf.Variable(tf.random_normal([feature_size, n_node_hl1])),
            'bias': tf.Variable(tf.random_normal([n_node_hl1])) }
    hl2 = {'weights': tf.Variable(tf.random_normal([n_node_hl1, n_node_hl2])),
            'bias': tf.Variable(tf.random_normal([n_node_hl2])) }

    output_layer = { 'weights': tf.Variable(tf.random_normal([n_node_hl2, n_class])),
                'bias' : tf.Variable(tf.random_normal([n_class]))}
    
    # Output of each layer
    # Relu((x*W + bias))
    s1 = tf.add(tf.matmul(data, hl1['weights']),hl1['bias'])
    a1 = tf.nn.relu(s1)
    
    s2 = tf.add(tf.matmul(a1, hl2['weights']), hl2['bias'])
    a2 = tf.nn.relu(s2)
    
    output = tf.add(tf.matmul(a2, output_layer['weights']), output_layer['bias'])
    
    #return tf.reshape(tf.cast(tf.argmax(output,1),'float'), [batch_size, 1])
    return output

In [33]:
def train_NN_model(x,y, n_epoch):
    prediction = NN_model(x)
    cost = tf.reduce_sum(tf.nn.softmax_cross_entropy_with_logits(logits = prediction, labels=y))
    
    optimizer = tf.train.AdamOptimizer().minimize(cost)
    
    
    sess = tf.InteractiveSession()
    sess.run(tf.global_variables_initializer())
    
    
    for epoch in range(n_epoch):
        epoch_loss = 0
        
        # cut into batches
        i = 0
        while i < len(X_train):
            start = i
            end = i + batch_size
            batch_X = X_train[start: end]
            
            # the original y is single numeric labels, we transform into one hot 
            # but the out put of tf.one_hot is a tensor node, so we need to sess.run(batch_y)
            batch_y = \
            tf.one_hot(np.array(y_train[start: end]-1),5,on_value=1.0, off_value=0.0)
        
            _, c = sess.run([optimizer, cost],feed_dict={x: batch_X, y:sess.run(batch_y)})
            
            epoch_loss += c
            
            i += batch_size
        print  ('Epoch:', epoch, 'loss:', epoch_loss)
    
    
    correct = tf.equal(tf.argmax(prediction,1), tf.argmax(y,1))
    accuracy = tf.reduce_mean(tf.cast(correct, 'float'))
    
    one_hot_y_test = tf.one_hot(np.array(y_test-1),5,on_value=1.0, off_value=0.0)
    print ('Accuracy', accuracy.eval(feed_dict={x: X_test, y: sess.run(one_hot_y_test)}))

In [32]:
begin = time.time()
train_NN_model(x,y,n_epoch)
print ('Total time spent:', time.time() - begin)

Epoch: 0 loss: 6776505.1095
Epoch: 1 loss: 2184694.57111
Epoch: 2 loss: 1029324.89551
Epoch: 3 loss: 497760.010345
Epoch: 4 loss: 303728.63221
Epoch: 5 loss: 232924.048691
Epoch: 6 loss: 199989.409782
Epoch: 7 loss: 182444.223709
Epoch: 8 loss: 170315.700058
Epoch: 9 loss: 162480.446579
Accuracy 0.446481
Total time spent: 8609.180506944656
